Java使⽤hanlp+Hash(分词)计算⽂章相似度1. 引⼊maven依赖
<!--simhash算法(⽂章得相似度依赖)-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.2</version>
</dependency>
2.创建⼯具类
package;
import Term;
import StandardTokenizer;
import StringUtils;
import Jsoup;
import Whitelist;
import BigInteger;
import HashMap;
import List;
import Map;
/**
* 计算两篇⽂章相似度
*
* @Author HB
* @Date 2022/2/10 11:50
**/
public class SimilarityUtils {
private String tokens;//字符串
private BigInteger strSimHash;//字符产的hash值
private int hashbits;// 分词后的hash数;
private SimilarityUtils(String tokens,int hashbits){
this.hashbits = hashbits;
this.strSimHash =this.simHash();
}
/**
* 清除html标签
*
* @param content
* @return
*/
private String cleanResume(String content){
// 若输⼊为HTML,下⾯会过滤掉所有的HTML的tag
content = Jsoup.clean(content, ());
content = StringUtils.lowerCase(content);
String[] strings ={" ","\n","\r","\t","\\r","\\n","\\t"," "};
for(String s : strings){
content = placeAll(s,"");
content = placeAll(s,"");
}
return content;
}
/
**
* 这个是对整个字符串进⾏hash计算
*
* @return
*/
private BigInteger simHash(){
tokens =cleanResume(tokens);// cleanResume 删除⼀些特殊字符
int[] v =new int[this.hashbits];
List<Term> termList = StandardTokenizer.kens);// 对字符串进⾏分词//对分词的⼀些特殊处理 : ⽐如: 根据词性添加权重 , 过滤掉标点符号 , 过滤超频词汇等;
Map<String, Integer> weightOfNature =new HashMap<String, Integer>();// 词性的权重
weightOfNature.put("n",2);//给名词的权重是2;
Map<String, String> stopNatures =new HashMap<String, String>();//停⽤的词性如⼀些标点符号之类的;        stopNatures.put("w","");//
int overCount =5;//设定超频词汇的界限 ;
Map<String, Integer> wordCount =new HashMap<String, Integer>();
for(Term term : termList){
String word = term.word;//分词字符串
String nature = String();// 分词属性;
//  过滤超频词
ainsKey(word)){
int count = (word);
if(count > overCount){
continue;
}
wordCount.put(word, count +1);
}else{
wordCount.put(word,1);
}
// 过滤停⽤词性
ainsKey(nature)){
continue;
}
// 2、将每⼀个分词hash为⼀组固定长度的数列.⽐如 64bit 的⼀个整数.
BigInteger t =this.hash(word);
for(int i =0; i <this.hashbits; i++){
BigInteger bitmask =new BigInteger("1").shiftLeft(i);
// 3、建⽴⼀个长度为64的整数数组(假设要⽣成64位的数字指纹,也可以是其它数字),
// 对每⼀个分词hash后的数列进⾏判断,如果是1,那么数组的第⼀位和末尾⼀位加1,
// 中间的62位减⼀,也就是说,逢1加1,逢0减1.⼀直到把所有的分词hash数列全部判断完毕.
int weight =1;//添加权重
ainsKey(nature)){
weight = (nature);
}
if(t.and(bitmask).signum()!=0){
// 这⾥是计算整个⽂档的所有特征的向量和
v[i]+= weight;
}else{
v[i]-= weight;
}
}
}
BigInteger fingerprint =new BigInteger("0");
BigInteger fingerprint =new BigInteger("0");
for(int i =0; i <this.hashbits; i++){
if(v[i]>=0){
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
/**
* 对单个的分词进⾏hash计算;
*
* @param source
* @return
*/
private BigInteger hash(String source){
if(source ==null|| source.length()==0){
return new BigInteger("0");
}else{
/**
* 当sourece 的长度过短,会导致hash算法失效,因此需要对过短的词补偿
*/
while(source.length()<3){
source = source + source.charAt(0);
}
char[] sourceArray = CharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0])<<7);
BigInteger m =new BigInteger("1000003");
BigInteger mask =new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
for(char item : sourceArray){
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if(x.equals(new BigInteger("-1"))){
x =new BigInteger("-2");
weight是什么词性}
return x;
}
}
/**
* 计算海明距离,海明距离越⼩说明越相似;
* 等于0时证明完全相似
* @param other
* @return
*/
private int hammingDistance(SimilarityUtils other){
BigInteger m =new BigInteger("1").shiftLeft(this.hashbits).subtract(
new BigInteger("1"));
BigInteger x =(other.strSimHash).and(m);
int tot =0;
while(x.signum()!=0){
tot +=1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
/**
* 等于1时,完全相似
* @Author HB
* @param s2
* @return  double
* @Date 2022/2/12 10:47
*
**/
public double getSemblance(SimilarityUtils s2){
double i =(double)this.hammingDistance(s2);
return1- i /this.hashbits;
}
/**
* 相似率 >0.85 为相似  1为完全相似
*
* @param inValue 输⼊参数  outValue 对⽐值
* @return null
* @Author HB 相似率
* @Date 2022/2/10 10:57
**/
public static double getRatio(String inValue, String outValue){
SimilarityUtils hash1 =new SimilarityUtils(inValue,64);
SimilarityUtils hash2 =new SimilarityUtils(outValue,64);
Semblance(hash2);
}
}
3.应⽤
public static void main(String[] args){
SimilarityUtils hash1 =new SimilarityUtils("⽼铁,加个关注呗666",64);        SimilarityUtils hash2 =new SimilarityUtils("⽼铁,加个关注呗6666",64);
//海明值计算
System.out.println(hash1.hammingDistance(hash2));
//相似率值
System.out.Semblance(hash2));
}
4.控制台输出结果

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。