Java使用hanlp+Hash(分词)计算文章相似度--688IT编程网

Java使⽤hanlp+Hash（分词）计算⽂章相似度1. 引⼊maven依赖

<groupId>org.jsoup</groupId>

<artifactId>jsoup</artifactId>

</dependency>

<groupId>com.hankcs</groupId>

<artifactId>hanlp</artifactId>

<version>portable-1.8.2</version>

</dependency>

2.创建⼯具类

package;

import Term;

import StandardTokenizer;

import StringUtils;

import Jsoup;

import Whitelist;

import BigInteger;

import HashMap;

import List;

import Map;

/**

* 计算两篇⽂章相似度

* @Author HB

* @Date 2022/2/10 11:50

**/

public class SimilarityUtils {

private String tokens;//字符串

private BigInteger strSimHash;//字符产的hash值

private int hashbits;// 分词后的hash数;

private SimilarityUtils(String tokens,int hashbits){

this.hashbits = hashbits;

this.strSimHash =this.simHash();

}

/**

* 清除html标签

* @param content

* @return

private String cleanResume(String content){

// 若输⼊为HTML，下⾯会过滤掉所有的HTML的tag

content = Jsoup.clean(content, ());

content = StringUtils.lowerCase(content);

String[] strings ={" ","\n","\r","\t","\\r","\\n","\\t"," "};

for(String s : strings){

content = placeAll(s,"");

}

return content;

}

* 这个是对整个字符串进⾏hash计算

* @return

private BigInteger simHash(){

tokens =cleanResume(tokens);// cleanResume 删除⼀些特殊字符

int[] v =new int[this.hashbits];

List<Term> termList = StandardTokenizer.kens);// 对字符串进⾏分词//对分词的⼀些特殊处理 : ⽐如: 根据词性添加权重 , 过滤掉标点符号 , 过滤超频词汇等;

Map<String, Integer> weightOfNature =new HashMap<String, Integer>();// 词性的权重

weightOfNature.put("n",2);//给名词的权重是2;

Map<String, String> stopNatures =new HashMap<String, String>();//停⽤的词性如⼀些标点符号之类的; stopNatures.put("w","");//

int overCount =5;//设定超频词汇的界限 ;

Map<String, Integer> wordCount =new HashMap<String, Integer>();

for(Term term : termList){

String word = term.word;//分词字符串

String nature = String();// 分词属性;

// 过滤超频词

ainsKey(word)){

int count = (word);

if(count > overCount){

continue;

}

wordCount.put(word, count +1);

}else{

wordCount.put(word,1);

}

// 过滤停⽤词性

ainsKey(nature)){

continue;

}

// 2、将每⼀个分词hash为⼀组固定长度的数列.⽐如 64bit 的⼀个整数.

BigInteger t =this.hash(word);

for(int i =0; i <this.hashbits; i++){

BigInteger bitmask =new BigInteger("1").shiftLeft(i);

// 3、建⽴⼀个长度为64的整数数组(假设要⽣成64位的数字指纹，也可以是其它数字),

// 对每⼀个分词hash后的数列进⾏判断，如果是1，那么数组的第⼀位和末尾⼀位加1,

// 中间的62位减⼀，也就是说，逢1加1，逢0减1.⼀直到把所有的分词hash数列全部判断完毕.

int weight =1;//添加权重

ainsKey(nature)){

weight = (nature);

}

if(t.and(bitmask).signum()!=0){

// 这⾥是计算整个⽂档的所有特征的向量和

v[i]+= weight;

}else{

v[i]-= weight;

}

BigInteger fingerprint =new BigInteger("0");

for(int i =0; i <this.hashbits; i++){

if(v[i]>=0){

fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));

}

return fingerprint;

}

/**

* 对单个的分词进⾏hash计算;

* @param source

* @return

private BigInteger hash(String source){

if(source ==null|| source.length()==0){

return new BigInteger("0");

}else{

/**

* 当sourece 的长度过短，会导致hash算法失效，因此需要对过短的词补偿

while(source.length()<3){

source = source + source.charAt(0);

}

char[] sourceArray = CharArray();

BigInteger x = BigInteger.valueOf(((long) sourceArray[0])<<7);

BigInteger m =new BigInteger("1000003");

BigInteger mask =new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));

for(char item : sourceArray){

BigInteger temp = BigInteger.valueOf((long) item);

x = x.multiply(m).xor(temp).and(mask);

}

x = x.xor(new BigInteger(String.valueOf(source.length())));

if(x.equals(new BigInteger("-1"))){

x =new BigInteger("-2");

weight是什么词性}

return x;

}

/**

* 计算海明距离，海明距离越⼩说明越相似;

* 等于0时证明完全相似

* @param other

* @return

private int hammingDistance(SimilarityUtils other){

BigInteger m =new BigInteger("1").shiftLeft(this.hashbits).subtract(

new BigInteger("1"));

BigInteger x =(other.strSimHash).and(m);

int tot =0;

while(x.signum()!=0){

tot +=1;

x = x.and(x.subtract(new BigInteger("1")));

}

return tot;

}

/**

* 等于1时，完全相似

* @Author HB

* @param s2

* @return double

* @Date 2022/2/12 10:47

**/

public double getSemblance(SimilarityUtils s2){

double i =(double)this.hammingDistance(s2);

return1- i /this.hashbits;

}

/**

* 相似率 >0.85 为相似 1为完全相似

* @param inValue 输⼊参数 outValue 对⽐值

* @return null

* @Author HB 相似率

* @Date 2022/2/10 10:57

**/

public static double getRatio(String inValue, String outValue){

SimilarityUtils hash1 =new SimilarityUtils(inValue,64);

SimilarityUtils hash2 =new SimilarityUtils(outValue,64);

Semblance(hash2);

}

3.应⽤

public static void main(String[] args){

SimilarityUtils hash1 =new SimilarityUtils("⽼铁，加个关注呗666",64); SimilarityUtils hash2 =new SimilarityUtils("⽼铁，加个关注呗6666",64);

//海明值计算

System.out.println(hash1.hammingDistance(hash2));

//相似率值

System.out.Semblance(hash2));

}

4.控制台输出结果

688IT编程网

Java使用hanlp+Hash(分词)计算文章相似度

发表评论

推荐文章

java正则表达式选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

热门文章

excel文字递增函数公式

数字递增公式

notepad 正则变量运算

C++regex库常用函数及实例

js正则表达式之前瞻后顾与非捕获分组

indesign正则数字和英文之间的空格

C#匹配中文字符串的4种正则表达式分享

PHP正则表达式匹配中文字符

匹配中文汉字的正则表达式介绍

Python正则表达式如何进行字符串替换

orcl中用正则表达式

sql正则表达式excel

dataframe正则表达式

postgress sql正则

el-upload accept 正则表达式

半小时正则表达式

判断科学计数法的正则

根据url判断静态资源的方法

Java正则表达式-匹配正负浮点数

替换模糊匹配正则-hive

最新文章

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

能被5整除的十进制整数的正规表达式

大于0小于等于1的正则表达式

linux grep 26个字母

java pattern 正则表达式

掌握文本编辑器中的搜索和替换技巧

标签列表

688IT编程网

Java使用hanlp+Hash(分词)计算文章相似度

发表评论

推荐文章

java正则表达式 选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

热门文章

excel文字递增函数公式

数字递增公式

notepad 正则变量运算

C++regex库常用函数及实例

js正则表达式之前瞻后顾与非捕获分组

indesign正则数字和英文之间的空格

C#匹配中文字符串的4种正则表达式分享

PHP正则表达式匹配中文字符

匹配中文汉字的正则表达式介绍

Python正则表达式如何进行字符串替换

orcl中用正则表达式

sql正则表达式excel

dataframe正则表达式

postgress sql正则

el-upload accept 正则表达式

半小时 正则表达式

判断科学计数法的正则

根据url判断静态资源的方法

Java正则表达式-匹配正负浮点数

替换模糊匹配正则-hive

最新文章

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

能被5整除的十进制整数的正规表达式

大于0小于等于1的正则表达式

linux grep 26个字母

java pattern 正则表达式

掌握文本编辑器中的搜索和替换技巧

标签列表

java正则表达式选择题

非零金额正则表达式

半小时正则表达式