text_similarity faiss
#!pip install faiss-cpu -i pypi.tuna.tsinghua.edu/simple
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# 512维,data包含2000个向量,每个向量符合正态分布
d =512
n_data =2000
np.random.seed(0)
data =[]
mu =3
sigma =0.1
for i in range(n_data):
data.append(al(mu, sigma, d))
data = np.array(data).astype('float32')
# print(data[0])
print(data.shape)
# 查看第6个向量是不是符合正态分布
import matplotlib.pyplot as plt
plt.hist(data[5])
plt.show()
# 精确索引
query =[]
n_query =10
mu =3
sigma =0.1
np.random.seed(12)
query =[]
for i in range(n_query):
query.append(al(mu, sigma, d))
query = np.array(query).astype('float32')
import faiss
index = faiss.IndexFlatL2(d)# 构建 IndexFlatL2
print(index.is_trained)# False时需要train
# 在index⾥添加数据
index.add(data)
al)#index中向量的个数
#精确索引⽆需训练便可直接查询
k =10# 返回结果个数
query_self = data[:5]# 查询本⾝
# 返回TopK结果
dis, ind = index.search(query_self, k)
print(dis.shape)# 打印张量 (5, 10)
print(ind.shape)# 打印张量 (5, 10)
print(dis)# 升序返回每个查询向量的距离
print(ind)# 升序返回每个查询向量
# 倒排表快速索引
nlist =50# 将数据库向量分割为多少了维诺空间
k =10
quantizer = faiss.IndexFlatL2(d)# 量化器
# METRIC_L2计算L2距离, 或faiss.METRIC_INNER_PRODUCT计算内积
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
print(index.is_trained)
#倒排表索引类型需要训练, 训练数据集应该与数据库数据集同分布
print(index.is_trained)
index.add(data)
# 精度会有损失,nprobe代表要搜索的桶的个数
# 是通过聚类来划分桶的,聚类 = 相似的在⼀个桶
index.nprobe =1# 选择n个维诺空间进⾏索引,
#dis, ind = index.search(query, k)
dis, ind = index.search(query_self, k)
print(dis)
print(ind)
# PG可以在聚类的基础上,减少内存
# 乘积量化索引
nlist =50
m =8# 列⽅向划分个数,必须能被d整除
k =10
quantizer = faiss.IndexFlatL2(d)
# 8 表⽰每个⼦向量被编码为 8 bits
index = faiss.IndexIVFPQ(quantizer, d, nlist, m,8)
index.add(data)
index.nprobe =50
dis, ind = index.search(query_self, k)# 查询⾃⾝
print(dis)
print(ind)
"""
dis, ind = index.search(query, k) # 真实查询
print(dis)
print(ind)
"""
#!pip install jieba -i pypi.tuna.tsinghua.edu/simple
#!pip install editdistance -i pypi.tuna.tsinghua.edu/simple
import re
import numpy as np
import pandas as pd
import jieba
from sklearn. import CountVectorizer, TfidfTransformer
del_selection import train_test_split, cross_validate
ics import accuracy_score, precision_score, recall_score, f1_score from sklearn.naive_bayes import MultinomialNB
ics.pairwise import cosine_similarity
import pickle
from tqdm import tqdm
from pprint import pprint
import os
# 加载停⽤词
with open('','r', encoding='utf-8')as file:
stopwords=[i[:-1]for i adlines()]
#stopwords = [line.strip() for line in open('',encoding='UTF-8').readlines()] # 数据加载
news = pd.read_csv('sqlResult.csv',encoding='gb18030')
print(news.shape)
import pickleprint(news.head(5))
print(news.head(5))
# 处理缺失值
print(t.isna()].head(5))
news=news.dropna(subset=['content'])
print(news.shape)
# 分词
def split_text(text):
#return ' '.join([w for w in list(jieba.cut(re.sub('\s|[%s]' % (punctuation),'',text))) if w not in stopwords])
text = place(' ','')
text = place('\n','')
text2 = jieba.cut(text.strip())
result =' '.join([w for w in text2 if w not in stopwords])
return result
print(news.iloc[0].content)
print(split_text(news.iloc[0].content))
if not ists("corpus.pkl"):
# 对所有⽂本进⾏分词
corpus=list(map(split_text,[str(i)for i t]))
print(corpus[0])
print(len(corpus))
print(corpus[1])
# 保存到⽂件,⽅便下次调⽤
with open('corpus.pkl','wb')as file:
pickle.dump(corpus,file)
else:
# 调⽤上次处理的结果
with open('corpus.pkl','rb')as file:
corpus = pickle.load(file)
# 得到corpus的TF-IDF矩阵
countvectorizer = CountVectorizer(encoding='gb18030',min_df=0.015)
tfidftransformer = TfidfTransformer()
countvector = countvectorizer.fit_transform(corpus)
print(countvector.shape)
tfidf = tfidftransformer.fit_transform(countvector)
# 保存到⽂件,⽅便下次调⽤
with open('tfidf.pkl','wb')as file:
pickle.dump(tfidf,file)
print(tfidf.shape)
# 标记是否为⾃⼰的新闻
label=list(map(lambda source:1if'新华'in str(source)else0,news.source))
#print(label)
# 数据集切分
X_train, X_test, y_train, y_test = train_test_array(), label, test_size =0.3, random_state=42)
clf = MultinomialNB()
clf.fit(X=X_train, y=y_train)
"""
# 进⾏CV=3折交叉验证
scores=cross_validate(clf, X_train, y_train, scoring=('accuracy','precision','recall','f1'), cv=3, return_train_score=True) pprint(scores)
"""
y_predict = clf.predict(X_test)
def show_test_reslt(y_true,y_pred):
print('accuracy:',accuracy_score(y_true,y_pred))
print('accuracy:',accuracy_score(y_true,y_pred))
print('precison:',precision_score(y_true,y_pred))
print('recall:',recall_score(y_true,y_pred))
print('f1_score:',f1_score(y_true,y_pred))
show_test_reslt(y_test, y_predict)
# 使⽤模型检测抄袭新闻
prediction = clf.array())
labels = np.array(label)
# compare_news_index中有两列:prediction为预测,labels为真实值
compare_news_index = pd.DataFrame({'prediction':prediction,'labels':labels})
# copy_news_index:可能是Copy的新闻(即到预测为1,但是实际不是“新华社”)
copy_news_index=compare_news_index[(compare_news_index['prediction']==1)&(compare_news_index['labels']==0)].index # 实际为新华社的新闻
xinhuashe_news_index=compare_news_index[(compare_news_index['labels']==1)].index
print('可能为Copy的新闻条数:',len(copy_news_index))
if not ists("label.pkl"):
# 使⽤k-means对⽂章进⾏聚类
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
normalizer = Normalizer()
scaled_array = normalizer.fit_array())
# 使⽤K-Means, 对全量⽂档进⾏聚类
kmeans = KMeans(n_clusters=25,random_state=42,n_jobs=-1)
k_labels = kmeans.fit_predict(scaled_array)
# 保存到⽂件,⽅便下次调⽤
with open('label.pkl','wb')as file:
pickle.dump(k_labels,file)
print(k_labels.shape)
print(k_labels[0])
else:
# 调⽤上次处理的结果
with open('label.pkl','rb')as file:
k_labels = pickle.load(file)
if not ists("id_class.pkl"):
# 创建id_class
id_class ={index:class_ for index, class_ in enumerate(k_labels)}
# 保存到⽂件,⽅便下次调⽤
with open('id_class.pkl','wb')as file:
pickle.dump(id_class,file)
else:
# 调⽤上次处理的结果
with open('id_class.pkl','rb')as file:
id_class = pickle.load(file)
if not ists("class_id.pkl"):
from collections import defaultdict
# 创建你class_id字段,key为classId,value为⽂档index
class_id = defaultdict(set)
for index,class_ in id_class.items():
# 只统计新华社发布的class_id
if index in xinhuashe_list():
class_id[class_].add(index)
# 保存到⽂件,⽅便下次调⽤
with open('class_id.pkl','wb')as file:
pickle.dump(class_id,file)
else:
# 调⽤上次处理的结果
with open('class_id.pkl','rb')as file:
class_id = pickle.load(file)
# 输出每个类别的⽂档个数
count=0
for k in class_id:
print(count,len(class_id[k]))
count +=1
# 查相似⽂本(使⽤聚类结果进⾏filter)
def find_similar_text(cpindex, top=10):
# 只在新华社发布的⽂章中查
dist_dict={i:cosine_similarity(tfidf[cpindex],tfidf[i])for i in class_id[id_class[cpindex]]} # 从⼤到⼩进⾏排序
return sorted(dist_dict.items(),key=lambda x:x[1][0], reverse=True)[:top]
import editdistance
# 指定某篇⽂章的相似度
#print(copy_news_index)
cpindex =3352# 在copy_news_index
#print('是否在新华社', cpindex in xinhuashe_news_index)
#print('是否在copy_news', cpindex in copy_news_index)
#print('3134是否在新华社', 3134 in xinhuashe_news_index)
#print('3134是否在copy_news', 3134 in copy_news_index)
#print(cpindex)
similar_list = find_similar_text(cpindex)
print(similar_list)
print('怀疑抄袭:\n', news.iloc[cpindex].content)
# ⼀篇相似的原⽂
similar2 = similar_list[0][0]
print('相似原⽂:\n', news.iloc[similar2].content)
# 求任意两篇⽂章的编辑距离
print('编辑距离:',editdistance.eval(corpus[cpindex], corpus[similar2]))
def find_similar_sentence(candidate, raw):
similist =[]
cl = candidate.strip().split('。')
ra = raw.strip().split('。')
for c in cl:
for r in ra:
similist.append([c,r,editdistance.eval(c,r)])
# 最相似的5个句⼦
sort=sorted(similist,key=lambda x:x[2])[:5]
for c,r,ed in sort:
if c!=''and r!='':
print('怀疑抄袭句:{0}\n相似原句:{1}\n 编辑距离:{2}\n'.format(c,r,ed))
# 查copy⽂章和第⼀相似的原⽂的⽐对
find_similar_sentence(news.iloc[cpindex].content, news.iloc[similar2].content)
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论