java计算句⼦相似度的代码_4种⽅法计算句⼦相似度Edit Distance
计算两个字符串之间,由⼀个转成另⼀个所需要的最少编辑次数,次数越多,距离越⼤,也就越不相关。⽐
如,“xiaoming”和“xiamin”,两者的转换需要两步:
去除‘o’
去除‘g’
所以,次数/距离=2。
!pip install distance
import distance
def edit_distance(s1, s2):
return distance.levenshtein(s1, s2)
s1 = 'xiaoming'
s2 = 'xiamin'
print('距离:'+str(edit_distance(s1, s2)))
杰卡德系数
⽤于⽐较有限样本集之间的相似性与差异性。Jaccard 系数值越⼤,样本相似度越⾼,计算⽅式是:两个样本的交集除以并集。
from sklearn. import CountVectorizer
import numpy as np
def jaccard_similarity(s1, s2):
def add_space(s):
python转java代码return ' '.join(list(s))
# 将字中间加⼊空格
s1, s2 = add_space(s1), add_space(s2)
# 转化为TF矩阵
cv = CountVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = cv.fit_transform(corpus).toarray()
# 求交集
numerator = np.sum(np.min(vectors, axis=0))
# 求并集
denominator = np.sum(np.max(vectors, axis=0))
# 计算杰卡德系数
return 1.0 * numerator / denominator
s1 = '你在⼲啥呢'
s2 = '你在⼲什么呢'
print(jaccard_similarity(s1, s2))
TF 计算
计算矩阵中两个向量的相似度,即:求解两个向量夹⾓的余弦值。
计算公式:cosθ=a·b/|a|*|b|
from sklearn. import CountVectorizer
import numpy as np
from scipy.linalg import norm
def tf_similarity(s1, s2):
def add_space(s):
return ' '.join(list(s))
# 将字中间加⼊空格
s1, s2 = add_space(s1), add_space(s2)
# 转化为TF矩阵
cv = CountVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = cv.fit_transform(corpus).toarray()
# 计算TF系数
return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
s1 = '你在⼲啥呢'
s2 = '你在⼲什么呢'
print(tf_similarity(s1, s2))
⾼阶模型Bert
Bert的内部结构,请查看从word2vec到bert这篇⽂章,本篇⽂章我们只讲代码实现。我们可以下载Bert模型源码,或者使⽤TF-HUB的⽅式使⽤,本次我们使⽤下载源码的⽅式。
⾸先,从Github下载源码,然后下载google预训练好的模型,我们选择Bert-base Chinese。
预模型下载后解压,⽂件结构如图:
<是训练时中⽂⽂本采⽤的字典,bert_config.json是BERT在训练时,可选调整的⼀些参数。其它⽂件是模型结构,参数等⽂件。
准备数据集
修改 processor
class MoveProcessor(DataProcessor):
"""Processor for the move data set ."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
@classmethod
def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file."""
with tf.gfile.Open(input_file, "r") as f:
reader = ader(f, delimiter="\t", quotechar=quotechar) lines = []
for line in reader:
lines.append(line)
return lines
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets.""" examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
if set_type == "test":
text_a = vert_to_unicode(line[0])
label = "0"
else:
text_a = vert_to_unicode(line[1])
label = vert_to_unicode(line[0])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
修改 processor字典
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mrpc": MrpcProcessor,
"xnli": XnliProcessor,
'setest':MoveProcessor
}
Bert模型训练
export BERT_BASE_DIR=/Users/xiaomingtai/Downloads/chinese_L-12_H-768_A-12 export MY_DATASET=/Users/xiaomingtai/Downloads/bert_model
python run_classifier.py \
--data_dir=$MY_DATASET \
--task_name=setest \
--vocab_file=$BERT_BASE_ \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--output_dir=/Users/xiaomingtai/Downloads/ber_model_output/ \
--do_train=true \
--do_eval=true \
--do_predict=true\
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--max_seq_length=128 \
--train_batch_size=16 \
--eval_batch_size=8\
--predict_batch_size=2\
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论