NLP⼊门系列⼀:搜狗新闻语料处理和word2vec词向量的训练新闻语料预处理
def is_Qnumber(uchar):
"""判断⼀个unicode是否是全⾓数字"""
if uchar >= u'\uff10' and uchar <= u'\uff19':
return True
else:
return False
def is_Qalphabet(uchar):
"""判断⼀个unicode是否是全⾓英⽂字母"""
if (uchar >= u'\uff21' and uchar <= u'\uff3a') or (uchar >= u'\uff41' and uchar <= u'\uff5a'):
return True
else:
return False
def Q2B(uchar):
"""单个字符全⾓转半⾓"""
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: #转完之后不是半⾓字符返回原来的字符
return uchar
return chr(inside_code)
def stringQ2B(ustring):
"""把字符串全⾓转半⾓"""
return "".join([Q2B(uchar) for uchar in ustring])
def stringpartQ2B(ustring):
"""把字符串中数字和字母全⾓转半⾓"""
return "".join([Q2B(uchar) if is_Qnumber(uchar) or is_Qalphabet(uchar) else uchar for uchar in ustring])
具体处理过程,⾸先读取xml格式语料,然后提取content部分内容,去除符号标记,最后保存语料为⽂本格式,也可以选择分词后在保存,分词后保存可以直接⽤于word2vec模型训练词向量。
import codecs
import re
import jieba
from tqdm import tqdm
file_path = r'/path/news_tensite_xml.dat' # 原始语料
save_path = r'/path/' # 处理后只有⽂章内容的语料
seg_save_path = r'/path/SougouNews_' # ⽂章内容分词后的语料
# read file
print("read news dataset:", file_path)
with codecs.open(file_path, encoding='GB2312', errors="ignore") as fr:
news_data = fr.read()
# extract the text between <content> and  </content>
print("extract news content ...")
news_content = re.findall('<content>.*</content>', news_data)
# write to text file without segment
print("write content to text file ...")
with codecs.open(save_path, 'w', encoding='utf8') as fw:
# 注意tqdm只是⼀个进度条也可以不需要
unicode系列全部汉字for item in tqdm(news_content):
item = re.sub(r'<content>|</content>|\s', '', item)
# item = stringQ2B(item) # 全部全⾓转半⾓
item = stringpartQ2B(item) # 只有数字字母全⾓转半⾓
if item != "":
fw.write(item + '\n')
下⾯是对语料分词后写⼊⽂件,这⾥使⽤的是jieba分词⼯具,也可以选择其它分词⼯具。
# segment and write to text file
with codecs.open(seg_save_path, 'w', encoding='utf8') as fw:
for content in tqdm(news_content):
content = re.sub(r'<content>|</content>|\s', '', content)
# content = stringQ2B(content) # 全部全⾓转半⾓
item = stringpartQ2B(item) # 只有数字字母全⾓转半⾓
if content != "":
# 这⾥分词调⽤的是jieba 也可以使⽤其它分词⼯具
content_seg = jieba.cut(content.strip())
fw.write(" ".join(content_seg) + "\n")
对语料预处理完后,可以使⽤已经分词的语料训练word2vec,这⾥分词后的语料是可以加上⾃⼰整理的其它语料,这样可以更加丰富我们的语料库。
word2vec训练词向量
word2vec模型的原理这⾥不再讲解,⽹上随便⼀搜,可以到很多教程,这⾥是给个实例,基于上⾯处理好的语料训练词向量,使⽤的⼯具是gensim中⾃带的word2vec模型。
import logging
dels as word2vec
dels.word2vec import LineSentence
def train_word2vec(dataset_path, model_path, size=100, window=5, binary=True):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 把语料变成句⼦集合
sentences = LineSentence(dataset_path)
# 训练word2vec模型
model = word2vec.Word2Vec(sentences, size=size, window=window, min_count=5, workers=4, iter=10)
# 保存word2vec模型
if binary:
model.wv.save_word2vec_format(model_path, binary=True)
else:
model.wv.save_word2vec_format(model_path, binary=False)
def load_word2vec_model(w2v_path):
# load word2vec
model = word2vec.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
return model
def calculate_most_similar(model, word):
similar_words = st_similar(word)
print(word)
for term in similar_words:
print(term[0], term[1])
dataset_path = "/path/SougouNews_"
save_model_path = "word2vec_model.bin" # save_binary=True
# save_model_path = "" # save_binary=False
train_word2vec(dataset_path, save_model_path, size=100, window=5, binary=True)
word2vec模型训练完后,可以计算词语之间的相似度,以及与每次词最相似的词。也可以做为其它NLP相关任务的词向量预训练输⼊。

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。