NLP之滑动窗⼝函数import re
def compute_ngrams(word):
# BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix
pattern = r'[a-zA-Z]+'
re.findall(pattern, word)
extended_word,tag_dict = segword(word)
# print(extended_word,tag_dict)
min_n = 2
max_n = len(extended_word)
ngrams = []
for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
isalpha 函数
for i in range(0, len(extended_word) - ngram_length + 1):
new_word = extended_word[i:i + ngram_length]
new_word2 = new_word
if len(new_word) == 1:
continue
if len(tag_dict) == 0:
ngrams.append(new_word)
else:
for c in new_word:
de('utf-8').isalpha():
new_word2 = place(c,tag_dict[c]+' ')
ngrams.append(new_word2)
return list(set(ngrams))
def segword(word):
word = word.lower()
# en
pattern = r'[a-zA-Z0-9]+'
en_words = re.findall(pattern, word)
chars = [chr(i).upper() for i in range(97, 123)]
tags = [chars[i] for i in range(len(en_words))]
tag_word = {}
new_word = place(' ', '')
for i in range(len(en_words)):
new_word = place(en_words[i], tags[i])
tag_word[tags[i]] = en_words[i]
return new_word,tag_word
r = compute_ngrams("cad模具设计")
print(r)
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论