pythonjieba分词⼩说与词频统计1、知识点
"""
1)cut()
a) codecs.open() 解决编码问题
b) f.readline() 读取⼀⾏,也可以使⽤f.readlines()读取多⾏
c) words =" ".join(jieba.cut(line))分词,每个词⽤空格分隔
2)lcut()
返回⼀个list列表
"""
2、标点符号处理,并分词,存储到⽂件中
def fenCi():
"""
标点符号处理,并分词,存储到⽂件中
:return:
"""
f = codecs.open("深渊主宰系统.txt",'r',encoding='utf-8')
f1 = open("",'w',encoding='utf-8')
line = f.readline()
while line:
line = line.strip('')
words ="".join(jieba.cut(line))
words = place(",","").replace("!","").replace("“","")\
.
replace("”","").replace("。","").replace("?","").replace(":","")\
.replace("...","").replace("、","").strip('')
print(len(words))
if words.startswith('-') or words == '\r\n'or words.startswith('.') or len(words)<10 :
line = f.readline()
continue
words = words.strip('\n')
f1.writelines(words)
line = f.readline()
3、中⽂分词统计
def zhongwen():
"""
中⽂分词统计
对两个词以上的次数进⾏统计
lcut 进⾏分词,返回分词后list列表
:return:
"""
f = codecs.open("深渊主宰系统.txt", 'r', encoding='utf-8').read()
counts = {}writelines使用方法python
wordsList =jieba.lcut(f)
for word in wordsList:
word = place(",", "").replace("!", "").replace("“", "") \
.
replace("”", "").replace("。", "").replace("?", "").replace(":", "") \
.replace("...", "").replace("、", "").strip('').strip('\r\n')
if len(word) == 1 or word == "":
continue
else:
counts[word]=(word,0)+1 #单词计数
items = list(counts.items()) #将字典转为list
items.sort(key=lambda x:x[1],reverse=True) #根据单词出现次数降序排序
#打印前15个
for i in range(15):
word,counter = items[i]
print("单词:{},次数:{}".format(word,counter))
4、英⽂分词统计
def get_txt():
txt = open("1.txt", "r", encoding='UTF-8').read()
txt = txt.lower()
for ch in'!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
txt = place(ch, "")      # 将⽂本中特殊字符替换为空格
return txt
def yingwen():
"""
英⽂分词统计
:
return:
"""
file_txt = get_txt()
words = file_txt.split()    # 对字符串进⾏分割,获得单词列表
counts = {}
for word in words:
if len(word) == 1:
continue
else:
counts[word] = (word, 0) + 1    items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) for i in range(5):
word, count = items[i]
print("{0:<5}->{1:>5}".format(word, count))

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。