python3.6.国家政策⽂本分析代码
根据学习⾄今的python,和导师吩咐的⽅向,⼀共做了5件事:
1.政府⽹中养⽼政策特殊⽂本爬取与保存。
2.基于的TF/IDF多⽂档关键词抽取。
-基于TF-IDF算法的关键词抽取(原⽂:blog.csdn/zhangyu132/article/details/52128924)
import jieba.analyse
act_tags(sentence, topK=20, withWeight=False, allowPOS=())
–sentence 为待提取的⽂本
–topK 为返回⼏个 TF/IDF 权重最⼤的关键词,默认值为 20
–withWeight 为是否⼀并返回关键词权重值,默认值为 False
–allowPOS 仅包括指定词性的词,默认值为空,即不筛选
-
基于TextRank算法的关键词提取
ank(sentence, topK=20, withWeight=False, allowPOS=(‘ns’, ‘n’, ‘vn’, ‘v’)) 直接使⽤,接⼝相同,注意默认过滤词性。
–基本思想:
1,将待抽取关键词的⽂本进⾏分词
2,以固定窗⼝⼤⼩(默认为5,通过span属性调整),词之间的共现关系,构建图
import jieba
import jieba.analyse
import pandas as pd
for i in range(23):
text = open(r'D:\\python3.6.5\\pycharm\\main\\output\\txt\\'+str(i)+'.txt','r',encoding='utf-8').read()
words=act_tags(text, topK = 15,withWeight =True, allowPOS = ('ns', 'n', 'vn', 'v', 'nr', 'x'))
for word in words:
#print('keywords based on TFIDF:'+'/'.join([word for word in words]))
fileout = r'D:\python3.6.5\'
with open (fileout,'a') as fr:
fr.write(word+'/')
print(word,weight)
topK=15表⽰每篇⽂章抽取频率最⾼的前15个词。下⼀步处理共现时词与词中间需要'/',否则⽆法共现,然后将每篇抽取的放⼊同⼀个xlxs中,抽取结果如图:
3.关键词共现矩阵的⽣成。
import xlrd
#读⼊表格数据,返回形如['/././','././','/././','/././']格式的列表
def readxls_bycol(path,colnum):
#    path=r'D:\Ditys\python learning\学习任务\第4次任务20170222\test.xlsx' #少了个r有时候会抽风出
错#    colnum=2 #从0开始计数!!所以,第三列的序号为2
xl = xlrd.open_workbook(path)
python代码转换
table = xl.sheets()[0]
data = l_values(colnum))
print(data)
print('----------1---------')
return data
#处理表格数据, 返回⽆重复的所有出现过的关键词set
def deal_data(data):
data_list = []
data_set = set()
for i in data:
d(i.split('/'))
#    data_list.sort()  #⾼亮, 升序排列
data_set=set(data_list)
print(data_set)
print('----------2---------')
return data_set
#根据set,可建⽴⼀个⼆维列表,并填充其其⼀⾏以及第⼀列, 返回建好框架的⼆维列表
def creat_list_2d(data_set):
i = len(data_set)+1
#list1=[['' for x in range(i)] for y in range(i)]
list_2d = [[0 for col in range(i)] for row in range(i)]  #建⼀个空⼆维列表的⽅法噢~
n=1
for row_1 in data_set:
list_2d[0][n] = row_1  #填充第⼀⾏数据
n+=1
if n == i:
break
print(list_2d)
m=1
print(data_set)
for cols in data_set:    #填充第⼀列数据
list_2d[m][0] = cols
m += 1
if m == i:
break
break
print(list_2d)
print('----------3---------')
return list_2d
#计算共现次数, 填充⼆维列表~  返回填好的列表~
def count_data(list_2d,data,data_set):
data_formted= []
for i in data:
data_formted.append(i.split('/'))
print(data_formted)
print('----------4---------')
for row in range(1,len(data_set)):
for col in range(1,len(data_set)):
if row == col:
continue
else:
counter = 0
for i in data_formted:
if list_2d[col][0] in i and list_2d[0][row] in i :
counter += 1
list_2d[row][col] = counter
print(list_2d)
print('----------5---------')
return list_2d
#把矩阵写进txt~~~~
def putdata_intotxt(path,matrix):
with open(path,'w') as f :
for row in range(0,len(matrix)):
for col in range(0,len(matrix)):#⼆维列表中的每⼀个元素都⾛⼀遍
f.write(str(matrix[row][col]) + '\t')  #因为write()只接字符串类型啊
f.write('\n')
def main():
#path_xls = r'test.xlsx'    #---测试数据---
#path_txt= r'共现矩阵.txt'    #---测试数据---
path_xls = r'D:\python3.6.5\gongci\test.xlsx' #r不可少
path_txt= r'D:\python3.6.5\gongci\关键词共现矩阵.xlsx'
colnum = 0
data = readxls_bycol(path_xls,colnum)
data_set = deal_data(data)
list_2d = creat_list_2d(data_set)
matrix = count_data(list_2d,data,data_set)
print(matrix)
putdata_intotxt(path_txt,matrix)
if __name__=='__main__':
main()
print('你的⽂件夹多了⼀个共现矩阵的结果~快去看看吧XP')
结果如图
4.政策词云图的⽣成(这是⼀段单独的程序,也可以把上下步骤加进来)
# -*- encoding:utf-8 -*-
quest
from bs4 import BeautifulSoup  # 导⼊urllib库的request模块
import lxml                    #⽂档解析器
import time                    #时间模块
import os                      #os模块就是对操作系统进⾏操作
import matplotlib.pyplot as plt    #数学绘图库
import jieba              #分词库
from PIL import Image      #图⽚
from wordcloud import WordCloud, ImageColorGenerator  #词云库
from collections import Counter  #列表、字典、字符串等中计算元素重复的次数
import numpy as np                #科学计算
t = time.localtime(time.time())    #转换⾄当前时区;time.time():返回当前时间的时间戳;
foldername = str(t.__getattribute__("tm_year")) + "-" + str(t.__getattribute__("tm_mon")) +            "-" + str(t.__getattribute__("tm_mday"))+ "-" + str(t.__getattribut picpath = 'D:\\python3.6.5\\pycharm\\main\\%s' % (foldername)
def txt(name, text):  # 定义函数名
if not ists(picpath):  # 路径不存在时创建⼀个
os.makedirs(picpath)
savepath = picpath + '\\' + name + '.txt'
file = open(savepath, 'a', encoding='utf-8')
file.write(text)
# print(text)
file.close
return  (picpath)
def get_text(bs):
# 读取纯⽂本
for p in bs.select('p'):
t = p.get_text()
# print(t)#输出⽂本
txt('url2', t)
def FenCi(pathin,pathout1,pathout2,picturein,pictureout):
text = open(pathin, "r", encoding='utf-8').read()# 1、读⼊txt⽂本数据
#2、结巴分词,默认精确模式。可以添加⾃定义词典,然后jieba.load_userdict(file_name) ,file_name为⽂件类对象或⾃定义词典的路径
#⾃定义词典格式和默认词库⼀样,⼀个词占⼀⾏:每⼀⾏分三部分:词语、词频(可省略)、词性(可省略),⽤空格隔开,顺序不可颠倒
cut_text = jieba.cut(text, cut_all=False)
result = " ".join(cut_text)
#print(result)
with open(pathout1, 'a', encoding='utf-8') as f:
f.write(result)
print("save")
#3、wordcount
with open(pathout1, 'r', encoding='utf-8') as fr:  # r:只读;w:只写
data = jieba.ad())
data = dict(Counter(data))
with open(pathout2, 'a', encoding='utf-8') as fw:  # 读⼊存储wordcount的⽂件路径
for k, v in data.items():
fw.write('%s,%d\n' % (k, v))
# 4、初始化⾃定义背景图⽚
image = Image.open(picturein)
graph = np.array(image)
# 5、产⽣词云图
# 有⾃定义背景图:⽣成词云图由⾃定义背景图像素⼤⼩决定
wc = WordCloud(font_path=r"D:\Python3.6.5\jieba\f", background_color='white', max_font_size=50,mask=graph)
wc = WordCloud(font_path=r"D:\Python3.6.5\jieba\f", background_color='white', max_font_size=50,mask=graph)
# 6、绘制⽂字的颜⾊以背景图颜⾊为参考
image_color = ImageColorGenerator(graph)  # 从背景图⽚⽣成颜⾊值
<_file(pictureout)
def readhtml(path): #读取⽹页⽂本
res = quest.urlopen(path)  # 调⽤urlopen()从服务器获取⽹页响应(respone),其返回的响应是⼀个实例
html = ad().decode('utf-8')  # 调⽤返回响应⽰例中的read(),可以读取html
soupa = BeautifulSoup(html, 'lxml')
result = soupa.find_all('div', class_='result')
download_soup = BeautifulSoup(str(result), 'lxml')#使⽤查询结果再创建⼀个BeautifulSoup对象,对其继续进⾏解析
urls = []
url_all = download_soup.find_all('a')
#抓取所有政策链接
for a_url in url_all:
a_url = ('href')
urls.append(a_url)
url = a_url
txt('url0', a_url)
res = quest.urlopen(url)# 指定要抓取的⽹页url
html = ad().decode('utf-8')
# print(html)
txt('url1', html)
soup = BeautifulSoup(html, 'lxml')
get_text(soup)
for n in range(3):
url = r'v/s.htm?q=&n=10&p=' + str(n) + '&t=paper&advance=true&title=%E5%85%BB%E8%80%81&content=&puborg=&pcodeJiguan=&pcode    readhtml(url)
#picpath + '\\'
a=picpath + '\\'
b=picpath + '\\'
c=picpath + '\\result.csv'
d=r'D:\python3.6.5\jieba\1.jpg'
e=picpath + '\\wordcloud.png'
FenCi(a,b,c,d,e)
print('finish')
5.关键词⽹络关系图⽣成。

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。