python爬取京东评论分析_Python爬取京东商品评论+词云展
⽰!
利⽤python爬⾍爬取京东商品评论数据,并绘制词云展⽰。
1. 爬取商品评论数据
在京东商城⾥搜索三只松⿏,选取⼀家店铺打开
点开商品评价,选择只看当前商品评价,按时间排序查看,发现⼀页有10条评论。
打开⾕歌的调试⼯具,点开Network查看,京东的商品评论信息是存放json包中的。
分析Request URL,⾥⾯有⼀些关键参数,productId是这个商品的ID,sortType为评论的排序⽅式,page为第⼏页,pageSize表⽰这
⼀页有10条评论数据,复制Request URL,在浏览器中打开这个链接,可以发现:
改变page参数的值可以实现翻页,效果如下:
python爬⾍,正则匹配提取数据,保存到txt,代码如下:
import asyncio
京东python入门教程import aiohttp
import re
import logging
import datetime
什么是dubbologging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
start = w()
class Spider(object):
gamma函数和指数函数def __init__(self):
self.semaphore = asyncio.Semaphore(6)
# 伪装请求头
self.header = {
"Host": "club.jd",
"Cookie": "shshshfpa=c003ed54-a640-d73d-ba32-67b4db85fd3e-1594895561;
shshshfpb=i5%20TzLvWAV56AeaK%20C9q5ew%3D%3D; __jdu=629096461;
unpl=V2_ZzNtbUVRFkZ8DUddfRxcBGIEE1hKXhBGIQEVVnNLD1IwBkBeclRCFnQUR1JnGloUZwEZXkZcQxVFCEdkeR1ZAmYBEV1yZ __jdv=122270672|baidu|-|organic|not set|1596847892017; areaId=0; ipLoc-djd=1-72-55653-0; PCSYCityID=CN_0_0_0;
__jda=122270672.629096461.1595821561.1596847892.1597148792.3; __jdc=122270672;
shshshfp=4866c0c0f31ebd5547336a334ca1ef1d;
3AB9D23F7A4B3C9B=DNFMQBTRNFJAYXVX2JODGAGXZBU3L2TIVL3I36BT56BKFQR3CNHE5ZTVA76S56HSJ2TX62VY7ZJ2T jwotest_product=99; shshshsID=ba4014acbd1aea969254534eef9cf0cc_5_1597149339335;
__jdb=122270672.5.629096461|3.1597148792; JSESSIONID=99A8EA65B8D93A7F7E8DAEE494D345BE.s1",
"Connection": "keep-alive",
"Referer": "item.jd/4803334.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
async def scrape(self, url):
async with self.semaphore:
session = aiohttp.ClientSession(headers=self.header)
response = (url)
result = ()
await session.close()
return result
async def scrape_page(self, page):
url = f'club.jd/comment/skuProductPageComments.action?
callback=fetchJSON_comment98&productId=4803334&score=0&sortType=6&page=
{page}&pageSize=10&isShadowSku=0&rid=0&fold=1'
text = await self.scrape(url)
await self.parse(text)
async def parse(self, text):
content = re.findall('"guid":".*?","content":"(.*?)"', text)
with open('', 'a+') as f:
for con in content:
f.write(con + '\n')
logging.info(con)
def main(self):
# 100页的数据
scrape_index_tasks = [sure_future(self.scrape_page(page)) for page in range(0, 100)]
loop = _event_loop()
tasks = asyncio.gather(*scrape_index_tasks)
loop.run_until_complete(tasks)
if __name__ == '__main__':
spider = Spider()
spider.main()
如何快速学好编程delta = (w() - start).total_seconds()
print("⽤时:{:.3f}s".format(delta))
2. 词云展⽰
代码如下:
import jieba
import collections
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
with open('') as f:
data = f.read()
三大构成渐变构成图片# ⽂本预处理 去除⼀些⽆⽤的字符 只提取出中⽂出来
new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S) new_data = "/".join(new_data)
# ⽂本分词
seg_list_exact = jieba.cut(new_data, cut_all=True) result_list = []
with open('', encoding='utf-8') as f: con = f.readlines()
stop_words = set()
for i in con:
i = i.replace("\n", "") # 去掉读取每⼀⾏数据的\n
stop_words.add(i)
for word in seg_list_exact:
别克车security是什么意思# 设置停⽤词并去除单个词
if word not in stop_words and len(word) > 1:
result_list.append(word)
print(result_list)
# 筛选后统计
word_counts = collections.Counter(result_list)
# 绘制词云
my_cloud = WordCloud(
background_color='white', # 设置背景颜⾊ 默认是black width=800, height=550,
font_path='f', # 设置字体 显⽰中⽂
max_font_size=112, # 设置字体最⼤值
min_font_size=12, # 设置⼦图最⼩值
random_state=80 # 设置随机⽣成状态,即多少种配⾊⽅案).generate_from_frequencies(word_counts)
# 显⽰⽣成的词云图⽚
plt.imshow(my_cloud, interpolation='bilinear')
# 显⽰设置词云图中⽆坐标轴
plt.axis('off')
plt.show()
运⾏效果如下:
源码获取点击源码
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论