python爬取京东商品信息及评论
准备
1. chrome浏览器 和 chromeDriver插件 (其他浏览器步骤类似)
2. python 环境
3. python selenium模块
代码
'''
爬取京东商品信息:
功能: 通过chromeDrive进⾏模拟访问需要爬取的京东商品详情页(item.jd/100003196609.html)并且程序⽀持多个页⾯爬取,输⼊时以逗号分隔,思路: 创建webdriver对象并且调⽤get⽅法请求url,进⼊页⾯根据dom结构爬取⼀些简要信息,之后
通过模拟点击商品评价按钮,再分别解析没个⽤户的评价信息,到每页的底部时,模拟点击下⼀页按钮
获取新的⼀页数据。
提取商品信息:
商品名称: {goods_name}
商品价格: {goods_price}
好评度: {percent_con}
metashape教程评价标签: {tags}
评价类型
姓名:{username}
星级:{star}
⽂字:{word}
评价图⽚: {picList}
购买类型: {order_type}
购买⽇期:{order_date}
点赞⼈数: {likes}
评论⼈数: {
'''
from selenium import webdriver
ptions import NoSuchElementException
如何制作小程序二维码from selenium.webdrivermon.keys import Keys
import time
# 根据类名⽤来判断元素是否存在
def isElementPresent(driver, element):
"""
⽤来判断元素标签是否存在,
"""
try:
driver.find_element_by_class_name(element)
# 原⽂是except NoSuchElementException, e:
except NoSuchElementException as e:
# 发⽣了NoSuchElementException异常,说明页⾯中未到该元素,返回False
return False
else:
# 没有发⽣异常,表⽰在页⾯中到了该元素,返回True
return True
## 获取评价
def get_evaluation(goods_detail):
js_code ='''
window.scrollTo(0,2000);
'''
## 评价详细信息
try:
comments = goods_detail.find_elements_by_class_name('comment-item')
for comment in comments:
picList =[]
username = comment.find_element_by_class_name('user-info').place("\n","")
star = comment.find_element_by_class_name('comment-star').get_attribute('class')[-1]+'星级'
word = comment.find_element_by_class_name('comment-con').place("\n","")
# 调⽤isElementExist⽅法,判断元素是否存在
flag = isElementPresent(comment,'pic-list')
if flag:
pics = comment.find_element_by_class_name('pic-list').find_elements_by_tag_name('a')
pics = comment.find_element_by_class_name('pic-list').find_elements_by_tag_name('a')
for pic in pics:
picList.append(pic.find_element_by_tag_name('img').get_attribute('src'))
order_type = comment.find_element_by_class_name('order-info').find_elements_by_tag_name('span')[0].text
order_date = comment.find_element_by_class_name('order-info').find_elements_by_tag_name('span')[-1].text
likes = comment.find_element_by_class_name('J-nice').text
sprite_comment = comment.find_element_by_class_name('comment-op').find_elements_by_tag_name('a')[2].text
goods_content = f'''
姓名:{username}
星级:{star}
⽂字:{word}
评价图⽚: {picList}
购买类型: {order_type}item.jd/100003196609.html,item.jd/100006966435.html,item.jd/100001168786.html 购买⽇期:{order_date}
点赞⼈数: {likes}
评论⼈数: {sprite_comment}
'''
print(goods_content)
except NoSuchElementException as e:
print(e)
def get_good(driver):
# 通过JS控制滚轮滑动获取所有商品信息
js_code ='''
window.scrollTo(0,5000);
'''
# 等待数据加载
time.sleep(2)
# 商品详情wrap
goods_detail = driver;
# 商品名称
goods_name = goods_detail.find_element_by_class_name('sku-name').place("\n"," ")
# 商品价格
goods_price = goods_detail.find_element_by_class_name('price').place("\n"," ")
# 评价信息
evaluation_btn = goods_detail.find_element_by_id('detail').find_element_by_class_name('tab-main').find_elements_by_tag_name('li')[4]
evaluation_btn.click()
print()
time.sleep(2)
# 好评度
percent_con = goods_detail.find_element_by_class_name('percent-con').place("\n"," ")
# 评价tag
evaluation_tags = goods_detail.find_elements_by_class_name('tag-1')
tags =[]
for tag in evaluation_tags:
tags.)
# 评价类型
evaluation_type_list = goods_detail.find_element_by_class_name('filter-list').find_elements_by_tag_name('li')
types =[]
for type in evaluation_type_list:
_attribute('data-tab')=='trigger'):
types.append(type.find_element_by_tag_name('a').text)
goods_content = f'''
商品名称: {goods_name}
critical怎么记忆商品价格: {goods_price}
好评度: {percent_con}
评价标签: {tags}
评价类型: {types}
\n
'''
print(goods_content)
## 爬取评价信息
get_evaluation(goods_detail)
n =1
# 爬取后⾯的页⾯
top header是什么意思while True:
while True:
flag = isElementPresent(goods_detail,'ui-pager-next') if flag:
方框图与软件架构图区别element = driver.find_element_by_class_name('ui-pager-next') ute_script("arguments[0].click();", element) # 等待数据加载
time.sleep(2)
n = n+1
## 爬取评价信息
get_evaluation(goods_detail)
print('%d商品页数:'% n)
else:
print('到底了.')
return
print('商品总页数:%d'% n)
if __name__ =='__main__':
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# driver = webdriver.Chrome(options=chrome_options)
# 获取⽤户商品的url
urlList =input('请输⼊爬取商品url(以逗号分割):').strip()
urlList = urlList.split(',')
# 可视化界⾯需要下载 chromeDiiriver 及 chrome浏览器
driver = webdriver.Chrome()
driver.implicitly_wait(5)
for url in urlList:
<(url)
get_good(driver)
driver.close();
第⼀个版本…京东python入门教程
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论