python爬⾍源码附注解_Python爬⾍系列之美团全站信息爬取
实战
import requests
from bs4 import BeautifulSoup
import re
import json
import time
'''
@Author :王磊
@Time :2018/12/31
@Description:美团站点所有有效信息抓取(待完善)
'''
class MeiTuanSpider:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
"Cookie": "_lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=167ffca928ec8-0e654d87ed4011-
4d045769-100200-167ffca928ec8; __mta=210679722.1546184730073.1546184730073.1546184730073.1;
_lxsdk=167ffca928ec8-0e654d87ed4011-4d045769-100200-167ffca928ec8; _ga=GA1.2.268215992.1546188830; _gid=GA1.2.2085490335.1546188830; mtcdn=K; lsu=; token2=k5KFzZOmjNtI4RXwSn-
MBwHYS_QFAAAAqgcAAM17q21drlYFsEkrWY8nBciWgigr_vFCL5FDakc3B15Z318X6W3X_Dkc15OrK0yCPQ;
u=646978641; n=XwR964951585; lt=k5KFzZOmjNtI4RXwSn-
MBwHYS_QFAAAAqgcAAM17q21drlYFsEkrWY8nBciWgigr_vFCL5FDakc3B15Z318X6W3X_Dkc15OrK0yCPQ; ci=146; rvct=146%2C224%2C527%2C1114%2C1268%2C758%2C835%2C811%2C729%2C113%2C402;
unc=XwR964951585; uuid=d927d5e7a70f4031900e.1546184723.2.0.0; client-id=03aeb51b-56e7-4809-b3a0-
1fd44f5b4ea4; lat=40.74812; lng=107.400892; _lxsdk_s=16803187a83-b3c-b35-5ba%7C%7C171",
# "Referer": "as.meituan/",
"Upgrade-Insecure-Requests": "1"
}
self.start_url = "ituan/changecity/"
def getHTML(self, url):
'''
get⽅式请求url
:param url: 请求url
:return: str
'''
resp = (url, headers=self.headers)
t.decode(resp.apparent_encoding)
def getCityList(self, html):
'''
获取全国城市的美团主页地址列表
:
param html: 城市选择页⾯html
:return: list
'''
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll("span", attrs={"class": "cities"})
citys = []
for i in range(len(spans)):
a = spans[i].findAll("a", attrs={"class": repile("link.*?city.*?")})
for j in range(len(a)):
url = "https:" + a[j]['href']
city_name = a[j].text
citys.append({"city_name": city_name, "url": url})
return citys
def getMoreList(self, html):
'''
获取城市主页的分类信息地址
:param html: 主页html
:return: list
'''
soup = BeautifulSoup(html, 'html.parser')
a = soup.findAll("a", attrs={"class": repile("link.*?detail-more.*?")}) moreList = []
for i in range(len(a)):
url = a[i]['href']
type_name = re.findall(r'"title":"(.*?)"', str(a[i]))[0]
moreList.append({"type_name": type_name, "url": url})
return moreList
def meiShiParser(self, html):
'''
解析美⾷类⽬下所有店铺的店铺信息以及店铺类⽬下的所有商品信息,店铺评价信息等等
:param html: 店铺页⾯html
:return: None
'''
req = repile(r'{"itemId":"(.*?)"', re.S)
itemsIds = re.findall(req, html)
preURL = "ituan/meishi/"
for i in range(len(itemsIds)):
url = preURL + itemsIds[i] + "/"
detailHTML = HTML(url)
shopDirtyInfo = re.findall(r'detailInfo":(.*?),"crumbNav', detailHTML)[0]
# 店铺信息
shopId, shopName = re.findall(r'"poiId":(.*?),"name":"(.*?)"', shopDirtyInfo)[0]
avgScore = re.findall(r'"avgScore": (.*?)', shopDirtyInfo)
address = re.findall(r'"address":"(.*?)"', shopDirtyInfo)
phone = re.findall(r'"phone":"(.*?)"', shopDirtyInfo)
# 店铺提供设施服务
extraInfos = re.findall(r'"text":"(.*?)"', shopDirtyInfo)
req_good = repile(r'recommended":(.*?)]', re.S)
# 商品列表
goods = re.findall(req_good, shopDirtyInfo)[0]
req_goodList = repile(r'{"id":"(.*?)","name":"(.*?)","price":(.*?),"frontImgUrl":"(.*?)"}', re.S)
goodsList = re.findall(req_goodList, goods)
# ⽤户评价
evaluateURL = "ituan/meishi/api/poi/getMerchantComment?
uuid=d927d5e7a70f4031900e.1546184723.2.0.0&platform=1&partner=126&originUrl=" + url +
"&riskLevel=1&optimusCode=1&id=" + str(itemsIds[i]) + "&userId=646978641&offset=0&pageSize=1
0&sortType=1" totalPages = int(json.HTML(evaluateURL))['data']['total'])
evaluateList = []
for k in range(totalPages):
offset = k * 10
evaluateURL = "ituan/meishi/api/poi/getMerchantComment?
uuid=d927d5e7a70f4031900e.1546184723.2.0.0&platform=1&partner=126&originUrl=" + url +
"&riskLevel=1&optimusCode=1&id=" + str(itemsIds[i]) + "&userId=646978641&offset=" + str(offset) +
"&pageSize=10&sortType=1"
# 数据量太⼤,此处做测试打印
print(evaluateList)
time.sleep(5)
for j in range(len(goodsList)):
good_id = goodsList[j][0]
good_name = goodsList[j][1]
good_price = goodsList[j][2]
good_img = goodsList[j][3]
print("shopId: %s,shopName: %s, avgScore: %s, address: %s, phone: %s, extraInfos: %s, good_id: %s, good_name: %s,
good_price: %s, good_img: %s" %
(shopId, shopName, avgScore, address, phone, extraInfos, good_id, good_name, good_price, good_img)
)
print(evaluateList)
time.sleep(10)
def spider(self):
'''
数据分析
:return: None
'''
python新手代码useridhtml = HTML(self.start_url)
cityList = CityList(html)
type_detail = []
# 每⼀个城市为单位
for i in range(len(cityList)):
cityIndexHtml = HTML(cityList[i]['url'])
moreList = MoreList(cityIndexHtml)
# 每⼀个城市下每⼀个类⽬为单位
for j in range(len(moreList)):
moreType = moreList[j]['type_name']
moreURL = moreList[j]['url']
>>>>>####
# 爬取数据量较⼤,此处做测试步骤#
>>###测试块开始>>##
print(moreURL)
aimHTML = HTML(moreURL)
exit(0)
>>###测试块结束>>##
type_detail.append({"city_name": cityList[i]['city_name'], "type_name": moreType, "type_url": moreURL}) time.sleep(5)
for k in range(len(type_detail)):
city_name = type_detail[k]['city_name']
type_name = type_detail[k]['type_name']
type_url = type_detail[k]['type_url']
aimHTML = HTML(type_url)
if type_name == '美⾷':
elif type_name == '外卖':
pass
elif type_name == '酒店星级':
pass
elif type_name == '热门城市':
pass
elif type_name == '⽕车票':
pass
elif type_name == '休闲娱乐':
pass
elif type_name == '⽣活服务':
pass
elif type_name == '丽⼈':
pass
elif type_name == '结婚':
pass
elif type_name == '⼉童乐园':
pass
elif type_name == '幼⼉教育':
pass
elif type_name == '亲⼦摄影':
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
python调用dll三种方式
« 上一篇
推荐文章
热门文章
-
随机森林算法的改进方法
2024-10-02 -
基于随机森林算法的风险预警模型研究
2024-10-02 -
Python中的随机森林算法详解
2024-10-02 -
随机森林发展历史
2024-10-02 -
如何使用随机森林进行时间序列数据模式识别(八)
2024-10-02 -
随机森林回归模型原理
2024-10-02 -
如何使用随机森林进行时间序列数据模式识别(六)
2024-10-02 -
如何使用随机森林进行时间序列数据预测(四)
2024-10-02 -
如何使用随机森林进行异常检测(六)
2024-10-02 -
随机森林算法和grandientboosting算法 -回复
2024-10-02 -
随机森林方法总结全面
2024-10-02 -
随机森林算法原理和步骤
2024-10-02 -
随机森林的原理
2024-10-02 -
随机森林 重要性
2024-10-02 -
随机森林算法
2024-10-02 -
机器学习中随机森林的原理
2024-10-02 -
随机森林算法原理
2024-10-02 -
使用计算机视觉技术进行动物识别的技巧
2024-10-02 -
基于crf命名实体识别实验总结
2024-10-02 -
transformer预测模型训练方法
2024-10-02
最新文章
-
随机森林算法介绍及R语言实现
2024-10-02 -
基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...
2024-10-02 -
基于正则化贪心森林算法的情感分析方法研究
2024-10-02 -
随机森林算法和grandientboosting算法
2024-10-02 -
基于随机森林的图像分类算法研究
2024-10-02 -
随机森林结合直接正交信号校正的模型传递方法
2024-10-02
发表评论