Python爬⾍实战+数据分析+数据可视化(美团美⾷信息)
⼀、爬⾍部分
爬⾍说明:
1、本爬⾍是以⾯向对象的⽅式进⾏代码架构的
2、本爬⾍爬取的数据存⼊到MongoDB数据库中
3、爬⾍代码中有详细注释
4、爬⾍爬取的美⾷是以⽆锡为例
代码展⽰
import json
import random
import re
import time
from pymongo import MongoClient
import requests
from lxml import html
class MeituanSpider():
def__init__(self):
# ⼊⼝url
self.start_url ='ituan/meishi/'
# ⾸先需要登录⾃⼰的账号上获取登录后的Cookie信息和User-Agent来构造响应头
self.headers ={
# 修改成⾃⼰的cookie
"Cookie":"_lxsdk_cuid=17567c82defc8-02c8aee262bc18-3e604000-144000-17567c82defc8; _hc.v=f17bef2e-9394-ea78-d6a7-940fc84143be.161 4495157; mtcdn=K; ci=70; rvct=70%2C52; lsu=; uuid=99cbecdfcd6342ca9753.1617116140.1.0.0; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorg anic%26utm_term%3D%25E7%25BE%258E%25E5%259B%25A2; __mta=218988198.1617067475078.1617152337122.1617500202673.20; client-id=6cfc edec-72cb-470f-86a6-dddf64bc8869; lt=sOSqHk9WE66qIJX1xr-r9ytOpXsAAAAAJg0AAG99qBYNh2fwnJJ-MPffiG58lnM3m45u2teQdyug6LscHSf9jh_RDfo Fcgz4UhgqfA; u=2585285025; n=%E9%A9%AC%E5%B0%91%E7%88%B1%E4%BD%A0%E4%B9%88%E4%B9%88%E5%93%92; token2=sOSqHk9WE66 qIJX1xr-r9ytOpXsAAAAAJg0AAG99qBYNh2fwnJJ-MPffiG58lnM3m45u2teQdyug6LscHSf9jh_RDfoFcgz4UhgqfA; unc=%E9%A9%AC%E5%B0%91%E7%88 %B1%E4%BD%A0%E4%B9%88%E4%B9%88%E5%93%92; firstTime=1617500314442; _lxsdk=17567c82defc8-02c8aee262bc18-3e604000-144000-1756 7c82defc8; _lxsdk_s=1789a866045-cd1-379-f7b%7C%7C6",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", }
# 初始化MongoDB数据库并创建数据库连接
self.client = MongoClient()
# 获取需要爬取的url列表
def get_url_list(self,url,total_nums):
url_temp = url+'pn{}/'
# 每⼀页显⽰显⽰15个美⾷通过获取到每个分类下的总美⾷数来求出总页数
pages = total_nums//15+1if total_nums%15!=0else total_nums//15
url_list =[url_temp.format(i)for i in range(1,pages+1)]
return url_list
# 对url进⾏请求并返回处理后的响应信息
def parse_url(self,url):alert属性
# self.headers['Cookie'] = random.kies)
time.sleep(1)
rest = (url,headers=self.headers)
html_str = re.findall(r'window._appState = (.*?);</script>', t.decode())[0]
return html_str
# 创建item并进⾏存储
def get_content_list(self,html_str,item):
json_html = json.loads(html_str)
foods = json_html['poiLists']['poiInfos']
for i in foods:
item['food_id']= i['poiId']
item['food_url']="ituan/meishi/{}/".format(item['food_id'])
item['title']= i['title']
item['title']= i['title']
item['avg_score']= i['avgScore']
item['avg_price']= i['avgPrice']
item['comments']= i['allCommentNum']
item['area']= i['address'][0:3]
item['address']= i['address']
print(item)
self.save(item)
# 保存数据到mongodb数据库中
def save(self,item):
# 主⽅法
def run(self):
# ⾸先请求⼊⼝url来获取每⼀个美⾷分类的url地址
# 请看图例⼀
html_str = (self.start_url,headers=self.headers)
# 代码已经改变
# html_str = HTML(t.decode())
# cate_list = html_str.xpath('//div[text()="分类"]/../ul/li')[1:]
str_html = re.findall(r'window._appState = (.*?);</script>',t.decode())[0] json_html = json.loads(str_html)
cate_list = json_html['filters']['cates'][1:]
item_list =[]
# 对每⼀个分类进⾏分组分别获取美⾷的分类名和美⾷的分类的url
for i in cate_list:
item ={}
# 分类的url进⾏反爬处理
# 从⽹页中获取的url地址为 wx.meituan/meishi/c11/
# 实际url地址为 wx.meituan/meishi/c11/
# 因此需要将http替换成https
# cate_url= i.xpath('./a/@href')[0]
cate_url = i['url']
item['cate_url']= place('http','https')
# item['cate_name'] = i.xpath('./a/text()')[0]
item['cate_name']= i['name']
item_list.append(item)
# 对每⼀个美⾷分类的分类名和分类url地址进⾏遍历并分别进⾏处理
for i in item_list:
# 睡眠1秒防⽌被识别为⽹络爬⾍
time.sleep(1)
rest = (i['cate_url'],headers = self.headers)
str_html = t.decode()
str_html = re.findall(r'window._appState = (.*?);</script>', str_html)[0]
澳大利亚制裁json_html = json.loads(str_html)
total_nums = json_html['poiLists']['totalCounts']
url_list = _url_list(i['cate_url'],total_nums)
for url in url_list:
list_html = self.parse_url(url)
<_content_list(list_html,i)
if __name__ =='__main__':
meituan = MeituanSpider()
meituan.run()
⼆、数据分析和数据可视化部分
数据分析和数据可视化说明:
1、本博客通过Flask框架来进⾏数据分析和数据可视化
2、项⽬的架构图为
代码展⽰
服务器连接redis命令数据分析代码展⽰(analysis.py)
import pandas as pd
import numpy as np
import pymysql
# 数据的预处理
def pre_process(df):
# 删除⽆⽤列数据⽐如说:url地址、美⾷id
数据结构教程第五版视频df.drop('cate_url', inplace=True, axis=1)
df.drop('food_id', inplace=True, axis=1)
# 删除数据中为NaN的数据
df.dropna(how='any', inplace=True)
# 删除餐厅名称⼀样的重复数据
df.drop_duplicates(subset=['title'],inplace=True)
return df
# ⽆锡不同美⾷分类下的餐厅数量
def food_cate_count(df):
# 按照美⾷分类进⾏分组获取不同美⾷分类的餐厅数量
grouped = df.groupby('cate_name')['title'].count().reset_index()
data =[[i['cate_name'],i['title']]for i _dict(orient='records')]
print(data)
return data
# ⽆锡不同美⾷分类下的餐厅的平均评分
def food_cate_score(df):
# 按照美⾷分类进⾏分组获取不同分类的平均评分
grouped = df.groupby(['cate_name'])['avg_score'].mean().reset_index()
data =[[i['cate_name'],round(i['avg_score'],1)]for i _dict(orient='records')] print(data)
return data
# ⽆锡不同美⾷分类下的餐厅的平均价格
def food_cate_price(df):
# 按照美⾷分类进⾏分组获取不同分类的平均价格
# 按照美⾷分类进⾏分组获取不同分类的平均价格
grouped = df.groupby(['cate_name'])['avg_price'].mean().reset_index()
data =[[i['cate_name'],round(i['avg_price'],1)]for i _dict(orient='records')]
print(data)
return data
# ⽆锡评论最多的前⼗家餐厅
def food_comment_top10(df):
# 按照评论数量进⾏排序获取评论数量最多的前⼗个餐厅
food_comments = df.sort_values(by='comments',ascending=False)[['title','comments']][:10]
data =[[i['title'],i['comments']]for i in _dict(orient='records')]
print(data)
return data
# ⽆锡不同地区的餐厅数量分布
def food_area_count(df):
# 按照地区进⾏分组
grouped = df.groupby(['area'])['title'].count().reset_index()
data =[[i['area'],i['title']]for i _dict(orient='records')]
print(data)
return data
if __name__ =='__main__':
# 读取美⾷数据
df = pd.read_json('美⾷.json', lines=True)
# 打印基本信息
print(df.head(5))
print(df.info())
# 预处理rank函数怎么添加绝对引用
df = pre_process(df)
# ⽆锡不同美⾷分类下的餐厅数量
# data = food_cate_count(df)
# ⽆锡不同美⾷分类下的餐厅的平均评分
# data = food_cate_score(df)
# ⽆锡评论最多的前⼗家餐厅
# data = food_comment_top10(df)
# ⽆锡不同美⾷分类下的餐厅的平均价格
# data = food_cate_price(df)
# ⽆锡不同地区的餐厅数量
data = food_area_count(df)
# 创建数据库连接
conn = t(host='localhost',user='root',password='123456',port=3306,database='mt_food',charset='utf8')
with conn.cursor()as cursor:
# ⽆锡不同美⾷分类下的餐厅数量
# sql = "insert into db_food_cate_count(cate_name,count) values(%s,%s)"
# ⽆锡不同美⾷分类下的餐厅的平均评分
# sql = "insert into db_food_cate_score(cate_name,avg_score) values(%s,%s)"
# ⽆锡不同美⾷分类下的餐厅的平均价格
# sql = "insert into db_food_cate_price(cate_name,avg_price) values(%s,%s)"
# ⽆锡评论最多的前⼗家餐厅
# sql = "insert into db_food_comment_top10(name,comment) values(%s,%s)"
# sql = "insert into db_food_comment_top10(name,comment) values(%s,%s)"
# ⽆锡不同地区的餐厅数量
python请求并解析json数据sql ="insert into db_food_area_count(area,count) values(%s,%s)"
try:
result = utemany(sql,data)
if result:
print('插⼊数据成功')
connmit()
except pymysql.MySQLError as error:
print(error)
数据转换⽂件MongoDB数据转json(food_to_json.py)
import json
from pymongo import MongoClient
# 将MongoDB中存储的数据转存到json⽂件中
def save_json(item_list):
with open('美⾷.json','w', encoding='utf-8')as f:
for item in item_list:
json_item = json.dumps(item,ensure_ascii=False)
f.write(json_item)
f.write('\n')
if __name__ =='__main__':
client = MongoClient()
connection = client['test']['mt_foods']
ret = connection.find({},{'_id':0})
data_list =list(ret)
save_json(data_list)
数据库模型⽂件展⽰(models.py)
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论