python爬取⾖瓣top250电影信息成功代码
import requests
from lxml import etree
import pandas as pd电影源代码人物介绍
import os
MOVIES =[]
IMGURLS =[]
def get_html(url):
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
try:
html = (url,headers = headers)
if html.status_code ==200:
print('成功获取源代码')
except Exception as e:
print('获取源代码失败:%s'% e)
def parse_html(html):
movies =[]
imgurls =[]
html = etree.HTML(html)
lis = html.xpath("//ol[@class = 'grid_view']/li")
for li in lis:
name = li.xpath(".//a/span[@class='title'][1]/text()")[0]
name = li.xpath(".//a/span[@class='title'][1]/text()")[0]
director_actor ="".join(li.xpath(".//div[@class='bd']/p/text()[1]")[0].replace(' ','').replace('\n','').replace('/','').split())        info ="".join(li.xpath(".//div[@class='bd']/p/text()[2]")[0].replace(' ','').replace('\n','').split())
rating_score = li.xpath(".//span[@class='rating_num']/text()")[0]
rating_num = li.xpath(".//div[@class='star']/span[4]/text()")[0]
introduce = li.xpath(".//p[@class='quote']/span/text()")
if introduce:
movie ={'name': name,'director_actor': director_actor,'info': info,'rating_score': rating_score, 'rating_num': rating_num,'introduce': introduce[0]}
else:
movie ={'name': name,'director_actor': director_actor,'info': info,'rating_score': rating_score, 'rating_num': rating_num,'introduce':None}
imgurl = li.xpath(".//img/@src")[0]
movies.append(movie)
imgurls.append(imgurl)
return movies,imgurls
def download_img(url,movie):
if'movieposter'in os.listdir(r'G:\爬⾍数据'):
pass
else:
os.mkdir('movieposter')
os.chdir(r'G:\爬⾍数据\movieposter')
img = (url).content
with open(movie['name']+'.jpg','wb')as f:
print('正在下载: %s'% url)
f.write(img)
if __name__ =='__main__':
for i in range(10):
url ='movie.douban/top250?start='+str(i *25)+'&filter='
html = get_html(url)
movies = parse_html(html)[0]
imgurls = parse_html(html)[1]
for i in range(250):
download_img(IMGURLS[i],MOVIES[i])
os.chdir(r'G:\爬⾍数据')
moviedata = pd.DataFrame(MOVIES)
<_csv('movie.csv')
print('电影信息成功保存到本地')

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。