Python爬⾍框架Scrapy实例(⼆)
⽬标任务:使⽤Scrapy框架爬取所有⼤类、⼩类、⼩类⾥的⼦链接、以及⼦链接页⾯的新闻内容,最后保存到本地。⼤类⼩类如下图所⽰:
点击国内这个⼩类,进⼊页⾯后效果如下图(部分截图):
查看页⾯元素,得到⼩类⾥的⼦链接如下图所⽰:
有⼦链接就可以发送请求来访问对应新闻的内容了。
⾸先创建scrapy项⽬
# 创建项⽬
scrapy startproject sinaNews
# 创建爬⾍
scrapy genspider sina "sina"
⼀、根据要爬取的字段创建item⽂件:
# -*- coding: utf-8 -*-
import scrapy
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class SinanewsItem(scrapy.Item):
# ⼤类的标题和url
parentTitle = scrapy.Field()
parentUrls = scrapy.Field()
# ⼩类的标题和⼦url
subTitle = scrapy.Field()
subUrls = scrapy.Field()
# ⼩类⽬录存储路径
subFilename = scrapy.Field()
# ⼩类下的⼦链接
sonUrls = scrapy.Field()
# ⽂章标题和内容
head = scrapy.Field()
content = scrapy.Field()
⼆、编写spiders爬⾍⽂件
# -*- coding: utf-8 -*-
import scrapy
import os
from sinaNews.items import SinanewsItem
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class SinaSpider(scrapy.Spider):
name = "sina"
allowed_domains = ["sina"]
start_urls = ['news.sina/guide/']
def parse(self, response):
items= []
# 所有⼤类的url 和标题
parentUrls = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract()
parentTitle = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract()
# 所有⼩类的ur 和标题
subUrls  = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract()
subTitle = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract()
#爬取所有⼤类
for i in range(0, len(parentTitle)):
# 指定⼤类⽬录的路径和⽬录名
parentFilename = "./Data/" + parentTitle[i]
#如果⽬录不存在,则创建⽬录
if(not ists(parentFilename)):
os.makedirs(parentFilename)
# 爬取所有⼩类
for j in range(0, len(subUrls)):
item = SinanewsItem()
# 保存⼤类的title和urls
item['parentTitle'] = parentTitle[i]
item['parentUrls'] = parentUrls[i]
# 检查⼩类的url是否以同类别⼤类url开头,如果是返回True (sports.sina 和 sports.sina/nba)                if_belong = subUrls[j].startswith(item['parentUrls'])
# 如果属于本⼤类,将存储⽬录放在本⼤类⽬录下
if(if_belong):
subFilename =parentFilename + '/'+ subTitle[j]
# 如果⽬录不存在,则创建⽬录
if(not ists(subFilename)):
os.makedirs(subFilename)
# 存储⼩类url、title和filename字段数据
item['subUrls'] = subUrls[j]
item['subTitle'] =subTitle[j]
item['subFilename'] = subFilename
items.append(item)
#发送每个⼩类url的Request请求,得到Response连同包含meta数据⼀同交给回调函数 second_parse ⽅法处理for item in items:
yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
#对于返回的⼩类的url,再进⾏递归请求
def second_parse(self, response):
# 提取每次Response的meta数据
meta_1= a['meta_1']
# 取出⼩类⾥所有⼦链接
sonUrls = response.xpath('//a/@href').extract()
items= []
for i in range(0, len(sonUrls)):
# 检查每个链接是否以⼤类url开头、以.shtml结尾,如果是返回True
if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])
# 如果属于本⼤类,获取字段值放在同⼀个item下便于传输
if(if_belong):
item = SinanewsItem()
item['parentTitle'] =meta_1['parentTitle']
item['parentUrls'] =meta_1['parentUrls']
item['subUrls'] = meta_1['subUrls']
item['subTitle'] = meta_1['subTitle']
item['subFilename'] = meta_1['subFilename']
item['sonUrls'] = sonUrls[i]
items.append(item)
#发送每个⼩类下⼦链接url的Request请求,得到Response后连同包含meta数据⼀同交给回调函数 detail_parse ⽅法处理for item in items:
yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse)
# 数据解析⽅法,获取⽂章标题和内容
def detail_parse(self, response):
item = a['meta_2']
content = ""
head = response.xpath('//h1[@id="main_title"]/text()')
content_list = response.xpath('//div[@id="artibody"]/p/text()').extract()
# 将p标签⾥的⽂本内容合并到⼀起
for content_one in content_list:
content += content_one
item['head']= head
item['content']= content
yield item
三、编写pipelines⽂件
# -*- coding: utf-8 -*-
from scrapy import signals
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class SinanewsPipeline(object):
def process_item(self, item, spider):
sonUrls = item['sonUrls']
# ⽂件名为⼦链接url中间部分,并将 / 替换为 _,保存为 .txt格式
filename = sonUrls[7:-6].replace('/','_')
filename += ".txt"
fp = open(item['subFilename']+'/'+filename, 'w')
fp.write(item['content'])
fp.close()
return item
四、settings⽂件的设置
# 设置管道⽂件
xpath语法 python
ITEM_PIPELINES = {
'sinaNews.pipelines.SinanewsPipeline': 300,
}
执⾏命令
scrapy crwal sina
效果如下图所⽰:
打开⼯作⽬录下的Data⽬录,显⽰⼤类⽂件夹
⼤开⼀个⼤类⽂件夹,显⽰⼩类⽂件夹:
打开⼀个⼩类⽂件夹,显⽰⽂章:

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。