python爬⾍请求库httpx和parsel解析库的使⽤测评
⽬录
requests + BeautifulSoup组合
requests + parsel组合
httpx同步 + parsel组合
httpx异步+ parsel组合
对⽐与总结
Python⽹络爬⾍领域两个最新的⽐较⽕的⼯具莫过于httpx和parsel了。httpx号称下⼀代的新⼀代的⽹络请求库,不仅⽀持requests库的所有操作,还能发送异步请求,为编写异步爬⾍提供了便利。parsel最初集成在著名Python爬⾍框架Scrapy中,后独⽴出来成⽴⼀个单独的模块,⽀持XPath选择器, CSS选择器和正则表达式等多种解析提取⽅式, 据说相⽐于BeautifulSoup,parsel的解析效率更⾼。
今天我们就以爬取链家⽹上的⼆⼿房在售房产信息为例,来测评下httpx和parsel这两个库。为了节约时间,我们以爬取上海市浦东新区500万元-800万元以上的房产为例。
requests + BeautifulSoup组合
整个项⽬代码如下所⽰:
# homelink_requests.py
# Author: ⼤江狗
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
class HomeLinkSpider(object):
def __init__(self):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.data = list()
self.path = "浦东_三房_500_800万.csv"
self.url = "sh.lianjia/ershoufang/pudong/a3p5/"
def get_max_page(self):
response = (self.url, headers=self.headers)
if response.status_code == 200:
soup = , 'html.parser')
a = soup.select('div[class="page-box house-lst-page-box"]')
#使⽤eval是字符串转化为字典格式
max_page = eval(a[0].attrs["page-data"])["totalPage"]
return max_page
else:
print("请求失败 status:{}".format(response.status_code))
return None
def parse_page(self):
max_page = _max_page()
for i in range(1, max_page + 1):
url = 'sh.lianjia/ershoufang/pudong/pg{}a3p5/'.format(i)
response = (url, headers=self.headers)
soup = , 'html.parser')
ul = soup.find_all("ul", class_="sellListContent")
python请求并解析json数据li_list = ul[0].select("li")
for li in li_list:
detail = dict()
detail['title'] = li.select('div[class="title"]')[0].get_text()
# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼
house_info = li.select('div[class="houseInfo"]')[0].get_text()
house_info_list = house_info.split(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = repile(r'\d{1,2}')
# 从字符串任意位置匹配
match1 = re.search(floor_pattern, house_info_list[4])
if match1:
detail['floor'] = up()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = repile(r'\d{4}')
match2 = re.search(year_pattern, house_info_list[5])
if match2:
detail['year'] = up()
else:
detail['year'] = "未知"
# ⽂兰⼩区 - 塘桥,提取⼩区名和哈快
position_info = li.select('div[class="positionInfo"]')[0].get_text().split(' - ')
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = repile(r'\d+')
total_price = li.select('div[class="totalPrice"]')[0].get_text()
detail['total_price'] = re.search(price_pattern, total_price).group()
# 单价64182元/平⽶,匹配64182
unit_price = li.select('div[class="unitPrice"]')[0].get_text()
detail['unit_price'] = re.search(price_pattern, unit_price).group()
self.data.append(detail)
def write_csv_file(self):
head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层", "年份",
"位置", "总价(万)", "单价(元/平⽅⽶)"]
keys = ["title", "house", "bedroom", "area", "direction",
"floor", "year", "location",
"total_price", "unit_price"]
js取消隐藏
try:
with open(self.path, 'w', newline='', encoding='utf_8_sig') as csv_file:
writer = csv.writer(csv_file, dialect='excel')
if head is not None:
writer.writerow(head)
for item in self.data:
row_data = []
for k in keys:
row_data.append(item[k])
# print(row_data)
writer.writerow(row_data)
print("Write a CSV file to path %s Successful." % self.path)
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))
if __name__ == '__main__':
start = time.time()
home_link_spider = HomeLinkSpider()
home_link_spider.parse_page()
home_link_spider.write_csv_file()
end = time.time()
print("耗时:{}秒".format(end-start))
注意:我们使⽤了fake_useragent, requests和BeautifulSoup,这些都需要通过pip事先安装好才能⽤。
现在我们来看下爬取结果,耗时约18.5秒,总共爬取580条数据。
requests + parsel组合
这次我们同样采⽤requests获取⽬标⽹页内容,使⽤parsel库(事先需通过pip安装)来解析。Parsel库的⽤法和BeautifulSoup相似,都是先创建实例,然后使⽤各种选择器提取DOM元素和数据,但语法上稍有不同。Beautiful有⾃⼰的语法规则,⽽Parsel库⽀持标准的css选择器和xpath选择器, 通过get⽅法或getall⽅法获取⽂本或属性值,使⽤起来更⽅便。
# BeautifulSoup的⽤法
from bs4 import BeautifulSoup
soup = , 'html.parser')
ul = soup.find_all("ul", class_="sellListContent")[0]
# Parsel的⽤法, 使⽤Selector类
from parsel import Selector
selector = )
ul = selector.css('ul.sellListContent')[0]
# Parsel获取⽂本值或属性值案例
selector.css('div.title span::text').get()
selector.css('ul li a::attr(href)').get()
>>> for li in selector.css('ul > li'):
... print(li.xpath('.//@href').get())
注:⽼版的parsel库使⽤extract()或extract_first()⽅法获取⽂本或属性值,在新版中已被get()和getall()⽅法替代。全部代码如下所⽰:
# homelink_parsel.py
# Author: ⼤江狗
from fake_useragent import UserAgent
import requests
import csv
import re
import time
数据库增删改查统称什么from parsel import Selector
class HomeLinkSpider(object):
def __init__(self):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.data = list()
self.path = "浦东_三房_500_800万.csv"
self.url = "sh.lianjia/ershoufang/pudong/a3p5/"
def get_max_page(self):
response = (self.url, headers=self.headers)
if response.status_code == 200:
# 创建Selector类实例
selector = )
# 采⽤css选择器获取最⼤页码div Boxl
a = selector.css('div[class="page-box house-lst-page-box"]')
# 使⽤eval将page-data的json字符串转化为字典格式
max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]
print("最⼤页码数:{}".format(max_page))
return max_page
else:
print("请求失败 status:{}".format(response.status_code))
return None
def parse_page(self):
max_page = _max_page()
for i in range(1, max_page + 1):
url = 'sh.lianjia/ershoufang/pudong/pg{}a3p5/'.format(i)
response = (url, headers=self.headers)
selector = )
ul = selector.css('ul.sellListContent')[0]
li_list = ul.css('li')
for li in li_list:
detail = dict()
detail['title'] = li.css('div.title a::text').get()
# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼
house_info = li.css('div.houseInfo::text').get()
house_info_list = house_info.split(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = repile(r'\d{1,2}')
match1 = re.search(floor_pattern, house_info_list[4]) # 从字符串任意位置匹配
if match1:
detail['floor'] = up()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = repile(r'\d{4}')
match2 = re.search(year_pattern, house_info_list[5])
if match2:
detail['year'] = up()
else:
detail['year'] = "未知"
# ⽂兰⼩区 - 塘桥提取⼩区名和哈快
position_info = li.css('div.positionInfo a::text').getall()
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = repile(r'\d+')
total_price = li.css('alPrice span::text').get()
detail['total_price'] = re.search(price_pattern, total_price).group()
# 单价64182元/平⽶,匹配64182
unit_price = li.css('div.unitPrice span::text').get()
detail['unit_price'] = re.search(price_pattern, unit_price).group()
self.data.append(detail)
def write_csv_file(self):
head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",
"年份", "位置", "总价(万)", "单价(元/平⽅⽶)"]
keys = ["title", "house", "bedroom", "area",
python和vba哪个更有前景"direction", "floor", "year", "location",
"total_price", "unit_price"]
try:
with open(self.path, 'w', newline='', encoding='utf_8_sig') as csv_file:
writer = csv.writer(csv_file, dialect='excel')
if head is not None:
writer.writerow(head)
for item in self.data:
row_data = []
for k in keys:
row_data.append(item[k])
# print(row_data)
writer.writerow(row_data)
print("Write a CSV file to path %s Successful." % self.path)
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))
if __name__ == '__main__':
start = time.time()
home_link_spider = HomeLinkSpider()
home_link_spider.parse_page()
home_link_spider.write_csv_file()
end = time.time()
print("耗时:{}秒".format(end-start))
现在我们来看下爬取结果,爬取580条数据耗时约16.5秒,节省了2秒时间。可见parsel⽐BeautifulSoup解析效率是要⾼的,爬取任务少时差别不⼤,任务多的话差别可能会⼤些。
httpx同步 + parsel组合
我们现在来更进⼀步,使⽤httpx替代requests库。httpx发送同步请求的⽅式和requests库基本⼀样,所以我们只需要修改上例中两⾏代码,把requests替换成httpx即可, 其余代码⼀模⼀样。
from fake_useragent import UserAgent
import csv
import re
import time
from parsel import Selector
import httpx
class HomeLinkSpider(object):
def __init__(self):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.data = list()
self.path = "浦东_三房_500_800万.csv"
self.url = "sh.lianjia/ershoufang/pudong/a3p5/"
def get_max_page(self):
# 修改这⾥把requests换成httpx
response = (self.url, headers=self.headers)fetch()
if response.status_code == 200:
# 创建Selector类实例
selector = )
# 采⽤css选择器获取最⼤页码div Boxl
a = selector.css('div[class="page-box house-lst-page-box"]')
# 使⽤eval将page-data的json字符串转化为字典格式
max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]
print("最⼤页码数:{}".format(max_page))
return max_page
else:
print("请求失败 status:{}".format(response.status_code))
return None
def parse_page(self):
max_page = _max_page()
for i in range(1, max_page + 1):
url = 'sh.lianjia/ershoufang/pudong/pg{}a3p5/'.format(i)
# 修改这⾥把requests换成httpx
response = (url, headers=self.headers)
selector = )
ul = selector.css('ul.sellListContent')[0]
li_list = ul.css('li')
for li in li_list:
detail = dict()
detail['title'] = li.css('div.title a::text').get()
# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼
house_info = li.css('div.houseInfo::text').get()
house_info_list = house_info.split(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = repile(r'\d{1,2}')
match1 = re.search(floor_pattern, house_info_list[4]) # 从字符串任意位置匹配 if match1:
detail['floor'] = up()
else:
detail['floor'] = "未知"
# 匹配年份
year_pattern = repile(r'\d{4}')
match2 = re.search(year_pattern, house_info_list[5])
if match2:
detail['year'] = up()
else:
detail['year'] = "未知"
# ⽂兰⼩区 - 塘桥提取⼩区名和哈快
position_info = li.css('div.positionInfo a::text').getall()
detail['house'] = position_info[0]
detail['location'] = position_info[1]
# 650万,匹配650
price_pattern = repile(r'\d+')
total_price = li.css('alPrice span::text').get()
detail['total_price'] = re.search(price_pattern, total_price).group()
linux网站搭建# 单价64182元/平⽶,匹配64182
unit_price = li.css('div.unitPrice span::text').get()
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论