python爬虫请求库httpx和parsel解析库的使用测评--688IT编程网

python爬⾍请求库httpx和parsel解析库的使⽤测评

⽬录

requests + BeautifulSoup组合

requests + parsel组合

httpx同步 + parsel组合

httpx异步+ parsel组合

对⽐与总结

Python⽹络爬⾍领域两个最新的⽐较⽕的⼯具莫过于httpx和parsel了。httpx号称下⼀代的新⼀代的⽹络请求库，不仅⽀持requests库的所有操作，还能发送异步请求，为编写异步爬⾍提供了便利。parsel最初集成在著名Python爬⾍框架Scrapy中，后独⽴出来成⽴⼀个单独的模块，⽀持XPath选择器, CSS选择器和正则表达式等多种解析提取⽅式, 据说相⽐于BeautifulSoup，parsel的解析效率更⾼。

今天我们就以爬取链家⽹上的⼆⼿房在售房产信息为例，来测评下httpx和parsel这两个库。为了节约时间，我们以爬取上海市浦东新区500万元-800万元以上的房产为例。

requests + BeautifulSoup组合

整个项⽬代码如下所⽰：

# homelink_requests.py

# Author: ⼤江狗

from fake_useragent import UserAgent

import requests

from bs4 import BeautifulSoup

import csv

import re

import time

class HomeLinkSpider(object):

def __init__(self):

self.ua = UserAgent()

self.headers = {"User-Agent": self.ua.random}

self.data = list()

self.path = "浦东_三房_500_800万.csv"

self.url = "sh.lianjia/ershoufang/pudong/a3p5/"

def get_max_page(self):

response = (self.url, headers=self.headers)

if response.status_code == 200:

soup = , 'html.parser')

a = soup.select('div[class="page-box house-lst-page-box"]')

#使⽤eval是字符串转化为字典格式

max_page = eval(a[0].attrs["page-data"])["totalPage"]

return max_page

else:

print("请求失败 status:{}".format(response.status_code))

return None

def parse_page(self):

max_page = _max_page()

for i in range(1, max_page + 1):

url = 'sh.lianjia/ershoufang/pudong/pg{}a3p5/'.format(i)

response = (url, headers=self.headers)

soup = , 'html.parser')

ul = soup.find_all("ul", class_="sellListContent")

python请求并解析json数据li_list = ul[0].select("li")

for li in li_list:

detail = dict()

detail['title'] = li.select('div[class="title"]')[0].get_text()

# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼

house_info = li.select('div[class="houseInfo"]')[0].get_text()

house_info_list = house_info.split(" | ")

detail['bedroom'] = house_info_list[0]

detail['area'] = house_info_list[1]

detail['direction'] = house_info_list[2]

floor_pattern = repile(r'\d{1,2}')

# 从字符串任意位置匹配

match1 = re.search(floor_pattern, house_info_list[4])

if match1:

detail['floor'] = up()

else:

detail['floor'] = "未知"

# 匹配年份

year_pattern = repile(r'\d{4}')

match2 = re.search(year_pattern, house_info_list[5])

if match2:

detail['year'] = up()

else:

detail['year'] = "未知"

# ⽂兰⼩区 - 塘桥，提取⼩区名和哈快

position_info = li.select('div[class="positionInfo"]')[0].get_text().split(' - ')

detail['house'] = position_info[0]

detail['location'] = position_info[1]

# 650万，匹配650

price_pattern = repile(r'\d+')

total_price = li.select('div[class="totalPrice"]')[0].get_text()

detail['total_price'] = re.search(price_pattern, total_price).group()

# 单价64182元/平⽶，匹配64182

unit_price = li.select('div[class="unitPrice"]')[0].get_text()

detail['unit_price'] = re.search(price_pattern, unit_price).group()

self.data.append(detail)

def write_csv_file(self):

head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层", "年份",

"位置", "总价(万)", "单价(元/平⽅⽶)"]

keys = ["title", "house", "bedroom", "area", "direction",

"floor", "year", "location",

"total_price", "unit_price"]

js取消隐藏

try:

with open(self.path, 'w', newline='', encoding='utf_8_sig') as csv_file:

writer = csv.writer(csv_file, dialect='excel')

if head is not None:

writer.writerow(head)

for item in self.data:

row_data = []

for k in keys:

row_data.append(item[k])

# print(row_data)

writer.writerow(row_data)

print("Write a CSV file to path %s Successful." % self.path)

except Exception as e:

print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))

if __name__ == '__main__':

start = time.time()

home_link_spider = HomeLinkSpider()

home_link_spider.parse_page()

home_link_spider.write_csv_file()

end = time.time()

print("耗时：{}秒".format(end-start))

注意：我们使⽤了fake_useragent, requests和BeautifulSoup，这些都需要通过pip事先安装好才能⽤。

现在我们来看下爬取结果，耗时约18.5秒，总共爬取580条数据。

requests + parsel组合

这次我们同样采⽤requests获取⽬标⽹页内容，使⽤parsel库(事先需通过pip安装)来解析。Parsel库的⽤法和BeautifulSoup相似，都是先创建实例，然后使⽤各种选择器提取DOM元素和数据，但语法上稍有不同。Beautiful有⾃⼰的语法规则，⽽Parsel库⽀持标准的css选择器和xpath选择器, 通过get⽅法或getall⽅法获取⽂本或属性值，使⽤起来更⽅便。

# BeautifulSoup的⽤法

from bs4 import BeautifulSoup

soup = , 'html.parser')

ul = soup.find_all("ul", class_="sellListContent")[0]

# Parsel的⽤法, 使⽤Selector类

from parsel import Selector

selector = )

ul = selector.css('ul.sellListContent')[0]

# Parsel获取⽂本值或属性值案例

selector.css('div.title span::text').get()

selector.css('ul li a::attr(href)').get()

>>> for li in selector.css('ul > li'):

... print(li.xpath('.//@href').get())

注：⽼版的parsel库使⽤extract()或extract_first()⽅法获取⽂本或属性值，在新版中已被get()和getall()⽅法替代。全部代码如下所⽰：

# homelink_parsel.py

# Author: ⼤江狗

from fake_useragent import UserAgent

import requests

import csv

import re

import time

数据库增删改查统称什么from parsel import Selector

class HomeLinkSpider(object):

def __init__(self):

self.ua = UserAgent()

self.headers = {"User-Agent": self.ua.random}

self.data = list()

self.path = "浦东_三房_500_800万.csv"

self.url = "sh.lianjia/ershoufang/pudong/a3p5/"

def get_max_page(self):

response = (self.url, headers=self.headers)

if response.status_code == 200:

# 创建Selector类实例

selector = )

# 采⽤css选择器获取最⼤页码div Boxl

a = selector.css('div[class="page-box house-lst-page-box"]')

# 使⽤eval将page-data的json字符串转化为字典格式

max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]

print("最⼤页码数:{}".format(max_page))

return max_page

else:

print("请求失败 status:{}".format(response.status_code))

return None

def parse_page(self):

max_page = _max_page()

for i in range(1, max_page + 1):

url = 'sh.lianjia/ershoufang/pudong/pg{}a3p5/'.format(i)

response = (url, headers=self.headers)

selector = )

ul = selector.css('ul.sellListContent')[0]

li_list = ul.css('li')

for li in li_list:

detail = dict()

detail['title'] = li.css('div.title a::text').get()

# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼

house_info = li.css('div.houseInfo::text').get()

house_info_list = house_info.split(" | ")

detail['bedroom'] = house_info_list[0]

detail['area'] = house_info_list[1]

detail['direction'] = house_info_list[2]

floor_pattern = repile(r'\d{1,2}')

match1 = re.search(floor_pattern, house_info_list[4]) # 从字符串任意位置匹配

if match1:

detail['floor'] = up()

else:

detail['floor'] = "未知"

# 匹配年份

year_pattern = repile(r'\d{4}')

match2 = re.search(year_pattern, house_info_list[5])

if match2:

detail['year'] = up()

else:

detail['year'] = "未知"

# ⽂兰⼩区 - 塘桥提取⼩区名和哈快

position_info = li.css('div.positionInfo a::text').getall()

detail['house'] = position_info[0]

detail['location'] = position_info[1]

# 650万，匹配650

price_pattern = repile(r'\d+')

total_price = li.css('alPrice span::text').get()

detail['total_price'] = re.search(price_pattern, total_price).group()

# 单价64182元/平⽶，匹配64182

unit_price = li.css('div.unitPrice span::text').get()

detail['unit_price'] = re.search(price_pattern, unit_price).group()

self.data.append(detail)

def write_csv_file(self):

head = ["标题", "⼩区", "房厅", "⾯积", "朝向", "楼层",

"年份", "位置", "总价(万)", "单价(元/平⽅⽶)"]

keys = ["title", "house", "bedroom", "area",

python和vba哪个更有前景"direction", "floor", "year", "location",

"total_price", "unit_price"]

try:

with open(self.path, 'w', newline='', encoding='utf_8_sig') as csv_file:

writer = csv.writer(csv_file, dialect='excel')

if head is not None:

writer.writerow(head)

for item in self.data:

row_data = []

for k in keys:

row_data.append(item[k])

# print(row_data)

writer.writerow(row_data)

print("Write a CSV file to path %s Successful." % self.path)

except Exception as e:

print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))

if __name__ == '__main__':

start = time.time()

home_link_spider = HomeLinkSpider()

home_link_spider.parse_page()

home_link_spider.write_csv_file()

end = time.time()

print("耗时：{}秒".format(end-start))

现在我们来看下爬取结果，爬取580条数据耗时约16.5秒，节省了2秒时间。可见parsel⽐BeautifulSoup解析效率是要⾼的，爬取任务少时差别不⼤，任务多的话差别可能会⼤些。

httpx同步 + parsel组合

我们现在来更进⼀步，使⽤httpx替代requests库。httpx发送同步请求的⽅式和requests库基本⼀样，所以我们只需要修改上例中两⾏代码，把requests替换成httpx即可, 其余代码⼀模⼀样。

from fake_useragent import UserAgent

import csv

import re

import time

from parsel import Selector

import httpx

class HomeLinkSpider(object):

def __init__(self):

self.ua = UserAgent()

self.headers = {"User-Agent": self.ua.random}

self.data = list()

self.path = "浦东_三房_500_800万.csv"

self.url = "sh.lianjia/ershoufang/pudong/a3p5/"

def get_max_page(self):

# 修改这⾥把requests换成httpx

response = (self.url, headers=self.headers)fetch()

if response.status_code == 200:

# 创建Selector类实例

selector = )

# 采⽤css选择器获取最⼤页码div Boxl

a = selector.css('div[class="page-box house-lst-page-box"]')

# 使⽤eval将page-data的json字符串转化为字典格式

max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]

print("最⼤页码数:{}".format(max_page))

return max_page

else:

print("请求失败 status:{}".format(response.status_code))

return None

def parse_page(self):

max_page = _max_page()

for i in range(1, max_page + 1):

url = 'sh.lianjia/ershoufang/pudong/pg{}a3p5/'.format(i)

# 修改这⾥把requests换成httpx

response = (url, headers=self.headers)

selector = )

ul = selector.css('ul.sellListContent')[0]

li_list = ul.css('li')

for li in li_list:

detail = dict()

detail['title'] = li.css('div.title a::text').get()

# 2室1厅 | 74.14平⽶ | 南 | 精装 | ⾼楼层(共6层) | 1999年建 | 板楼

house_info = li.css('div.houseInfo::text').get()

house_info_list = house_info.split(" | ")

detail['bedroom'] = house_info_list[0]

detail['area'] = house_info_list[1]

detail['direction'] = house_info_list[2]

floor_pattern = repile(r'\d{1,2}')

match1 = re.search(floor_pattern, house_info_list[4]) # 从字符串任意位置匹配 if match1:

detail['floor'] = up()

else:

detail['floor'] = "未知"

# 匹配年份

year_pattern = repile(r'\d{4}')

match2 = re.search(year_pattern, house_info_list[5])

if match2:

detail['year'] = up()

else:

detail['year'] = "未知"

# ⽂兰⼩区 - 塘桥提取⼩区名和哈快

position_info = li.css('div.positionInfo a::text').getall()

detail['house'] = position_info[0]

detail['location'] = position_info[1]

# 650万，匹配650

price_pattern = repile(r'\d+')

total_price = li.css('alPrice span::text').get()

detail['total_price'] = re.search(price_pattern, total_price).group()

linux网站搭建

# 单价64182元/平⽶，匹配64182

unit_price = li.css('div.unitPrice span::text').get()

688IT编程网

python爬虫请求库httpx和parsel解析库的使用测评

发表评论

推荐文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

热门文章

随机森林算法的改进方法

基于随机森林算法的风险预警模型研究

Python中的随机森林算法详解

随机森林发展历史

如何使用随机森林进行时间序列数据模式识别(八)

随机森林回归模型原理

如何使用随机森林进行时间序列数据模式识别(六)

如何使用随机森林进行时间序列数据预测(四)

如何使用随机森林进行异常检测(六)

随机森林算法和grandientboosting算法 -回复

随机森林方法总结全面

随机森林算法原理和步骤

随机森林的原理

随机森林重要性

随机森林算法

机器学习中随机森林的原理

随机森林算法原理

使用计算机视觉技术进行动物识别的技巧

基于crf命名实体识别实验总结

transformer预测模型训练方法

最新文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

随机森林结合直接正交信号校正的模型传递方法

标签列表

688IT编程网

python爬虫请求库httpx和parsel解析库的使用测评

发表评论

推荐文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

热门文章

随机森林算法的改进方法

基于随机森林算法的风险预警模型研究

Python中的随机森林算法详解

随机森林发展历史

如何使用随机森林进行时间序列数据模式识别(八)

随机森林回归模型原理

如何使用随机森林进行时间序列数据模式识别(六)

如何使用随机森林进行时间序列数据预测(四)

如何使用随机森林进行异常检测(六)

随机森林算法和grandientboosting算法 -回复

随机森林方法总结全面

随机森林算法原理和步骤

随机森林的原理

随机森林 重要性

随机森林算法

机器学习中随机森林的原理

随机森林算法原理

使用计算机视觉技术进行动物识别的技巧

基于crf命名实体识别实验总结

transformer预测模型训练方法

最新文章

随机森林算法介绍及R语言实现

基于随机森林优化的神经网络算法在冬小麦产量预测中的应用研究_百度文 ...

基于正则化贪心森林算法的情感分析方法研究

随机森林算法和grandientboosting算法

基于随机森林的图像分类算法研究

随机森林结合直接正交信号校正的模型传递方法

标签列表

随机森林重要性