Python爬⾍:Q房⽹房源信息
#爬⾍项⽬采取xpath解析
#爬取Q房源⽹的详情信息并保存为csv⽂件
#爬取具体内容有:"⼩区名称", "户型", "⾯积", "装修", "楼层", "朝向",
# "售价", "总价/万", "详情"
1、导⼊模块
import requests
import time
chrome直接下载from lxml import etree
import csv
2、#定义spider_page()函数爬取并返回页⾯信息
def spider_page(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/70.0.3538.110 Safari/537.36',
'upgrade-insecure-requests': '1',
'cookie':'acw_tc=df6fef1a15477176336286817eeb02a7224b5ac26463f80afbe8cf7952; qchatid=d59ef744-850a-427b-9340-264de69f268b; WINDOW_D
response = (url, headers=headers)
time.sleep(2)#延迟两秒时间
3、#创建csv保存函数
def csv_data(item):
with open('fangwo_info.csv', 'a+', encoding='utf-8', newline='')as csvfile:#newline设置为''可以去点换⾏
writer = csv.writer(csvfile)
writer.writerow(item)
4、# 解析页⾯所需内容
def paser_info(url):
# 解析页⾯
html = spider_page(url)
selector = etree.HTML(html)#以构造器的形式返回
house_infos = selector.xpath('//*[@id="cycleListings"]/ul/li')
for house_info in house_infos:
name = house_info.xpath('./div[1]/p[1]/a/text()')[0].split(' ', 1)[0]
xiangq = house_info.xpath('./div[1]/p[1]/a/text()')[0].split(' ', 1)[1]
style = house_info.xpath('./div[1]/p[2]/span[2]/text()')[0]
area = house_info.xpath('./div[1]/p[2]/span[4]/text()')[0]
decotored = house_info.xpath('./div[1]/p[2]/span[6]/text()')[0]
louceng = house_info.xpath('./div[1]/p[2]/span[8]/text()')[0].strip()
chaoxiang = house_info.xpath('./div[1]/p[2]/span[10]/text()')[0]
total = house_info.xpath('./div[2]/span[1]/text()')[0]
price = house_info.xpath('./div[2]/p/text()')[0]
info = [name, style, area, decotored, louceng, chaoxiang, price, total, xiangq]
csv_data(info)
print("正在爬取", name)#编辑器⾥打开显⽰爬取
5、#创建主函数
def main():
# 添加csv标题头
info_title = ["名称", "户型", "⾯积", "装修", "楼层", "朝向", "售价", "总价/万", "详情"] csv_data(info_title)
urls = ['shenzhen.qfang/sale/f%s' % x for x in range(1, 10)]
for url in urls:
paser_info(url)
6、# 调⽤函数运⾏
if __name__ == '__main__':
main()
最后爬取结果如下:
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论