python解析html⽂件,提取标签中⼀个元素
对于本地html⽂件
1# -*- coding: utf-8 -*-
2# 使⽤BeautifulSoup解析⽹页
3from bs4 import BeautifulSoup
4
5#获取要解析的标签
6 with open('test.html','r',encoding='utf-8') as wb_data:
7    Soup = BeautifulSoup(wb_data,'lxml');    #将要解析的⽂件传⼊
8print(Soup);    #打印读⼊Soup中的内容
9print("!--------------\n")
10    shot_name = Soup.select('body > div > div > table > tbody > tr > td > a');    #将要解析的标签元素路径传⼊
11#shot_name = Soup.select('body > div > div > div > ol > li > a');    #将要解析的标签元素路径传⼊
12#可以从⽹站上直接复制
13print(shot_name,sep='\n!!---------------\n');      #打印解析标签元素包含内容
14 wb_data.close();
15
16#解析标签内容-------使⽤get_text()获得⽂本内容,使⽤get('')⽅法获取标签属性值
17 list = [];
18for shot in shot_name:
19    data = ('href').strip('\/');
20    list.append(data);
21
22 with open('', 'w+') as f:
23for i in list:
24        f.writelines(i + '\n')
2、对于⽹页
1# -*- coding: utf-8 -*-
2from bs4 import BeautifulSoup
3import requests
4
5
6 url = 'hao.360/?a1004'
7 wb_data = (url)
8 soup = BeautifulSoup(,'lxml')  #把web_data变得可读
9#解析⽹页元素,从⽹站上复制元素的CSS路径
10#这⾥以链接为例
writelines在python中的用法11#famous-section > ul.learfix > li:nth-child(7) > a
12 url_famous = soup.select('#famous-section > ul.learfix > li > a')
13#famous-section > ul.learfix > li:nth-child(1) > a
14 url_famous .append(soup.select('#famous-section > ul.learfix > li > a'))
15print(url_famous)
16#focus_news > ul > li:nth-child(1) > a
17 url_focus = soup.select('#focus_news > ul > li > a')
18print(url_focus)

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。