pythonbs4库_python库:bs4,BeautifulSoup库、Requests库Beautiful Soup
importbs4print(bs4.__version__) #当前版本是4.5.3  2017-4-6
安装第三⽅库
C:\Python3\scripts\> install bs4 (引⼊第三⽅库 bs4 )——BeautifulSoup
C:\Python3\scripts\> install html5lib(引⼊第三⽅库 html5lib )——html5解析器,BeautifulSoup要⽤到
打开本地的zzzzz.html⽂件,⽤ BeautifulSoup 解析
from urllib importrequestfrom bs4 importBeautifulSoupimport html5lib #html5解析器
url='file:///C:/Python3/zz/zzzzz.html'resp=request.urlopen(url)
html_ad()
soup= BeautifulSoup(html_doc,'lxml')#使⽤BeautifulSoup解析这段代码。'lxml'是解析器,除此之外还有'html.parser'、'xml'、
'html5lib'等
print(soup.prettify()) #按照标准的缩进格式的结构输出
print(soup.title)#
标签
print(soup.title.string)#
标签的⽂字
print(soup.find(id="div111"))#查id
print(soup.find(id="div111").get_text())#获得标签内的所有⽂本内容⽂字
print(soup.find("p", {"class": "p444"}))#查
print(soup.select('.p444'))#css选择器 (这⾥的数据类型是 list)
for tag1 in soup.select('.p444'):print(tag1.string)print(soup.select('.div2 .p222'))#css选择器
print(soup.findAll("a"))#所有标签
for link in soup.findAll("a"):("href"))print(link.string)
使⽤正则
importre
data= soup.findAll("a",href=repile(r"baidu\"))for tag22 indata:("href"))
练习1:解析⼀个⽹页
由于win7上的编码解码问题搞不定,只好先使⽤标准html5的⽹页了。先拿廖⼤的python教程页做练习了,抓取左侧的⽬录
#-*- coding: utf-8 -*-
from urllib importrequestfrom bs4 importBeautifulSoupimport html5lib #html5解析器
url="www.liaoxuefeng/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000"resp=request.urlope
html_ad()#使⽤BeautifulSoup解析这段代码。'lxml'是解析器,除此之外还有'html.parser'、'xml'、'html5lib'等
soup = BeautifulSoup(html_doc,'html.parser')#soup = BeautifulSoup(html_doc,'lxml')
#按照标准的缩进格式的结构输出#print(soup.prettify())
f= open("c:\\Python3\zz\\0.txt","w+")for tag1 in soup.select('.x-sidebar-left-content li a'):#ss = _text()
ss =tag1.string
ss2= ("href")print(ss,"---","www.liaoxuefeng",ss2)廖雪峰python
f.close()
2017-10-18:
#python 3.6.0
import requests #2.18.4
import bs4 #4.6.0
importhtml5lib
url= "www.bootcdn/"
#url = "www.bootcdn/all/"
headers = {'User-Agent': 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko)
chrome/61.0.3163.100 safari/537.36'}
r = (url,headers=headers)#(url)
ding) #获得编码
print(r.status_code) #获得状态码
soup= bs4.t.decode("utf-8"), "lxml") #'lxml'是解析器,除此之外还有'html.parser'、'xml'、'html5lib'等
#soup = bs4.t, "html5lib")#aa = soup.decode("UTF-8", "ignore")#print(soup.prettify())#按照标准的缩进格式的结构输出
#将数据解析成字典
element = soup.select('.packages-list-container .row')
starsList={}for item inelement:#print(item.select("h4.package-name"))
#print(item.select(".package-extra-info span"))
#print()
#print()
starsList[]=print(starsList)#将字典存⼊⽂本⽂件
importtimefrom datetime importdatetimetry:
f= open('1.txt', 'a+')
t2=datetime.fromtimestamp(float(time.time()))
f.write('\n'+str(t2))
f.write('\n'+str(starsList))finally:iff:
f.close()
爬取廖雪峰的python教程:(就是先⽤bs4解析左边的⽬录列表,拿到链接,存为字典,并保存到⽂本⽂件中。再扒取。)  共123条,但我只扒下28个⽂件
importrequestsimportbs4importurllib
url="www.liaoxuefeng/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000"
#r = (url) #这⾥不加header,不让爬了
headers = {'User-Agent': 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko)
chrome/61.0.3163.100 safari/537.36'}
r= (url,headers=headers)
soup= bs4.t.decode("utf-8"), "lxml")#⽣成字典,并保存在⽂本⽂件中
f = open("c:\\Python3\\zz\\liaoxuefeng\\a.txt","w+")
mylist= soup.select('#x-wiki-index .x-wiki-index-item')
myhrefdict={}for item inmylist:
]= "www.liaoxuefeng" + item["href"]#,item["href"]) # tag1.string item["href"] ("href")。
f.write(str(myhrefdict))
f.close()#爬取⽂件
i =0for key,val inmyhrefdict.items():
i+= 1name= str(i) + '_' + key + '.html'link=valprint(link,name)
Requests库:  2017-10-30
<(url, params=None, **kwargs)获取HTML⽹页的主要⽅法,对应于HTTP的GET
requests.head(url, **kwargs)获取HTML⽹页头信息的⽅法,对应于HTTP的HEAD
requests.post(url, data=None, json=None, **kwargs)向HTML⽹页提交POST请求的⽅法,对应于HTTP的POST
requests.put(url, data=None, **kwargs)向HTML⽹页提交PUT请求的⽅法,对应于HTTP的PUT
requests.patch(url, data=None, **kwargs)向HTML⽹页提交局部修改请求,对应于HTTP的PATCH
requests.delete(url, **kwargs)向HTML页⾯提交删除请求,对应于HTTP的DELET
代理:  2018-2-5
importrequests
proxies={"http": "10.10.1.10:3128","https": "10.10.1.10:1080",
}
<("aaa", proxies=proxies)
...

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。