基于python的爬⾍(⼀)
⼀.爬⾍的基本流程:
# 1、发起请求:
  使⽤http库向⽬标站点发起请求,即发送⼀个Request
  Request包含:请求头、请求体等
# 2、获取响应内容
  如果服务器能正常响应,则会得到⼀个Response
  Response包含:html,json,图⽚,视频等
# 3、解析内容
  解析html数据:正则表达式,第三⽅解析库如Beautifulsoup,pyquery等
  解析json数据:json模块
  解析⼆进制数据:以b的⽅式写⼊⽂件
# 4、保存数据
  数据库
  ⽂件
⼆.我们来爬⼀个校花⽹
import requests
import re
# 爬⾍三部曲
# ⼀发送请求
def get_page(url):
index_res = (url)
return
# ⼆解析数据
# 解析主页
def parse_index(index_page):
detail_urls = re.findall('<div class="items">.*?href="(.*?)"', index_page, re.S)
# print(detail_urls)
for detail_url in detail_urls:
if not detail_url.startswith('http'):
detail_url = 'www.xiaohuar' + detail_url
yield detail_url
# 解析详情页
def parse_detail(detail_page):
video_urls = re.findall('id="media".*?src="(.*?)"', detail_page, re.S)
if video_urls:
video_urls = video_urls[0]
if dswith('.mp4'):
yield video_urls
# print(video_urls)
# 三保存数据
import uuid
def save_video(video_url):
try:
res = (video_url)
with open(r'D:\pachong\movies\%s.mp4' % uuid.uuid4(), 'wb') as f:
f.t)
f.flush()
except Exception:
pass
if__name__ == '__main__':
base_url = 'www.xiaohuar/list-3-{}.html'
for line in range(5):
index_url = base_url.format(line)
index_page = get_page(index_url)
detail_urls = parse_index(index_page)
for detail_url in detail_urls:
detail_page = get_page(detail_url)
video_urls = parse_detail(detail_page)
for video_url in video_urls:
coreldraw教程save_video(video_url)
并发版:
# pip3 install requests
import requests
import re
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(50)
# 爬⾍三部曲
# ⼀发送请求
def get_page(url):
print('%s GET start ...' % url)
index_res = (url)
return
# ⼆解析数据
# 解析主页
def parse_index(index_page):
# 拿到主页的返回结果
res = sult()
detail_urls = re.findall('<div class="items">.*?href="(.*?)"', res, re.S)
# print(detail_urls)
for detail_url in detail_urls:
if not detail_url.startswith('http'):
detail_url = 'www.xiaohuar' + detail_url卵巢囊肿扭转的症状有哪些表现
pool.submit(get_page, detail_url).add_done_callback(parse_detail)
# yield detail_url
# 解析详情页
def parse_detail(detail_page):
res = sult()
video_urls = re.findall('id="media".*?src="(.*?)"', res, re.S)
if video_urls:
video_urls = video_urls[0]
if dswith('.mp4'):
pool.submit(save_video, video_urls)
# print(video_urls)
# 三保存数据
import uuid
def save_video(video_url):
try:
res = (video_url)
with open(r'D:\tank\day01\movies\%s.mp4' % uuid.uuid4(), 'wb') as f:
f.t)
f.flush()
print('%s done ...' % video_url)
except Exception:
pass
if__name__ == '__main__':
base_url = 'www.xiaohuar/list-3-{}.html'
for line in range(5):
index_url = base_url.format(line)
pool.submit(get_page, index_url).add_done_callback(parse_index)
三.request的基本使⽤
<请求的两种⽅式:
import requests
阿里巴巴java开发手册 pdffrom urllib.parse import urlencode
# 请求url
base_url = 'www.baidu/s?' + urlencode({"wd": "美⼥"})
# 请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
# 请求⽅法 GET
res = (base_url, headers=headers)
# print(res)            ⼀个response对象 # )      整个html⽂本 # t)    ⼆进制内容with open('meinv.html', 'w', encoding='utf-8') as f:
f.)
每次url编码会很⿇烦,所以可以在GET内添加参数即可:
import requests
# 请求url
base_url = 'www.baidu/s?'
# # 请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
#
# 请求⽅法 GET
res = (base_url, headers=headers, params={"wd": "黄云"})
with open('⼩云云.html', 'w', encoding='utf-8') as f:
f.)
get请求访问知乎:
# 访问知乎
# 请求url
zhi_url = 'www.zhihu/explore'
# # 请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
# 请求⽅法 GET
res = (zhi_url, headers=headers)
with open('知乎.html', 'w', encoding='utf-8') as f:
f.)
get请求访问github:
# # 请求头,登录后的主页
url='github/settings/emails'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Cookie': 'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __H }
# # 请求头,email
headers_2 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Cookie':'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Ho }
# 请求⽅法 GET
# res = (url, headers=headers_2)
res = (url, headers=headers)
with open('github.html', 'w', encoding='utf-8') as f:
f.)
print('1059239165')
# True
2.post请求
# 第⼀步 github/login  >>>>  获取tocken
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
login_res = ('github/login', headers=headers)
#
authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', , re.S)[0]
print(
authenticity_token
)
# 第⼆步拿到cookies
cookies = {}
# 把login_cookies放进cookies字典内
cookies.update(_dict())
print(cookies)
# 第三步往session发送post请求
# 请求⽅法 POST
# 请求url
#    github/session  POST
# 请求体
form_data = {
"commit": "Sign in",
"utf8": "✓",
"authenticity_token": authenticity_token,
"login": "pengsima",
"password": "oa09116611",
"webauthn-support":" supported"
}
# json
# requests.post('github/session', headers=headers, json=form_data)
res = requests.post('github/session', headers=headers, data=form_data, cookies=cookies)
# print(res.status_code)
python请求并解析json数据with open('github.html', 'w', encoding='utf-8') as f:
lozenges
f.)
import requests
baidu = 'www.baidu/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
res = (baidu, headers=headers)
# 返回响应状态码
print(res.status_code)
print(res)
# 响应头
print(res.headers)
# 响应⽂本
)
print(res.url)
#
kies)
_dict())
ding)
# ding = 'utf-8'
# ding)
print(res.history)
t)
下载⼀张图⽚:
bo = 'timgsa.baidu/timg?image&quality=80&size=b9999_10000&sec=1551942493340&di=afa19a1f5a3a4fbdec983baaeb1954f0&imgtype=0&src=http%3A%ws%2Fwenyu%2Flxsj%2F201611%2FW020161114 headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
res = (bo, headers=headers, stream=True)
with open('bo2.jpg', 'wb') as f:
for line in res.iter_content():
# f.t)
f.write(line)
补充:
取消重定向(默认为True):
allow_redriects=False
4.session⽤法:
import requests
import re
session = requests.session()
# ⼀往login发送get请求: 获取token
'''
name="authenticity_token" value="/pE5/yY3Ibm1z0CgiSrqZheBOGQl+rPLs491/TOUL0sRIaQFQzS/s/er5eC/xxEO2AGY0l39b0rEStW/A6Bngg=="
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36',
# 'Cookies'
}
login_res = ('github/login', headers=headers)
authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', , re.S)[0]
# ⼆往session发送post请求
# 请求⽅法 POST
# 请求url
递归算法的总结
#    github/session  POST
# 请求体
form_data = {
"commit": "Sign in",
"utf8": "✓",
"authenticity_token": authenticity_token,
"login": "hdjasbfsas",
"password":"yy9797910",
}
res = session.post('github/session', headers=headers, data=form_data)
# print('pengsima' )
print(res.status_code)
# )
#
with open('github.html', 'w', encoding='utf-8') as f:
f.)
5.json格式反序化:
import requests
import json
res = ('utiao/stream/widget/local_weather/city/')
)
# jason反序列化的两种⽅式
print(json.))
print(res.json())
补充:
'''
requests⾼级⽤法
了解!
'''
import requests
# SSL
res = ('www.xiaohuar/')
)
# 改进⼀: ⽆视证书
res = ('www.xiaohuar/', verify=False)
)
# 改进⼆:取消警告
import urllib3
urllib3.disable_warnings()
res = ('www.xiaohuar/', verify=False)
)
# 改进三:添加证书
import urllib3
urllib3.disable_warnings()
res = ('www.xiaohuar/', verify=False,
cert=('/', '/path/key'))
)
# 使⽤代理
res = ('www.baidu/', headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' },
# proxies={
#    'http': '112.85.130.66:9999',
#    # 'https': '112.85.130.66:9999',
# }
proxies={
'sock': 'sock://ip:port'
})
)
# 超时设置(超时报错)
import requests
('www.baidu',
timeout=0.0001)
)
# 认证
import requests
from requests.auth import HTTPBasicAuth
('xxx', auth=HTTPBasicAuth('user','password'))
print(r.status_code)
# 上传⽂件
import requests
files = {'file': open('a.jpg', 'rb')}
response = requests.post('/post', files=files)
print(response.status_code)

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。