Python实现爬⾍设置代理IP和伪装成浏览器的⽅法分享1.python爬⾍浏览器伪装
#导⼊quest模块
quest
#设置请求头
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
#创建⼀个opener
quest.build_opener()
#将headers添加到opener中
opener.addheaders=[headers]
#将opener安装为全局
#⽤urlopen打开⽹页
quest.urlopen(url).read().decode('utf-8','ignore')
2.设置代理
#定义代理ip
proxy_addr="122.241.72.191:808"
#设置代理
quest.ProxyHandle({'http':proxy_addr})
#创建⼀个opener
quest.build_opener(quest.HTTPHandle)
#将opener安装为全局
#⽤urlopen打开⽹页
quest.urlopen(url).read().decode('utf-8','ignore')
3.同时设置⽤代理和模拟浏览器访问
#定义代理ip
proxy_addr="122.241.72.191:808"
#创建⼀个请求
quest.Request(url)
#添加headers
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
#设置代理
quest.ProxyHandle("http":proxy_addr)
#创建⼀个opener
quest.build_opener(quest.HTTPHandle)
#将opener安装为全局
#⽤urlopen打开⽹页
quest.urlopen(req).read().decode('utf-8','ignore')
4.在请求头中添加多个信息
quest
page_headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0", "Host":"www.baidu",
"Cookie":"xxxxxxxx"
}
quest.Request(url,headers=page_headers)
quest.urlopen(req).read().decode('utf-8','ignore')
5.添加post请求参数
quest
import urllib.parse
#设置post参数
page_data=urllib.parse.urlencode([
('pn',page_num),
('kd',keywords)
])
#设置headers
page_headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Connection':'keep-alive',
'Host':'www.lagou',
'Origin':'www.lagou',
'Cookie':'JSESSIONID=ABAAABAABEEAAJA8F28C00A88DC4D771796BB5C6FFA2DDA; user_trace_token=20170715131136-d58c1f22f6434e9992fc0b35819a572b', 'Accept':'application/json, text/javascript, */*; q=0.01',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'www.lagou/jobs/list_%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Token':'None',
'X-Requested-With':'XMLHttpRequest'
}
#打开⽹页
quest.Request(url,headers=page_headers)
quest.urlopen(req,data=de('utf-8')).read().decode('utf-8')
6.利⽤phantomjs模拟浏览器请求
#1.下载phantomjs安装到本地,并设置环境变量
from selenium import webdriver
bs=webdriver.PhantomJS()
#打开url
<(url)selenium获取cookie
#获取⽹页源码
url_data=bs.page_source
#将浏览到的⽹页保存为图⽚
<_screenshot_as_file(filename)
7.phantomjs设置user-agent和cookie
from selenium import webdriver
from selenium.webdrivermon.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") bs = webdriver.PhantomJS(desired_capabilities=dcap)
<(url)
#删除cookie
bs.delete_all_cookies()
#设置cookie
#cookie格式:在浏览器cookie中查看,⼀个cookie需要包含以下参数,domain、name、value、path
cookie={
'domain':'.www.baidu', #注意前⾯有.
'name':'xxxx',
'value':'xxxx',
'path':'xxxx'
}
#向phantomjs中添加cookie
bs.add_cookie(cookie)
8.利⽤web_driver⼯具
#1.下载web_driver⼯具(如)及对应的浏览器
#2.将放到某个⽬录,如c:\
from selenium import webdriver
driver=webdriver.Chrome(executable_path="C:\")
#打开url
<(url)
以上这篇Python实现爬⾍设置代理IP和伪装成浏览器的⽅法分享就是⼩编分享给⼤家的全部内容了,希望能给⼤家⼀个参考,也希望⼤家多多⽀持。
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论