关于Python爬取⽹页返回521状况码的解决⽅案⽂章⽬录
# 项⽬场景: Python3.8
问题描述:
在使⽤Python爬⾍爬取⽹页的列表页中的详情页时,返回的详情页的html⽂件的数据长度有限。
原因分析:
频繁爬取⽬标⽹站,导致的⽹址反爬⾍措施
解决⽅案:
⽅法⼀:
换⼀个vpn,也就是换⼀台电脑执⾏程序
⽅法⼆:
复制⽬标⽹页的Headers添加到代码中
根据⽬标情况不同修改
⽅法三:
两次访问⽬标详情页代码⼀def askURL (url ): head = { # 模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo";s:2:"ft";s:19:"2022-01-10 'Host': 'www.mafengwo', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.107
} # ⽤户代理,表⽰告诉⾖瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么⽔平的⽂件内容) request = urllib .request .Request (url , headers =head ) html = "" try : response = urllib .request .urlopen (request ) html = response .read ().decode ("utf-8") except urllib .error .URLError as e : if hasattr (e , "code"): print (e .code ) if hasattr (e , "reason"): print (e .reason ) return html }1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
url编码处理17
18
19
20
21
22
23
24
25
26import execjs import requests import re head = { # 模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.mafengwo', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.}url = 'www.mafengwo/poi/5423409.html'# response = (url)# # cookie1# cookie1 = kies # # js 代码# js_code = def get_521_content (url ,head ): req = requests .get (url , headers =head ) cookies = req .cookies 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
代码⼆ cookies = '; '.join (['='.join (item ) for item in cookies .items ()]) txt_521 = req .text txt_521 = ''.join (re .findall ('<script>(.*?)</script>', txt_521)) return (txt_521, cookies )def fixed_fun (function ): func_return = function .replace ('eval', 'return') content = execjs pile (func_return ) req = requests .get (url , headers =head ) evaled_func = ''.join (re .findall ('<script>(.*?)</script>', req .text )) # print(js_con) # fn = js_con.split('=').split(' ') # evaled_func = content.call(fn) # print(evaled_func) mode_func = evaled_func .replace ('while(window._phantom||window.__phantomas){};', ''). \ replace ('kie=', 'return').replace (';if((function(){try{return !!window.addEventListener;}', ''). \ replace ("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',l,false);}", ''). \ replace ("else{document.attachEvent('onreadystatechange',l);}", '').replace ( r"setTimeout('location.href=place(/[\?|&]captcha-challenge/,\'\')',1500);", '') content = execjs pile (mode_func ) cookies = content .call ('l') __jsl_clearance = cookies .split (';')[0] r
eturn __jsl_clearance def cookie_dict (js , id ): dict = {} js = js .split ('=') id = id .split ('=') dict [js [0]] = js [1] dict [id [0]] = id [1] return dict if __name__ == '__main__': func = get_521_content (url ,head ) content = func [0] cookie_id = func [1] cookie_js = fixed_fun (func [0]) dicted_cookie = cookie_dict (cookie_js , cookie_id ) head = { # 模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.mafengwo', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.107 'Cookie': cookie_id + ';' + cookie_js } req = requests .get (url , headers =head ) print (req .status_code )2930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687# resouce:blog.csdn/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_
1
# resouce:blog.csdn/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_ # -*- coding: utf-8 -*-# @Time : 2022/1/16 9:11# @Author : sherlock # @File : creeper_2_521.py # @Project : creeper import execjs import re import requests url = 'www.mafengwo/poi/5423409.html'head = { # 模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.mafengwo', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.}def get_521_content (url ): req = requests .get (url , headers =head , timeout =5) print (req .status_code , req .text ) if req .status_code == 521: cookies = dict (req .cookies .items ()) print (cookies ) js_con = ''.join (re .findall ('<script>(.*?)</script>', req .text )) if js_con : __jsl_clearance = fixed_fun (js_con , url ) if __jsl_clearance : key , value = __jsl_clearance .split ('=') cookies [key ] = value return cookies # 执⾏js 代码获取cookies 的__jsl_clearance 的键值def fixed_fun (js_con , url ): # js_con 第⼀次请求获取的js 内容 func_return = js_con .replace ('eval(', 'return(') print ('第⼀次替换eval==》return 后: ', func_return )
content = execjs pile (func_return ) # fn = js_con.split('=')[0].split(' ')[1] # 只有['kie'] fn = js_con .split ('=')[0].split (' ')[1] evaled_func = content .call (fn ) print ('第⼀次执⾏js 代码后: ', evaled_func ) fn = evaled_func .split ('=')[0].split (' ')[1] # 获取动态函数名 aa = evaled_func .split ("<a href=\\'/\\'>") # 获取<a>标签的内容 aa = aa [1].split ("</a>")[0] if len (aa ) >= 2 else '' mode_func = evaled_func . \ replace ( "setTimeout('location.href=location.pathname+place(/[\\?|&]captcha-challenge/,\\'\\')',1500);kie=", 'return'). \ replace (';if((function(){try{return !!window.addEventListener;}', ''). \ replace ( "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," ''). \ replace (123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
代码三 replace ( "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else ''). \ replace ("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \ replace ( "var " + fn + "=ateElement('div');" + fn + ".innerHTML='<a href=\\'/\\'>" + aa + "</a>';" + fn + "=" + fn + ".firstChild.href", "var " + fn + "='" + url + "'") print ('第⼆次替换后的js 代码:', mode_func ) t
ry : content = execjs pile (mode_func ) cookies = content .call (fn ) __jsl_clearance = cookies .split (';')[0] print (__jsl_clearance ) return __jsl_clearance except : print ('js 执⾏错误:', mode_func ) return None # 携带解密后的cookies 第⼆次爬取详情页def con_spider (cookies , url ): response = requests .get (url , headers =head , cookies =cookies , timeout =5) if response .status_code == 200: response .encoding = 'utf-8' print (response .status_code ) print (response .text ) return response else : print ('第⼆次爬取错误状态码:', response .status_code ) return None if __name__ == "__main__": cookies = get_521_content (url ) con_spider (cookies , url )66676869707172737475767778798081828384858687888990919293949596979899100101# resource:wwwblogs/gongs/p/10524710.html import execjs import re import requests url = 'www.mafengwo/poi/5423409.html'head = { # 模拟浏览器头部信息,向⾖瓣服务器发送消息 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.mafengwo", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072}123456789101112131415161718192021222324
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论