java做爬⾍解决521错误
最近做爬⾍时碰到了521错误,500开头的都是服务器错误,521错误有很⼤可能是请求头参数不对,⽐如下⾯这个
这是错误的
这是正确的
就是这⼀堆东西可能有哪些少了或是错误,⽽在爬⾍中遇到这个错误,⼜极⼤可能是少了Cookie参数,⽐如在这⾥就是少了Cookie参数。展开那个Cookie
Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; _ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839a
可以看到就是两个字段,⼀个yd_cookie,还有⼀个_ydclearance。那不就是少这两个参数嘛,yd_cookie可以看到在第⼀次错误的响应头中已经有了
Set-Cookie: yd_cookie=a26be905-40bb-4e4b52bf7b24f2580a068ce65463cbf5a91d; Expires=1544419245; Path=/; HttpOnly
那只要正则截取下及好了,这样就只剩下⼀个_ydclearance,这个怎么解决呢,⾸先⽤各种拦截请求的⼯具,我是⽤的Firefox,直接看请求
可以看到实际上请求了两次,第⼀次就是521错误,第⼆次才是200的正确返回。
看下521错误的返回
<html><body><script language = "javascript">
function fp(PD) {
var qo, mo = "",
no = "",
oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0x  qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";
eval(qo);
qo = 233;
do {
oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;
} while (--qo >= 3);
eval是做什么的qo = 1;
for(;;) {
if(qo > 233) break;
oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff)
>> 4);
qo++;
}
po = "";
for(qo = 1; qo < oo.length - 1; qo++)
if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);
eval("qo=eval;qo(po);");
}
</script> </body></html>
⼀堆和密码⼀样的js代码,不管他,看最后⼀句
eval("qo=eval;qo(po);");
这句绕来绕去实际上就是eval(po),提取出这个⽅法
function fp(PD) {
var qo, mo = "",
no = "",
oo = [0x9a, 0x6f, 0x28, 0xea, 0xe9, 0xeb, 0x70, 0x71, 0xd3, 0xf4, 0x5d, 0x20, 0x1e, 0x60, 0xa2, 0x64, 0xe5, 0xe8, 0xca, 0xd2, 0x1c, 0x58, 0xd7, 0xfa, 0x19, 0x  qo = "qo=234; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-141)&0xff;} while(--qo>=2);";
eval(qo);
qo = 233;
do {
oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;
} while (--qo >= 3);
qo = 1;
for(;;) {
if(qo > 233) break;
oo[qo] = ((((((oo[qo] + 197) & 0xff) + 240) & 0xff) << 4) & 0xff) | (((((oo[qo] + 197) & 0xff) + 240) & 0xff) >> 4);
qo++;
}
po = "";
for(qo = 1; qo < oo.length - 1; qo++)
if(qo % 6) po += String.fromCharCode(oo[qo] ^ PD);
return po;
}
fp(38);
把它放到浏览器中执⾏下
"kie='_ydclearance=36aa3e07d8ff72cc2c52b076-2862-4bac-b1b0-9850d839acc0-1544419245; expires=Mon, 10-Dec-18 05:20:45 GMT; do
_ydclearance终于出来了,现在只要把_ydclearance和yd_cookie拼起来set到Cookie中就可以了。
JAVA代码
public class HandleCrawler {
public static void setCookie() throws IOException, ScriptException {
CloseableHttpResponse response = ApacheHttpUtil.sendGet(Constant.proxyUrl);
StatusLine().getStatusCode()==521) {
String yd_cookie = AllHeaders());
ProxyRequest.logger.info("yd_cookie is :"+yd_cookie);
HttpEntity entity = Entity();
String String(entity,"utf-8");
String runString = getRunString(html);
String fuction = html.substring(html.indexOf("function")).replace("</script> </body></html>",runString+";").replace("eval(\"qo=eval;qo(po);\")","return po");            ProxyRequest.logger.info("fuction is :"+fuction);
ScriptEngineManager m = new ScriptEngineManager(); //获取JavaScript执⾏引擎
ScriptEngine engine = m.getEngineByName("JavaScript"); //执⾏JavaScript代码
String origin = (String) engine.eval(fuction);
ProxyRequest.logger.info("origin ydclearance is :"+origin);
String ydclearance = getYdclearance(origin);
ProxyRequest.logger.info("ydclearance is :"+ydclearance);
Constant.COOKIE = "yd_cookie="+yd_cookie+"; _ydclearance="+ydclearance;
}
}
private static String getYdCookie(Header[] headers){
String yd_cookie = null;
for(Header header:headers){
if (Name().equals("Set-Cookie")){
yd_cookie = Value();
}
}
Pattern pattern = Patternpile("(?<=yd_cookie=).+?(?=; Expires=)");
Matcher matcher = pattern.matcher(yd_cookie);
while (matcher.find()){
yd_cookie = up(0);
}
return yd_cookie;
}
private static String getYdclearance(String origin){
String ydclearance = null;
Pattern pattern = Patternpile("(?<=_ydclearance=).+?(?=; expires=)");
Matcher matcher = pattern.matcher(origin);
while (matcher.find()){
ydclearance = up(0);
}
return ydclearance;
}
private static String getRunString(String html){
Pattern pattern = Patternpile("(?<=load=setTimeout\\(\").+?(?=\", 200\\))");
Matcher matcher = pattern.matcher(html);
while (matcher.find()){
up(0);
}
return null;
}
}

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。