【问题解决】关于爬⾍被封的处理⽅法(同花顺数据获取问
题)
最近试图从财经⽹站上积累数据,选中了同花顺财经的数据中⼼的数据。
1. ⾸先⾃然是常规的requests.Session()模块的⽅法,伪装User-Agent,先定位同花顺,再进⼊数据中⼼获取数据。结果甚⾄连访问
都⽆法做到,后来我尝试复制了⼀段真实浏览器访问的Cookie加⼊到session.headers中终于可以实现数据获取;
2. 然⽽接下来就是⽆穷⽆尽了只能获取到前五页的数据的问题了。我尝试了爬取每页伪装⼀个不同IP(失败,仍然只能五页);两次访
问之间设置⼀个60~120秒的间隔(稍有起效,但是仍然会被拒绝访问);最后我甚⾄猜想会不会是同花顺默认设置如果访问了前五页就不允许访问接下来的页⾯,于是我尝试⽤⼀个随机的顺序去访问这总共71页,结果如你所料——并没有什么P⽤,访问了五页后照样拒绝访问;
3. 我最后总结下来原因应该是出在Cookie上,可是我⼜⽆法做到每次访问换⼀个新的Cookie,虽然继续增加访问之间的间隔时间是有效
果的,但是都间隔2分钟访问⼀次我还不如⾃⼰⼿动复制页⾯算了;
4. 这时候我忽然想到了selenium。以前我觉得selenium能做到的事情,我普通爬⾍照样能做到,⽽且selenium需要实际启动浏览器,
刷新页⾯实在是很低效的做法,同样的爬⾍我⽤requests库能爬100页⽽selenium可能只能爬到30页,因此我⼀度觉得selenium是⼀个很鸡肋的爬⾍⼿段。但是现在不同了,requests.Session()模块在同花顺⾯前成了纸⽼虎,中看不中⽤了,于是我重启selenium 来进⾏爬⾍;
5. 事实上在selenium只⽣成⼀个webdriver对象的情况下仍然会被同花顺限制在五页之类的访问次数,可是如果我每访问⼀次都重建⼀
个新的webdriver对象则可以避免这种情况;
不多说,直接上代码↓↓↓
Tips:笔者是使⽤的Firefox浏览器,因此selenium是基于Firefox的,其他不同浏览器可以把代码中的Firefox改为Chrome之类的应该就可以了。另外请⾃⾏在该py⽂件所在的⽬录下新建⼀个⽤今⽇⽇期命名的⽂件夹(如2019-01-27)。
# -*- coding:UTF-8 -*-
import os
import re
import json
import time
import numpy
import pandas
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
"""
作者:囚⽣CY
平台:CSDN
时间:2019/01/27
转载请注明原作者
创作不易,仅供分享
"""
class StraightFlush():
def __init__(self,
userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
cookie="v=AigMPjtB4I7PKMwKRvAplY1u-Rc7UYxbbrVg3-JZdKOWPcYFimFc677FMG4x; log=; \
Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1548080158; \
Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1548080196; \
Hm_lvt_60bad21af9c824a4a0530d5dbf4357ca=1548080175; \
Hm_lpvt_60bad21af9c824a4a0530d5dbf4357ca=1548080196; \
Hm_lvt_f79b64788a4e377c608617fba4c736e2=1548080175; \
Hm_lpvt_f79b64788a4e377c608617fba4c736e2=1548080196; \
vvvv=1"
):
):
""" 定义构造函数传⼊可变参数 """
self.userAgent = userAgent
""" 定义常⽤的固定参数 """
self.date = time.strftime("%Y-%m-%d")        # 获取类初始化的时间
self.workSpace = os.getcwd()          # 获取⼯作⽬录
self.mainURL = "www.10jqka/"
self.dataURL = "data.10jqka/"
self.stockFlowURL = self.dataURL + "funds/ggzjl/"
self.industryFlowURL = self.dataURL + "funds/hyzjl/"
self.stockTableURL = self.stockFlowURL + derSuffix
self.industryTableURL = self.industryFlowURL + derSuffix
self.headers = {
"User-Agent":self.userAgent,
"Cookie":kie,
}
self.session = requests.Session()
self.session.headers = self.headers
""" 初始化操作 """
(self.mainURL)
def parse_money_flow_stock(self,):          # 获取A股个股资⾦流动
(self.dataURL)
html = (self.stockFlowURL).text
soup = BeautifulSoup(html,"lxml")
spans = soup.find_all("span")
ths = soup.find_all("th")
""" 将<th>标签中的string作为DataFrame的header """
flag = False
with open(r"{}\{}\money_flow_stock_{}.csv".format(self.workSpace,self.date,self.date),"a") as f:  for th in ths:
aLabel = th.find_all("a")
string = aLabel[0].string if len(aLabel) else th.string
if flag: f.write(",{}".format(string))
else:
flag = True
f.write(str(string))
f.write("\n")
""" 遍历<span>标签获取总页数 """
for span in spans:
string = str(span.string)
if len(string)>2 and string[:2]=="1/":
page = int(string[2:])
break
""" 获取资⾦流动信息 """
for i in range(1,page+1):
b = webdriver.Firefox()
<(self.stockTableURL.format(i))
html = b.page_source
soup = BeautifulSoup(html,"lxml")
trs = soup.find_all("tr")
with open(r"{}\{}\money_flow_stock_{}.csv".format(self.workSpace,self.date,self.date),"a") as f:    for tr in trs[1:]:          # 跳过第⼀个<tr>标签是因为第⼀⾏是表头
flag = False
tds = tr.find_all("td")
for td in tds:
string = str(td.string)
string = place(" ","").replace("\t","").replace("\n","")
if flag: f.write(",{}".format(string))
else:
flag = True
f.write(string)
f.write("\n")
b.quit()
return True
def parse_money_flow_concept(self,):        # 获取A股概念板块资⾦流动
(self.dataURL)
html = (ptFlowURL).text
soup = BeautifulSoup(html,"lxml")
spans = soup.find_all("span")
ths = soup.find_all("th")
""" 将<th>标签中的string作为DataFrame的header """
flag = False
with open(r"{}\{}\money_flow_concept_{}.csv".format(self.workSpace,self.date,self.date),"a") as f:
for th in ths:
aLabel = th.find_all("a")
if len(aLabel):            # 概念板块与⾏业板块读取表头的代码如此笨拙因为与个股的⽅法在这⾥竟然只能拿到None,⽽且两者格式⼀模⼀样,让我很奇怪    tag = str(aLabel[0])
index1 = tag.find(">")
index2 = tag.find("<",index1)
string = tag[index1+1:index2]
else: string = th.string
if flag: f.write(",{}".format(string))
else:
flag = True
f.write(str(string))
f.write("\n")
""" 遍历<span>标签获取总页数 """
for span in spans:
string = str(span.string)
if len(string)>2 and string[:2]=="1/":
page = int(string[2:])
break
""" 获取资⾦流动信息 """
for i in range(1,page+1):
b = webdriver.Firefox()
<(ptTableURL.format(i))
html = b.page_source
soup = BeautifulSoup(html,"lxml")
trs = soup.find_all("tr")
with open(r"{}\{}\money_flow_concept_{}.csv".format(self.workSpace,self.date,self.date),"a") as f:
for tr in trs[1:]:          # 跳过第⼀个<tr>标签是因为第⼀⾏是表头
flag = False
tds = tr.find_all("td")
for td in tds:
string = str(td.string)
string = place(" ","").replace("\t","").replace("\n","")
if flag: f.write(",{}".format(string))
else:
flag = True
f.write(string)
f.write("\n")
b.quit()
return True
def parse_money_flow_industry(self,):        # 获取A股概念板块资⾦流动
(self.dataURL)
html = (self.industryFlowURL).text
soup = BeautifulSoup(html,"lxml")
spans = soup.find_all("span")
ths = soup.find_all("th")
""" 将<th>标签中的string作为DataFrame的header """
flag = False
with open(r"{}\{}\money_flow_industry_{}.csv".format(self.workSpace,self.date,self.date),"a") as f:
for th in ths:
aLabel = th.find_all("a")
if len(aLabel):            # 概念板块与⾏业板块读取表头的代码如此笨拙因为与个股的⽅法在这⾥竟然只能拿到None,⽽且两者格式⼀模⼀样,让我很奇怪    tag = str(aLabel[0])
index1 = tag.find(">")
index2 = tag.find("<",index1)
string = tag[index1+1:index2]
else: string = th.string
if flag: f.write(",{}".format(string))
else:
flag = True
f.write(str(string))
f.write("\n")
""" 遍历<span>标签获取总页数 """
for span in spans:
string = str(span.string)
if len(string)>2 and string[:2]=="1/":
page = int(string[2:])
break
""" 获取资⾦流动信息 """
for i in range(1,page+1):
b = webdriver.Firefox()
<(self.industryTableURL.format(i))
html = b.page_source
soup = BeautifulSoup(html,"lxml")
trs = soup.find_all("tr")
with open(r"{}\{}\money_flow_industry_{}.csv".format(self.workSpace,self.date,self.date),"a") as f:
for tr in trs[1:]:          # 跳过第⼀个<tr>标签是因为第⼀⾏是表头
flag = False
tds = tr.find_all("td")
for td in tds:
string = str(td.string)
string = place(" ","").replace("\t","").replace("\n","")
if flag: f.write(",{}".format(string))
else:
flag = True
f.write(string)
f.write("\n")
selenium获取cookie
b.quit()
return True
if __name__ == "__main__":
print("测试开始...")
sf = StraightFlush()
sf.parse_money_flow_stock()
sf.parse_money_flow_concept()
sf.parse_money_flow_industry()

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。