Python爬取中国知⽹⽂献、参考⽂献、引证⽂献
前两天⽼师派了个活,让下载知⽹上根据⾼级搜索得到的来源⽂献的参考⽂献及引证⽂献数据,⽹上了⼀些相关博客,感觉都不太合适,因此特此记
录,希望对需要的⼈有帮助。
切⼊正题,先说这次需求,⾼级搜索,根据中图分类号,年份,来源类别条件共检索得到5000多条⽂献信息。
需求⼀:获取这5000多篇⽂献的基本信息
需求⼆:获取这5000多篇⽂献的参考⽂献信息
需求三:获取这5000多篇⽂献的引证⽂献信息
这⼏个需要其实也⽐较明确,下⾯⼏张图是本次需求涉及到的⼏个页⾯。
⾸先看⼀下需求⼀,在知⽹中的⾼级检索输⼊检索条件,获取检索出来的⽂献信息。通过点击检索按钮,发现浏览器检索框内的url并未发⽣变化,因此采
取第⼆种⽅式,打开开发者⼯具->network,点击检索按钮,发现浏览器发送了两次请求,第⼀次是根据检索条件发送了⼀次post请求,返回⼀些参数,第⼆
次携带参数再次发送请求获取数据。通过点击翻页按钮,可以出变化的url信息,通过观察,发现两个重要的参数:curpage和RecordsPerPage,分别代表
当前页和每页数据个数,最⼤个数为50,我们可以采取两次请求的⽅式,但显然⼀次检索的话直接复制浏览器中第⼀次请求返回的参数就可以了,只需模拟
第⼆次请求。另外重要的⼀点就是,请求的时候需要携带cookie信息,否则也⽆法返回正确的数据。分析完成,可以开始写代码了。
def download_search_page(self):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'Ecp_ClientId=2200630175601165678; cnkiUserKey=d6737e43-6a79-d00c-9a04-a03c2c11ee30; Ecp_IpLoginFail=200701183.202.194.16; ASP.NET_SessionId=edraumuckd12e2nqz3tywjsk; SID_kns=123113; SID_klogin=125141 'Host': 'knski',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
page = 70
while page < 104:
error parse newtry:
url = f"knski/kns/brief/brief.aspx?curpage={page}&RecordsPerPage=50&QueryID=2&ID=&turnpage=1&tpagemode=L&dbPrefix=CJFQ&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx&isinEn=2&
response = (url, headers=headers)
with open(f'{self.search_html_dir}/{page}.html', mode='w', encoding='utf-8') as f:
f.)
print(f'{page} 下载完成')
_file_size(f"{self.search_html_dir}/{page}.html") < 50:
raise Exception("cookie失效")
page += 1
except Exception as e:
print(f'{page}下载失败\t正在睡眠请耐⼼等待', )
time.sleep(30)
def parse_search_article_info(self):
for file in os.listdir(self.search_html_dir):
file_path = os.path.join(self.search_html_dir, file)
items = []
try:
text = ad_html(file_path)
response = HTML(text)
tr_list = response.xpath('//table[@class="GridTableContent"]/tr[@bgcolor]')
for tr in tr_list:
item = {}
item['title'] = tr.xpath('td[2]/a/text()')[0]
href = tr.xpath('td[2]/a/@href')[0]
params = parse_qs(urlparse(href).query)
dbcode = params['DbCode'][0]
dbname = params['dbname'][0]
filename = params['filename'][0]
item['filename'] = filename
item[
'article_url'] = f'knski/KCMS/detail/detail.aspx?dbcode={dbcode}&dbname={dbname}&filename={filename}'
item['authors'] = '; '.join(tr.xpath('td[@class="author_flag"]/a/text()'))
item['journal'] = tr.xpath('td[@class="cjfdyxyz"]/a/text()')[0].strip()
item['publish_time'] = tr.xpath('td[5]/text()')[0].strip().split()[0]
try:
item['cited_num'] = tr.xpath('td[6]/span[@class="KnowledgeNetcont"]/a/text()')[0]
except IndexError:
item['cited_num'] = 0
try:
item['download_num'] = tr.xpath('td[7]/span[@class="downloadCount"]/a/text()')[0]
except IndexError:
item['download_num'] = 0
items.append(item)
df = DataFrame(data=items)
df.set_index(keys='filename', inplace=True)
<_sql("paper_info", , if_exists='append')
print(f'{file_path}\t解析完成')
except Exception as e:
print(f'{file_path}\t插⼊失败')
traceback.print_exc()
下载和解析⾼级检索⽂献信息
另外,这些信息还不够全⾯,⽐如关键词,摘要这些还需要通过详情页⾯获取,我们可以通过第⼀次获取的⽂章url来得到其他信息。
def spider_article_detail_page(self):
if len(os.listdir(self.paper_html_dir)) > 0:
files = {place('.html', '') for file in os.listdir(self.paper_html_dir)}
files = "('" + "','".join(files) + "')"
paper_info = read_sql(f"SELECT article_url FROM paper_info where filename not in {files}", )
else:
paper_info = read_sql(f"SELECT article_url FROM paper_info", )
with ThreadPoolExecutor() as pool:
pool.map(self.download_article_detail, paper_info['article_url'])
# paper_info['article_url'].apply(self.download_article_detail)
def download_article_detail(self, url):
filename = parse_qs(urlparse(url).query)['filename'][0]
filepath = f'{self.paper_html_dir}/{filename}.html'
response = (url)
self.write_, filepath)
_file_size(file_path=filepath) < 5:
print(f'{url}\t下载失败')
exit()
print(f'{url}\t下载完成')
def parse_article_detail(self):
f = open('', mode='a')
for file in os.listdir(self.paper_html_dir):
filename = place('.html', '')
file_path = os.path.join(self.paper_html_dir, file)
try:
text = ad_html(file_path)
response = HTML(text)
institution = '; '.join(response.xpath('//div[@class="orgn"]/span/a/text()'))
try:
summary = response.xpath('//span[@id="ChDivSummary"]/text()')[0]
except IndexError:
summary = ''
keywords = ''.join([word.strip() for word in response.xpath(
'//label[@id="catalog_KEYWORD"]/following-sibling::a/text()')]).strip(';')
try:
cls_num = response.xpath('//label[@id="catalog_ZTCLS"]/parent::p/text()')[0]
except IndexError:
cls_num = ''
ute(
"update paper_info set summary=?, institution=?, keywords=?, cls_num=? where filename=?",
params=(summary, institution, keywords, cls_num, filename))
print(f'{filename} 更新完毕')
except Exception as e:
print(f'{filename} 更新失败', e)
f.write(f'{file_path}\n')
f.close()
下载和解析参⾼级检索⽂章详细信息
接下来看需求⼆和需求三,获取⽂献的参考⽂献信息,这时通过点击⽂章页⾯的参考⽂献,观察请求信息的变化,发现和⽂章url相⽐,参数中多了⼀个RefType,参考⽂献等于1,引证危险等于3。请求
的时候需要携带refer信息,refer信息我们设置为当前⽂章url。开始写代码:
def download_article_refer_cited_page(self):
paper_info = read_sql(f"SELECT article_url FROM paper_info", )
<_f = open('', mode='w')
with ThreadPoolExecutor() as pool:
pool.map(self.download_reference_page, paper_info['article_url'])
pool.map(self.download_cited_page, paper_info['article_url'])
<_f.close()
def download_reference_page(self, url):
"""
下载指定⽂章参考⽂献页⾯
:param url:
:return:
"""
query = urlparse(url).query
filename = parse_qs(query)['filename'][0]
refer_url = f"knski/kcms/detail/frame/list.aspx?{query}&RefType=1&vl="
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
'Referer': f'knski/KCMS/detail/detail.aspx?{query}',
}
response = (refer_url, headers=headers)
if response.status_code == 200:
self.write_, f'{self.paper_refer_html_dir}/{filename}.html')
else:
raise Exception(f"请求异常, 状态码为:{response.status_code}")
except Exception as e:
<_f.write(refer_url + '\n')
print(f'{refer_url}\t下载失败', e)
def download_cited_page(self, url):
query = urlparse(url).query
filename = parse_qs(query)['filename'][0]
cited_url = f"knski/kcms/detail/frame/list.aspx?{query}&RefType=3&vl="
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
'Referer': f'knski/KCMS/detail/detail.aspx?{query}',
}
response = (cited_url, headers=headers)
if response.status_code == 200:
self.write_, f'{self.paper_cited_html_dir}/{filename}.html')
else:
raise Exception(f"请求异常, 状态码为:{response.status_code}")
except Exception as e:
<_f.write(cited_url + '\n')
print(f'{cited_url}\t下载失败', e)
def get_error_refer_cited_page(self):
with open('') as f:
for line in f:
url = line.strip()
dswith("RefType=3&vl="):
self.download_cited_place("RefType=3&vl=", ""))
dswith("RefType=1&vl="):
self.download_reference_place("RefType=1&vl=", ""))
def get_all_refer_cited_page_url(self):
f = open('more_refer_', mode='a')
for file_path _dir_all_files(self.paper_refer_html_dir, self.paper_cited_html_dir):
filename = file_path.split('\\')[-1].replace('.html', '')
req_type = 1 if file_path.__contains__('refer') else 3
response = ad_html(file_path))
nodes = response.xpath('//span[@name="pcount"]')
for node in nodes:
pcount = int(node.xpath('text()')[0])
if pcount > 10:
article_url = \
self.db.fetchone(f"select article_url from paper_info where filename=?", params=(filename,))[0]
query = urlparse(article_url).query
pages = int(pcount / 10) + 1
CurDBCode = node.xpath('@id')[0].replace('pc_', '')
for page in range(2, pages + 1):
url = f"knski/kcms/detail/frame/list.aspx?{query}&RefType={req_type}&vl=&CurDBCode={CurDBCode}&page={page}"
f.write(f'{url}\n')
print(f'{url}\t写⼊成功')
f.close()
def download_all_refer_cited_page(self):
<_f = open('', mode='w')
def download_page(url):
query = parse_qs(urlparse(url).query)
page = query['page'][0]
CurDbCode = query['CurDBCode'][0]
filename = query['filename'][0]
refType = query['RefType'][0]
if refType == '1':
file_path = f'{self.paper_refer_html_dir}/{filename}_{CurDbCode}_{page}.html'
else:
file_path = f'{self.paper_cited_html_dir}/{filename}_{CurDbCode}_{page}.html'
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
'Referer': url,
}
response = (url, headers=headers)
if response.status_code == 200:
self.write_, file_path)
else:
raise Exception(f"请求异常, 状态码为:{response.status_code}")
except Exception as e:
<_f.write(url + '\n')
print(f'{url}\t下载失败', e)
with open('more_refer_') as f:
urls = [line.strip() for line in f]
with ThreadPoolExecutor() as pool:
pool.map(download_page, urls)
<_f.close()
def download_all_error_refer_cited_page(self):
with open('') as f:
for line in f:
url = line.strip()
query = parse_qs(urlparse(url).query)
page = query['page'][0]
CurDbCode = query['CurDBCode'][0]
filename = query['filename'][0]
refType = query['RefType'][0]
if refType == '1':
file_path = f'{self.paper_refer_html_dir}/{filename}_{CurDbCode}_{page}.html'
else:
file_path = f'{self.paper_cited_html_dir}/{filename}_{CurDbCode}_{page}.html'
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
'Referer': url,
}
response = (url, headers=headers)
if response.status_code == 200:
self.write_, file_path)
print(f'{url}\t下载成功')
else:
raise Exception(f"请求异常, 状态码为:{response.status_code}")
except Exception as e:
print(f'{url}\t下载失败', e)
下载参考⽂献和印证⽂献页⾯
接下来是解析页⾯信息,这⾥⾯的难点是对参考⽂献和引证⽂献标注的格式不同类别不⼀样,需要单独进⾏正则匹配,不过我这⾥已经处理好了,可以直接拿来⽤。
def get_article_refer_num(self):
def parse_refer_num(filename):
try:
response1 = ad_html(f'{self.paper_refer_html_dir}/{filename}.html'))
refer_pcounts = response1.xpath('//span[@name="pcount"]/text()')
if refer_pcounts:
refer_num = sum(int(num) for num in refer_pcounts)
else:
refer_num = 0
ute("update paper_info set refer_num=? where filename=?", params=(refer_num, filename))
print(f'{filename}\t{refer_num}')
except Exception as e:
print(f'{filename}\t解析失败', e)
paper_info = read_sql(f"SELECT filename FROM paper_info", )
paper_info['filename'].apply(parse_refer_num)
@timeit
def parse_refer_cited_info(self):
<_f = open('', mode='a')
refer_file_list = []
cited_file_list = []
for file _dir_all_files(self.paper_refer_html_dir, self.paper_cited_html_dir):
if file.__contains__('refer'):
refer_file_list.append(file)
elif file.__contains__('cited'):
cited_file_list.append(file)
refer_data_list = []
for file in refer_file_list:
self.parse_reference_cited_article_detail(file, relation='参考⽂献', data_list=refer_data_list)
refer_data = DataFrame(data=refer_data_list)
refer_data.drop_duplicates(subset=['origin_article', 'dbcode', 'pid', 'relation'], inplace=True)
_csv('res/参考⽂献.csv', index=False, encoding='utf_8_sig')
# _sql("reference_article", , if_exists='append', index=False)
cited_data_list = []
for file in cited_file_list:
self.parse_reference_cited_article_detail(file, relation='引证⽂献', data_list=cited_data_list)
cited_data = DataFrame(data=cited_data_list)
print(cited_data.info())
cited_data.drop_duplicates(subset=['origin_article', 'dbcode', 'pid', 'relation'], inplace=True)
print(cited_data.info())
_csv('res/引证⽂献.csv', index=False, encoding='utf_8_sig')
# _sql("cited_article", , if_exists='append', index=False)
<_f.close()
def parse_reference_cited_article_detail(self, file, relation, data_list):
filename = file.split('\\')[-1].replace('.html', '')
if len(filename.split('_')) > 1:
filename = filename.split('_', maxsplit=1)[0]
response = ad_html(file))
essayBoxs = response.xpath('//div[@class="essayBox"]')
for box in essayBoxs:
db_title = box.xpath('div[@class="dbTitle"]/text()')[0]
db_code = box.xpath('div[@class="dbTitle"]/b/span[@name="pcount"]/@id')[0].replace('pc_', '')
essays = box.xpath('ul[contains(@class, "ebBd")]/li')
for essay in essays:
item = ArticleItem()
item.dbcode = db_code
item.dbtitle = db_title
try:
item.pid = essay.xpath('em[1]/text()')[0].strip().replace('[', '').replace(']', '')
except IndexError:
continue
if db_code == 'CBBD':
info = essay.xpath('text()')[0].replace('\n', '').replace('', '')
try:
res = re.search('(.*?)\[(.*?)\]\.(.*?),(.*?),(\d{4})', info)
item.title = up(1)
item.article_type = up(2)
item.journal = up(3)
item.author = up(4)
except AttributeError as e:
res = re.search('(.*?)\[(.*?)\]\.(.*?),(.*?),', info)
item.title = up(1)
item.article_type = up(2)
item.journal = up(3)
item.author = up(4)
elif db_code == 'CJFQ':
try:
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论