python抓取交易所_Python爬⾍爬取上海黄⾦交易所历史交易
数据
为什么
因为想做上海黄⾦的量化交易,⼜信不过⽹上那些忽悠的神乎其神的App。于是⾃⼰动⼿,丰⾐⾜⾷。
如何做
⾸先要获取权威的交易数据,上海黄⾦交易所官⽹就有历年的交易数据。所以打算⽤熟悉的Python写个爬⾍⾃动获取。
1. ⼯具准备
Python3.6 + requests + lxml + Json
2. ⽹站解析
⾸先到上海黄⾦交易所每⽇⾏情页列表(⾸页 > 数据资讯 > 历史⾏情数据 > 每⽇⾏情),分析该列表每页显⽰10天
的数据列表,点开后才是每天每个交易合约的交易数据。并且每⼀页的URL采⽤参数⽅式进⾏定位,如:“sjzx/mrhqsj?p=2 ”
getsavefilename表⽰第⼆页。所以只需要⼀个简单循环就可以到需要的页⾯。
其次要到具体数据页⾯列表的Xpath,可以使⽤浏览器Chrome⾃带的开发者模式,到需要的数据,直接点右键 Copy > Copy Xpath。
3. 上代码
# -*- coding: UTF-8 -*-
# 本模块从上海黄⾦交易所官⽹下载历史交易数据
import os
import time
from Lib.Web import get_Html, get_list, get_List_xpath, add_host
from Lib.os import save_list, save_list_A, save_list_B, makdir, BASE_PATH
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/87.0.4280.88 Safari/537.36‘,
}
def get_table(title, url, headers):
table_xpath = ‘//div[@class="content"]/table/tbody/tr/td[1]/text()‘
html = get_Html(url, headers)
doc = get_list(html, table_xpath)
tab = []
had = []
n = len(doc)
for r in range(1, n+1):
table_xpath = ‘//div[@class="content"]/table/tbody/tr[%d]/td/text()‘ % r
d = get_list(html, table_xpath)
if r == 1:
for i in d:
had.append(str(i).replace(‘\t‘, ‘‘).replace(
‘\n‘, ‘‘).replace(‘\r‘, ‘‘))
else:
row = {}
row[‘交易⽇期‘] = title
try:
for i in range(len(d)):
row[had[i]] = str(d[i]).replace(
‘\t‘, ‘‘).replace(‘\n‘, ‘‘).replace(‘\r‘, ‘‘)
except Exception as e:
pass
tab.append(row)
return tab
if __name__ == "__main__":
# 获得下载链接
for r in range(1, 201):
filename = ‘list_%d.txt‘ % r
cache_dir = "goldlist"
html = get_Html(url, headers)
if not ists(os.path.join(BASE_PATH, cache_dir)):
makdir(os.path.join(BASE_PATH, cache_dir))
filename = os.path.join(BASE_PATH, cache_dir, filename)
if ists(filename):
print("跳过:%s" % filename)
continue
a = ‘/html/body/div[6]/div/div[2]/div[2]/div[2]/ul/li/a/span[2]/text()‘
b = ‘/html/body/div[6]/div/div[2]/div[2]/div[2][email protected]
lst = get_List_xpath(html, a, b)
for item in lst:
lst[item] = add_host(url, lst[item])
save_list_A(filename, lst)
print(‘获取历史⾏情第%d页‘ % r)
time.sleep(3)
# 下载⾏情数据
for r in range(1, 201):
filename = ‘list_%d.txt‘ % r
cache_dir = "goldlist"
filename = os.path.join(BASE_PATH, cache_dir, filename)
if ists(filename):
with open(filename, ‘r‘, encoding=‘utf-8‘) as f:
line = f.readline()
item, url = line.split(‘\t‘)
filename = os.path.join(BASE_PATH, cache_dir, "%s.txt" % item) if ists(filename):
print("跳过:%s" % filename)
continue
doc = get_table(item, str(url).replace(‘\n‘, ‘‘), headers) save_list_B(filename, doc)
print("保存:%s" % filename)
time.sleep(3)
其中使⽤到我⾃⼰为了⽅便建⽴的库函数
def get_host(url):
ul = urlparse(url)
return ul.scheme + ‘://‘ + ul.hostname
def add_host(url, path):
return get_host(url) + path
def get_Html(url, headers, cookies=None, params=None): """ 返回⽹页内容 """
if cookies:
r = (url=url, headers=headers, cookies=cookies) else:
r = (url=url, headers=headers)
return etree.)
def get_list(html, xpath):
""" 返回指定位置的列表 """
return html.xpath(xpath)
def save_list_B(filename, list):
with open(filename, ‘w‘, encoding=‘utf-8‘) as f: f.writelines(json.dumps(list,ensure_ascii=False))
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论