import requests
import json
from lxml import etree
def load_cookies():
cookie_json ={}
try:
with open('export.json','r')as cookies_file:
cookie_json = json.load(cookies_file)
except:
print("Json load failed")
finally:
return cookie_json
def main():
page_list =[]
for i in range(1,8):
agent ="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"        headers ={
"HOST":"www.amazon/s?k=anime+figure+one+piece",
"Referer":"www.amazon/s?k=anime+figure+one+piece",
"User-Agent": agent
}
session = requests.session()
session.headers = headers
requests.utils.add_dict_to_kies, load_cookies())
url ="www.amazon/s?k=anime+figure+one+piece&page=%d"% i
response = (url)
[:300])
# with open("test.html", "wb") as f:
#    f.ncode('utf-8'))
page_text =
tree = etree.HTML(page_text)
a_list = tree.xpath('//a[@class="a-link-normal a-text-normal"]/@href')
# print(a_list)
#
for j in a_list:
page_url ="www.amazon"+ j
page_list.append(page_url)
print("Done")
print(page_list)
for k in page_list:
with open("","a+")as f:
f.write(k +"\r")
if __name__ =='__main__':
main()
另外coookie封装到了⼀个cookie.py⽂件⾥,以下cookie有删改
import json
def save_cookies(cookies):
cookies_file ='export.json'
with open(cookies_file,'w')as f:
json.dump(cookies, f)
def main():
cookies ={
"ad-privacy":"0",
"ad-id":"A5d94GSIhp6aw774YK1k",
"i18n-prefs":"USD",
"ubid-main":"138-2864646",
"session-id-time":"20887201l",
"sst-main":"Sst1|PQHBhLoYbd9ZEWfvYwULGppMCd6ESJYxmWAC3tFvsK_7-FgrCJtViwGLNnJcPk6NS08WtWl7f_Ng7tElRchY70dGzOfHe6LfeLVA2 EvS_KTJUFbqiKQUt4xJcjOsog_081jnWYQRp5lAFHerRS0K30zO4KWlaGuxYf-GlWHrIlX0DCB0hiuS4F69FaHInbcKlPZphULojbSs4y3YC_Z2098BiZK5mzna 84daFvmQk7GS1uIEV9BJ-7zXSaIE1i0RnRBqEDqCw",
"sess-at-main":"7B9/7TbljVmxe9FQP8pj4/TirM4hXdoh0io=",
"at-main":"Atza|IwEBICsAvrvpljvBn6U0aVHZtVAdHNTj8I9XMXpj0_akGclan8n4it62oe4MadfnSheGBfJeVJwRmrV41ZbllH48hNM32FGo4DJGoeXE01g Dei-_2PGNH3jKU79B8rzg8MaHRootDMSwFmj4vNmPtnvl6qrbfZoPSmey12IuWq9ijSx3MuCbpJ2wt4Sp7ixf7jWHW6VfaZ849AJkOBDonSHp9o", "sp-cdn":"L5Z:CN",
"session-id":"141-56579-5761416",
"x-main":"2XkJe2ehs13TDTsRlELJt12FINPkJSfDKLuc5XjGgy2akeyGa45?wYKN4wcIC",
"session-token":"HfpLyDT70a2q+Ktd9sYUopKOKUeQndXMlbDcwP8sQNGA/ZeUA9ZNGNXOPRvXV8E6pUjeI7j/RR9iDCr5S7W0sRLmHT27PAvbN3T XsyaLvvPhsn4e3hUvhgdJn/xK/BfioKniukounAKZnYZLNcGf44ZiX8sRfdIjOiOx9GvAvl+hnPfJmWi/l73tqO6/G+PPf8uc0vq7Xubsgw2SuSXzqwq0gHEtE6HcbA6 AeyyE59DCuH+CdV3p2mVSxUcvmF+ToO6vewLuMl1Omfc+tQ==",
"lc-main":"en_US",
# "csm-hit": "tb:s-YM0DR0KTNG964PT0RMS0|1627973759639&t:1627973760082&adb:adblk_no"
# "csm-hit": "tb:s-K3VN7V41Z5H7N250A9NE|1627974200332&t:1627974200622&adb:adblk_no"
"csm-hit":"tb:6CJBWDDJGRZPB09G+b-K3VN7V41Z5H7N250A9NE|1627974443&t:1627974446683&adb:adblk_no"
}
save_cookies(cookies)
selenium获取cookie
if __name__ =='__main__':
main()
6. 爬到了url就很晚了,本来想着原来那么简单,再对每个url加上cookie再⼀爬取页⾯,xpath⼀解析就完事了,先睡觉喽。
7. 第⼆天上午继续做,发现事情果然没那么简单。对每个商品进⾏爬取时cookie也对应着变,每⼀次请求都会发现cookie中csm-hit字段
的值都会变。⽽且不知为何昨天晚上弄得那个代码也爬不到了。崩了。但是肯定还是有办法的。
8. 没错,就是selenium。⾃动化操作,⼀个⼀个请求页⾯url,获取页⾯详细信息也挺⾹的,还不⽤输cookie啥的。
9. 先把昨天的url重新获取。(⽤⽆头化发现获取不全url,于是就注释掉了)
14. 不过仔细搞下,发现当我⿏标在这图⽚上⼀经过
15. 下⾯的li标签⽴刻就多了起来,在⼀个个打开,发现果然是⾼清的图⽚
16. 于是就有办法了,我可以在每次获取页⾯的html详情前都先把这图⽚都点击⼀下,然后再获取页⾯html信息,再进⾏xpath解析,再
对获取到的标题,介绍,价格进⾏写⼊⽂件的操作就⾏了,于是就OK了。

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。