⽤Python和selenium下载pdf⽂件
import time
from selenium.webdrivermon.keys import Keys
from selenium import webdriver
import random
'''
Automatically download SPE papers from a conference
< /conferences/SPE/17ADIP/all?start=0&rows=700
Author: Carl Wu, 吴⽂旷
'''
#The starting URL
start_url = "/conferences/SPE/17ADIP/all?start=0&rows=700"
#设置Chrome的选项
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.default_directory": "D:\\Carl\\dev\\test\\"}
options.add_experimental_option("prefs", profile)
browser = webdriver.Chrome(r"C:\Users\Carl\AppData\Local\Google\Chrome\",
chrome_options=options)
# credentials
username = 'xxxxxxx'
password = 'xxxxxxx'
#Open the URL
<(start_url)  #
#⾸先等⼏秒加载⾸页,然后要点击⼀下Login/Register链接,才能看见⽤户名和密码输⼊框
time.sleep(6)
elem_register = browser.find_element_by_id('p13n-menu')
elem_register.click()
time.sleep(2)
#输⼊⽤户名
user_blank = browser.find_element_by_id('l-email')
user_blank.clear()
user_blank.send_keys(username)
#输⼊密码
password_blank = browser.find_element_by_id('l-password')
password_blank.clear()
password_blank.send_keys(password)
#在登陆页⾯上到登陆(login)按钮并点击
#虽然到该按钮,但是点击不起作⽤
chrome直接下载
elem_login = browser.find_element_by_xpath(
'//form[@id="sgk-login-form"]/fieldset/div[@class="form-actions"]/input[@class="btn btn-pri"]')
print(_attribute('value'))
print(_attribute('name'))
# 点击登录按钮
# webdrivermon.action_chains.ActionChains(browser).click(elem_login).perform()
time.sleep(1)
# 为了正常登陆,只好在密码空格上按下回车键模拟登陆
password_blank.send_keys(Keys.ENTER)
time.sleep(28)
pageSource = browser.page_source
#获得所有的SPE paper的链接,链接的text为Get PDF
all_papers_link = browser.find_elements_by_link_text("Get PDF")
all_papers_link = browser.find_elements_by_link_text("Get PDF")
#打印链接并下载pdf⽂档
for onePaperLink in all_papers_link:
url = _attribute('href')
print(url)
<(_attribute('href'))
# a.click()
# wget.download(url,out='D:\\Carl\\dev\\test\\')
time.sleep(22 + random.randint(1, 5))
browser.quit()

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。