全国图书馆参考咨询联盟模拟登陆及爬取可爬取的图⽚
⼀、编程思路
1.模拟登陆采⽤selenium PhantomJS 采⽤Chrome Firefox 这些,我的电脑⽆法截取验证码位置,读者可以⾃⾏尝试验证码识别可采⽤tesserocr 我采⽤⼿动输⼊
2、查询,获取搜索框,⽤户输⼊关键字并查询
3、页⾯信息,F12查看即可,若采⽤find_element_by_xpath()查询需注意element 返回是第⼀个节点信息 elements返回是⼀个列表
4、书本具体信息、F12查看,后⾯操作很简单
5、⽂献传递页⾯、这个地⽅是最难的,右键查看⽂献传递这个按钮,点击其中href是⽆法进⼊的,这个只是⼀种绑定关系,需要仔细观察进⼊⽂献传递页⾯前后的network中第⼀个⽂本中的信息,⾥⾯存在很多url,
只有refer 点击可以进⼊,分析refer url⾥⾯的元素,在进⼊前的那个页⾯的url可到,后⾯采⽤切⽚即可
6、下载书名页...............,此处我采⽤的是观察图⽚的链接直接的关系,从⽽获取,这个地⽅需要注意的是,图⽚需要不断的滑动滑动条才能加载图⽚,否则⽆法下载
7、保存图⽚,注意 'w' 和‘wb’使⽤即可
8、最后需要注意爬取频率,否则很容易被发现。
from selenium import webdriver
import time
#import tesserocr
import pytesseract
from selenium.webdrivermon.keys import Keys
from selenium.webdriver.support.ui import Select
from PIL import Image
import requests
from selenium.webdrivermon.desired_capabilities import DesiredCapabilities
# from pyquery import PyQuery as pq
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdrivermon.by import By
from lxml import etree
from urllib.parse import quote
import os
# cookie_bro = _cookies()
#
# cookie1=cookie_bro[1]['value']
# print('获取cookie成功')
def login():
#输⼊⽤户名
input_user = browser.find_element_by_id("userName") #查输⼊框
input_user.send_keys("ckho")
time.sleep(2)
#输⼊密码
input_pwd = browser.find_element_by_id("passWord")
input_pwd.send_keys("chen135********")
time.sleep(2)
#选择
input_gid = Select(browser.find_element_by_id("gid")) #实例化input_gid
input_gid.select_by_value("7320") #选择class=7320的选择
browser.save_screenshot("screen.png")
# 定位验证码图⽚的位置,并截取该位置的图⽚
code_element = browser.find_element_by_id("vimg")
print(code_element.location) # {'x': 548, 'y': 523}
left = code_element.location['x']
top = code_element.location['y']
right = code_element.size['width'] + left
height = code_element.size['height'] + top
im = Image.open("screen.png")
img = im.crop((left, top, right, height))
img.save("screen4.png")
#验证码识别
try:
input_verify = wait.until(
EC.element_to_be_clickable((By.ID,"verifyCode"))
)#browser.find_element_by_id("verifyCode")
result = input("请输⼊验证码")
input_verify.send_keys(result)
#单击登录
enter = wait.until(
EC.element_to_be_clickable((By.ID,"submit"))
)#browser.find_element_by_id("submit").click()
enter.click()
print("登录成功")
browser.save_screenshot("screen6.png")
return browser.current_url
except BaseException:
print(" Enter Error")
#查询书籍信息并且⽤户可选择页数
def index_page(url):
book_name = input("请输⼊查的书名")
input_bookname = browser.find_element_by_id("sw").send_keys(book_name)
enter = browser.find_element_by_xpath('//*[@id="f2"]/div[2]/input[1]').click()
print("当前页数为第⼀页")
all_book_information()
page = input("请输⼊想看页数:")
print("...正在爬取第"+str(page)+"页")
current_url = browser.current_url #输⼊书名后的页⾯的链接
try:
if int(page)>1:
<(current_url)
print(current_url)
#查输⼊框
input_page = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#jpage"))
)
#查登录建
sumbit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#pageinfo > input[type=button]:nth-child(13)'))
)
input_page.clear()
input_page.send_keys(str(page))
sumbit.click()
all_book_information()
return browser.current_url #换页后的链接
except TimeoutError:
index_page()
def all_book_information():
#返回当前页⾯的书本信息
addres ='//table[@class="book1"]'
addres_list = browser.find_elements_by_xpath(addres)
book_list = []
for book in addres_list:
book_list.)
for i in enumerate(book_list, start=1):
print(i)
#获取每本书具体链接并且返回每本书具体信息
def get_detail_book(url):
number = input("请输⼊你想要了解书的编号:")
<(url)
addres = '//table[{}][@class="book1"]//a[@class="px14"]'.format(number)
book_url = browser.find_element_by_xpath(addres).get_attribute("href")
<(book_url)
detail_book_information = browser.find_elements_by_xpath('//div[@class="tubox"]//dd')
for book in detail_book_information:
)
return browser.current_url
#进⼊图书馆⽂献传递页⾯
def sent_book_emial(url):
bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
all_page_name(bqy_url)
answer = input("是否需要下载此书是请输⼊是1 否2 看其他书按3 ,下载书名页4 下载前⾔页5 下载版权页6 下载⽬录页7 下载正⽂页8")
if int(answer)==1:
base_url = 'book.ucdrs.superlib/gofirstdrs.jsp?'
<(url)
sent_href = browser.find_element_by_xpath('//*[@id="libinfo"]/div[2]//a').get_attribute("href") #页⾯上⽂献传递对应href值#拆分href,拼接可进⼊图书参考咨询页⾯的url 通过进⼊⽂献传递,观察network中到可进⼊图书参考咨询页⾯url,直接点击href是不能访问 list1 = sent_href.split("?", 1)
list2 = list1[1].split("'", 1)
tscx_url = base_url+list2[0]
<(tscx_url)
browser.save_screenshot("screen5.png")
book_download()
elif int(answer)==2:
print("\n")
print("本次查询结束,欢迎下次使⽤!")
elif int(answer) == 4:
<(url)
bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
base_url = bqy_download(bqy_url)
smy_img(base_url)
elif int(answer)==5:
<(url)
bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
base_url = bqy_download(bqy_url)
qyy_img(base_url)
elif int(answer)==6:
<(url)
bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
base_url = bqy_download(bqy_url)
bqy_url(base_url)
elif int(answer)==7:
<(url)
bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
base_url = bqy_download(bqy_url)
mly_img(base_url)
elif int(answer)==8:
<(url)
bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
base_url = bqy_download(bqy_url)
zwy_img(base_url)
else:
url = "www.ucdrs.superlib/"
<(url)
all_book_url_page = index_page(url)
detail_book_url = get_detail_book(all_book_url_page)
sent_book_emial(detail_book_url)
def all_page_name(url):
<(url)
all_page_name = browser.find_element_by_id("pagejump")
t1 = all_eplace("", "") #删除空格
print("该书可看部分仅有:"+t1+"请按照此选择下载,否则可能导致下载错误")
#图书下载
def book_download():
all_page = browser.find_element_by_xpath('//*[@id="content"]/form/ul/li[3]/p[1]').text
print(all_page)
print("每本图书咨询每次不超过50页")
input1 = input("请输⼊想看的书初始页")
input2 = input("请输⼊想看的书的末页")
input_start = browser.find_element_by_id("frompage").send_keys(input1)
input_led = browser.find_element_by_id("endpage").send_keys(input2)
email = input("请输⼊你的邮箱账号")
input_email = browser.find_element_by_id("email").send_keys(email)
verifycode1 = input("请输⼊验证码")
input_verifycode1 = browser.find_element_by_id("verifycode").send_keys(verifycode1)
input_enter = browser.find_element_by_xpath('//li[@class="sumbit"]').click()
#返回图⽚的url共同部分
def bqy_download(url):
<(url)
print(url)
time.sleep(4) #注意需要留个页⾯加载时间,模仿⼈阅读时候⽹页加载速度否则加载不出来想要的图⽚链接
browser.save_screenshot("screen8.png")
first_img_url = browser.find_element_by_xpath('//*[@id="reader"]/div/div[1]/input').get_attribute("src")
print(first_img_url)
base_url = first_img_url[0:-13]
print(base_url)
return base_url
#下载书名页
def smy_img(base_url):
i=1
print("仅下载1页")
while i<2:
img_url = base_url + 'bok00{}'.format(i) + '?zoom=0&f=0'
i += 1
response = (img_url)
print(img_url)
with open("D:/pycharm/实战案例/前⾔页/" + str(i-1) + '.png', "wb") as f:
f.t)
print("success download")
time.sleep(2)
#下载版权页
def bqy_img(base_url):
i=1
print("仅下载1页")
while i<2:
img_url = base_url + 'leg00{}'.format(i) + '?zoom=0&f=0'
i += 1
response = (img_url)
print(img_url)
with open("D:/pycharm/实战案例/版权页/" + str(i-1) + '.png', "wb") as f:
f.t)
print("success download")
selenium获取cookie#下载前⾔页
def qyy_img(base_url):
i=1
print("仅下载5页")
while i<6:
img_url = base_url + 'fow00{}'.format(i) + '?zoom=0&f=0'
i += 1
response = (img_url)
print(img_url)
with open("D:/pycharm/实战案例/前⾔页/" + str(i-1) + '.png', "wb") as f:
f.t)
print("success download")
# try:
# response.headers["Accept-Encoding"]
# except:
# break
time.sleep(2)
#下载⽬录页
def mly_img(base_url):
i=1
print("仅下载3页")
while i<4:
img_url = base_url + '!0000{}'.format(i) + '?zoom=0&f=0'
i += 1
response = (img_url)
print(img_url)
with open("D:/pycharm/实战案例/⽬录页/" + str(i-1) + '.png', "wb") as f:
f.t)
print("success download")
time.sleep(2)
#下载正⽂页
def zwy_img(base_url):
i=1
print("仅下载15页")
while i<12:
if i<16:
img_url = base_url + '00000{}'.format(i) + '?zoom=0&f=0'
else:
img_url = base_url + '0000{}'.format(i) + '?zoom=0&f=0'
i += 1
response = (img_url)
print(img_url)
with open("D:/pycharm/实战案例/正⽂页/" + str(i-1) + '.png', "wb") as f:
f.t)
print("success download")
time.sleep(2)
if__name__ == '__main__':
url = "www.ucdrs.superlib/login/login.action"
# headers = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Encoding": "gzip, deflate",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Cache-Control": "max-age=0",
# "Connection": "keep-alive",
# "Cookie": "JSESSIONID=E9B8FFC8B023F0FC12A07A3ECDE91581.jp26; __dxca=d04d4dbb-57fb-4114-b080-190507ee4cbf; route=5ead36b501ee59635125fd6ef4221d0e; UM_distinctid=170b290e53493e-0372017c841b37-4313f6a-144000 # "Host": "img.duxiu",
# "Upgrade-Insecure-Requests":"1",
# "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
# }
#
# cookies = {
# "Cookie": "JSESSIONID=E9B8FFC8B023F0FC12A07A3ECDE91581.jp26; __dxca=d04d4dbb-57fb-4114-b080-190507ee4cbf; route=5ead36b501ee59635125fd6ef4221d0e; UM_distinctid=170b290e53493e-0372017c841b37-4313f6a-144000 # }
browser = webdriver.PhantomJS()
<(url)
wait = WebDriverWait(browser, 8)
print("欢迎使⽤图书查询⼩程序")
login() # 登录需要⼿动验证码
all_book_url_page = index_page(url) # 查看第⼏页及返回当前页的书本信息
detail_book_url = get_detail_book(all_book_url_page) #返回每本书的具体的链接
sent_book_emial(detail_book_url)
若有错误,请留⾔告诉我,谢谢!
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论