Python 爬⾍:第七章动态加载数据处理selenium 模块(25)
第七章 动态加载数据处理 selenium 模块
selenium 基础操作
example 医药局
selenium 的其他⾃动化操作
example 淘宝from
selenium import  webdriver
from  lxml import  etree
from  time import  sleep
#实例化⼀个浏览器对象(传⼊浏览器的驱动成)
bro = webdriver .Chrome (executable_path ='./chromedriver')
#让浏览器发起⼀个指定url 对应请求
bro .get ('v:81/xk/')
#page_source 获取浏览器当前页⾯的页⾯源码数据
page_text = bro .page_source
#解析企业名称
tree = etree .HTML (page_text )
li_list = tree .xpath ('//ul[@id="gzlist"]/li')
for  li in  li_list :
name = li .xpath ('./dl/@title')[0]
print (name )
sleep (5)
bro .quit ()
<('www.taobao/')
#标签定位
search_input = bro.find_element_by_id('q')
#标签交互
search_input.send_keys('Iphone')
#执⾏⼀组js程序
sleep(2)
#点击搜索按钮
btn = bro.find_element_by_css_selector('.btn-search')
btn.click()
<('www.baidu')
sleep(2)
#回退
bro.back()
sleep(2)
#前进
bro.forward()
sleep(5)
bro.quit()
动作链和iframe处理
模拟登陆
⾕歌⽆头浏览器+反检测实现⽆可视化界⾯、实现规避检测
#导⼊动作链对应的类
from  selenium .webdriver import  ActionChains
bro = webdriver .Chrome (executable_path ='./chromedriver')
bro .get ('www.runoob/try/try.php?filename=jqueryui-api-droppable')
#如果定位的标签是存在于iframe 标签之中的则必须通过如下操作在进⾏标签定位
bro .switch_to .frame ('iframeResult')#切换浏览器标签定位的作⽤域
div = bro .find_element_by_id ('draggable')
#动作链
action = ActionChains (bro )
#点击长按指定的标签
action .click_and_hold (div )
for  i in  range (5):
#perform()⽴即执⾏动作链操作
#move_by_offset(x,y):x ⽔平⽅向 y 竖直⽅向
action .move_by_offset (17,0).perform ()
sleep (0.5)
#释放动作链
action .release ()
bro .
quit ()
from  selenium import  webdriver
from  time import  sleep
bro = webdriver .Chrome (executable_path ='./chromedriver')
bro .get ('qzone.qq/')
bro .switch_to .frame ('login_frame')
a_tag = bro .find_element_by_id ("switcher_plogin")
a_tag .click ()
userName_tag = bro .find_element_by_id ('u')
password_tag = bro .find_element_by_id ('p')
sleep (1)
userName_tag .send_keys ('328410948')
sleep (1)
password_tag .send_keys ('123456789')
sleep (1)
btn = bro .find_element_by_id ('login_button')
btn .click ()
sleep (3)
bro .quit ()
实战:基于selenium 实现12306模拟登录
#实现⽆可视化界⾯
from  selenium .webdriver .chrome .options import  Options
#实现规避检测
from  selenium .webdriver import  ChromeOptions
#实现⽆可视化界⾯的操作
chrome_options = Options ()
chrome_options .add_argument ('--headless')
chrome_options .add_argument ('--disable-gpu')
#实现规避检测
option = ChromeOptions ()
option .add_experimental_option ('excludeSwitches', ['enable-automation'])
#如何实现让selenium 规避被检测到的风险
bro = webdriver .Chrome (executable_path ='./chromedriver',chrome_options =chrome_options ,options =option )
#⽆可视化界⾯(⽆头浏览器) phantomJs
bro .get ('www.baidu')
print (bro .page_source )
sleep (2)
bro .quit ()
#下述代码为超级鹰提供的⽰例代码
import  requests
from  hashlib import  md5
class  Chaojiying_Client (object ):
def  __init__(self , username , password , soft_id ):
self .username = username
password =  password .encode ('utf8')
self .password = md5(password ).hexdigest ()
self .soft_id = soft_id
self .base_params = {
'user': self .username ,
'pass2': self .password ,
'softid': self .soft_id ,
}
self .headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def  PostPic (self , im , codetype ):
"""
im: 图⽚字节
codetype: 题⽬类型 参考 www.chaojiying/price.html
"""
params = {
'codetype': codetype ,
}
params .update (self .base_params )
files = {'userfile': ('ccc.jpg', im )}
r = requests .post ('upload.chaojiying/Upload/Processing.php', data =params , files =files , headers =self .headers )        return  r .json ()
def  ReportError (self , im_id ):
"""
im_id:报错题⽬的图⽚ID
im_id:报错题⽬的图⽚ID
"""
params ={
'id': im_id,
}
params.update(self.base_params)
r = requests.post('upload.chaojiying/Upload/ReportError.php', data=params, headers=self.headers) return r.json()
# chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') #⽤户中⼼>>软件ID ⽣成⼀个替换 96001 # im = open('12306.jpg', 'rb').read()            #本地图⽚⽂件路径来替换 a.jpg 有时WIN系统须要//
# print(chaojiying.PostPic(im, 9004)['pic_str'])
#上述代码为超级鹰提供的⽰例代码
#使⽤selenium打开登录页⾯
from selenium import webdriver
import time
from PIL import Image
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='./chromedriver')
<('kyfw.12306/otn/login/init')
time.sleep(1)
#save_screenshot就是将当前页⾯进⾏截图且保存
bro.save_screenshot('aa.png')
#确定验证码图⽚对应的左上⾓和右下⾓的坐标(裁剪的区域就确定)
code_img_ele = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img_ele.location  # 验证码图⽚左上⾓的坐标 x,y
print('location:',location)
size = code_img_ele.size  #验证码标签对应的长和宽
print('size:',size)
#左上⾓和右下⾓坐标
rangle =(
int(location['x']),int(location['y']),int(location['x']+ size['width']),int(location['y']+ size['height']))
#⾄此验证码图⽚区域就确定下来了
i = Image.open('./aa.png')
code_img_name ='./code.png'
#crop根据指定区域进⾏图⽚裁剪
frame = i.crop(rangle)
frame.save(code_img_name)
#将验证码图⽚提交给超级鹰进⾏识别
chaojiying = Chaojiying_Client('bobo328410948','bobo328410948','899370')#⽤户中⼼>>软件ID ⽣成⼀个替换 96001 im =open('code.png','rb').read()#本地图⽚⽂件路径来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im,9004)['pic_str'])
result = chaojiying.PostPic(im,9004)['pic_str']
all_list =[]#要存储即将被点击的点的坐标  [[x1,y1],[x2,y2]]
if'|'in result:
list_1 = result.split('|')
count_1 =len(list_1)
for i in range(count_1):
xy_list =[]
x =int(list_1[i].split(',')[0])
y =int(list_1[i].split(',')[1])
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
else:
x =int(result.split(',')[0])
y =int(result.split(',')[1])
xy_list =[]
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。