python爬取百家号⽂章#!/usr/bin/env python
# -*- coding:utf-8 -*-
import xlwt
from selenium import webdriver
xpath语法 python# from selenium.webdrivermon.desired_capabilities import DesiredCapabilities
import time
import requests
import hashlib
from lxml import etree
class Bjh():
def __init__(self):
self.wb = xlwt.Workbook()
chrome_options = webdriver.ChromeOptions()
# extension_path = r'D:\python\work\bj3y\1.0.'
path = "chromedriver"
# chrome_options.add_experimental_option('w3c', False)
mobileEmulation = {'deviceName': 'iPhone 6/7/8 Plus'}
chrome_options.add_experimental_option('mobileEmulation', mobileEmulation)
# chrome_options.add_argument('--headless')
# chrome_options.add_argument("--proxy-server=%s"%_ip())
# chrome_options.add_argument('user-agent=%s'%random_ua())
chrome_options.add_argument("user-data-dir=" + r"C:\Users\redhat\AppData\Local\Google\Chrome\U
ser Data")        self.driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)
def open(self,url):
(url=url)
def hua(self):
time.sleep(1)
print("滑动加载中")
ute_script("window.scrollTo(0,document.body.scrollHeight)")
def check_state(self):
# pageSource = self.driver.page_source
# html = etree.HTML(pageSource)
# data = html.xpath('//*[@id="article"]/div/div/div[-1]/div')[0]
data=self.driver.find_element_by_class_name("s-loader").get_attribute('innerHTML')
state=str(data).split("\n")[1][-3:-2]
return state
def check_state_video(self):
# pageSource = self.driver.page_source
# html = etree.HTML(pageSource)
# data = html.xpath('//*[@id="article"]/div/div/div[-1]/div')[0]
data = self.driver.find_elements_by_class_name("s-loader")[1].get_attribute('innerHTML')
state=str(data).split("\n")[1][-3:-2]
return state
def get_article(self):
html = self.driver.page_source
html = etree.HTML(html)
article=html.xpath('//*[@id="article"]/div/div/div')
l=[]
for i in article:
title=i.xpath('div/div/div/div/div/div/div[2]/div[1]/text()')
if title==[]:
title = i.xpath('div/div/div/div/div[1]/text()')
read=i.xpath('div/div/div/div/div[3]/span/text()')
if read ==[]:
read = i.xpath('div/div/div/div/div/div/div[2]/div[2]/span/text()')
data=title+read
l.append(data)
return l
def tab_video(self):
ele=self.driver.find_element_by_xpath('//*[@id="app"]/div/div[3]/div/div[1]/div/div/div[4]')
ele.click()
def get_video(self):
html = self.driver.page_source
html = etree.HTML(html)
article=html.xpath('//*[@id="video"]/div/div/div')
l=[]
for i in article:
title=i.xpath('div/div/div[1]/div[2]/div/div[1]/text()')
if title==[]:
title = i.xpath('div/div/div/div/div[1]/text()')
read=i.xpath('div/div/div[1]/div[2]/div/div[3]/span/text()')
if read ==[]:
read = i.xpath('div/div/div/div/div/div/div[2]/div[2]/span/text()')            data=title+read
l.append(data)
return l
def write(self,title,type,data):
# wb = xlwt.Workbook()
# 添加sheet
ws = self.wb.add_sheet(type)
index = 0
for i in data:
if i==[]:
continue
ws.write(index, 0, i[0])
ws.write(index, 1, i[1])
ws.write(index, 2, i[2])
if type=="article":
ws.write(index, 3, i[3])
index += 1
# def run(self,url,title,type):
#    self.open(url)
#    if type=="article":
#        while True:
#            bjh.hua()
#            state = bjh.check_state()
#            if state == "2":
#                break
#        _article()
#    else:
#        self.tab_video()
#        while True:
#            bjh.hua()
#            state = bjh.check_state_video()
#            if state == "2":
#                break
#        data = _video()
#    self.write(title,type,data)
#    self.driver.close()
#    self.driver.quit()
def run(self,url,title):
self.open(url)
while True:
self.hua()
state = self.check_state()
if state == "2":
break
_article()
self.write(title, "article", data)
fresh()
time.sleep(1)
fresh()
time.sleep(1)
fresh()
time.sleep(3)
while True:
try:
time.sleep(1)
self.tab_video()
break
except Exception as e:
print(e)
fresh()
while True:
self.hua()
state = self.check_state_video()
if state == "2":
break
data = _video()
self.write(title,"video",data)
self.wb.save(title + ".xls")
self.driver.close()
self.driver.quit()
if __name__ == '__main__':
title=str(input("请输⼊标题:"))
url=str(input("请输⼊url:"))
bjh = Bjh()
bjh.run(url,title)

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。