爬⾍爬取⼩说⽹站,实现单本⼩说的⾃动下载
import requests,random,os,re
from bs4 import BeautifulSoup
HomeUrl = 'www.22ff'
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36' }
# 搜索⼩说,确定是否存在,
def get_book_url(bookName):
try:
url = 'www.22ff/txt/{}'.format(bookName)
soup = (url=url,headers=header)
html = , 'lxml')
# print(soup.status_code)
book_list = html.select('irong ul li a')
link = None
for i in range(book_list.__len__()-3):
if bookName == book_list[i].get_text():
# ⼩说的下载页⾯地址
link = HomeUrl+book_list[i+3].get('href')
# print(link)
return link
except Exception as e:
print(e)
print('这⾥出错')
# get_book_url('⼤国⽂娱')
# 创建保存⼩说的⽂件夹
def mkdirs():
path = './⼩说/'
if not ists(path):
os.mkdir(path)
print('⽂件夹已创建成功...')
return path
else:
网页html下载# print('⽂件夹已存在...')
return path
# print(checkFile())
#判断⼩说是否已下载
def checkFile(path,bookName):
flag = False
for each in os.walk(path):
for i in each[2]:
if bookName == i:
flag = True
return flag
# print(checkFile('./⼩说/','宇宙交易系统'))
# 进⼊下载页⾯,获取下载链接,如果⽤selenium写,这个函数就很简单,我这个⽤的笨⽅法,你们如果有兴趣⾃⼰改改试试
def down_book(bookName):
url1 = get_book_url(bookName)
if url1 == None:
print('暂未搜索到该⼩说...')
else:
print('已搜索到该资源,正在获取链接...')
try:
try:
data = (url1, headers=header)
if data.status_code == 200:
html = , 'lxml')
# 通过字符串切割,到真正的下载链接
link = html.select('div.down_bar script')[0]
listlink = str(link).split('"')[1]
ur = listlink.split('/')
# 通过拼接等⽅法,获取下载链接
RightUrl = ur[0]+'//67.229.159.202/'+ur[3]+'/'+ur[4]+'/'+ur[5]
return RightUrl
else:
print('⽹页请求出错,获取链接失败...')
except Exception as e:
print(e)
return None
# down_book('⼤国⽂娱')
# 主要负责下载⼩说
def main():
while True:
bookName = input('请输⼊要下载的⼩说名字,[n或N退出]:')
if bookName == 'n' or bookName =='N':
break
else:
# 获取⽂件保存的路径
path = mkdirs()
# 要下载的⼩说名
book = bookName+'.txt'
flag = checkFile(path,book)
if flag == False:
# 获取下载链接
RightUrl = down_book(bookName)
if RightUrl != None:
print('成功获取链接,正在下载,请耐⼼等待...')
try:
bookdata = (url=RightUrl,headers=header)
if bookdata.status_code == 200:
with open(path+book,'wb') as f:
f.t)
size = size(path+bookName+'.txt')
print('⼩说下载成功!共%.2fM' %(size/1024/1024))
else:
print('⽹页请求出错,⼩说下载失败...')
except Exception as e:
print(e)
else:
print('⽆法获取链接...')
else:
print('该⼩说已存在...')
if __name__ == '__main__':
# pass
main()
input('回车键结束...')
只有输⼊⼩说名称,就会查该⼩说,如果笔趣阁有这部⼩说,就会⾃动下载。(在获取⼩说的实际下载链接的时候,我⽤了很笨的⽅法,⽤了各种切割、拼接。但是如果⽤selenium,很容易就可以获取了,⾃⼰去尝试吧!能⼒有限,各位海涵~~~)
如果想理解代码,可以⾃⼰尝试着单独运⾏代码中的每个函数,输出它们的值,慢慢理解各个函数的功能。
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论