23hh⼩说⽹——爬⾍1.0python 修改点:
1. decode('gbk')修改为decode('gbk','replace'),在遇到不能识别的字符时直接⽤?替换
2. 将原来的单线程变为多线程,现在处理速度快了好多
1#! /bin/python
2# -*- coding:utf-8 -*-
3
4# --------------------------------------------
5# 程序:【23hh⼩说⽹】爬⾍
6# 版本:0.2.2
7# 作者:Silence
8# ⽇期:2014-04-08
9# 功能:1. 提供⼀个⽬录页,把这个⼩说的全部章节都抓取下来,保存为⼀个⽂件
10# 2. 提供⼀个正在看的⽬录页,把这个章节及以后的所有章节都抓取下来,
11# 3. 增加错误时的重试机制
12# ---------------------------------------------
13import threading
14import urllib2
15import re
16import os
17from Queue import Queue
18
19class Spider_Thread(threading.Thread):
20"""单进程的爬⽹页很耗时的,所以考虑做成多进程的
21参数通过queue来传递"""
22
23def__init__(self, t_name,queue):
24 threading.Thread.__init__(self ,name = t_name)
25 self.data = queue
26 Info = {}
27
28def run(self):
29while self.data.qsize() > 0:
30 pageInfo = ()
31print'线程%s正在爬第%d个页⾯'%(Name(),pageInfo.keys()[0])
32try:
33 vel = Novel_Tool(pageInfo.values()[0]['pageurl'],'N')
34 decodePageResp = DecodePage(pageInfo.values()[0]['pageurl'])
35 pageContent = PageContent(decodePageResp)
36 vel.writeToFile(pageContent,pageInfo.values()[0]['pagename'])
37except Exception,e:
38print'爬第%d个页⾯时出错了' %pageInfo.keys()[0]
39 Info[pageInfo.keys()[0]] = pageInfo.values()
40pass
Info.__len__() > 0:
42print'出错的页⾯信息有:\n',Info
43 vel = None
44
45
46# 主要是⽤来提供⼀些公⽤的⽅法,不作为主函数调⽤
47class Novel_Tool():
48
49def__init__(self,weburl,saveAsOne):
50 self.url = weburl
51 self.headers = {
52'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
53 }
54 self.saveAsOne = saveAsOne
55 self.pagesInfo = {}
56 Page = []
57
58#获取当前页⾯的编码格式,现在某些⼩说⽹喜欢⽤gbk来编码
59# 但是,也有⼀些不厚道的⽹站,他们的实际编码格式不是在页⾯中charset中定义的格式,暂时先忽略了
60def getPageType(self,content):
61 pattern = repile('charset=.*?"')
62 pagetype = pattern.search(content).group()
63 pagetype = pagetype[8:len(pagetype) - 1]
64return pagetype
65
66def start(self):
67if novelUrl.find('html') > 0:
68 self.spiderPagesFromCurrent()
69else:
70 pageInfos = AllUrlsAndNames()
70 pageInfos = AllUrlsAndNames()
71 self.spiderAllPagesFromOne(pageInfos)
72 self.doStat()
73
74def doStat(self):
75print'本次共尝试爬章节 %d,其中爬成功章节数 %d' %(self.pageInfos.__len__(),self.pageInfos.__len__() - Page.__len__()) 76print'失败的章节信息为:',errorPage
77
78def retryErroePage(self,errorPages):
79print'准备重试错误页⾯中....'
80 self.spiderAllPagesFromOne(errorPages)
81
82def spiderPagesFromCurrent(self):
83 pageurl = self.url
84 index = 1
85while pageurl.find('index.html') == -1:
86try:
87 decodePageResp = DecodePage(pageurl)
88 pageContent = PageContent(decodePageResp)
89
90 self.writeToFile(PageTitle(decodePageResp))
91 pageurl = NextPage(decodePageResp)
92except Exception,e:
93print'爬第%d个页⾯时出错了' %index
94 Page = {index:pageInfo['pageurl']}
95pass
96finally:
97 index = index + 1
98
99# 依次遍历所有的章节,并爬下来
100def spiderAllPagesFromOne(self,pageInfo):
101for index,pageInfo in pageInfo.items():
102print'正在爬第%d个页⾯……'%index
103try:
104 decodePageResp = DecodePage(pageInfo['pageurl'])
105 pageContent = PageContent(decodePageResp)
106 self.writeToFile(pageContent,pageInfo['pagename'])
107except Exception,e:
108print'爬第%d个页⾯时出错了' %index
109 Page = {index:pageInfo['pageurl']}
110pass
111
112# 获取正⽂的标题
113def getPageTitle(self,content):
114 charToTitleRex = repile('h1>(.|\s)*?</h1')
115 pageTitle = charToTitleRex.search(content).group()
116 pageTitle = pageTitle[3:len(pageTitle)-4]
117return pageTitle
118
119def writeToFile(self,content,filename):
120if wd() + '/Novels'):
121if not os.path.wd() + '/Novels'):
122 os.rename('Novels','Novels.bak')
123 os.wd() + '/Novels')
124else:
125 os.wd() + '/Novels')
126
127if self.saveAsOne == 'N':
128 ofile = wd() + '/Novels/' + filename,'w')
129else:
130 ofile = wd() + '/','a')
131
132try:
133 ofile.write(content)
134except Exception, e:
135print'存储⽹页',filename,'出错!'
136pass
137finally:
138 ofile.close()
139
140def getDecodePage(self,pageurl):
141 req = urllib2.Request(
142 url = pageurl,
143 headers = self.headers
144 )
145# print pageInfo['pageurl']
146 pageResponse = urllib2.urlopen(req).read()
147 decodePageResp = pageResponse.PageType(pageResponse),'replace').encode('utf-8')
148return decodePageResp
149
150# 章节内容
150# 章节内容
151
152def getPageContent(self,decodePageResp):
153 contentPattern = repile('(<dd id="contents">)((.|\s)*?)(</dd>)')
154 content = contentPattern.search(decodePageResp).group(2)
155 content = placeWebTag(content)
156return content
157
158# 获取下⼀页的地址
159def getNextPage(self,content):
160# 先获取到下⼀页的位置
161 footlinkRex = repile('(footlink">)(.*?)</dd>')
162 foot = footlinkRex.search(content).group(2)
163 pattern = repile(r'(返回⽬录.*?(<a.*?">下⼀页))')
164 m = pattern.search(foot).groups()
165 nextUrl = m[len(m)-1][9:m[len(m)-1].find('">')]
166
167return self.url[0:self.url.rfind('/')+1] + nextUrl
168
169def getAllUrlsAndNames(self):
170# 先请求⽬录页,获取所有的⽬录章节和链接
171 req = urllib2.Request(
172 url = self.url,
173 headers = self.headers
174 )
175 myResponse = urllib2.urlopen(req).read()
176 decodeResp = myResponse.PageType(myResponse)).encode('utf-8')
177
178print'正在分析⽬录页⾯,请稍后…………'
179 pageRex = repile('<a href=".*?</td>') #定义获取所有章节页⾯链接的正则
180 pageUrlRex = repile('".*?"') #获取章节url的正则
181 pageNameRex = repile('>.*?<') #获取章节名字的正则
182
183 pages = pageRex.findall(decodeResp)
184 index = 1
getsavefilename185for page in pages:
186 pageurl = pageUrlRex.search(page).group()
187 pageurl = pageurl[1:len(pageurl) - 1]
188 pageurl = self.url + pageurl
189
190 pagename = pageNameRex.search(page).group()
191 pagename = pagename[1:len(pagename) - 1]
192
193# print pagename + ' ' + pageurl
194 self.pagesInfo[index] = {
195'pagename' : pagename,
196'pageurl' : pageurl
197 }
198 index = index + 1
199print'⽬录页分析完成!该⼩说共有%d个章节'%index
200return self.pagesInfo
201
202def getNovelName(self,content):
203 titleRex = repile('<h1>.*?</h1>')
204 title = titleRex.search(content).group()
205return title[4:len(title) - 5]
206
207def replaceWebTag(self,content):
208 charToNoneRex = repile(r' ')
209 charToNewLineRex = repile("<br />|<br>|<br/>")
210
211 content = charToNoneRex.sub("",content)
212 content = charToNewLineRex.sub("\n",content)
213return content
214
215if__name__ == '__main__':
216print u"""
217# * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * #
218# 程序:【23hh⼩说⽹】爬⾍ #
219# 版本:1.0 #
220# 作者:Silence #
221# ⽇期:2014-04-08 #
222# 操作:启动后输⼊要爬得⼩说⽬录页地址,就可以⾃动爬了 #
223# 功能:1. 提供⼀个⽬录页,把⽬录页中所有的⽬录章节都抓出来(默认是:23hh的争霸天下); # 224# 2. 提供⼀个正在看的⽬录页,把这个章节及以后的所有章节都抓取下来, #
225# 分章节保存在启动脚本⽬录下的Novels⽬录下; #
226# 如果该⽬录下已经有⼀个Novels,则把这个Novels改名为Novels.bak #
227# * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * # * #"""
228
229 novelUrl = raw_input('请输⼊要爬的⼩说地址(默认是:23hh的争霸天下章节⽬录)\n')
229 novelUrl = raw_input('请输⼊要爬的⼩说地址(默认是:23hh的争霸天下章节⽬录)\n') 230if novelUrl == '':
231 novelUrl = 'www.23hh/book/43/43957/'
232elif novelUrl.find('html') > 0:
233 novelUrl = novelUrl
234
235 saveAsOne = raw_input('是否保存为⼀个⽂件?是为Y,否为N\n')
236if saveAsOne not in ['Y','N']:
237 saveAsOne = 'N'
238
239 Novel = Novel_Tool(novelUrl,saveAsOne)
240
241if not novelUrl.find('html') > 0:
242 queue = Queue()
243 pageInfos = AllUrlsAndNames()
244for key,value in pageInfos.items():
245 queue.put({key:value})
246
247 thread1 = Spider_Thread('thread1',queue)
248 thread2 = Spider_Thread('thread2',queue)
249 thread3 = Spider_Thread('thread3',queue)
250 thread4 = Spider_Thread('thread4',queue)
251 thread1.start()
252 thread2.start()
253 thread3.start()
254 thread4.start()
255else:
256 Novel.start()
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论