python爬⾍之淘宝宝贝图⽚抓取
写在前⾯的话:家⾥有⼈开淘宝店,作为⼀个⼩的淘宝店主,经常要做的就是从别⼈的店铺(当然是批发商)把图⽚⼀张⼀张存下来。然后再⾃⼰做ps做好看⼀点,再上架。这样存图什么的,挺烦⼈的,刚好最近在学习python,发现这东西,真⼼的那叫⼀个⽅便。
总的来说,其实也并没有什么技术含量,只是熟悉⼀下python的语⾔和正则表达式的使⽤。
主要步骤:
1、当然是抓取页⾯html代码
1import urllib
2import urllib2
3
4#获取html代码
5def getHtml(url):
6 request = urllib2.Request(url , headers = headers)
7try:
8 response = urllib2.urlopen(request)
9 html = ad()
10return html
11except urllib2.URLError,e:
ason
2、分析页⾯中的详情图⽚部分和主图部分
淘宝的html页⾯相当的整齐,可读性不错。很快就可以到,他们的描述页位置:descUrl .. location.protocol = 'http:.......'
可以写⼀个正则表达式,提取出来
1import re
2
3#提取描述url
4def descUrl(html):
5 reg = r"descUrl.*?location.protocol==='http:' \? '//(.*?)'.?:"
6 desurlre = repile(reg,re.I)
7 desurl = re.findall(desurlre , html)
8return desurl
再获取这个详情页地址,就可以提取出所有的图⽚地址了。
1#提取所有图⽚
2def getImglist(html):
3 reg = r'src=\"(.*?)\"'
4 imgre = repile(reg,re.I)
5 imglist = re.findall(imgre , html)
6return imglist
3、下载图⽚
获取到了图⽚的url后,当然就是把图⽚下下来,这⾥做⼀个指定路径的保存⽅法。
因此再加⼀个创建路径
1#⽬录是否存在,不存在则创建
2def createDir(path):
3if not ists(path):
图片爬虫app4 os.makedirs(path)
5else:
6if os.path.isfile(path):
7 os.mkdir(path)
保存图⽚
1#保存所有图⽚
2def saveImgTo(imglist , path):
3 createDir(path)
4 imgIndex = 1
5for imgurl in imglist:
6 splist = imgurl.split('.')
7 filetype = splist[len(splist)-1]
8print"saving " + imgurl
9try:
10 urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + '.' + filetype )
11 imgIndex += 1
12print"==> ok!"
14print"==> err"
以下为⼀份完整代码,传⼊存储路径,保存下 中所有url的淘宝或其他⽹页图⽚。新⼿上路,写的不好的地⽅轻拍: 1#coding=utf-8
2
3import re
4import urllib
5import urllib2
6import cookielib
7import StringIO, gzip
8import os
9import sys
10
11 headers = {
12'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
13 }
14
15
16#解压gzip
17def gzdecode(data) :
18 compressedstream = StringIO.StringIO(data)
19 gziper = gzip.GzipFile(fileobj=compressedstream)
20 data2 = ad() # 读取解压缩后数据
21return data2
22
23#获取html代码
24def getHtml(url):
25 request = urllib2.Request(url , headers = headers)
26try:
27 response = urllib2.urlopen(request)
28 html = ad()
29return html
30except urllib2.URLError,e:
ason
32
33#⽬录是否存在,不存在则创建
34def createDir(path):
35if not ists(path):
36 os.makedirs(path)
37else:
38if os.path.isfile(path):
39 os.mkdir(path)
40
41#提取描述url
42def descUrl(html):
43 reg = r"descUrl.*?location.protocol==='http:' \? '//(.*?)'.?:"
44 desurlre = repile(reg,re.I)
45 desurl = re.findall(desurlre , html)
46return desurl
47
48#提取所有图⽚
49def getImglist(html):
50 reg = r'src=\"(.*?)\"'
51 imgre = repile(reg,re.I)
52 imglist = re.findall(imgre , html)
53return imglist
54#提取主图
55def getTitleImg(html, path):
56 createDir(path)
57 reg = r'auctionImages.*?\[(.*?)\]'
58 imgre = repile(reg,re.I)
59 titleImg = re.findall(imgre , html)
60 titleImg = titleImg[0]
61 imglist = titleImg.split(',')
62 titleIndex = 1
63for imgurl in imglist:
64print"img ==== > " + imgurl
65 imgurl = imgurl.strip('"')
66 imgurl = 'http:' + imgurl
67print imgurl
68 splist = imgurl.split('.')
69 filetype = splist[len(splist)-1]
70try:
71 urllib.urlretrieve(imgurl , path + "/title"+ str(titleIndex) + '.' + filetype )
72 titleIndex += 1
73print"==> ok!"
74except:
75print"==> err"
76
78def saveImgTo(imglist , path):
79 createDir(path)
80 imgIndex = 1
81for imgurl in imglist:
82 splist = imgurl.split('.')
83 filetype = splist[len(splist)-1]
84print"saving " + imgurl
85try:
86 urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + '.' + filetype )
87 imgIndex += 1
88print"==> ok!"
89except:
90print"==> err"
91
92#从⼀个淘宝页⾯,得到详情图⽚
93def getTaoBaoImg(url ,savePath):
94 html = getHtml(url)
95 getTitleImg(html , savePath)
96 desurl = descUrl(html)
97 desurl = "" + desurl[0]
98print"desurl = " + desurl
99print"----------------------------------------------------------"
100#得到淘贝详情html
101 desHtml = getHtml(desurl)
102 imglist = getImglist(desHtml)
103 saveImgTo(imglist , savePath)
104#-------------------------------------我是华丽的分界线 begin Other-----------------------------------------105#提取其他详情图⽚列表
106def getOtherImgurllist(html):
107 reg = r'src="(.*?)"'
108 desre = repile(reg,re.S)
109 imgurllist = re.findall(desre , html)
110return imgurllist
111
112
113#从其他提取详情图⽚
114def getOtherImg(url , savePath):
115 html = getHtml(url)
116 imglist = getOtherImgurllist(html)
117 saveImgTo(imglist , savePath)
118
119#提取其他主图
120def getOthertitleImg(html, savePath):
121print"todo:"
122
123#-------------------------------------我是华丽的分界线 end Other-----------------------------------------124
125#保存原地址
126def saveUrl(url , savePath):
127 output = open( savePath + "/url.htm" , "w")
128 output.write("""<html>
129<head>
130<meta http-equiv="Content-Language" content="zh-CN">
131<meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=gb2312">
132<meta http-equiv="refresh" content="0.1;url=""" + url + """\">
133<title></title>
134</head>
135<body>
136</body>
137</html>""")
138 output.close()
139
140
141 savepath = "img"
142
143 input = open('', 'r')
144
145 urls = ad( )
146 urls = urls.split('\r\n')
147print urls
148
149if len(sys.argv)>1 and sys.argv[1]:
150 savepath = sys.argv[1]
151
152print savepath
153
154 urlIndex = 1
155for url in urls:
156if len(url) < 10:
157continue
158 urlSavePath = savepath + '/' + str(urlIndex)
159 createDir(urlSavePath)
160 saveUrl(url , urlSavePath)
162print url
163if url.find('taobao') != -1:
164 getTaoBaoImg(url , urlSavePath) 165else:
166 getOtherImg(url , urlSavePath) 167 urlIndex += 1
168
169print"success!"
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论