python爬⾍从csv⽂件读取批量url爬取所需信息保存数据库或者本地csv⽂件
1.爬⾍场景:
爬⾍⼤部分要爬取列表和列表详情的信息,⼀般的操作是先爬取列表的信息的内容和详情页的链接,这样就需要把详情页的url读取出来再来爬取详情页的数据。
2.爬⾍步骤:
a.先使⽤python 的pandas数据清洗包或者numpy包 操作csv⽂件 ⽐如 读取,去重,保存,或者直接存⼊数据库。
b.然后⽤requests包请求(get/post)url 获得html源码(这⾥仅针对不是动态加载的数据,也就是打印源码 所需信息也能打印出来,如果打印不出来 还是采⽤采集⼯具⽐较⽅便)。
c.得到源码后 ,就需要⽤到另⼀个包 BeautifulSoup包 通过select和find_all⽅法得到想要的信息 ⼀般在⾕歌浏览器复制所需信息的select 信息即可。
d.获取到数据后把数据存⼊List
e.最后把list数据存⼊数据库,建议循环url的时候 循环⼀个url就存⼀个url的信息,不要读取完所有的url的时候再保存 ,因为数据量⼤的时候 中途可能会炸 ,这样数据就没有了 ,读取⼀个存⼀个 炸了还可以接着爬取。
3.具体代码:
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
# 读取csv信息在python中展⽰多少条信息否则会以省略号代替
pd.set_option('display.max_columns',28000)
pd.set_option('display.width', 28000)
pd.set_option('display.max_colwidth',28000)
import MySQLdb
import datetime
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# mmsi
mmsiList = []
# 各个国家集合
cityList = []
# size
sizeList = []
# callSign
callSignList = []
# GRT
grtList = []
# DWT
dwtList = []
# owner
ownerList = []
# build
buildList = []
#
# 发出请求获得HTML源码
def get_html(url):
# 指定⼀个浏览器头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
# 代理,免费的代理只能维持⼀会可能就没⽤了,⾃⾏更换
resp = (url, headers=headers)
# proxies = {'http': '111.23.10.27:8080'}
# try:
# print ("-------------------------")
# # Requests库的get请求
# print "uuuu :" + url
# print "uuuu :" + url
#
# print ('没使⽤代理ip')
# except:
# print ('使⽤了代理ip')
# # 如果请求被阻,就使⽤代理
# resp = (url, headers=headers, proxies=proxies)
return resp
# 获得所有页⾯
def all_page():
# url2 = "shiptracking/vessels/alvsnabben-3-mmsi-265547220-imo-0"
# url3 = "shiptracking/vessels/zefyros-mmsi-240485000-imo-9324382"
# url4 = "shiptracking/vessels/xin-hai-shan-mmsi-413492410-imo-9830563" # url5 = "shiptracking/vessels/pilot22-mmsi-477995056-imo-0"
# url6 = "shiptracking/vessels/earl-w-redd-mmsi-367765170-imo-0"
# url7 = "shiptracking/vessels/confide-mmsi-244710115-imo-0"
# url8 = "shiptracking/vessels/christina-m-mmsi-205415890-imo-0"
# url9 = "shiptracking/vessels/olieservice-4-mmsi-244670165-imo-0"
# url10 = "shiptracking/vessels/sineo-mmsi-244700521-imo-0"
# url11 = "shiptracking/vessels/bow-engineer-mmsi-258767000-imo-9317860" # listurl = []
# listurl.append(url2)
# listurl.append(url3)
# listurl.append(url4)
# listurl.append(url5)
# listurl.append(url6)
# listurl.append(url7)
# listurl.append(url8)
# listurl.append(url9)
# listurl.append(url10)
# listurl.append(url11)
# for b in listurl:
# print "b : " + b
# print "**********************"
#
# print listurl
#
# print "**********************"
# return listurl
listur2 = read_csv_info("C:/pcdata/shipdata/shipxiangqing/MyShipTracking1and512.csv")
return listur2
# ⽤panda包读取csv⽂件把⽂件指定字段List返回⽤于爬⾍所需url
def read_csv_info(file_name):
# 读取 csv⽂件去除重复数据然后保存到csv⽂件中
# data = pd.read_csv(file_name,encoding="utf_8_sig")
# print (data)
# # # drop_duplicates() 去重⽅法
# # data_quchong = data.drop_duplicates()
# # # encoding 是防⽌存⼊csv⽂件出现乱码
# # _csv(file_name,encoding="utf_8_sig")
# # print ("去重完成!")
# # print (data_quchong)
# ⽤numpy 包读取csv指定列转成列表(数组) 去除第⼀⾏读取第⼀列(从0开始)类型为字符串
lists = np.loadtxt(file_name,skiprows=1,usecols=0,dtype=str)
print "**********************"
print lists
print "**********************"
return lists
# 获取每个页⾯的所需信息
def get_data():
def get_data():
# 调⽤函数获得所有页⾯
for url in all_page():
print "html url:" + url
# 调⽤请求函数获得HTML源码
html = get_html(url).text
# print (html)
# 使⽤lxml解析器,也可以使⽤html.parser
soup = BeautifulSoup(html, 'lxml')
# 获取 mmsi
mmsi = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_ll > table > tbody > tr:nth-child(3) > td:nth-child(2)")
for m in mmsi:
print "mmsi : " + m.get_text()
mmsiList._text())
# 获取详情页⾯的信息
city = soup.select('#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_ll > table > tbody > tr:nth-child(2) > td:nth-child(2)')
for c in city:
print "国家 : " + c.get_text()
cityList._text())
# 获取呼号
callSign = soup.select('#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.c ell > table > tbody > tr:nth-child(5) > td:nth-child(2)')
for h in callSign:
print "呼号 : " + h.get_text()
callSignList._text())
# 获取尺⼨
size = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_ll > table > tbody > tr:nth-child(7) > td:nth-child(2)")
for s in size:
print "尺⼨ : " + s.get_text()
sizeList._text())
# 获取 GRT
grt = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_ll > table > tbody > tr:nth-child(10) > td:nth-child(2)")
for g in grt:
print "GRT : " + g.get_text()
grtList._text())
# 获取 DWT
dwt = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_ll > table > tbody > tr:nth-child(11) > td:nth-child(2)")
for d in dwt:
print "DWT : " + d.get_text()
dwtList._text())
# 获取 owner
owner = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_ ll > table > tbody > tr:nth-child(12) > td:nth-child(2)")
for o in owner:
print "owner : " + o.get_text()
ownerList._text())
# 获取 build
build = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_ll > table > tbody > tr:nth-child(13) > td:nth-child(2)")
for bi in build:
print "build : " + bi.get_text()
buildList._text())
for mmsiLists,cityLists,callSignLists,sizeLists,grtLists,dwtLists,ownerLists,buildLists in zip(mmsiList,cityList,callSignList,sizeList,grtList,dwtList,ownerList ,buildList):
xiang = [mmsiLists,cityLists,callSignLists,sizeLists,grtLists,dwtLists,ownerLists,buildLists]
conn = MySQLdb.Connect(
host='47.100.162.232',
port=3306,
user='dac',
passwd='dac',
db='cmp_dac',
db='cmp_dac',
charset='utf8'
)
# 使⽤cursor()⽅法获取操作游标
cursor = conn.cursor()
try:
sql_insert = "INSERT INTO t_ship_detail(mmsi,country,call_sign,s_size,grt,dwt,owner,build) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)"
# 提交到数据库执⾏
connmit()
print ('记录船详情成功!')
# 如果发⽣错误则回滚
except Exception as e:
print e
cursor.close()
# 关闭数据库连接
conn.close()
# 打开数据库连接
# 清空数组
print ('开始清空数组...')
del mmsiList[:]
del cityList[:]
del callSignList[:]
del sizeList[:]
del grtList[:]
del dwtList[:]
del ownerList[:]
del buildList[:]
# 把list数据存csv 数据量不⼤的时候可以存⼊本地⽂件⼤的话还是存⼊数据库
# def insert_csv():
# mmsi_column = pd.Series(mmsiList,name='mmsi')
# city_column = pd.Series(cityList,name='city')
# callSign_column = pd.Series(callSignList,name='call_sign')
# size_column = pd.Series(sizeList, name='size')
# grt_column = pd.Series(grtList, name='grt')
# dwt_column = pd.Series(dwtList, name='dwt')
# owner_column = pd.Series(ownerList, name='owner')
# build_column = pd.Series(buildList, name='build')
# save = pd.DataFrame({'mmsi': mmsi_column, 'city': city_column,'call_sign' : callSign_column,'size' : size_column,'grt' : grt_column,'dwt' : dwt_column,' owner' : owner_column,'build' : build_column})
# _csv(r"C:/pcdata/shipdata/shipxiangqing/MyShipTracking1and512data.csv")
if __name__ == "__main__":
get_data()
# insert_csv()
# 测试从csv⽂件读取url
# all_page()
print ("爬⾍结束...")
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论