Python⽹络爬⾍——前程⽆忧⽹数据爬取及可视化分析
⼀、选题背景
为什么要选择此选题?要达到的数据分析的预期⽬标是什么?(10 分)
通过⽹络爬⾍爬取前程⽆忧⽹的数据信息,并且对爬取的数据进⾏进⼀步清洗处理,提取可利⽤数据信息,同时加以分析各维度数据,筛选该⽹站⼊驻的企业和为求职者提供的⼈才招聘、求职、⼯
作、培训等在内的全⽅位的⼈⼒资源服务,让数据看起来直观清晰。
⼆、主题式⽹络爬⾍设计⽅案(10 分)
1.⽹络爬⾍名称:“前程⽆忧⽹络爬⾍及数据清洗分析”。
2.⽹络爬⾍爬取的内容与数据特征分析:
通过⽹络爬⾍技术分析该⽹站⽹页结构获取数据,前程⽆忧⽹为⼈才招聘求职⽹站,因此爬取的数据内容涵盖公司名称、岗位类型、薪资标准、⼯作经验、学历、城市、招聘⼈
数、公司规模等。获取多维度数据,并且经过数据清洗分析,获取所需数据,经过清洗后的数据⽆重复
值⽆空值,使数据更加可靠。
3.⽹络爬⾍设计⽅案概述:
需多个步骤实现:
通过获取⽹页资源,设置请求头,防⽌被⽹页识别爬⾍,利⽤requests请求,使⽤etree解析⽹页,定位爬取资源将数据保存到csv⽂件中。
三、主题页⾯的结构特征分析(10 分)
数据来源:search.51job
所需页⾯代码:
四、⽹络爬⾍程序设计(10 分)
1、数据爬取
1import requests
2import time
3import re
4import csv
5import json
6import pandas as pd
7from lxml import etree
8#创建⼀个csv⽂件,设置编码格式
9 file = open('qcwy.csv','a+',encoding='gbk')
10#写⼊表头
11 writer =csv.writer(file)
12 writer.writerow(['公司','岗位','薪资','福利','⼯作经验','学历','城市','招聘⼈数','公司规模','公司⽅向'])
13 file.close()
14from urllib.parse import urlencode
15#页数循环,设置10页
16for page in range(1,10):
17try:
18 url0 = 'search.51job/list/000000,000000,0000,00,9,99,python,2,{}.html?'.format(page)
19#设置请求头,防⽌被⽹站识别爬⾍
20 headers = {
21'Connection': 'keep-alive',
22'Host': 'search.51job',
23# 'Cookie': 'guid=011c029d4be2f1535b1488058cc65d73; _ujz=MTYwMzQwMDYxMA%3D%3D; ps=needv%3D0; 51job=cuid%3D160340061%26%7C%26cusername%3Dphone_133********_201907142883%26%7C%26cpassword%3 24'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
25 }
26#在url后拼接参数,参数固定
27 params = {
28'lang': 'c',
29'postchannel': '0000',
30'workyear': '99',
31'cotype': '99',
32'degreefrom': '99',
33'jobterm': '99',
34'companysize': '99',
35'ord_field': '0',
36'dibiaoid': '0',
37'line': '',
38'welfare': '',
39 }
40#拼接url
41 url = url0+urlencode(params)
42print(url)
43#requests请求,设置请求时间最长为30秒,超时报错
44 r = (url,headers=headers,timeout=30)
45#将请求到的字符串转化为html标签
46 html = etree.)
47#标签定位到该字段
48 nr = html.xpath('//script[@type="text/javascript"]/text()')[0].replace('\n','').replace('\t','').replace('window.__SEARCH_RESULT__ = ','')
49#将字符串抓华为json格式
50 datas = json.loads(nr)['engine_search_result']
51#循环,获取字段
52for sjs in datas:
53#判断
54if len(sjs['attribute_text']) == 4:
电脑自带计算器进制转换55 workyear = sjs['attribute_text'][1]
56 education = sjs['attribute_text'][2]
57 city = sjs['attribute_text'][0]
58 renshu = sjs['attribute_text'][-1]
59else:
60 city = sjs['attribute_text'][0]
61 renshu = sjs['attribute_text'][-1]
62 test = sjs['attribute_text'][1]
63#判断经验是否在test⾥⾯
64if'经验'in test:
65 workyear = test
66 education = '⽆'
67else:
68 education = test
69 workyear = '⽆'
70 company_name = sjs['company_name']
71 job_name = sjs['job_name']
72 providesalary_text = sjs['providesalary_text'].replace('\\',"")
73 jobwelf = sjs['jobwelf'].replace('\\',"")
74 companysize_text = sjs['companysize_text'].replace('\\',"")
75 companyind_text = sjs['companyind_text'].replace('\\',"")
76#如果为空,直接设置为⽆
77if not providesalary_text:
78 providesalary_text = '⽆'
79if not jobwelf:
80 jobwelf = '⽆'
81if not companysize_text:
82 companysize_text = '⽆'
83if not companyind_text:
84 companyind_text = '⽆'
85 file = open('qcwy.csv', 'a+', encoding='gbk')
86 writer = csv.writer(file)
87#将数据每⾏写⼊
88 writer.writerow([company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text]) 89print(company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text)
90#异常处理
91except Exception as e:
92print(e)
93 time.sleep(1)
94
95# break
96#将csv转成excel
97 datas = pd.read_csv('qcwy.csv',encoding='gbk')
2、对数据进⾏清洗和处理
1import pandas as pd
2 qcwy = pd.ad_csv('qcwy.csv',encoding="gbk"))python请求并解析json数据
3 qcwy.head()
1 qcwy.drop('福利',axis = 1,inplace= True)
2 qcwy.head()
1 qcwy.duplicated()
1 qcwy = qcwy.drop_duplicates()
2 qcwy.head()
1 qcwy['公司'].isnull().value_counts()
1 qcwy['岗位'].isnull().value_counts()
1 qcwy['薪资'].isnull().value_counts()
1 qcwy['⼯作经验'].isnull().value_counts()
1 qcwy['学历'].isnull().value_counts()
1 qcwy['城市'].isnull().value_counts()
1 qcwy['招聘⼈数'].isnull().value_counts()
1 qcwy['公司规模'].isnull().value_counts()
1 qcwy['公司⽅向'].isnull().value_counts()
1 qcwy ['薪资'] = qcwy['薪资'].map(str.strip) #删除数据两边的空格
1 qcwy ['薪资'] = qcwy['薪资'].map(str.lstrip) #删除数据左边的空格
1 qcwy ['薪资'] = qcwy['薪资'].map(str.rstrip) #删除数据右边的空格
1 qcwy.describe()
3、数据可视化
1import pandas as pd
2import numpy as mp
3import sklearn
4import seaborn as sns
5import matplotlib.pyplot as plt
6
7#学历占⽐饼图
Params['font.sans-serif'] = ['SimHei']#解决乱码问题
9 gw_score = qcwy['学历'].value_counts() #统计评分情况
10 plt.title("学历占⽐图") #设置饼图标题
11 plt.pie(gw_score.values,labels = gw_score.index,autopct='%1.1f%%') #绘图12#autopct表⽰圆⾥⾯的⽂本格式,在python⾥%操作符可⽤于格式化字符串操作13 plt.show()
1 qcwy = pd.ad_csv('qcwy.csv',encoding="gbk"))
2 sns.distplot(qcwy['招聘⼈数'])
plot(x = '招聘⼈数',y = '公司规模',data=qcwy)
1import seaborn as sns
2from scipy.optimize import leastsq
Params['font.sans-serif'] = ['SimHei']#解决乱码问题
4#定义变量
5 gsgm=qcwy.loc[:,'公司规模']
6 zprs=qcwy.loc[:,'招聘⼈数']
7#函数表达式
8def func(params,x):
9 a,b,c=params
10return a*x*x+b*x+c
11def error_func(params,x,y):
12return func(params,x)-y
13 P0=[1,9.0]
14def main():
15 plt.figure(figsize=(8,6))
16 P0=[1,9.0,1]
17 Para=leastsq(error_func,P0,args=(gsgm,zprs))
18 a,b,c=Para[0]
19print("a=",a, "b=",b, "c=",c)
20#绘图
21 plt.scatter(gsgm,zprs,color="green",label="样本数据",linewidth=2)
22 x=mp.linspace(1000,10,400)
23 y=a*x*x+b*x+c
24#右上⾓标
25 plt.plot(x,y,color="red",label="拟合曲线",linewidth=2)
26#x,y轴名称
27 plt.xlabel('公司规模')
28 plt.ylabel('招聘⼈数')
29#标题
30 plt.title("公司规模与招聘⼈数回归⽅程")
31 id()
32 plt.legend()
33 plt.show()
34 main()
五、附完整程序代码
1 import requests
2 import time
3 import re
4 import csv
5 import json
6 import pandas as pd
7 from lxml import etree
8 #创建⼀个csv⽂件,设置编码格式
9 file = open('qcwy.csv','a+',encoding='gbk')
fortunearterial赤之约束动漫10 #写⼊表头
11 writer =csv.writer(file)
12 writer.writerow(['公司','岗位','薪资','福利','⼯作经验','学历','城市','招聘⼈数','公司规模','公司⽅向'])
13 file.close()
14 from urllib.parse import urlencode
15 #页数循环,设置10页
尚硅谷培训机构培训费用16 for page in range(1,10):
17 try:
18 url0 = 'search.51job/list/000000,000000,0000,00,9,99,python,2,{}.html?'.format(page)
19 #设置请求头,防⽌被⽹站识别爬⾍
20 headers = {
21 'Connection': 'keep-alive',
22 'Host': 'search.51job',
23 # 'Cookie': 'guid=011c029d4be2f1535b1488058cc65d73; _ujz=MTYwMzQwMDYxMA%3D%3D; ps=needv%3D0; 51job=cuid%3D160340061%26%7C%26cusername%3Dphone_133********_201907142883%26%7C%26cpassword%3
24 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
25 }
26 #在url后拼接参数,参数固定
27 params = {
28 'lang': 'c',
29 'postchannel': '0000',
30 'workyear': '99',
31 'cotype': '99',
32 'degreefrom': '99',
33 'jobterm': '99',
34 'companysize': '99',
35 'ord_field': '0',matlab简单函数
36 'dibiaoid': '0',
37 'line': '',
38 'welfare': '',
39 }
40 #拼接url
41 url = url0+urlencode(params)
42 print(url)
43 #requests请求,设置请求时间最长为30秒,超时报错
44 r = (url,headers=headers,timeout=30)
45 #将请求到的字符串转化为html标签
46 html = etree.)
47 #标签定位到该字段
48 nr = html.xpath('//script[@type="text/javascript"]/text()')[0].replace('\n','').replace('\t','').replace('window.__SEARCH_RESULT__ = ','')
49 #将字符串抓华为json格式
50 datas = json.loads(nr)['engine_search_result']
51 #循环,获取字段
52 for sjs in datas:
53 #判断
54 if len(sjs['attribute_text']) == 4:
55 workyear = sjs['attribute_text'][1]
56 education = sjs['attribute_text'][2]
57 city = sjs['attribute_text'][0]
58 renshu = sjs['attribute_text'][-1]
59 else:
oracle 标准版60 city = sjs['attribute_text'][0]
61 renshu = sjs['attribute_text'][-1]
62 test = sjs['attribute_text'][1]
63 #判断经验是否在test⾥⾯
64 if'经验'in test:
65 workyear = test
66 education = '⽆'
67 else:
68 education = test
69 workyear = '⽆'
70 company_name = sjs['company_name']
71 job_name = sjs['job_name']
72 providesalary_text = sjs['providesalary_text'].replace('\\',"")
73 jobwelf = sjs['jobwelf'].replace('\\',"")
74 companysize_text = sjs['companysize_text'].replace('\\',"")
75 companyind_text = sjs['companyind_text'].replace('\\',"")
76 #如果为空,直接设置为⽆
77 if not providesalary_text:
78 providesalary_text = '⽆'
79 if not jobwelf:
80 jobwelf = '⽆'
81 if not companysize_text:
82 companysize_text = '⽆'
83 if not companyind_text:
84 companyind_text = '⽆'
85 file = open('qcwy.csv', 'a+', encoding='gbk')
86 writer = csv.writer(file)
87 #将数据每⾏写⼊
88 writer.writerow([company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text])
89 print(company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text)
90 #异常处理
91 except Exception as e:
92 print(e)
93 time.sleep(1)
94
95 # break
96 #将csv转成excel
97 datas = pd.read_csv('qcwy.csv',encoding='gbk')
98
99 import pandas as pd
100 qcwy = pd.ad_csv('qcwy.csv',encoding="gbk"))
101 qcwy.head()
102
103 qcwy.drop('福利',axis = 1,inplace= True)
104 qcwy.head()
105
106 qcwy.duplicated()
107
108 qcwy = qcwy.drop_duplicates()
109 qcwy.head()
110
111 qcwy['公司'].isnull().value_counts()
112 qcwy['岗位'].isnull().value_counts()
113 qcwy['薪资'].isnull().value_counts()
114 qcwy['⼯作经验'].isnull().value_counts()
115 qcwy['学历'].isnull().value_counts()
116 qcwy['城市'].isnull().value_counts()
117 qcwy['招聘⼈数'].isnull().value_counts()
118 qcwy['公司⽅向'].isnull().value_counts()
119
120 qcwy ['薪资'] = qcwy['薪资'].map(str.strip) #删除数据两边的空格
121 qcwy ['薪资'] = qcwy['薪资'].map(str.lstrip) #删除数据左边的空格
122 qcwy ['薪资'] = qcwy['薪资'].map(str.rstrip) #删除数据右边的空格
123
124 qcwy.describe()
125
126 import pandas as pd
127 import numpy as mp
128 import sklearn
129 import seaborn as sns
130 import matplotlib.pyplot as plt
131
132 #学历占⽐饼图
Params['font.sans-serif'] = ['SimHei']#解决乱码问题
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论