⽹易云⾳乐热评详细源码-Python
爬⾍ & 数据分析
运⾏环境:python3.6
为⽅便理解,⽹易云⾳乐热评的爬取代码分为两部分
1. 先爬取每个歌单⾥的歌曲的url,导出到music1_01.csv⽂件中
2. 爬取每⾸歌的热评信息,导出到hotCommets_01.csv⽂件中
music_01.ipynb
import logging
import requests
from pyquery import PyQuery as pq
import pandas as pd
import random
import time
# headers需要填上,否则⽆法正常爬取
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'} # 设置⽇志的格式、输出级别
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
def scrape_index(url):
response = (url,headers = headers )
logging.info('scrape index %s...',url) #不需要再url前加%,⽽是,
try:
if response.status_code == 200:
return parse_) # 传到parse_index ⽅法中获取歌单url列表
else :
<('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
<('error occurred while scraping %s', url, exc_info=True) # exc_info=True:会将异常异常信息添加到⽇志消息中
def parse_index(html):
doc = pq(html) # ⽤pyquery进⾏解析
a = doc('#m-pl-container .dec .s-fc0') # #对应div .对应class
a1 = a.items() # 对于返回值是多个元素,然后对每个元素做处理,需要调⽤items⽅法,返回的generator类型,可以通过for 循环去取值
return a1
def scrape_detail(url):
response = (url,headers = headers )
logging.info('scraping detail %s...',url)
try:
if response.status_code == 200:
logging.info('detail url is succeed ')
return parse_detail(response.json()) # API获取的内容返回的是json格式
else:
<('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
<('error occurred while scraping %s', url, exc_info=True)
'''
热评获取API:music.163/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0
所以获取歌曲的ID就可以得到热评
'''
def parse_detail(html):
list_02 = []
jobs = html['result']['tracks']
for j in jobs:
dic ={}
dic['name'] = j['name'] # 创建字典
dic['id'] = j['id']
list_02.append(dic)
return list_02
def get_list():
list_01 = []
url = 'music.163/discover/playlist/?order=hot&cat=%E5%8D%8E%E8%AF%AD&limit=35&offset={page}'
for page in range(0,35,35): # 跑⼀页试试,如果跑全部,改为 range(0,1295,35)
url1 = url.format(page = page)
list = []
for i in scrape_index(url1): # generator 遍历之后的i的类型仍然是qyquery类型
i_url = i.attr('href') # attr ⽅法来获取属性
'''
获取歌单和评论均⽤了⽹易云⾳乐get请求的API,快速⾼效!
⽹易云歌单API
music.163/api/playlist/detail?id={歌单ID}
热评获取API
music.163/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0
'''
detail_url = f'music.163/api{place("?","/detail?")}' #获取的url还需要替换⼀下符合API要求的格式
list.append(detail_url)
d(list) # extend 对列表合并
time.sleep(5+random.random()) # ⽂明爬⾍
return list_01
def save_date(list):
df1 = pd.DataFrame(list)
df2 = pd.concat([df, df1])
df3 = df2.drop_duplicates(subset=None, keep='first',inplace = False)
<_csv('music_163_02.csv',index_label="index_label", encoding='utf-8-sig') # index_label索引列的列标签
df = pd.DataFrame(columns = ('name','id'))
def main():
detail_list = []
url_01 = get_list()
for l in url_01:
logging.info('detail url is %s',l)
detail_list_part = scrape_detail(l)
d(detail_list_part) # 列表合并,得到最后的完整歌单信息列表
time.sleep(5+random.random())
save_date(detail_list)
if __name__ == '__main__':
main()
hotCommets_01.ipynb
import pandas as pd
import requests
import logging
import time
import random
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'} logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
def scrape_comment(url, name):
logging.info('scraping comments %s',url)
logging.info('scraping comments %s',url)
try:
response = (url, headers = headers)
if response.status_code == 200:
return parse_comment(response.json(), name ) # ⽹易云热评API返回的是json格式
else :
<('invaild status_code %s while scraping %s',response.status_code, url)
except Exception:
<('can not scraping %s', url)
def parse_comment(html, name ):
data = []
jobs = html['hotComments']
for job in jobs:
dic = {}
dic['nickname'] = job['user']['nickname']
dic['userid'] = job['user']['userId']
dic['content'] = job['content'].replace('\n', '') # 对换⾏符进⾏替换
dic['likecount'] = job['likedCount']
dic['time'] = stampToTime(job['time']) # 时间戳的转换
dic['name'] = name
data.append(dic)
return data
def stampToTime(stamp):
'''
获得是13位的时间戳,需要转化成时间字符串
将ms(毫秒)转成s(秒) stamp/1000
将10位的符合python的时间戳转化成时间元组,localtime():北京时间
将时间元组⽤strftime转化成时间字符串
'''
timeStamp = float(stamp/1000)
timeArray = time.localtime(timeStamp)
date = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return date
def main():
df = pd.read_csv('music_163_03.csv',header = 0) # 对上个保存的歌曲的ID的csv的内容提取,header= 0:第⼀⾏作为column
data_comment = []
for index,row in df.iterrows(): # 数据框中的⾏进⾏迭代的⼀个⽣成器,它返回每⾏的索引及⼀个包含⾏本⾝的对象。
name = row['name']
'''
⽹易云⾳乐获取热评的API
limit:返回数据条数(每页获取的数量),默认为20,可以⾃⾏更改
offset:偏移量(翻页),offset需要是limit的倍数
type:搜索的类型
music.163/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0
'''
url = f'music.163/api/v1/resource/comments/R_SO_4_{row["id"]}?limit=20&offset=0' # 本⽂只爬取⾸页(第⼀页)的热评 data1 = scrape_comment(url, name)
d(data1)
df1 = pd.DataFrame(data_comment)
<_csv('hotComments_06p.csv',encoding = 'utf-8-sig') # 是utf-8-sig ⽽不是utf-8
logging.info('scraping id %s',index)
time.sleep(random.random()) # ⽂明爬⾍
if __name__ == '__main__':
main()
对评论数据分析
import matplotlib.pyplot as plt
import pandas as pd
import pandas as pd
import jieba
import jieba.analyse
import numpy as np
from PIL import Image
from wordcloud import WordCloud
'''
读取刚刚爬好的热评⽂件
df1 = pd.read_csv('hotComments_06.csv',index_col = 0)
ERROR: Buffer overflow caught -缓冲区溢出
发现也是因为csv⽂件中单个item内有\r,即回车符
解决⽅法:lineterminator=”\n”:让\n作为换⾏符即可
'''
df3 = pd.read_csv('hotComments_06.csv',index_col = 0,lineterminator='\n')
# 空格的影响会导致打字内容⼀样,但却被判为不⼀样
# ⽤strip()⽅法去除开头或则结尾的空格
df3['content1'] = df3['content'].apply(lambda x:x.strip())
# 有些句⼦中有\r,因为我们以\n作为换⾏符,所以这些\r不属于⽂本,需要去掉
df3['content1'] = df3['content'].apply(lambda place('\r',''))
df4 = df3.drop(['content'], axis =1 )
# 对点赞数排序
df5 = df4.sort_values(by = 'likecount',ascending = False)
df5.head(10).to_csv('strat_TOP10.csv',index = False, encoding = 'utf-8-sig')
# 对重复的句⼦次数排序
df6 = upby('content').size().sort_values(ascending = False).reset_index(name = 'count')
df6.head(15).to_csv('hot_copy01.csv',index = False, encoding = 'utf-8-sig') # index = False :不需要导出index
#热评最多的ID都有那些特征
df10 = upby('userid').count().sort_values(by = 'content',ascending = False)
# 热评最多的Id 是 424311909
df11 = df4[df4['userid']==424311909]
df10 = upby('userid').count().sort_values(by = 'content',ascending = False)
'''
hist: 直⽅图
bins: 条形数
density: bool 密度显⽰
'''
plt.hist(df12, bins = 200, density = True )
plt.xlim((0,6000))
plt.title('424311909⽤户的点赞分布')
plt.savefig('start_424311909.png',dpi = 100)
plt.show()
'''
看评论的长度分布
len(df11['content']) --- out: 133
python新手代码userid显⽰的是这个series的长度
'''
df12 = df11['content'].map(len) # map函数进⾏求取每⼀单元格个长度
plt.hist(df12,bins = 20, density = True)
plt.title('424311909⽤户的评论长度分布')
plt.savefig('len_424311909.png', dpi = 100)
plt.show()
'''
jieba库中基于 TextRank 算法的关键词抽取
详情见官⽅⽂档:github/fxsjy/jieba
'''
'''
segments = []
for index,row in df4.iterrows():
content = row[5]
words = ank(content,topK=3, withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
for w in words: # 对分词好后的words进⾏提取,并且关联⼀个1,⽅便进⾏计数
segments.append({'word':w,'counts':1})
df_w = pd.DataFrame(segments)
_csv('jieba_01.csv',index = False,encoding = 'utf-8-sig')
# wordcloud库制作云词
# 将我们之前做的分词列表合并成字符串,以空格连接⽅便制作云词
text = ' '.join(df_w['word'])
'''
2222.png是⼀张作为蒙版的图⽚,需要转换成numy数组才可以⽤
利⽤PIL模块读取我们的png⽂件并转换为numpy数组,作为WordCloud的mask参数传⼊
'''
mask_cir = np.array(Image.open('2222.png'))
wordc =WordCloud(
background_color='white',
mask = mask_cir,
font_path = 'f', # 中⽂显⽰的⽅法,baidu载⼀个f字体包即可让云词显⽰中⽂
max_words=1000
).generate(text)
plt.imshow(wordc)
plt.axis('off') #关闭坐标轴,更加美观
plt.savefig('图3.jpg',dpi=600, bbox_inches='tight', quality=95) # bbox_inches='tight',可以达到去除空⽩的效果plt.show()
end. 欢迎⼤家指正~
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论