记第⼀天使⽤node做爬⾍——爬取猫眼电影票房总榜以及指定电影的⾖瓣评论
前200条
⾸先,我是⼀个做前端的应届⽣,今天朋友想让我帮忙爬取猫眼电影票房总榜的数据,但是我之前⼀点都没接触过爬⾍,但我还是说:okk,我试试;
然后试试就逝世,各种坑接踵⽽来;
提前声明:这篇⽂章暂时只是获取到了页⾯的数据,还没有使⽤正则提取关键数据;(后续会继续更新)—已更新
代码:
// 引⼊superagent,帮助我们发⽣get和post请求
const superagent =require('superagent');
// 请求地址
const url ='piaofang.maoyan/movie/344264'
// const url = 'piaofang.maoyan/mdb/rank/query?type=0&id=2021'
superagent
.get(url)
.set('Cookie','mta=248378680.1622353618161.1622358743253.1622360863750.5; _lxsdk_cuid=179bbcfa476c8-08ab923c0a6f91-d7e1938-e1000-179b bcfa476c8; theme=moviepro; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1622360700; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=16223607 00; _lxsdk=EEFEF990C11A11EB88F7CB3FB083BC96E951611E0C3843B5B875568FCDE2885A; _lxsdk_s=179bc3bb2e4-e1c-418-4b0%7C%7C8')
.set('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36')
.then(res =>{
console.log(res)
小浣熊cms}).catch(err =>{
console.log(err)
})
这个代码⾥⾯借助了superagent来模拟浏览器访问猫眼电影的服务器,此时⽆论是访问某⼀部电影还是访问票房总榜,数据都可以爬取到;
具体的使⽤正则提取关键数据还在学习ing。。。。。。
下图是部分爬取到的数据;
后端代码:
// 将koa引⼊
const koa =require("koa2")
// 实例化koa
const app =new koa();
// 引⼊路由
const Router =require('koa-router')
const router =new Router();
socket 10054目标计算机积极拒绝// 处理跨域
app.use(async(ctx, next)=>{
ctx.set("Access-Control-Allow-Origin","*")
await next()
})
// 引⼊superagent,帮助我们发⽣get和post请求
const superagent =require('superagent');
// 引⼊cheerio,帮助我们处理获取到的⽹页字符串
const cheerio =require('cheerio')
// 爬⾍⽅法
// 登录凭证及模拟浏览器登录,不加cookie的话服务器返回403错误(没权限)
const cookie ='__mta=248378680.1622353618161.1622822135342.1622825432418.31; _lxsdk_cuid=179bbcfa476c8-08ab923c0a6f91-d7e1938-e1000-179bbcfa476c8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1622360700; _lxsdk=EEFEF990C11A11EB88F7CB3FB083BC96E951611E0C3843B5B8 75568FCDE2885A; theme=moviepro; _lxsdk_s=179d7eef51d-c2e-cc-a4c%7C%7C2';
const userAgent ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36';
let doubanCookie ='ll="118281"; bid=UEuG1A0t0w8; _vwo_uuid_v2=D38900B3B458B847163B795EEAEB0FDE0|defa03deedbec2f1680989c2da76c7ba; __utmz=30149280.1622560751.4.2.utmcsr=search.douban|utmccn=(referral)|utmcmd=referral|utmcct=/movie/subject_search; __utmz=223695111.1 622560751.4.2.utmcsr=search.douban|utmccn=(referral)|utmcmd=referral|utmcct=/movie/subject_search; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22 %22%2C%22%22%2C1622645515%2C%22https%3A%2F%2Fsearch.douban%2Fmovie%2Fsubject_search%3Fsearch_text%3D%25E7%2596%25A F%25E7%258B%2582%25E5%258A%25A8%25E7%2589%25A9%25E5%259F%258E%26cat%3D1002%22%5D; _pk_id.100001.4cf6=e59993bfdc54083f. 1622356663.5.1622645515.1622561683.; _pk_ses.100001.4cf6=*; __utma=30149280.1966141415.1622356664.1622560751.1622645515.5; __utmb=30 149280.0.10.1622645515; __utmc=30149280; __utma=223695111.772182386.1622356664.1622560751.1622645515.5; __utmb=223695111.0.10.16226 45515; __utmc=223695111';
/
/ 获取某⼀年份的电影票房排⾏榜
function getYearMovieList(year){
function getYearMovieList(year){
let url ='piaofang.maoyan/mdb/rank/query?type=0&id='+ year;
return superagent
.get(url)
.set('Cookie', cookie)
.set('User-Agent', userAgent)
.then(res =>{
const data =JSON.).data.list;
return data.slice(0,50);
}).catch(err =>{
return err;
})
}
// 获取某⼀部电影的关键信息
function getMovieDetail(id, name){
let url ='piaofang.maoyan/movie/'+ id;
return superagent
.get(url)
.set('Cookie', cookie)
.set('User-Agent', userAgent)
.
then(res =>{
const $ = cheerio.);// 然后我们就可以通过jQuery的⽅法来操作DOM
// 利⽤正则表⽰式把换⾏符号和空格去掉
const movieTypeText =$(".info-category").html().replace(/\n|\s*/g,'').trim();
const movieCountryText =$("..ellipsis-1").html().replace(/\n|\s*/g,'').trim();
const scoringNumDom =$(".detail-score-count").html()?$(".detail-score-count").html():'';
const score =$(".rating-num").html();
const maleRatioDom =$(".male").html()?$(".male").html().replace(/\n|\s*/g,'').trim():'';
const femaleRatioDom =$(".female").html()?$(".female").html().replace(/\n|\s*/g,'').trim():'';
const personRatioRegex =/<divclass="persona-item-key">(.*?)<\/div><divclass="persona-item-value">(.*?)<\/div>/ const cityRatio =/<divclass="persona-item-key">(.*?)<\/div><divclass="persona-item-value">(.*?)<\/div>/
let movieType = movieTypeText.split('<')[0];
let movieCountry = movieCountryText.split('/')[0];
let scoringNum = scoringNumDom ===''?0: scoringNumDom.split('观')[0];
let maleRatio = maleRatioDom ===''?'': (maleRatioDom)[2];
let femaleRatio = femaleRatioDom ===''?'': (femaleRatioDom)[2];
let cityRatioArr =[];
if($(".persona-item").html()){
$(".persona-item").each(function(i, el){
let cityDom =$(this).html().replace(/\n|\s*/g,'').trim();
cityRatioArr.(cityDom)[2]);
})
}else{
cityRatioArr =['','','',''];
}
return{
movieType,
movieCountry,
maleRatio,
femaleRatio,
scoringNum,
score,
firstCity: cityRatioArr[3],
secondCity: cityRatioArr[1],
thirdCity: cityRatioArr[2],
forthCity: cityRatioArr[0],
}
}).catch(err =>{
console.log("id为:"+ id +",影⽚名为《"+ name +"》的电影详细数据获取失败")
})
}
// 获取猫眼电影票房总榜top250
function getAllTop250(){
let url ='piaofang.maoyan/mdb/rank';
return superagent
.get(url)
.set('Cookie', cookie)
.set('User-Agent', userAgent)
.then(res =>{
const $ = cheerio.);
let result =[]
$("script").each(function(i, el){
if(i ===2){
eval($(this).html().split('var')[1])
result = AppData.data.list.slice(0,150)
}
})
return result;
}).catch(err =>{
return err;
})
}
// 获取⾖瓣影评前200,201到300获取失败
async function getMovieComment(movieId){
nagios安装教程let start =0;
let commentsList =[]
for(let i =0; i <15; i++){
start = i *20;
let url ='movie.douban/subject/'+ movieId +'/comments?start='+ start +'&limit=20&status=P&sort=new_score'; await superagent
.get(url)
.set('Cookie', doubanCookie)
.set('User-Agent', userAgent)
.then(res =>{
const $ = cheerio.);
$("ment").each(function(index, el){
commentsList.push({
text:$(".short",$(this)).html().replace(/\n|\s*/g,'').trim(),
recommend:$(".rating",$(this)).attr("title")||'此⽤户暂⽆推荐'
})
})
}).catch(err =>{
console.log('从第'+parseInt(start +1)+'条评论开始,后⾯⼆⼗条评论爬取失败!错误码:'+ err.status);
return err;
})
}
return commentsList;
}
<('/year',async(ctx, next)=>{
let year = ar;
let res =await new Promise((resolve, reject)=>{
resolve(getYearMovieList(year))
})
ctx.body = res;
})
<('/movie',async(ctx, next)=>{
let movieId = ctx.query.id;
let movieName = ctx.query.name;
let res =await new Promise((resolve, reject)=>{
resolve(getMovieDetail(movieId, movieName))
})
ctx.body = res;
})
})
<('/all',async(ctx, next)=>{
let res =await new Promise((resolve, reject)=>{
resolve(getAllTop250())
})
ctx.body = res;
})
<('/douban',async(ctx, next)=>{
let movieId = ctx.query.id;
let res =await new Promise((resolve, reject)=>{爬虫软件 app
resolve(getMovieComment(movieId))
})
ctx.body = res;
})
// 配置路由
app.utes(), router.allowedMethods())
// 设置登录⼝
const port = v.PORT||5555;
// 监听端⼝号
app.listen(port,()=>{
console.log(`server start at port: ${port}!!`)
})
前端代码:
index.js代码
// 获取电影详情,并将获取到的信息与之前的信息进⾏合并,同时写进excel表中
async function getDetail(result, filename){
for(let i =0; i < result.length; i++){
let detail =await new Promise((resolve, reject)=>{
$.ajax({
url:'localhost:5555/movie?id='+ result[i].movieId +'&name='+result[i].movieName,
type:'GET',
async:true,
success:(res)=>{
resolve(res)
}
})
})
Object.assign(result[i], detail)
web service监控系统使用方法
}
let title =['电影id','电影名称','上映时间','类型','国家','评分','评分⼈数','想看⽤户⽐例 - 男','想看⽤户⽐例 - ⼥','想看⽤户⽐例 - ⼀线城市','想看⽤户⽐例 - ⼆线城市','想看⽤户⽐例 - 三线城市','想看⽤户⽐例 - 四线城市','票房/万','平均票价','场均⼈数'];
let order =['movieId','movieName','releaseInfo','movieType','movieCountry','score','scoringNum','maleRatio','femaleRatio','firstCity','secondCity','thi rdCity','forthCity','boxDesc','avgViewBoxDesc','avgShowViewDesc']
JSONToExcelConvertor(result, filename, title, order);
}
// 向后端发起ajax请求,获取对应年份的数据
async function getMovieList(year){
let result =await new Promise((resolve, reject)=>{
$.ajax({
url:'localhost:5555/year?year='+ year,
type:'GET',
async:true,
success:(res)=>{
resolve(res)
朱紫属性克制表}
})
})
getDetail(result, year +"年猫眼电影top50")

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。