java新闻爬取
本来想爬今⽇头条,在⽹上了很多⽅法,⾛了很多弯路,异步刷新没能解决,本⼈爬⾍⼩⽩。后来发现json数据和本地cookie也有关,感觉前路艰难。果断换到⽹易新闻,⽹易新闻相对来说获取数据⽐较简单,通过⾕歌F12分析包数据,发现⽹易异步刷新的包和访问路径有关,通过在线json解析数据发现可以解析,这让我欣喜不已。
json数据:
废话不多说,直接上代码
//⽹易新闻类型
String[] typeArray=
{"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};
String type = typeArray[width];
//⽹易新闻列表url
String url1 = "3g.163/touch/reconstruct/article/list/";
//⽹易新闻内容url
String url2 = "3g.163/news/article/";
//根据新闻列表url,获取新闻docid,并把docid存储到list中
private static List<String> getDocid(String url,int num,String type) {
String json = null;
List<String> id=new ArrayList<>();
Map map=null;
JSONArray parseArray=null;
String jsonStrM="";
json = JSONUtils.loadJson(url+type+"/"+num+"-10.html");
String jsonStr = StringUtils.substringBeforeLast(json, ")");
String jsonStrO = StringUtils.substringAfter(jsonStr,"artiList(");
Map parse = (Map) JSONObject.parse(jsonStrO);
parseArray = (JSONArray) (type);
for(int j=0;j<parseArray.size();j++){
map = ((j);
id.add((String) ("docid"));
}
return id;
}
//根据内容url2获取新闻信息并进⾏存储
private static void getContent(String url2, List<String> ids) {
System.out.println("存储开始!!");
String url = null;
Connection connection = t(url2);
int i = 1;
for (;i<ids.size();i++){
url = (i)+".html";
connection = t(url);
try {
Document document = ();
//获取新闻标题
Elements title = document.select("[class=title]");
//获取新闻来源和⽂章发布时间
Elements articleInfo = document.select("[class=info]");
Elements src = articleInfo.select("[class=source js-source]");
Elements time = articleInfo.select("[class=time js-time]");
//获取新闻内容
Elements contentEle = document.select("[class=page js-page on]");
DBCollection dbCollection= null;
try {
dbCollection = MongoDB();
} catch (Exception e) {
e.printStackTrace();
}
BasicDBObject obj = new BasicDBObject();
obj.put("title", src.html());
obj.put("srcFrom", src.html());
obj.put("time", time.html());
obj.put("content", contentEle.html());
dbCollection.insert(obj);
DBCursor dbCursor = dbCollection.find();
while(dbCursor.hasNext()){
Map map = (();
}
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("本次共计存储"+i*0.8+"条数据");
}
//设置爬取深度,循环多次获取docid
private static List<String> getIds(String url1,int num,String type) {
List<String> id = new ArrayList<>();
List<String> ids = new ArrayList<>();
for (int i=0;i<=num;i+=10){
id = getDocid(url1,i,type);
ids.addAll(id);
}
return ids;
}
public static void main(String[] args) throws Exception {
//爬取条数,10的倍数,⽹易新闻每10条预留⼤约2个⼴告位,所以爬取新闻的真实条数⼤约为80%
int deep = 30;
/
/爬取宽度,0:⾸页,1:社会,2:国内,3:国际,4:历史
int width = 1;安卓在线解析json
//⽹易新闻类型
String[] typeArray=
{"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};
String type = typeArray[width];
//⽹易新闻列表url
String url1 = "3g.163/touch/reconstruct/article/list/";
//⽹易新闻内容url
String url2 = "3g.163/news/article/";
List<String> ids = new ArrayList<>();
/
/根据url1,爬取条数,新闻类型获取新闻docid
ids = getIds(url1,deep,type);
//根据url2,新闻docid获取内容并存储到MongoDB
getContent(url2,ids);
}
为了⽅便存取⽐较⼤的数据量,使⽤了mongodb数据库进⾏存储
列表
内容
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论