Python3:爬取新浪、⽹易、今⽇头条、UC四⼤⽹站新闻标题及
内容
Python3:爬取新浪、⽹易、今⽇头条、UC四⼤⽹站新闻标题及
内容
以爬取相应⽹站的社会新闻内容为例:
c语言的三种结构⼀、新浪:
新浪⽹的新闻⽐较好爬取,我是⽤BeautifulSoup直接解析的,它并没有使⽤JS异步加载,直接爬取就⾏了。
'''
新浪新闻:news.sina/society/
Date:20180920
Author:lizm
Description:获取新浪新闻
'''
import requests
from bs4 import BeautifulSoup
from urllib import request
import sys
import re
import os
def getNews(title,url,m):
Hostreferer = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
req = request.Request(url)
response = request.urlopen(req)
#过滤⾮utf-8的⽹页新闻
response = ad().decode('utf-8',"ignore")
soup = BeautifulSoup(response,'lxml')
tag = soup.find('div',class_='article')
if tag == None:
return 0
#获取⽂章发布时间
fb_date = soup.find('div','date-source').span.string
#获取发布⽹站名称
fb_www= soup.find('div','date-source').a.string
#获取⽂章内容
rep = repile("[\s+\.\!\/_,$%^*(+\"\']+|[+<>?、~*()]+")
title = rep.sub('',title)
title = place(':',':')
filename = sys.path[0]+"/news/"+title+".txt"
with open(filename,'w',encoding='utf8') as file_object:
file_object.write(fb_date + "" + fb_www)
file_object.write("\n")
file_object.write("⽹址:"+url)
file_object.write("\n")
file_object.write(title)
file_object._text())
i = 0
for image in tag.find_all('div','img_wrapper'):
title_img = title +str(i)
#保存图⽚
#判断⽬录是否存在
if (ists(sys.path[0]+"/news/"+title)):
pass
else:
#不存在,则新建⽬录
os.mkdir(sys.path[0]+"/news/"+title)
os.chdir(sys.path[0]+"/news/"+title)
file_name = "news.sina/"+('src').replace('//','')
html = (file_name, headers=Hostreferer)
# 图⽚不是⽂本⽂件,以⼆进制格式写⼊,所以是t
title_img = title_img +".jpg"
f = open(title_img, 'wb')
f.t)
f.close()
i+=1
print('成功爬取第', m,'个新闻',title)
return 0
#获取社会新闻(最新的162条新闻)
def getTitle(url):
req = request.Request(url)
response = request.urlopen(req)
response = ad().decode('utf8')
soup = BeautifulSoup(response,'lxml')
y = 0
for tag in soup.find('ul',class_='seo_data_list').find_all('li'):
if tag.a != None:
#if y== 27:
print(y,tag.a.string,('href'))
temp = tag.a.string
getNews(temp,('href'),y)
y += 1
if__name__ == '__main__':
url = 'news.sina/society/'
getTitle(url)
⼆、⽹易:
⽹易新闻的标题及内容是使⽤js异步加载的,单纯的下载⽹页源代码是没有标题及内容的,我们可以在Network的js中到我们需要的内容,这⾥我使⽤了正则表达式来获取我们需要的标题及其链接,并使⽤了BeautifulSoup来获取相应标题的内容。
import re
from urllib import request
from bs4 import BeautifulSoup
def download(title, url):
req = request.urlopen(url)
res = ad()
soup = BeautifulSoup(res,'lxml')
#print(soup.prettify())
tag = soup.find('div',class_='post_text')
#_text())
title = place(':','')
title = place('"','')
title = place('|','')
title = place('/','')
title = place('\\','')
title = place('*','')
title = place('<','')
title = place('>','')
title = place('?','')
#print(title)
file_name = r'D:\code\python\spider_news\NetEase_news\sociaty\\' +title + '.txt'
file = open(file_name,'w',encoding = 'utf-8')
file._text())
if__name__ == '__main__':
urls = ['temp.163/special/00804KVA/cm_shehui.js?callback=data_callback',
'temp.163/special/00804KVA/cm_shehui_02.js?callback=data_callback',
'temp.163/special/00804KVA/cm_shehui_03.js?callback=data_callback']
for url in urls:
#url = 'temp.163/special/00804KVA/cm_shehui_02.js?callback=data_callback'
req = request.urlopen(url)
res = ad().decode('gbk')
#print(res)
pat1 = r'"title":"(.*?)",'
pat2 = r'"tlink":"(.*?)",'
m1 = re.findall(pat1,res)
news_title = []
for i in m1:
news_title.append(i)
m2 = re.findall(pat2,res)
news_url = []
for j in m2:
news_url.append(j)
for i in range(0,len(news_url)):
#print(news_title[i],news_body[i])
python解析json文件
download(news_title[i],news_url[i])
print('正在爬取第' + str(i) + '个新闻',news_title[i])
三、头条:
头条的新闻跟前两个也都不⼀样,它的标题和链接是封装到json⽂件中的,但是他json⽂件的url参数是通过⼀个js随机算法变化的,所以我们需要模拟json⽂件的参数,否则我们不到json⽂件的具体url,我是通过这篇博客才了解到url获取⽅法的,⽽且也解决了总是下载重复新闻的问题,该⽹站⾃带反爬机制,需要添加cookie。关于新闻的内容,我⽤了正则表达式提取了中⽂。
from urllib import request
import requests
import json
import time
import math
import hashlib
import re
from bs4 import BeautifulSoup
def get_url(max_behot_time, AS, CP):
url = 'utiao/api/pc/feed/?category=news_society&utm_source=toutiao&widen=1' \ '&max_behot_time={0}' \
'&max_behot_time_tmp={0}' \
'&tadrequire=true' \
'&as={1}' \
'&cp={2}'.format(max_behot_time, AS, CP)
return url
def get_ASCP():
t = int(math.floor(time.time()))
e = hex(t).upper()[2:]
m = hashlib.md5()
m.update(str(t).encode(encoding='utf-8'))
i = m.hexdigest().upper()
if len(e) != 8:
AS = '479BB4B7254C150'
CP = '7E0AC8874BB0985'
return AS,CP
n = i[0:5]
a = i[-5:]
s = ''
r = ''
for o in range(5):
s += n[o] + e[o]
r += e[o + 3] + a[o]
AS = 'AL'+ s + e[-3:]
CP = e[0:3] + r + 'E1'
# print("AS:"+ AS,"CP:" + CP)
return AS,CP
def download(title, news_url):
# print('正在爬')
req = request.urlopen(news_url)
de() != 200:
return 0
res = ad().decode('utf-8')
#print(res)
pat1 = r'content:(.*?),'
pat2 = repile('[\u4e00-\u9fa5]+')
result1 = re.findall(pat1,res)
#print(len(result1))
if len(result1) == 0:
return 0
print(result1)
result2 = re.findall(pat2,str(result1))
result3 = []
for i in result2:
if i not in result3:
result3.append(i)
#print(result2)
title = place(':','')
title = place('"','')
title = place('|','')
title = place('/','')
title = place('\\','')
title = place('*','')
title = place('<','')
title = place('>','')
title = place('?','')
with open(r'D:\code\python\spider_news\Toutiao_news\society\\' + title + '.txt','w') as file_object:
file_object.write('\t\t\t\t')
file_object.write(title)
file_object.write('\n')
file_object.write('该新闻地址:')
file_object.write(news_url)
file_object.write('\n')
for i in result3:
#print(i)
file_object.write(i)
file_object.write('\n')
# file_object._text())
#print('正在爬取')
def get_item(url):
#time.sleep(5)
cookies = {'tt_webid': '6478612551432734221'}
wbdata = (url,cookies = cookies)
wbdata2 = json.)
data = wbdata2['data']
for news in data:
title = news['title']
news_url = news['source_url']
news_url = 'utiao' + news_url
print(title, news_url)
if'ad_label'in news:
print(news['ad_label'])
swappiness
continue
download(title,news_url)
无主之地免费下载next_data = wbdata2['next']
next_max_behot_time = next_data['max_behot_time']
# print("next_max_behot_time:{0}".format(next_max_behot_time))
return next_max_behot_time
if__name__ == '__main__':
refresh = 50
for x in range(0,refresh+1):
print('第{0}次:'.format(x))
if x == 0:
max_behot_time = 0
else:
max_behot_time = next_max_behot_time
#print(next_max_behot_time)
AS,CP = get_ASCP()
url = get_url(max_behot_time,AS,CP)
next_max_behot_time = get_item(url)
四、UC
UC和新浪差不多,没有太复杂的反爬⾍,直接解析爬取就好。
from bs4 import BeautifulSoup
from urllib import request
def download(title,url):
req = request.Request(url)
二氧化碳气瓶压力容器类别
response = request.urlopen(req)
response = ad().decode('utf-8')
soup = BeautifulSoup(response,'lxml')
tag = soup.find('div',class_='sm-article-content')
if tag == None:
return 0
title = place(':','')
title = place('"','')
title = place('|','')
title = place('/','')
title = place('\\','')
title = place('*','')
title = place('<','')
title = place('>','')
title = place('?','')
with open(r'D:\code\python\spider_news\UC_news\society\\' + title + '.txt','w',encoding='utf-8') as file_object:
file_object.write('\t\t\t\t')
file_object.write(title)
file_object.write('\n')
file_object.write('该新闻地址:')
file_object.write(url)
file_object.write('\n')
java课程多少钱
file_object._text())
#print('正在爬取')
if__name__ == '__main__':
for i in range(0,7):
url = 'news.uc/c_shehui/'
#    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36", #              "cookie":"sn=3957284397500558579; _uc_pramas=%7B%22fr%22%3A%22pc%22%7D"}
#    res = request.Request(url,headers = headers)
res = request.urlopen(url)
req = ad().decode('utf-8')
soup = BeautifulSoup(req,'lxml')
#print(soup.prettify())
tag = soup.find_all('div',class_ = 'txt-area-title') #print(tag.name)
for x in tag:
news_url = 'news.uc' + ('href') print(x.a.string,news_url)
download(x.a.string,news_url)

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。