Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容

Python3：爬取新浪、⽹易、今⽇头条、UC四⼤⽹站新闻标题及

内容

Python3：爬取新浪、⽹易、今⽇头条、UC四⼤⽹站新闻标题及

内容

以爬取相应⽹站的社会新闻内容为例：

c语言的三种结构⼀、新浪：

新浪⽹的新闻⽐较好爬取，我是⽤BeautifulSoup直接解析的，它并没有使⽤JS异步加载，直接爬取就⾏了。

'''

新浪新闻：news.sina/society/

Date：20180920

Author：lizm

Description：获取新浪新闻

'''

import requests

from bs4 import BeautifulSoup

from urllib import request

import sys

import re

import os

def getNews(title,url,m):

Hostreferer = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

}

req = request.Request(url)

response = request.urlopen(req)

#过滤⾮utf-8的⽹页新闻

response = ad().decode('utf-8',"ignore")

soup = BeautifulSoup(response,'lxml')

tag = soup.find('div',class_='article')

if tag == None:

return 0

#获取⽂章发布时间

fb_date = soup.find('div','date-source').span.string

#获取发布⽹站名称

fb_www= soup.find('div','date-source').a.string

#获取⽂章内容

rep = repile("[\s+\.\!\/_,$%^*(+\"\']+|[+<>?、~*（）]+")

title = rep.sub('',title)

title = place(':','：')

filename = sys.path[0]+"/news/"+title+".txt"

with open(filename,'w',encoding='utf8') as file_object:

file_object.write(fb_date + "" + fb_www)

file_object.write("\n")

file_object.write("⽹址:"+url)

file_object.write("\n")

file_object.write(title)

file_object._text())

i = 0

for image in tag.find_all('div','img_wrapper'):

title_img = title +str(i)

#保存图⽚

#判断⽬录是否存在

if (ists(sys.path[0]+"/news/"+title)):

pass

else:

#不存在，则新建⽬录

os.mkdir(sys.path[0]+"/news/"+title)

os.chdir(sys.path[0]+"/news/"+title)

file_name = "news.sina/"+('src').replace('//','')

html = (file_name, headers=Hostreferer)

# 图⽚不是⽂本⽂件，以⼆进制格式写⼊，所以是t

title_img = title_img +".jpg"

f = open(title_img, 'wb')

f.t)

f.close()

i+=1

print('成功爬取第', m,'个新闻',title)

return 0

#获取社会新闻（最新的162条新闻）

def getTitle(url):

req = request.Request(url)

response = request.urlopen(req)

response = ad().decode('utf8')

soup = BeautifulSoup(response,'lxml')

y = 0

for tag in soup.find('ul',class_='seo_data_list').find_all('li'):

if tag.a != None:

#if y== 27:

print(y,tag.a.string,('href'))

temp = tag.a.string

getNews(temp,('href'),y)

y += 1

if__name__ == '__main__':

url = 'news.sina/society/'

getTitle(url)

⼆、⽹易：

⽹易新闻的标题及内容是使⽤js异步加载的，单纯的下载⽹页源代码是没有标题及内容的，我们可以在Network的js中到我们需要的内容，这⾥我使⽤了正则表达式来获取我们需要的标题及其链接，并使⽤了BeautifulSoup来获取相应标题的内容。

import re

from urllib import request

from bs4 import BeautifulSoup

def download(title, url):

req = request.urlopen(url)

res = ad()

soup = BeautifulSoup(res,'lxml')

#print(soup.prettify())

tag = soup.find('div',class_='post_text')

#_text())

title = place(':','')

title = place('"','')

title = place('|','')

title = place('/','')

title = place('\\','')

title = place('*','')

title = place('<','')

title = place('>','')

title = place('?','')

#print(title)

file_name = r'D:\code\python\spider_news\NetEase_news\sociaty\\' +title + '.txt'

file = open(file_name,'w',encoding = 'utf-8')

file._text())

if__name__ == '__main__':

urls = ['temp.163/special/00804KVA/cm_shehui.js?callback=data_callback',

'temp.163/special/00804KVA/cm_shehui_02.js?callback=data_callback',

'temp.163/special/00804KVA/cm_shehui_03.js?callback=data_callback']

for url in urls:

#url = 'temp.163/special/00804KVA/cm_shehui_02.js?callback=data_callback'

req = request.urlopen(url)

res = ad().decode('gbk')

#print(res)

pat1 = r'"title":"(.*?)",'

pat2 = r'"tlink":"(.*?)",'

m1 = re.findall(pat1,res)

news_title = []

for i in m1:

news_title.append(i)

m2 = re.findall(pat2,res)

news_url = []

for j in m2:

news_url.append(j)

for i in range(0,len(news_url)):

#print(news_title[i],news_body[i])

python解析json文件

download(news_title[i],news_url[i])

print('正在爬取第' + str(i) + '个新闻',news_title[i])

三、头条：

头条的新闻跟前两个也都不⼀样，它的标题和链接是封装到json⽂件中的，但是他json⽂件的url参数是通过⼀个js随机算法变化的，所以我们需要模拟json⽂件的参数，否则我们不到json⽂件的具体url，我是通过这篇博客才了解到url获取⽅法的，⽽且也解决了总是下载重复新闻的问题，该⽹站⾃带反爬机制，需要添加cookie。关于新闻的内容，我⽤了正则表达式提取了中⽂。

from urllib import request

import requests

import json

import time

import math

import hashlib

import re

from bs4 import BeautifulSoup

def get_url(max_behot_time, AS, CP):

url = 'utiao/api/pc/feed/?category=news_society&utm_source=toutiao&widen=1' \ '&max_behot_time={0}' \

'&max_behot_time_tmp={0}' \

'&tadrequire=true' \

'&as={1}' \

'&cp={2}'.format(max_behot_time, AS, CP)

return url

def get_ASCP():

t = int(math.floor(time.time()))

e = hex(t).upper()[2:]

m = hashlib.md5()

m.update(str(t).encode(encoding='utf-8'))

i = m.hexdigest().upper()

if len(e) != 8:

AS = '479BB4B7254C150'

CP = '7E0AC8874BB0985'

return AS,CP

n = i[0:5]

a = i[-5:]

s = ''

r = ''

for o in range(5):

s += n[o] + e[o]

r += e[o + 3] + a[o]

AS = 'AL'+ s + e[-3:]

CP = e[0:3] + r + 'E1'

# print("AS:"+ AS,"CP:" + CP)

return AS,CP

def download(title, news_url):

# print('正在爬')

req = request.urlopen(news_url)

de() != 200:

return 0

res = ad().decode('utf-8')

#print(res)

pat1 = r'content:(.*?),'

pat2 = repile('[\u4e00-\u9fa5]+')

result1 = re.findall(pat1,res)

#print(len(result1))

if len(result1) == 0:

return 0

print(result1)

result2 = re.findall(pat2,str(result1))

result3 = []

for i in result2:

if i not in result3:

result3.append(i)

#print(result2)

title = place(':','')

title = place('"','')

title = place('|','')

title = place('/','')

title = place('\\','')

title = place('*','')

title = place('<','')

title = place('>','')

title = place('?','')

with open(r'D:\code\python\spider_news\Toutiao_news\society\\' + title + '.txt','w') as file_object:

file_object.write('\t\t\t\t')

file_object.write(title)

file_object.write('\n')

file_object.write('该新闻地址：')

file_object.write(news_url)

file_object.write('\n')

for i in result3:

#print(i)

file_object.write(i)

file_object.write('\n')

# file_object._text())

#print('正在爬取')

def get_item(url):

#time.sleep(5)

cookies = {'tt_webid': '6478612551432734221'}

wbdata = (url,cookies = cookies)

wbdata2 = json.)

data = wbdata2['data']

for news in data:

title = news['title']

news_url = news['source_url']

news_url = 'utiao' + news_url

print(title, news_url)

if'ad_label'in news:

print(news['ad_label'])

swappiness

continue

download(title,news_url)

无主之地免费下载next_data = wbdata2['next']

next_max_behot_time = next_data['max_behot_time']

# print("next_max_behot_time:{0}".format(next_max_behot_time))

return next_max_behot_time

if__name__ == '__main__':

refresh = 50

for x in range(0,refresh+1):

print('第{0}次：'.format(x))

if x == 0:

max_behot_time = 0

else:

max_behot_time = next_max_behot_time

#print(next_max_behot_time)

AS,CP = get_ASCP()

url = get_url(max_behot_time,AS,CP)

next_max_behot_time = get_item(url)

四、UC

UC和新浪差不多，没有太复杂的反爬⾍，直接解析爬取就好。

from bs4 import BeautifulSoup

from urllib import request

def download(title,url):

req = request.Request(url)

二氧化碳气瓶压力容器类别

response = request.urlopen(req)

response = ad().decode('utf-8')

soup = BeautifulSoup(response,'lxml')

tag = soup.find('div',class_='sm-article-content')

if tag == None:

return 0

title = place(':','')

title = place('"','')

title = place('|','')

title = place('/','')

title = place('\\','')

title = place('*','')

title = place('<','')

title = place('>','')

title = place('?','')

with open(r'D:\code\python\spider_news\UC_news\society\\' + title + '.txt','w',encoding='utf-8') as file_object:

file_object.write('\t\t\t\t')

file_object.write(title)

file_object.write('\n')

file_object.write('该新闻地址：')

file_object.write(url)

file_object.write('\n')

java课程多少钱

file_object._text())

#print('正在爬取')

if__name__ == '__main__':

for i in range(0,7):

url = 'news.uc/c_shehui/'

# headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36", # "cookie":"sn=3957284397500558579; _uc_pramas=%7B%22fr%22%3A%22pc%22%7D"}

688IT编程网

Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容_百度文 ...

发表评论

推荐文章

java正则表达式选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符回溯引用和前后查匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式选择题

工龄小数点提取

非零金额正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

688IT编程网

Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容_百度文 ...

发表评论

推荐文章

java正则表达式 选择题

一种基于正则表达式的DBC文件解析及报文分析方法[发明专利]

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

热门文章

利用正则表达式实现文本数据提取与处理

正则表达式零宽断言详解

文本匹配规则

excel中使用正则

1-31正则表达式

anki之高级筛选

BUAA_OO_2021_第一单元总结

insert语句递增写法

sublime text 3在行前插入递增数字序号的方法

字符串只允许数字和英文的正则

powerbuilder 正则表达式

Shell脚本编写的高级技巧利用正则表达式进行字符串匹配

JAVA正则表达式的三种模式:贪婪,勉强和占有的讨论

go regexp匹配规则

oracle regexp_substr 实现原理

基本的元字符 回溯引用和前后查 匹配模式

elasticsearch query dsl正则

oracle sql正则表达式

GA-设置目标

仅匹配全角片假名的正则表达式

最新文章

java正则表达式 选择题

工龄小数点提取

非零金额 正则表达式

提取文本中数字的函数

vue数字相加小数点变长-概述说明以及解释

vue validate 正则验证小数长度

标签列表

java正则表达式选择题

非零金额正则表达式

基本的元字符回溯引用和前后查匹配模式

java正则表达式选择题

非零金额正则表达式