python爬⾍———多线程threading模块爬取抖⾳⽤户信息
爬⾍背景:
由于原来的数据库中有1.5亿左右的⽤户id,但是其中有1.2亿的⽤户资料是不完整的(没有粉丝数量,点赞数量等,算是⽆⽤数据),现在⽼板要求将这些没有资料的⽤户更新信息,咋办?
刚开始的想法是使⽤主从模式+scrapy爬取,但是写着写着觉得⿇烦(写python的都很懒,scrapy还是⽐较臃肿的),然后突然想
到,python中的多线程,处理爬⾍这种存在⼤量io的操作时,多线程是⾮常有⽤的,⽽且省服务器资源(其他的爬⾍也在服务器,能省⼀点是⼀点,毕竟是⾃⼰⽤的,太卡的话上班都不爽),开始⼲!
不怕笑话,就2个函数。。。。
我觉得代码不重要,重要的是处理的⽅式,以及多多利⽤多线程的这种⽅法,各种原因不想搭scrapy的时候,只需要⼏⾏代码,就能将爬⾍速度提升n倍,岂不美哉?
⼀.
说⼀下我的爬⾍思路(架构?),不存在花⾥胡哨的操作与各种专业名词,只讲究实⽤
爬虫软件 app1.我的整个爬⾍所⽤到的有django1.8.2 + requests + redis + pgsql 由于这⾥django的作⽤很⼩(代码中有讲),就不讲了
2.django操作pg存数据,redis作为队列
3.由于使⽤了redis作为队列,所以不存在多线程中的锁(Lock)的隐患,队列本⾝就是安全的,锁来锁去什么的不⽤管
4.我的这个模式,⾃认为还是⽐较好⽤的,简单快捷,剩下的会在下⾯代码中注释说明,只copy了部分代码块
⼆.
说⼀下我这个⾥⾯的多线程的注意事项吧:
1.将爬⾍⽤while True包裹起来,⽬的是为了保持线程的存活,⾄于单个线程的退出条件,可以⽤for,while之类的条件来限制
2.函数中不要⽤return,不要⽤return,不要⽤return,重要的事情说三遍
@staticmethod
def get_user_info_from_app_new(**kwargs):
while True:
user_id = redis_client.rpop(REDIS_DOUYIN_NULL_INFO_USER) # 从redis中取⼀个user_id
retry_list = []
for retry in range(150): # 额,由于设备号不⾜的原因,导致⼀个user_id的请求最多重复请求150次,只要其中任意⼀次成功,就ok
url_as, url_cp, iid, device_id, openudid = _params_for_app(**kwargs)
device_info = redis_client.srandmember(REDIS_DOUYIN_SET_DEVICE_TYPE_BRAND) # str 每⼀次请求都是随机不同的设备型号
device = device_info.split(';')
d_type = device[0]
d_brand = device[1]
# print 'device_type', d_type
# print 'device_brand', d_brand
url = 'aweme-eagle.snssdk/aweme/v1/user/?user_id={}&retry_type=no_retry&iid={}&device_id={}&ac=wifi&channel=xiaomi&aid=1128 &app_name=aweme&version_code=179&version_name=1.6.8&device_platform=android&ssmix=a&device_type={}&device_brand={}&language=zh&os_ap i=22&os_version=5.1.1&openudid={}&manifest_version_code=179&resolution=720*1280&dpi=240&update_version_code=1792&_rticket={}&ts={}&as={}&c p={}'.format(
user_id, iid, device_id, d_type, d_brand, openudid, int(time.time() * 1000), int(time.time()), url_as,
url_cp)
req = (url, headers=app_headers)
try:
jdata = json.t)
except ValueError as e:
logging.info('retry for no json data decode, {}, {}'.format(user_id, t))
continue
if jdata['status_code'] != 0 or ('user'):
<('get user info from app error, {}, {}, times:{}'.format(user_id, jdata, retry))
retry_list.append(retry)
# return None
else:
try:
internal的近义词douyin_id = user_id
user = ('user')
city = ('city')
('cover_url') is None ("nickname", u'已重置') == u"已重置":
logging.info('user has been limited, {}, {}'.format(user_id, jdata))
locations = ('location')
activity_digg_count = user['activity']['digg_count']
activity_digg_count = user['activity']['digg_count']
use_music_count = user['activity']['use_music_count']
apple_account = user['apple_account']
head_url = user['avatar_larger']['url_list'][0]
gender = ('gender')
bind_phone = user['bind_phone']
birthday = user['birthday']
country = user['country']
province = user['province']
constellation_id = user['constellation']
verify_info = user['custom_verify']
dongtai_count = user['dongtai_count']
like_num = user['favoriting_count']
follower_quantity = user['follower_count']
followed_quantity = user['following_count']
ins_id = user['ins_id']
mplatform_followers_count = user['mplatform_followers_count']
name = user['nickname']
room_id = user['room_id']
musician_digg_count = ('original_musician', {}).get('digg_count')
musician_music_count = ('original_musician', {}).get('music_count')
musician_music_used_count = ('original_musician', {}).get('music_used_count') share_qrcode_uri = user['share_qrcode_uri']
short_id = user['short_id']
signature = user['signature']
total_digg_count = user['total_favorited']
uid = user['uid']
unique_id = user['unique_id']
weibo_name = ('weibo_name')
weibo_url = ('weibo_url')
feed_num = ('aweme_count')
with_fusion_shop_entry = ('with_fusion_shop_entry')
with_item_commerce_entry = ('with_commerce_entry')
verification_type = ('verification_type')
custom_verify = ('custom_verify')
enterprise_verify_reason = ('enterprise_verify_reason')
payloads = {
'user': user,
'city': city,
'locations': locations,
'activity_digg_count': activity_digg_count,
'use_music_count': use_music_count,
'apple_account': apple_account,
'head_url': head_url,
'gender': gender,
'bind_phone': bind_phone,
'birthday': birthday,
'country': country,
'province': province,
'constellation_id': constellation_id,
'verify_info': verify_info,
'dongtai_count': dongtai_count,
'like_num': like_num,
'follower_quantity': follower_quantity,
'followed_quantity': followed_quantity,全球新冠肺炎疫情实时动态数据
'ins_id': ins_id,
'mplatform_followers_count': mplatform_followers_count,
'name': name,
'room_id': room_id,
'musician_digg_count': musician_digg_count,
'musician_music_count': musician_music_count,
'musician_music_used_count': musician_music_used_count,
'share_qrcode_uri': share_qrcode_uri,
'short_id': short_id,
'short_id': short_id,
'signature': signature,
'total_digg_count': total_digg_count,
'uid': uid,
'unique_id': unique_id,
'weibo_name': weibo_name,
'weibo_url': weibo_url,
'feed_num': feed_num,
'with_fusion_shop_entry': with_fusion_shop_entry,
'with_item_commerce_entry': with_item_commerce_entry,
'verification_type': verification_type,语句select
'custom_verify': custom_verify,
'enterprise_verify_reason': enterprise_verify_reason,
}
try:
osi七层模型物理层功能描述# django中有很好⽤的⽅法,创建或更新资料,都不⽤管去重的问题
dd_user, is_create = DouyinUser.objects.update_or_create(douyin_id=user_id, defaults={ 'name': payloads['name'],
'verify_info': payloads['verify_info'],
'signature': payloads['signature'],
'follower_quantity': payloads['follower_quantity'],
c语言%什么意思'followed_quantity': payloads['followed_quantity'],
'total_digg_count': payloads['total_digg_count'],
'feed_num': payloads['feed_num'],
'like_num': payloads['like_num']
})
print dd_user, 'UPDATE USER INFO SUCCESS'
# return payloads # 注释了最好不要有return这个东西
except:
continue
except:
# 加⼊到抓取失败的队列
# redis_client.lpush(REDIS_DOUYIN_USER_GET_FAILED_LIST, user_id)
continue
主函数
def get_new(self):
for i in range(10): # 创建线程池,⼤⼩10,想要多少就多少,电脑性能决定
tt = Thread(_user_info_from_app_new) # 个⼈不建议在这⾥传参
tt.start() # 不加锁,不回收,直接start
是的,多加2⾏就搞定了,速度提升10倍!
最后:
注意
windows上,把主函数放在main函数⾥⾯调⽤
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论