Python3爬⾍Scrapy框架发送post请求详细笔记(带代码)scrapy 简单的post请求(先把我⽤的拿出来表⽰尊敬)
学了本⽂你能学到什么?仅供学习,如有疑问,请留⾔。。。
import scrapy
#发送post请求这⾥的post请求没有实际的应⽤直接使⽤request来发送post请求⽐较简单
#需求通过百度翻译中的搜索也就是post请求这⾥搜索的内容是dog
class PostSpider(scrapy.Spider):
name = 'post'
# allowed_domains = ['']
start_urls = ['fanyi.baidu/sug']
#这是⽗类中的⽅法 start_urls被封装到这⾥⾯来
def start_requests(self):
#注意这是原先的
# for url in self.start_urls:
# #直接发送get请求
# # yield scrapy.Request(url=url,callback=self.parse)
data ={
'kw':'dog'
}
#重写⽗类的⽅法start_urls
for url in self.start_urls:
#直接发送get请求
yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse)
def parse(self, response):
font awesome图标如何浏览#这⾥拿到的是⼀个json ⽽不是源码不需要使⽤response.xpath
)
Request对象
源码如下:
"""
This module implements the Request class which is used to represent HTTP
requests in Scrapy.
See documentation in docs/topics/request-response.rst
"""
import six
webstorm怎么做网页
from w3lib.url import safe_url_string
matlab 函数拟合polyfitfrom scrapy.http.headers import Headers
from scrapy.utils.python import to_bytes
from ackref import object_ref
from scrapy.utils.url import escape_ajax
from scrapy.httpmon import obsolete_setter
class Request(object_ref):
#初始化
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None):
self._encoding = encoding # this one has to be set firstw3c标准组织中负责标准制定
self._set_url(url)
self._set_body(body)
assert isinstance(priority, int), "Request priority not an integer: %r" % priority
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError('callback must be a callable, got %s' % type(callback).__name__)#raise⾃动引出异常 if errback is not None and not callable(errback):
raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
assert callback or not errback, "Cannot use errback without a callback"
self.callback = callback
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
self.flags = [] if flags is None else list(flags)
@property
def meta(self):
if self._meta is None:
self._meta = {}
return self._meta
def _get_url(self):
return self._url
def _set_url(self, url):
if not isinstance(url, six.string_types):
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
s = safe_url_string(url, ding)
self._url = escape_ajax(s)
if ':' not in self._url:
raise ValueError('Missing scheme in request url: %s' % self._url)
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
else:
self._body = to_bytes(body, ding)
body = property(_get_body, obsolete_setter(_set_body, 'body'))
@property
def encoding(self):正则表达式数字字母特殊符号包含三种
return self._encoding
def __str__(self):
return "<%s %s>" % (hod, self.url)
__repr__ = __str__
def copy(self):
"""Return a copy of this Request"""
place()
def replace(self, *args, **kwargs):
"""Create a new Request with the same attributes except for those
given new values.
"""
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
'encoding', 'priority', 'dont_filter', 'callback', 'errback']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
其中本⼈常⽤的参数:
url: 就是需要请求,并进⾏下⼀步处理的url
callback: 指定该请求返回的Response,由那个函数来处理。
method: 请求⼀般不需要指定,默认GET⽅法,可设置为"GET", "POST", "PUT"等,且保证字符串⼤写
meta: ⽐较常⽤,在不同的请求之间传递数据使⽤的。字典dict型
request_with_cookies = Request(
url="ample",
cookies={'currency': 'USD', 'country': 'UY'},
python请求并解析json数据meta={'dont_merge_cookies': True}
)
encoding: 使⽤默认的 'utf-8' 就⾏。
dont_filter: 表明该请求不由调度器过滤。这是当你想使⽤多次执⾏相同的请求,忽略重复的过滤器。默认为False。(具体可以看具体原因就是request的回调函数执⾏与不执⾏的问题,
在scrapy的⽂档中就有提及这个问题
⽂档
If the spider doesn’t define an allowed_domains attribute, or the attribute is empty, the offsite middleware will allow all requests.
If the request has the dont_filter attribute set, the offsite middleware will allow the request even if its domain is not listed in allowed domains
详情也可以看我的另⼀篇博客~~~scrapy爬⾍ Filterd offsite request 错误
)
errback: 指定错误处理函数
Response对象
部分源码如下:
# 部分代码
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body='', flags=None, request=None): self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.flags = [] if flags is None else list(flags)
@property
def meta(self):
try:
a
except AttributeError:
raise AttributeError("a not available, this response " \
"is not tied to any request")
⼤部分和上⾯差不多
status: #响应码
_set_body(body): 响应体
_set_url(url):响应url
post模拟登录github
⽅式⼀:解析登录请求参数发送请求登录
可以使⽤ yield scrapy.FormRequest(url, formdata, callback)⽅法发送POST请求。
# -*- coding: utf-8 -*-
import scrapy
import re
class GithubSpider(scrapy.Spider):
name = 'github'
allowed_domains = ['github']
start_urls = ['github/login']
def parse(self, response):
#构造请求参数
authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
utf8 = response.xpath("//input[@name='utf8']/@value").extract_first()
commit = response.xpath("//input[@name='commit']/@value").extract_first()
post_data = dict(
login="453045311@qq",
password="****8",
authenticity_token=authenticity_token,
utf8=utf8,
commit=commit
)
yield scrapy.FormRequest(
"github/session",
formdata=post_data,
callback=self.login_parse
)
def login_parse(self,response):
print(re.findall("yangge",response.body.decode()))
⽅式⼆:通过预定义表单数据登录
使⽤FormRequest.from_response()⽅法
通常⽹站通过 实现对某些表单字段(如数据或是登录界⾯中的认证令牌等)的预填充。
使⽤Scrapy抓取⽹页时,如果想要预填充或重写像⽤户名、⽤户密码这些表单字段, 可以使⽤ FormRequest.from_response() ⽅法实现。
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论