1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
| def _get_method(method, spider): if callable(method): return method elif isinstance(method, str): return getattr(spider, method, None)
_default_link_extractor = LinkExtractor()
class Rule:
def __init__( self, link_extractor=None, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None, errback=None, ): self.link_extractor = link_extractor or _default_link_extractor self.callback = callback self.errback = errback self.cb_kwargs = cb_kwargs or {} self.process_links = process_links or _identity self.process_request = process_request or _identity_process_request self.follow = follow if follow is not None else not callback
def _compile(self, spider): self.callback = _get_method(self.callback, spider) self.errback = _get_method(self.errback, spider) self.process_links = _get_method(self.process_links, spider) self.process_request = _get_method(self.process_request, spider) class CrawlSpider(Spider):
rules: Sequence[Rule] = ()
def __init__(self, *a, **kw): super().__init__(*a, **kw) self._compile_rules()
def _parse(self, response, **kwargs): return self._parse_response( response=response, callback=self.parse_start_url, cb_kwargs=kwargs, follow=True, )
def parse_start_url(self, response, **kwargs): return []
def process_results(self, response, results): return results
def _build_request(self, rule_index, link): return Request( url=link.url, callback=self._callback, errback=self._errback, meta=dict(rule=rule_index, link_text=link.text), )
def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for rule_index, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] for link in rule.process_links(links): seen.add(link) request = self._build_request(rule_index, link) yield rule.process_request(request, response)
def _callback(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _errback(self, failure): rule = self._rules[failure.request.meta['rule']] return self._handle_failure(failure, rule.errback)
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for request_or_item in iterate_spider_output(cb_res): yield request_or_item
if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
def _handle_failure(self, failure, errback): if errback: results = errback(failure) or () for request_or_item in iterate_spider_output(results): yield request_or_item
def _compile_rules(self): self._rules = [] for rule in self.rules: self._rules.append(copy.copy(rule)) self._rules[-1]._compile(self)
@classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super().from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True) return spider
|