Scrapy的extensions:EXTENSIONS
EXTENSIONS
EXTENSIONS_BASE
注意:关于scrapy爬虫extensions 执行顺序的问题
查看默认的爬虫中间件scrapy settings –get EXTENSIONS_BASE
1
2
3
4
5
6
7
8
9
10{"scrapy.extensions.corestats.CoreStats": 0,
"scrapy.extensions.telnet.TelnetConsole": 0,
"scrapy.extensions.memusage.MemoryUsage": 0,
"scrapy.extensions.memdebug.MemoryDebugger": 0,
"scrapy.extensions.closespider.CloseSpider": 0,
"scrapy.extensions.feedexport.FeedExporter": 0,
"scrapy.extensions.logstats.LogStats": 0,
"scrapy.extensions.spiderstate.SpiderState": 0,
"scrapy.extensions.throttle.AutoThrottle": 0
}EXTENSIONS 设置是一个 dict,其中键是扩展路径,它们的值是定义扩展加载顺序的顺序。EXTENSIONS 设置与 Scrapy 中定义的 EXTENSIONS_BASE 设置合并(无意被重写) ,然后按顺序排序,以获得最终的已启用扩展的排序列表。
由于扩展通常不相互依赖,因此它们的加载顺序在大多数情况下是不相关的。这就是 EXTENSIONS_BASE 设置以相同顺序(0)定义所有扩展的原因。但是,如果您需要添加一个依赖于已经加载的其他扩展的扩展,则可以利用这个特性
需要重点关注的几个默认extensions
- from scrapy.extensions.corestats import CoreStats
- from scrapy.extensions.logstats import LogStats
- from scrapy.extensions.throttle import AutoThrottle
关于signal
scrapy.crawler.signals中定义了scrapy的各种状态
E:\python3.7.6\Lib\site-packages\scrapy\crawler.py,中初始化self.signals = SignalManager(self)
E:\python3.7.6\Lib\site-packages\scrapy\signalmanager.py(SignalManager),基于from pydispatch import dispatcher这个模块
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110import logging
from twisted.internet.defer import DeferredList, Deferred
from twisted.python.failure import Failure
from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
from pydispatch.robustapply import robustApply
from scrapy.exceptions import StopDownload
from scrapy.utils.defer import maybeDeferred_coro
from scrapy.utils.log import failure_to_exc_info
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
"""Like pydispatcher.robust.sendRobust but it also logs errors and returns
Failures instead of exceptions.
"""
dont_log = (named.pop('dont_log', _IgnoredException), StopDownload)
spider = named.get('spider', None)
responses = []
# getAllReceivers给定sender和signal,获取所有的receiver
for receiver in liveReceivers(getAllReceivers(sender, signal)):
try:
# 调用receiver
response = robustApply(receiver, signal=signal, sender=sender, *arguments, **named)
if isinstance(response, Deferred):
logger.error("Cannot return deferreds from signal handler: %(receiver)s",
{'receiver': receiver}, extra={'spider': spider})
except dont_log:
result = Failure()
except Exception:
result = Failure()
logger.error("Error caught on signal handler: %(receiver)s",
{'receiver': receiver},
exc_info=True, extra={'spider': spider})
else:
result = response
responses.append((receiver, result))
return responses
from pydispatch import dispatcher
from scrapy.utils import signal as _signal
class SignalManager:
def __init__(self, sender=dispatcher.Anonymous):
self.sender = sender
def connect(self, receiver, signal, **kwargs):
"""
Connect a receiver function to a signal.
The signal can be any object, although Scrapy comes with some
predefined signals that are documented in the :ref:`topics-signals`
section.
:param receiver: the function to be connected
:type receiver: collections.abc.Callable
:param signal: the signal to connect to
:type signal: object
"""
kwargs.setdefault('sender', self.sender)
return dispatcher.connect(receiver, signal, **kwargs)
def disconnect(self, receiver, signal, **kwargs):
"""
Disconnect a receiver function from a signal. This has the
opposite effect of the :meth:`connect` method, and the arguments
are the same.
"""
kwargs.setdefault('sender', self.sender)
return dispatcher.disconnect(receiver, signal, **kwargs)
def send_catch_log(self, signal, **kwargs):
"""
Send a signal, catch exceptions and log them.
The keyword arguments are passed to the signal handlers (connected
through the :meth:`connect` method).
"""
kwargs.setdefault('sender', self.sender)
return _signal.send_catch_log(signal, **kwargs)
def send_catch_log_deferred(self, signal, **kwargs):
"""
Like :meth:`send_catch_log` but supports returning
:class:`~twisted.internet.defer.Deferred` objects from signal handlers.
Returns a Deferred that gets fired once all signal handlers
deferreds were fired. Send a signal, catch exceptions and log them.
The keyword arguments are passed to the signal handlers (connected
through the :meth:`connect` method).
"""
kwargs.setdefault('sender', self.sender)
return _signal.send_catch_log_deferred(signal, **kwargs)
def disconnect_all(self, signal, **kwargs):
"""
Disconnect all receivers from the given signal.
:param signal: the signal to disconnect from
:type signal: object
"""
kwargs.setdefault('sender', self.sender)
return _signal.disconnect_all(signal, **kwargs)例子:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45from pydispatch import dispatcher
from settings import TASK_END_SIGNAL, TASK_START_SIGNAL
class TaskSignal(object):
def task_start(self, app_task):
dispatcher.send(
signal=TASK_START_SIGNAL, sender=app_task
)
return app_task
def task_close(self, app_task):
dispatcher.send(
signal=TASK_END_SIGNAL, sender=app_task
)
return app_task
def _start_spider(sender, **kwargs):
print(sender, kwargs)
print("任务开始")
def _finish_spider(sender, **kwargs):
print(sender, kwargs)
print("任务结束")
dispatcher.connect(
receiver=_start_spider,
signal=TASK_START_SIGNAL,
sender=dispatcher.Any
)
dispatcher.connect(
receiver=_finish_spider,
signal=TASK_END_SIGNAL,
sender=dispatcher.Any
)
if __name__ == '__main__':
s = {"task": "demo", "name": "xx"},
ts = TaskSignal()
ts.task_start(s)
关于scrapy的爬虫状态统计CoreStats
1 | class CoreStats: |
关于scrapy的爬虫状态统计LogStats
1 | class LogStats: |
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来源 desperado!