第一步运行命令 scrapy crawl xx: 执行到:\Lib\site-packages\scrapy\cmdline.py 调用def execute(argv=None, settings=None):
初始化settings = get_project_settings()
配置项目环境init_env:os.environ[‘SCRAPY_SETTINGS_MODULE’] = cfg.get(‘settings’, project)
初始化cmd.crawler_process = CrawlerProcess(settings),一个进程中同时运行多个scrapy爬虫的类,获取cls_path = settings.get(‘SPIDER_LOADER_CLASS’),加载类site-packages\scrapy\spiderloader.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 @implementer(ISpiderLoader ) class SpiderLoader : def __init__ (self, settings ): self .spider_modules = settings.getlist('SPIDER_MODULES' ) self .warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY' ) self ._spiders = {} self ._found = defaultdict(list ) self ._load_all_spiders() def _load_spiders (self, module ): for spcls in iter_spider_classes(module): self ._found[spcls.name].append((module.__name__, spcls.__name__)) self ._spiders[spcls.name] = spcls def _load_all_spiders (self ): for name in self .spider_modules: try : for module in walk_modules(name): self ._load_spiders(module) except ImportError: if self .warn_only: warnings.warn( f"\n{traceback.format_exc()} Could not load spiders " f"from module '{name} '. " "See above traceback for details." , category=RuntimeWarning, ) else : raise self ._check_name_duplicates()
运行_run_print_help(parser, _run_command, cmd, args, opts) 执行\scrapy\crawler.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 @defer.inlineCallbacks def crawl (self, *args, **kwargs ): if self .crawling: raise RuntimeError("Crawling already taking place" ) self .crawling = True try : self .spider = self ._create_spider(*args, **kwargs) self .engine = self ._create_engine() start_requests = iter (self .spider.start_requests()) yield self .engine.open_spider(self .spider, start_requests) yield defer.maybeDeferred(self .engine.start) except Exception: self .crawling = False if self .engine is not None : yield self .engine.close() raise
twisted启动:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 class CrawlerProcess (CrawlerRunner ): def start (self, stop_after_crawl=True ): from twisted.internet import reactor if stop_after_crawl: d = self .join() if d.called: return d.addBoth(self ._stop_reactor) resolver_class = load_object(self .settings["DNS_RESOLVER" ]) resolver = create_instance(resolver_class, self .settings, self , reactor=reactor) resolver.install_on_reactor() tp = reactor.getThreadPool() tp.adjustPoolsize(maxthreads=self .settings.getint('REACTOR_THREADPOOL_MAXSIZE' )) reactor.addSystemEventTrigger('before' , 'shutdown' , self .stop) reactor.run(installSignalHandlers=False )
Scraper 控制爬虫与管道之间的数据传输
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 class Scraper : def __init__ (self, crawler ): self .slot = None self .spidermw = SpiderMiddlewareManager.from_crawler(crawler) itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR' ]) self .itemproc = itemproc_cls.from_crawler(crawler) self .concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS' ) self .crawler = crawler self .signals = crawler.signals self .logformatter = crawler.logformatter