diff --git a/finscraper/extensions.py b/finscraper/extensions.py new file mode 100644 index 0000000..831b275 --- /dev/null +++ b/finscraper/extensions.py @@ -0,0 +1,61 @@ +"""Module for Scrapy extension.""" + + +import logging + +from collections import defaultdict + +from scrapy import signals +from scrapy.exceptions import NotConfigured, CloseSpider + +from tqdm.auto import tqdm + +from finscraper.utils import TqdmLogger + + +class ProgressBar: + + def __init__(self, crawler): + self.progress_bar_enabled = ( + crawler.settings.get('PROGRESS_BAR_ENABLED', False)) + self.closespider_itemcount = ( + crawler.settings.get('CLOSESPIDER_ITEMCOUNT', None)) + self.counter = defaultdict(int) + + if not self.progress_bar_enabled: + raise NotConfigured + + crawler.signals.connect(self.on_response, + signal=signals.response_received) + crawler.signals.connect(self.on_item_scraped, + signal=signals.item_scraped) + crawler.signals.connect(self.on_error, signal=signals.spider_error) + + logger = logging.getLogger() + self.progress_bar = tqdm(desc='Progress', unit=' items', + total=self.closespider_itemcount, + file=TqdmLogger(logger)) + self.itemcount = 0 + self.pagecount = 0 + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def on_error(self, failure, response, spider): + self.counter['errorcount'] += 1 + self.progress_bar.set_postfix({ + 'pages': self.counter['pagecount'], + 'errors': self.counter['errorcount'] + }) + + def on_response(self, response, request, spider): + self.counter['pagecount'] += 1 + self.progress_bar.set_postfix({ + 'pages': self.counter['pagecount'], + 'errors': self.counter['errorcount'] + }) + + def on_item_scraped(self, item, spider): + self.counter['itemcount'] += 1 + self.progress_bar.update() diff --git a/finscraper/middlewares.py b/finscraper/middlewares.py index b2cfb0c..b41854c 100644 --- a/finscraper/middlewares.py +++ b/finscraper/middlewares.py @@ -1,5 +1,6 @@ """Module for Scrapy middleware.""" + import logging logger = logging.getLogger(__name__) @@ -11,20 +12,31 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options +# Monkey patch, see https://github.com/pypa/pipenv/issues/2609 +import webdriver_manager.utils +def console(text, bold=False): + pass +webdriver_manager.utils.console = console + from webdriver_manager.chrome import ChromeDriverManager -class DownloaderMiddlewareWithJs(object): - # https://stackoverflow.com/questions/31174330/passing-selenium-response-url-to-scrapy +class DownloaderMiddlewareWithJs: + + def __init__(self, settings): + self.log_enabled = settings.get('LOG_ENABLED', False) + self.log_level = settings.get('LOG_LEVEL', logging.NOTSET) + self.progress_bar_enabled = settings.get('PROGRESS_BAR_ENABLED', False) @classmethod def from_crawler(cls, crawler): - middleware = cls() + middleware = cls(crawler.settings) crawler.signals.connect(middleware.spider_opened, signals.spider_opened) crawler.signals.connect(middleware.spider_closed, signals.spider_closed) - return middleware + + return middleware def spider_opened(self, spider): options = Options() @@ -32,15 +44,19 @@ def spider_opened(self, spider): options.add_argument("--disable-extensions") options.add_argument("--disable-gpu") #options.add_argument("--no-sandbox") # linux only - self.driver = webdriver.Chrome( - ChromeDriverManager().install(), - options=options) + if self.progress_bar_enabled: + options.add_argument('--disable-logging') + for name in ['selenium.webdriver.remote.remote_connection', + 'requests', 'urllib3']: + logging.getLogger(name).propagate = False + + self.driver = webdriver.Chrome(ChromeDriverManager().install(), + options=options) def spider_closed(self, spider): self.driver.close() def process_request(self, request, spider): - return_response = False run_js = request.meta.get('run_js') run_js_wait_sec = request.meta.get('run_js_wait_sec', 0) scroll_to_bottom = request.meta.get('scroll_to_bottom') diff --git a/finscraper/scrapy_spiders/isarticle.py b/finscraper/scrapy_spiders/isarticle.py index 47a6f23..87e979d 100644 --- a/finscraper/scrapy_spiders/isarticle.py +++ b/finscraper/scrapy_spiders/isarticle.py @@ -21,7 +21,7 @@ def __init__( category=None, follow_link_extractor=None, item_link_extractor=None, - allow_chromedriver=False, + allow_chromedriver=False, # TODO: Allow set through property *args, **kwargs): """Fetch IltaSanomat news articles. diff --git a/finscraper/scrapy_spiders/mixins.py b/finscraper/scrapy_spiders/mixins.py index 50d4efc..943827e 100644 --- a/finscraper/scrapy_spiders/mixins.py +++ b/finscraper/scrapy_spiders/mixins.py @@ -2,6 +2,7 @@ from scrapy import Request +from scrapy.exceptions import CloseSpider class FollowAndParseItemMixin: @@ -13,6 +14,7 @@ class FollowAndParseItemMixin: item pages from. 3) `parse_item` function: Parse item from response. """ + itemcount = 0 item_link_extractor = None follow_link_extractor = None custom_settings = { @@ -34,8 +36,13 @@ def start_requests(self): def parse(self, resp, to_parse=False): """Parse items and follow links based on defined link extractors.""" + if (self.itemcount and + self.itemcount == self.settings.get('CLOSESPIDER_ITEMCOUNT', 0)): + raise CloseSpider + if to_parse: yield self._parse_item(resp) + self.itemcount += 1 # Parse items and further on extract links from those pages item_links = self.item_link_extractor.extract_links(resp) diff --git a/finscraper/settings.py b/finscraper/settings.py index 7b13e57..7c15160 100644 --- a/finscraper/settings.py +++ b/finscraper/settings.py @@ -60,6 +60,7 @@ # See https://docs.scrapy.org/en/latest/topics/extensions.html EXTENSIONS = { 'scrapy.extensions.closespider.CloseSpider': 200, + 'finscraper.extensions.ProgressBar': 500 } # Configure item pipelines diff --git a/finscraper/spiders.py b/finscraper/spiders.py index 6144cc8..9c0f19b 100644 --- a/finscraper/spiders.py +++ b/finscraper/spiders.py @@ -10,17 +10,25 @@ _ISArticleItem -__jobdir_doc__ = ''' -jobdir (None or str, optional): Working directory of the spider. +__wrapper_doc__ = ''' +jobdir (str or None, optional): Working directory of the spider. Defaults to None, which creates a temp directory to be used. Note that this directory will only be deleted through the `clear` method! +progress_bar (bool, optiona): Whether to enable progress bar or not. This + parameter is ignored if `log_level` is not None. Defaults to True. +log_level (str or None, optional): Level of logging to display. + Should be in ['debug', 'info', 'warn', 'error', 'critical'] or None. + When None, logging is disabled. Defaults to None. Note that this parameter + can be overriden through Scrapy settings (LOG_LEVEL, LOG_ENABLED) when + calling the `scrape` -method, and progress bar is not displayed when + `log_level` is not None. ''' def _get_docstring(spider_cls, item_cls): return ( spider_cls.__init__.__doc__.strip() - + indent(__jobdir_doc__, ' ' * 12) + + indent(__wrapper_doc__, ' ' * 12) + indent(item_cls.__doc__, ' ' * 4)) @@ -28,7 +36,7 @@ class ISArticle(_SpiderWrapper): __doc__ = _get_docstring(_ISArticleSpider, _ISArticleItem) def __init__(self, category=None, follow_link_extractor=None, item_link_extractor=None, allow_chromedriver=False, - jobdir=None): + jobdir=None, progress_bar=True, log_level=None): super(ISArticle, self).__init__( spider_cls=_ISArticleSpider, spider_params=dict( @@ -37,13 +45,16 @@ def __init__(self, category=None, follow_link_extractor=None, item_link_extractor=item_link_extractor, allow_chromedriver=allow_chromedriver ), - jobdir=jobdir) + jobdir=jobdir, + progress_bar=progress_bar, + log_level=log_level) class ILArticle(_SpiderWrapper): __doc__ = _get_docstring(_ILArticleSpider, _ILArticleItem) def __init__(self, category=None, follow_link_extractor=None, - item_link_extractor=None, jobdir=None): + item_link_extractor=None, jobdir=None, progress_bar=True, + log_level=None): super(ILArticle, self).__init__( spider_cls=_ILArticleSpider, spider_params=dict( @@ -51,4 +62,6 @@ def __init__(self, category=None, follow_link_extractor=None, follow_link_extractor=follow_link_extractor, item_link_extractor=item_link_extractor ), - jobdir=jobdir) + jobdir=jobdir, + progress_bar=progress_bar, + log_level=log_level) diff --git a/finscraper/utils.py b/finscraper/utils.py index 87c572a..e30d178 100644 --- a/finscraper/utils.py +++ b/finscraper/utils.py @@ -1,4 +1,92 @@ -"""Module for utility functions.""" +"""Module for utility functions and classes.""" + + +import io +import logging + +import pickle +import re + +from tqdm.auto import tqdm + + +class TqdmLogger(io.StringIO): + + def __init__(self, logger): + self.logger = logger + self.buf = '' + self.ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') + + def write(self, buf): + self.buf = buf + + def flush(self): + if self.buf.strip() != '': + self.logger.log(logging.WARN, self.buf) + + +class QueueHandler(logging.Handler): + + def __init__(self, queue): + """Sends events to a queue, allowing multiprocessing. + + Args: + queue (multiprocessing.Queue): Queue object to use. + + This handler checks for picklability before saving items into queue. + Modified from: https://gist.github.com/vsajip/591589 + """ + logging.Handler.__init__(self) + self.queue = queue + + def _get_picklable_attrs(self, record): + # TODO: More performant way to do the same + attrdict = {} + for attr in vars(record): + value = getattr(record, attr) + try: + pickle.dumps(value) + attrdict[attr] = value + except AttributeError: + pass + except: + pass + + if type(record.args) == tuple: + attrdict['args'] = record.args + else: + args = {} + for attr, value in record.args.items(): + try: + pickle.dumps(value) + args[attr] = value + except AttributeError: + args[attr] = str(value) + pass + except: + pass + attrdict['args'] = args + new_record = logging.makeLogRecord(attrdict) + return new_record + + def enqueue(self, record): + self.queue.put_nowait(record) + + def prepare(self, record): + record = self._get_picklable_attrs(record) + self.format(record) + record.msg = record.message + record.args = None + record.exc_info = None + return record + + def emit(self, record): + try: + self.enqueue(self.prepare(record)) + except (KeyboardInterrupt, SystemExit): + raise + except: + self.handleError(record) def strip_join(text_list): diff --git a/finscraper/wrappers.py b/finscraper/wrappers.py index eb7f563..d5d7051 100644 --- a/finscraper/wrappers.py +++ b/finscraper/wrappers.py @@ -2,18 +2,19 @@ import json +import logging import pickle import shutil +import sys import tempfile import uuid +from logging.handlers import QueueListener from multiprocessing import Process, Queue from pathlib import Path import pandas as pd -from twisted.internet import reactor - from scrapy import Request from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy.exceptions import CloseSpider @@ -21,42 +22,98 @@ from scrapy.spiders import Spider from scrapy.utils.log import configure_logging +from twisted.internet import reactor + +from finscraper.utils import QueueHandler + + +def _run_as_process(func, spider_cls, spider_params, settings): + # Setup logging / progress bar + # (queuehandler --> listener --> root logger --> streamhandler) + progress_bar_enabled = settings['PROGRESS_BAR_ENABLED'] + log_enabled = settings['LOG_ENABLED'] + log_stdout = settings['LOG_STDOUT'] + q_log = None + ql = None + if log_enabled or progress_bar_enabled: + handler = logging.StreamHandler() + if progress_bar_enabled: + handler.terminator = '' + handler.setFormatter(logging.Formatter('%(message)s')) + else: + handler.setFormatter(logging.Formatter(settings.get('LOG_FORMAT'))) + + q_log = Queue(-1) + ql = QueueListener(q_log, handler) + ql.start() -def _run_func_as_process(func, *args, **kwargs): + logger = logging.getLogger() + logger.setLevel(settings.get('LOG_LEVEL')) + logger.addHandler(handler) + + # Start function as a separate process q = Queue() - p = Process(target=func, args=(q, *args), - kwargs=kwargs) + p = Process(target=func, + args=(q, q_log, spider_cls, spider_params, settings)) p.start() result = q.get() p.join() - if result is not None: + if ql: + ql.stop() + + if isinstance(result, BaseException): raise result -def _run_spider_func(q, spider_cls, spider_params, settings): +def _run_spider_func(q, q_log, spider_cls, spider_params, settings): try: - configure_logging() + configure_logging(settings, install_root_handler=False) + if q_log is not None: + # Setup logging (worker --> queuehandler --> root logger) + if not settings['LOG_ENABLED']: # Disables STDOUT :o + logging.getLogger('scrapy').propagate = False + qh = QueueHandler(q_log) + logger = logging.getLogger() + logger.setLevel(settings.get('LOG_LEVEL')) + qh.setLevel(settings.get('LOG_LEVEL')) + qh.setFormatter(logging.Formatter(settings.get('LOG_FORMAT'))) + logger.addHandler(qh) + + # Start crawling runner = CrawlerRunner(settings) deferred = runner.crawl(spider_cls, **spider_params) deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: - Queue.put(e) + q.put(e) class _SpiderWrapper: + _log_levels = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warn': logging.WARN, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } - def __init__(self, spider_cls, spider_params, jobdir=None): + def __init__(self, spider_cls, spider_params, jobdir=None, + progress_bar=True, log_level=None): self.spider_cls = spider_cls self.spider_params = spider_params if jobdir is None: self._jobdir = Path(tempfile.gettempdir()) / str(uuid.uuid4()) - else: + elif type(jobdir) == str: self._jobdir = Path(jobdir) + else: + raise ValueError(f'Jobdir {jobdir} is not of type str or None') self._jobdir.mkdir(parents=True, exist_ok=True) + + self.log_level = log_level + self.progress_bar = progress_bar and self.log_level is None self._items_save_path = self._jobdir / 'items.jl' self._spider_save_path = self._jobdir / 'spider.pkl' @@ -69,6 +126,32 @@ def __init__(self, spider_cls, spider_params, jobdir=None): def jobdir(self): return str(self._jobdir) + @property + def log_level(self): + return self._log_level + + @log_level.setter + def log_level(self, log_level): + if log_level is None: + self._log_level = log_level + elif (type(log_level) == str + and log_level.strip().lower() in self._log_levels): + self._log_level = self._log_levels[log_level.strip().lower()] + else: + raise ValueError( + f'Log level should be in {self._log_levels.keys()}') + + @property + def progress_bar(self): + return self._progress_bar + + @progress_bar.setter + def progress_bar(self, progress_bar): + if type(progress_bar) == bool: + self._progress_bar = progress_bar + else: + raise ValueError(f'Progress bar "{progress_bar}" not boolean') + @property def items_save_path(self): return str(self._items_save_path) @@ -79,22 +162,33 @@ def spider_save_path(self): def _run_spider(self, itemcount=10, timeout=120, pagecount=0, errorcount=0, settings=None): - settings_ = Settings() - settings_.setmodule('finscraper.settings', priority='project') - settings_['JOBDIR'] = self.jobdir - settings_['FEEDS'] = {self.items_save_path: {'format': 'jsonlines'}} - settings_['CLOSESPIDER_ITEMCOUNT'] = itemcount - settings_['CLOSESPIDER_TIMEOUT'] = timeout - settings_['CLOSESPIDER_PAGECOUNT'] = pagecount - settings_['CLOSESPIDER_ERRORCOUNT'] = errorcount + _settings = Settings() + _settings.setmodule('finscraper.settings', priority='project') + + _settings['JOBDIR'] = self.jobdir + _settings['FEEDS'] = {self.items_save_path: {'format': 'jsonlines'}} + + _settings['CLOSESPIDER_ITEMCOUNT'] = itemcount + _settings['CLOSESPIDER_TIMEOUT'] = timeout + _settings['CLOSESPIDER_PAGECOUNT'] = pagecount + _settings['CLOSESPIDER_ERRORCOUNT'] = errorcount + + _settings['LOG_STDOUT'] = True + _settings['LOG_LEVEL'] = self.log_level or logging.NOTSET + _settings['LOG_ENABLED'] = self.log_level is not None + # Logging dominates progress bar + _settings['PROGRESS_BAR_ENABLED'] = self.progress_bar + + # Will always be prioritized --> conflicts are possible if settings is not None: - settings_.update(settings) + _settings.update(settings) + try: - _run_func_as_process( + _run_as_process( func=_run_spider_func, spider_cls=self.spider_cls, spider_params=self.spider_params, - settings=settings_ + settings=_settings ) except KeyboardInterrupt: pass diff --git a/requirements.txt b/requirements.txt index 4af32eb..b49f4ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pandas==1.0.3 selenium==3.141.0 scrapy==2.1.0 +tqdm==4.46.0 webdriver-manager==2.4.0 diff --git a/setup.py b/setup.py index 5404920..4552a03 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,13 @@ import setuptools + with open('README.md', 'r') as f: long_description = f.read() + setuptools.setup( name='finscraper', - version='0.0.1dev0', + version='0.0.1dev1', license='MIT', description='Web scraping API for Finnish websites', long_description=long_description, @@ -18,6 +20,7 @@ 'pandas>=1.0.3', 'selenium>=3.141.0', 'scrapy>=2.1.0', + 'tqdm>=4.46.0', 'webdriver-manager>=2.4.0' ], packages=setuptools.find_packages(), diff --git a/tests/test_spiders.py b/tests/test_spiders.py index d80e610..dc60245 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -2,6 +2,7 @@ import json +import logging import tempfile from pathlib import Path @@ -9,11 +10,14 @@ from finscraper.spiders import ISArticle, ILArticle +# TODO: Implement utility test function that performs common Spider checks + + def test_ISArticle_with_category(): # Test scraping, no chromedriver spider = ISArticle('ulkomaat').scrape(20) df = spider.get() - assert len(df) > 0 + assert len(df) >= 20 assert len(df.columns) == 8 # Test scraping with chromedriver @@ -24,52 +28,52 @@ def test_ISArticle_with_category(): # Test continuing scraping df2 = spider.scrape(10).get() - assert len(df2) > len(df) + assert len(df2) >= len(df) + 10 # Save and load spider jobdir = spider.save() spider = ISArticle.load(jobdir) df3 = spider.scrape(10).get() - assert len(df3) > len(df2) + assert len(df3) >= len(df2) + 10 def test_ISArticle_no_params(): # Test scraping spider = ISArticle().scrape(10) df = spider.get() - assert len(df) >= 10 + assert len(df) == 10 assert len(df.columns) == 8 - # Test continuing scraping + # Test continuing scraping (poor results, no driver) df2 = spider.scrape(10).get() - assert len(df2) > len(df) + assert len(df2) >= len(df) + 10 # Save and load spider jobdir = spider.save() spider = ISArticle.load(jobdir) df3 = spider.scrape(10).get() - assert len(df3) > len(df2) + assert len(df3) >= len(df2) + 10 def test_ILArticle_with_category(): # Test scraping - spider = ILArticle('politiikka').scrape(10) + spider = ILArticle('ulkomaat').scrape(5) df = spider.get() - assert len(df) >= 10 + assert len(df) >= 5 assert len(df.columns) == 8 # Test continuing scraping df2 = spider.scrape(10).get() - assert len(df2) > len(df) + assert len(df2) >= len(df) + 10 # Save and load spider jobdir = spider.save() spider = ILArticle.load(jobdir) df3 = spider.scrape(10).get() - assert len(df3) > len(df2) + assert len(df3) >= len(df2) + 10 def test_ILArticle_no_params(): @@ -79,16 +83,16 @@ def test_ILArticle_no_params(): assert len(df) >= 10 assert len(df.columns) == 8 - # Test continuing scraping + # Test continuing scraping df2 = spider.scrape(10).get() - assert len(df2) > len(df) + assert len(df2) >= len(df) + 10 # Save and load spider jobdir = spider.save() spider = ILArticle.load(jobdir) df3 = spider.scrape(10).get() - assert len(df3) > len(df2) + assert len(df3) >= len(df2) + 10 def test_spider_save_load_with_jobdir(): @@ -135,3 +139,51 @@ def test_spider_clear(): spider.clear() assert not Path(spider_save_path).exists() assert not Path(items_save_path).exists() + + +def test_spider_logging(): + # Yes attribute "None", no settings + spider = ISArticle(log_level=None) + spider.scrape(1) + + # Yes attribute, no settings + spider = ISArticle(log_level='info') + spider.scrape(1) + assert True + + # No attribute, yes settings + spider = ISArticle(log_level=None) + spider.scrape(1, settings={'LOG_LEVEL': logging.DEBUG}) + assert True + + spider = ISArticle(log_level='info') + spider.scrape(1, settings={'LOG_ENABLED': False}) + assert True + + # Attribute set + try: + spider.log_level = 'test' + except ValueError: + assert True + except: + assert False + spider.log_level = 'info' + assert spider.log_level == logging.INFO + + # TODO: Test the output + + +def test_spider_progress_bar(): + # Progress bas true by default + spider = ILArticle() + spider.scrape(1) + assert spider.progress_bar == True + assert len(spider.get()) > 0 + + # Progress bar disabled, when log level given + spider = ILArticle(log_level='info') + spider.scrape(1) + assert spider.progress_bar == False + assert len(spider.get()) > 0 + + # TODO: Test the output