Skip to content

Commit

Permalink
Version 0.0.1dev1 (#15)
Browse files Browse the repository at this point in the history
* Bump version

* Added spider logging

* First draft of progress bar, docs still missing

* Add progress bar doc

* Fix progress bar log mess up

* Reduce IL scraped items
  • Loading branch information
jmyrberg authored May 8, 2020
1 parent af39687 commit ce2b07b
Show file tree
Hide file tree
Showing 11 changed files with 390 additions and 54 deletions.
61 changes: 61 additions & 0 deletions finscraper/extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Module for Scrapy extension."""


import logging

from collections import defaultdict

from scrapy import signals
from scrapy.exceptions import NotConfigured, CloseSpider

from tqdm.auto import tqdm

from finscraper.utils import TqdmLogger


class ProgressBar:

def __init__(self, crawler):
self.progress_bar_enabled = (
crawler.settings.get('PROGRESS_BAR_ENABLED', False))
self.closespider_itemcount = (
crawler.settings.get('CLOSESPIDER_ITEMCOUNT', None))
self.counter = defaultdict(int)

if not self.progress_bar_enabled:
raise NotConfigured

crawler.signals.connect(self.on_response,
signal=signals.response_received)
crawler.signals.connect(self.on_item_scraped,
signal=signals.item_scraped)
crawler.signals.connect(self.on_error, signal=signals.spider_error)

logger = logging.getLogger()
self.progress_bar = tqdm(desc='Progress', unit=' items',
total=self.closespider_itemcount,
file=TqdmLogger(logger))
self.itemcount = 0
self.pagecount = 0

@classmethod
def from_crawler(cls, crawler):
return cls(crawler)

def on_error(self, failure, response, spider):
self.counter['errorcount'] += 1
self.progress_bar.set_postfix({
'pages': self.counter['pagecount'],
'errors': self.counter['errorcount']
})

def on_response(self, response, request, spider):
self.counter['pagecount'] += 1
self.progress_bar.set_postfix({
'pages': self.counter['pagecount'],
'errors': self.counter['errorcount']
})

def on_item_scraped(self, item, spider):
self.counter['itemcount'] += 1
self.progress_bar.update()
32 changes: 24 additions & 8 deletions finscraper/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Module for Scrapy middleware."""


import logging
logger = logging.getLogger(__name__)

Expand All @@ -11,36 +12,51 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Monkey patch, see https://github.com/pypa/pipenv/issues/2609
import webdriver_manager.utils
def console(text, bold=False):
pass
webdriver_manager.utils.console = console

from webdriver_manager.chrome import ChromeDriverManager


class DownloaderMiddlewareWithJs(object):
# https://stackoverflow.com/questions/31174330/passing-selenium-response-url-to-scrapy
class DownloaderMiddlewareWithJs:

def __init__(self, settings):
self.log_enabled = settings.get('LOG_ENABLED', False)
self.log_level = settings.get('LOG_LEVEL', logging.NOTSET)
self.progress_bar_enabled = settings.get('PROGRESS_BAR_ENABLED', False)

@classmethod
def from_crawler(cls, crawler):
middleware = cls()
middleware = cls(crawler.settings)
crawler.signals.connect(middleware.spider_opened,
signals.spider_opened)
crawler.signals.connect(middleware.spider_closed,
signals.spider_closed)
return middleware

return middleware

def spider_opened(self, spider):
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
#options.add_argument("--no-sandbox") # linux only
self.driver = webdriver.Chrome(
ChromeDriverManager().install(),
options=options)
if self.progress_bar_enabled:
options.add_argument('--disable-logging')
for name in ['selenium.webdriver.remote.remote_connection',
'requests', 'urllib3']:
logging.getLogger(name).propagate = False

self.driver = webdriver.Chrome(ChromeDriverManager().install(),
options=options)

def spider_closed(self, spider):
self.driver.close()

def process_request(self, request, spider):
return_response = False
run_js = request.meta.get('run_js')
run_js_wait_sec = request.meta.get('run_js_wait_sec', 0)
scroll_to_bottom = request.meta.get('scroll_to_bottom')
Expand Down
2 changes: 1 addition & 1 deletion finscraper/scrapy_spiders/isarticle.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(
category=None,
follow_link_extractor=None,
item_link_extractor=None,
allow_chromedriver=False,
allow_chromedriver=False, # TODO: Allow set through property
*args,
**kwargs):
"""Fetch IltaSanomat news articles.
Expand Down
7 changes: 7 additions & 0 deletions finscraper/scrapy_spiders/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


from scrapy import Request
from scrapy.exceptions import CloseSpider


class FollowAndParseItemMixin:
Expand All @@ -13,6 +14,7 @@ class FollowAndParseItemMixin:
item pages from.
3) `parse_item` function: Parse item from response.
"""
itemcount = 0
item_link_extractor = None
follow_link_extractor = None
custom_settings = {
Expand All @@ -34,8 +36,13 @@ def start_requests(self):

def parse(self, resp, to_parse=False):
"""Parse items and follow links based on defined link extractors."""
if (self.itemcount and
self.itemcount == self.settings.get('CLOSESPIDER_ITEMCOUNT', 0)):
raise CloseSpider

if to_parse:
yield self._parse_item(resp)
self.itemcount += 1

# Parse items and further on extract links from those pages
item_links = self.item_link_extractor.extract_links(resp)
Expand Down
1 change: 1 addition & 0 deletions finscraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
# See https://docs.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
'scrapy.extensions.closespider.CloseSpider': 200,
'finscraper.extensions.ProgressBar': 500
}

# Configure item pipelines
Expand Down
27 changes: 20 additions & 7 deletions finscraper/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,33 @@
_ISArticleItem


__jobdir_doc__ = '''
jobdir (None or str, optional): Working directory of the spider.
__wrapper_doc__ = '''
jobdir (str or None, optional): Working directory of the spider.
Defaults to None, which creates a temp directory to be used.
Note that this directory will only be deleted through the
`clear` method!
progress_bar (bool, optiona): Whether to enable progress bar or not. This
parameter is ignored if `log_level` is not None. Defaults to True.
log_level (str or None, optional): Level of logging to display.
Should be in ['debug', 'info', 'warn', 'error', 'critical'] or None.
When None, logging is disabled. Defaults to None. Note that this parameter
can be overriden through Scrapy settings (LOG_LEVEL, LOG_ENABLED) when
calling the `scrape` -method, and progress bar is not displayed when
`log_level` is not None.
'''


def _get_docstring(spider_cls, item_cls):
return ( spider_cls.__init__.__doc__.strip()
+ indent(__jobdir_doc__, ' ' * 12)
+ indent(__wrapper_doc__, ' ' * 12)
+ indent(item_cls.__doc__, ' ' * 4))


class ISArticle(_SpiderWrapper):
__doc__ = _get_docstring(_ISArticleSpider, _ISArticleItem)
def __init__(self, category=None, follow_link_extractor=None,
item_link_extractor=None, allow_chromedriver=False,
jobdir=None):
jobdir=None, progress_bar=True, log_level=None):
super(ISArticle, self).__init__(
spider_cls=_ISArticleSpider,
spider_params=dict(
Expand All @@ -37,18 +45,23 @@ def __init__(self, category=None, follow_link_extractor=None,
item_link_extractor=item_link_extractor,
allow_chromedriver=allow_chromedriver
),
jobdir=jobdir)
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)


class ILArticle(_SpiderWrapper):
__doc__ = _get_docstring(_ILArticleSpider, _ILArticleItem)
def __init__(self, category=None, follow_link_extractor=None,
item_link_extractor=None, jobdir=None):
item_link_extractor=None, jobdir=None, progress_bar=True,
log_level=None):
super(ILArticle, self).__init__(
spider_cls=_ILArticleSpider,
spider_params=dict(
category=category,
follow_link_extractor=follow_link_extractor,
item_link_extractor=item_link_extractor
),
jobdir=jobdir)
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
90 changes: 89 additions & 1 deletion finscraper/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,92 @@
"""Module for utility functions."""
"""Module for utility functions and classes."""


import io
import logging

import pickle
import re

from tqdm.auto import tqdm


class TqdmLogger(io.StringIO):

def __init__(self, logger):
self.logger = logger
self.buf = ''
self.ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')

def write(self, buf):
self.buf = buf

def flush(self):
if self.buf.strip() != '':
self.logger.log(logging.WARN, self.buf)


class QueueHandler(logging.Handler):

def __init__(self, queue):
"""Sends events to a queue, allowing multiprocessing.
Args:
queue (multiprocessing.Queue): Queue object to use.
This handler checks for picklability before saving items into queue.
Modified from: https://gist.github.com/vsajip/591589
"""
logging.Handler.__init__(self)
self.queue = queue

def _get_picklable_attrs(self, record):
# TODO: More performant way to do the same
attrdict = {}
for attr in vars(record):
value = getattr(record, attr)
try:
pickle.dumps(value)
attrdict[attr] = value
except AttributeError:
pass
except:
pass

if type(record.args) == tuple:
attrdict['args'] = record.args
else:
args = {}
for attr, value in record.args.items():
try:
pickle.dumps(value)
args[attr] = value
except AttributeError:
args[attr] = str(value)
pass
except:
pass
attrdict['args'] = args
new_record = logging.makeLogRecord(attrdict)
return new_record

def enqueue(self, record):
self.queue.put_nowait(record)

def prepare(self, record):
record = self._get_picklable_attrs(record)
self.format(record)
record.msg = record.message
record.args = None
record.exc_info = None
return record

def emit(self, record):
try:
self.enqueue(self.prepare(record))
except (KeyboardInterrupt, SystemExit):
raise
except:
self.handleError(record)


def strip_join(text_list):
Expand Down
Loading

0 comments on commit ce2b07b

Please sign in to comment.