Skip to content

Commit

Permalink
Add suomi24page spider (#49)
Browse files Browse the repository at this point in the history
* Add suomi24page spider
  • Loading branch information
jmyrberg authored May 23, 2020
1 parent 93469c7 commit 9ff9817
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 14 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ env:
- SPIDER=ylearticle
- SPIDER=demipage
- SPIDER=vauvapage
- SPIDER=suomi24page
- SPIDER=oikotieapartment
- SPIDER="not spider"
script: pytest -v -m "$SPIDER"
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ The library provides an easy-to-use API for fetching data from various Finnish w
| [Iltalehti](https://www.il.fi) | News article | `ILArticle` |
| [YLE Uutiset](https://www.yle.fi/uutiset) | News article | `YLEArticle` |
| [Demi](https://demi.fi) | Discussion thread | `DemiPage` |
| [Suomi24](https://keskustelu.suomi24.fi) | Discussion thread | `Suomi24Page` |
| [Vauva](https://www.vauva.fi) | Discussion thread | `VauvaPage` |
| [Oikotie Asunnot](https://asunnot.oikotie.fi/myytavat-asunnot) | Apartment ad | `OikotieApartment` |

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.1dev20
0.0.1dev21
8 changes: 8 additions & 0 deletions docs/source/finscraper.scrapy_spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ finscraper.scrapy\_spiders.oikotieapartment module
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.suomi24page module
---------------------------------------------

.. automodule:: finscraper.scrapy_spiders.suomi24page
:members:
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.vauvapage module
-------------------------------------------

Expand Down
8 changes: 4 additions & 4 deletions finscraper/scrapy_spiders/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ def parse(self, resp, to_parse=False):
for link in follow_links:
if self._follow_selenium:
yield SeleniumCallbackRequest(
link.url, callback=self.parse, meta=self.items_meta,
link.url, callback=self.parse, meta=self.follow_meta,
selenium_callback=self.follow_selenium_callback,
priority=10, cb_kwargs={'to_parse': True})
priority=10, cb_kwargs={'to_parse': False})
else:
yield Request(
link.url, callback=self.parse, meta=self.items_meta,
priority=10, cb_kwargs={'to_parse': True})
link.url, callback=self.parse, meta=self.follow_meta,
priority=10, cb_kwargs={'to_parse': False})
204 changes: 204 additions & 0 deletions finscraper/scrapy_spiders/suomi24page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"""Module for Suomi24Page spider."""


import time

from functools import partial

from scrapy import Item, Field, Selector
from scrapy.crawler import Spider
from scrapy.exceptions import DropItem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Compose

from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin
from finscraper.utils import strip_join, safe_cast_int, strip_elements, \
drop_empty_elements


class _Suomi24PageSpider(FollowAndParseItemMixin, Spider):
name = 'suomi24page'
start_urls = ['https://keskustelu.suomi24.fi']
follow_link_extractor = LinkExtractor(
allow_domains=('keskustelu.suomi24.fi'),
allow=(),
deny=('\?'),
deny_domains=(),
canonicalize=True
)
item_link_extractor = LinkExtractor(
allow_domains=('keskustelu.suomi24.fi'),
allow=(rf'/t/[0-9]+/[A-z0-9\-]+'),
deny=('\?'),
deny_domains=(),
canonicalize=True
)
custom_settings = {}

def __init__(self, *args, **kwargs):
"""Fetch comments from suomi24.fi.
Args:
"""
super(_Suomi24PageSpider, self).__init__(*args, **kwargs)

def _parse_comment_response(self, response):
l = ItemLoader(item=_Suomi24CommentResponseItem(), selector=response)
l.add_xpath('author',
'//*[contains(@class, "Username")]//text()')
l.add_xpath('date',
'//*[contains(@class, "Timestamp")]//text()')
l.add_xpath('quotes', '//blockquote//text()', strip_join)
l.add_xpath('content', '//p[contains(@class, "Text")]//text()')
return l.load_item()

def _parse_comment(self, comment):
l = ItemLoader(item=_Suomi24CommentItem(), selector=comment)
l.add_xpath('author',
'(//*[contains(@class, "Username")])[1]//text()')
l.add_xpath('date',
'(//*[contains(@class, "Timestamp")])[1]//text()')
l.add_xpath('quotes',
'(//article)[1]//blockquote//text()')
l.add_xpath('content',
'(//article)[1]//p[contains(@class, "Text")]//text()')

responses = []
responses_xpath = '//li[contains(@class, "CommentResponsesItem")]'
for response in comment.xpath(responses_xpath):
responses.append(
self._parse_comment_response(Selector(text=response.get())))
l.add_value('responses', responses)
return l.load_item()

def _parse_item(self, resp):
l = ItemLoader(item=_Suomi24PageItem(), response=resp)
l.add_value('url', resp.url)
l.add_value('time', int(time.time()))
l.add_xpath('title', '//*[contains(@*, "thread-title")]//text()')
l.add_xpath('published',
'(//*[contains(@class, "Timestamp")])[1]//text()')
l.add_xpath('author',
'(//*[contains(@class, "Username")])[1]//text()')
l.add_xpath('content',
'(//*[contains(@*, "thread-body-text")])[1]//text()')
l.add_xpath('n_comments',
'(//*[contains(@*, "stats-comments")])[1]//text()')
l.add_xpath('views',
'(//*[contains(@*, "stats-views")])[1]//text()')

comments = []
comment_xpath = '//li[contains(@class, "CommentItem")]'
for comment in resp.xpath(comment_xpath):
comments.append(self._parse_comment(Selector(text=comment.get())))
l.add_value('comments', comments)
return l.load_item()


class _Suomi24CommentResponseItem(Item):
"""
Returned comment response fields:
* author (str): Author of the comment response.
* date (str): Publish time of the comment response.
* quotes (list of str): List of quotes in the comment response.
* content (str): Contents of the comment response.
"""
author = Field(
input_processor=strip_elements,
output_processor=TakeFirst()
)
date = Field(
input_processor=strip_join,
output_processor=Compose(strip_elements, TakeFirst())
)
quotes = Field(
input_processor=drop_empty_elements,
output_processor=Identity()
)
content = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)


class _Suomi24CommentItem(Item):
"""
Returned comment fields:
* author (str): Author of the comment.
* date (str): Publish time of the comment.
* quotes (list of str): List of quotes in the comment.
* responses (list of dict): Response comments to this comment.
* content (str): Contents of the comment.
"""
author = Field(
input_processor=strip_elements,
output_processor=TakeFirst()
)
date = Field(
input_processor=strip_join,
output_processor=Compose(strip_elements, TakeFirst())
)
quotes = Field(
input_processor=Identity(),
output_processor=Identity()
)
responses = Field(
input_processor=Identity(),
output_processor=Identity()
)
content = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)


class _Suomi24PageItem(Item):
__doc__ = """
Returned page fields:
* url (str): URL of the scraped web page.
* time (int): UNIX timestamp of the scraping.
* title (str): Title of the thread.
* content (str): Content of the first message.
* comments (str): Comments of the thread page.
* published (str): Publish time of the thread.
* author (str): Author of the thread.
* n_comments (int): Number of comments in the thread.
* views (str): Number of views.
""" + _Suomi24CommentItem.__doc__ + _Suomi24CommentResponseItem.__doc__
url = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
time = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
title = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
content = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
comments = Field(
input_processor=Identity(),
output_processor=Identity()
)
published = Field(
input_processor=strip_join,
output_processor=Compose(strip_elements, TakeFirst())
)
author = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
n_comments = Field(
input_processor=MapCompose(safe_cast_int),
output_processor=TakeFirst()
)
views = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
13 changes: 13 additions & 0 deletions finscraper/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
_ISArticleItem
from finscraper.scrapy_spiders.demipage import _DemiPageSpider, \
_DemiPageItem
from finscraper.scrapy_spiders.suomi24page import _Suomi24PageSpider, \
_Suomi24PageItem
from finscraper.scrapy_spiders.vauvapage import _VauvaPageSpider, \
_VauvaPageItem, _VauvaPageSpider
from finscraper.scrapy_spiders.ylearticle import _YLEArticleSpider, \
Expand Down Expand Up @@ -84,6 +86,17 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
log_level=log_level)


class Suomi24Page(_SpiderWrapper):
__doc__ = _get_docstring(_Suomi24PageSpider, _Suomi24PageItem)
def __init__(self, jobdir=None, progress_bar=True, log_level=None):
super(Suomi24Page, self).__init__(
spider_cls=_Suomi24PageSpider,
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)


class VauvaPage(_SpiderWrapper):
__doc__ = _get_docstring(_VauvaPageSpider, _VauvaPageItem)
def __init__(self, jobdir=None, progress_bar=True, log_level=None):
Expand Down
2 changes: 1 addition & 1 deletion finscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def strip_elements(text_list):

def drop_empty_elements(text_list):
return [text for text in text_list
if text.strip() != '' and text is not None]
if text is not None and type(text) == str and text.strip() != '']

def safe_cast_int(text):
try:
Expand Down
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ markers =
ylearticle: Mark test as a ylearticle test.
demipage: Mark test as a demipage test.
vauvapage: Mark test as a vauvapage test.
suomi24page: Mark test as a suomi24page test.
oikotieapartment: Mark test as a oikotieapartment test.
22 changes: 14 additions & 8 deletions tests/test_spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
pytestmark = [pytest.mark.spider]

from finscraper.spiders import ILArticle, ISArticle, YLEArticle, VauvaPage, \
OikotieApartment, DemiPage
OikotieApartment, DemiPage, Suomi24Page

from tests.utils import calc_field_emptiness

Expand All @@ -32,6 +32,18 @@
'n_fields': 8,
'mark': pytest.mark.ylearticle
},
{
'class': DemiPage,
'params': [None],
'n_fields': 6,
'mark': pytest.mark.demipage
},
{
'class': Suomi24Page,
'params': [None],
'n_fields': 9,
'mark': pytest.mark.suomi24page
},
{
'class': VauvaPage,
'params': [None],
Expand All @@ -43,13 +55,7 @@
'params': [None],
'n_fields': 80,
'mark': pytest.mark.oikotieapartment
},
{
'class': DemiPage,
'params': [None],
'n_fields': 6,
'mark': pytest.mark.demipage
},
}
]


Expand Down

0 comments on commit 9ff9817

Please sign in to comment.