Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline, Ksenia Sizikova - 22FPL1 #93

Closed
wants to merge 47 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 198 additions & 1 deletion lab_5_scrapper/scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,65 @@
Crawler implementation.
"""
# pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
import json
import pathlib
import re
import shutil
from datetime import datetime
from random import randrange
from time import sleep
from typing import Pattern, Union

import requests
from bs4 import BeautifulSoup

from core_utils import constants
from core_utils.article.article import Article
from core_utils.article.io import to_meta, to_raw
from core_utils.config_dto import ConfigDTO


class IncorrectSeedURLError(Exception):
"""
The seed url is not alike the pattern.
"""


class NumberOfArticlesOutOfRangeError(Exception):
"""
The number of articles is not in range of 1 to 150.
"""


class IncorrectNumberOfArticlesError(Exception):
"""
The article number is not integer.
"""


class IncorrectHeadersError(Exception):
"""
The headers are not stored in a dictionary.
"""


class IncorrectEncodingError(Exception):
"""
The encoding is not a string.
"""


class IncorrectTimeoutError(Exception):
"""
The timeout is not an integer or is not in the range.
"""


class IncorrectVerifyError(Exception):
"""
Verification check or Headless mode are not boolean.
"""


class Config:
"""
Expand All @@ -18,6 +74,17 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
Args:
path_to_config (pathlib.Path): Path to configuration.
"""
self.path_to_config = path_to_config
self.config = self._extract_config_content()
self._validate_config_content()

self._encoding = self.config.encoding
self._headers = self.config.headers
self._headless_mode = self.config.headless_mode
self._num_articles = self.config.total_articles
self._seed_urls = self.config.seed_urls
self._should_verify_certificate = self.config.should_verify_certificate
self._timeout = self.config.timeout

def _extract_config_content(self) -> ConfigDTO:
"""
Expand All @@ -26,11 +93,42 @@ def _extract_config_content(self) -> ConfigDTO:
Returns:
ConfigDTO: Config values
"""
with open(self.path_to_config, 'r', encoding='utf-8') as f:
confi = json.load(f)

return ConfigDTO(**confi)

def _validate_config_content(self) -> None:
"""
Ensure configuration parameters are not corrupt.
"""
config = self._extract_config_content()

if not isinstance(config.seed_urls, list):
raise IncorrectSeedURLError

for seed_url in config.seed_urls:
if not re.match(r"https?://(www.)?vtomske\.ru", seed_url):
raise IncorrectSeedURLError

if not isinstance(config.total_articles, int) or config.total_articles <= 0:
raise IncorrectNumberOfArticlesError

if config.total_articles > 150:
raise NumberOfArticlesOutOfRangeError

if not isinstance(config.headers, dict):
raise IncorrectHeadersError

if not isinstance(config.encoding, str):
raise IncorrectEncodingError

if not isinstance(config.timeout, int) or not 0 <= config.timeout < 60:
raise IncorrectTimeoutError

if (not isinstance(config.should_verify_certificate, bool)
or not isinstance(config.headless_mode, bool)):
raise IncorrectVerifyError

def get_seed_urls(self) -> list[str]:
"""
Expand All @@ -39,6 +137,7 @@ def get_seed_urls(self) -> list[str]:
Returns:
list[str]: Seed urls
"""
return self._seed_urls

def get_num_articles(self) -> int:
"""
Expand All @@ -47,6 +146,7 @@ def get_num_articles(self) -> int:
Returns:
int: Total number of articles to scrape
"""
return self._num_articles

def get_headers(self) -> dict[str, str]:
"""
Expand All @@ -55,6 +155,7 @@ def get_headers(self) -> dict[str, str]:
Returns:
dict[str, str]: Headers
"""
return self._headers

def get_encoding(self) -> str:
"""
Expand All @@ -63,6 +164,7 @@ def get_encoding(self) -> str:
Returns:
str: Encoding
"""
return self._encoding

def get_timeout(self) -> int:
"""
Expand All @@ -71,6 +173,7 @@ def get_timeout(self) -> int:
Returns:
int: Number of seconds to wait for response
"""
return self._timeout

def get_verify_certificate(self) -> bool:
"""
Expand All @@ -79,6 +182,7 @@ def get_verify_certificate(self) -> bool:
Returns:
bool: Whether to verify certificate or not
"""
return self._should_verify_certificate

def get_headless_mode(self) -> bool:
"""
Expand All @@ -87,6 +191,7 @@ def get_headless_mode(self) -> bool:
Returns:
bool: Whether to use headless mode or not
"""
return self._headless_mode


def make_request(url: str, config: Config) -> requests.models.Response:
Expand All @@ -100,6 +205,14 @@ def make_request(url: str, config: Config) -> requests.models.Response:
Returns:
requests.models.Response: A response from a request
"""
sleep(randrange(3))

return requests.get(
url=url,
timeout=config.get_timeout(),
headers=config.get_headers(),
verify=config.get_verify_certificate()
)


class Crawler:
Expand All @@ -116,6 +229,9 @@ def __init__(self, config: Config) -> None:
Args:
config (Config): Configuration
"""
self.config = config
self.urls = []
self.base_url = "https://vtomske.ru"

def _extract_url(self, article_bs: BeautifulSoup) -> str:
"""
Expand All @@ -128,10 +244,38 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
str: Url from HTML
"""

link = article_bs.find(class_='mainbar')
if link:
links = link.find_all('a')
for link in links:
href = link.get('href')
if href:
url = self.base_url + href
if url not in self.get_search_urls() and url not in self.urls:
return url
return ''

def find_articles(self) -> None:
"""
Find articles.
"""
seed_urls = self.get_search_urls()

for seed_url in seed_urls:
response = make_request(seed_url, self.config)
if not response.ok:
continue

article_soup = BeautifulSoup(response.text, features='lxml')
new_url = self._extract_url(article_soup)
while new_url:
if len(self.urls) == self.config.get_num_articles():
break
self.urls.append(new_url)
new_url = self._extract_url(article_soup)

if len(self.urls) == self.config.get_num_articles():
break

def get_search_urls(self) -> list:
"""
Expand All @@ -140,6 +284,7 @@ def get_search_urls(self) -> list:
Returns:
list: seed_urls param
"""
return self.config.get_seed_urls()


# 10
Expand All @@ -160,6 +305,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
article_id (int): Article id
config (Config): Configuration
"""
self.full_url = full_url
self.article_id = article_id
self.config = config
self.article = Article(full_url, article_id)

def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -168,6 +317,10 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
body = article_soup.find('div', class_='material-content')
if body:
content = body.find_all('p')
self.article.text = '\n'.join([p_tag.text for p_tag in content])

def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
"""
Expand All @@ -176,8 +329,29 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No
Args:
article_soup (bs4.BeautifulSoup): BeautifulSoup instance
"""
cont = article_soup.find('div', class_='material-content')
if cont:
title = cont.find('h1')
if title:
self.article.title = title.text

author = article_soup.find('a', class_='material-author')
if not author:
self.article.author.append('NOT FOUND')
else:
self.article.author.append(author.text.strip())

date = article_soup.find('time', class_='material-date')
if date:
date_str = date.attrs.get('datetime')
if isinstance(date_str, str):
self.article.date = self.unify_date_format(date_str)

tags = article_soup.find_all(class_='material-tags')
for tag in tags:
self.article.topics.append(tag.text)

def unify_date_format(self, date_str: str) -> datetime.datetime:
def unify_date_format(self, date_str: str) -> datetime:
"""
Unify date format.

Expand All @@ -187,6 +361,7 @@ def unify_date_format(self, date_str: str) -> datetime.datetime:
Returns:
datetime.datetime: Datetime object
"""
return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S%z')

def parse(self) -> Union[Article, bool, list]:
"""
Expand All @@ -195,6 +370,12 @@ def parse(self) -> Union[Article, bool, list]:
Returns:
Union[Article, bool, list]: Article instance
"""
response = make_request(self.full_url, self.config)
if response.ok:
article_bs = BeautifulSoup(response.text, features='html.parser')
self._fill_article_with_text(article_bs)
self._fill_article_with_meta_information(article_bs)
return self.article


def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Expand All @@ -204,12 +385,28 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
Args:
base_path (Union[pathlib.Path, str]): Path where articles stores
"""
if base_path.exists():
shutil.rmtree(base_path)
base_path.mkdir(parents=True)


def main() -> None:
"""
Entrypoint for scrapper module.
"""
configuration = Config(constants.CRAWLER_CONFIG_PATH)
crawler = Crawler(configuration)
prepare_environment(constants.ASSETS_PATH)

crawler.find_articles()
i = 1
for url in crawler.urls:
parser = HTMLParser(full_url=url, article_id=i, config=configuration)
article = parser.parse()
if isinstance(article, Article):
to_raw(article)
to_meta(article)
i += 1


if __name__ == "__main__":
Expand Down
10 changes: 5 additions & 5 deletions lab_5_scrapper/scrapper_config.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"seed_urls": [],
"headers": {},
"total_articles_to_find_and_parse": 0,
"encoding": "",
"timeout": 0,
"seed_urls": ["https://vtomske.ru/"],
"headers": {"Accept":"*/*", "User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 YaBrowser/24.1.0.0 Safari/537.36"},
"total_articles_to_find_and_parse": 3,
"encoding": "utf-8",
"timeout": 5,
"should_verify_certificate": true,
"headless_mode": true
}
2 changes: 1 addition & 1 deletion lab_5_scrapper/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"target_score": 0
"target_score": 8
}
Loading
Loading