diff --git a/lab_5_scrapper/scrapper.py b/lab_5_scrapper/scrapper.py index c060232..3b4463f 100644 --- a/lab_5_scrapper/scrapper.py +++ b/lab_5_scrapper/scrapper.py @@ -1,10 +1,64 @@ """ Crawler implementation. """ +import datetime +import json # pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable import pathlib +import re +import shutil from typing import Pattern, Union +import requests +from bs4 import BeautifulSoup + +from core_utils import constants +from core_utils.article.article import Article +from core_utils.article.io import to_meta, to_raw +from core_utils.config_dto import ConfigDTO + + +class IncorrectSeedURLError(Exception): + """ + Seed URL does not match standard pattern + """ + + +class NumberOfArticlesOutOfRangeError(Exception): + """ + Total number of articles is out of range from 1 to 150 + """ + + +class IncorrectNumberOfArticlesError(Exception): + """ + Total number of articles to parse is not positive integer + """ + + +class IncorrectHeadersError(Exception): + """ + Headers are not in a form of dictionary + """ + + +class IncorrectEncodingError(Exception): + """ + Encoding is not a string + """ + + +class IncorrectTimeoutError(Exception): + """ + Timeout value is not a positive integer less than 60 + """ + + +class IncorrectVerifyError(Exception): + """ + Verify certificate value is not True or False + """ + class Config: """ @@ -18,6 +72,17 @@ def __init__(self, path_to_config: pathlib.Path) -> None: Args: path_to_config (pathlib.Path): Path to configuration. """ + self.path_to_config = path_to_config + self._validate_config_content() + self.config = self._extract_config_content() + + self._seed_urls = self.config.seed_urls + self._num_articles = self.config.total_articles + self._headers = self.config.headers + self._encoding = self.config.encoding + self._timeout = self.config.timeout + self._should_verify_certificate = self.config.should_verify_certificate + self._headless_mode = self.config.headless_mode def _extract_config_content(self) -> ConfigDTO: """ @@ -26,11 +91,49 @@ def _extract_config_content(self) -> ConfigDTO: Returns: ConfigDTO: Config values """ + with open(self.path_to_config, "r", encoding="utf-8") as f: + conf = json.load(f) + return ConfigDTO( + seed_urls=conf["seed_urls"], + total_articles_to_find_and_parse=conf["total_articles_to_find_and_parse"], + headers=conf["headers"], + encoding=conf["encoding"], + timeout=conf["timeout"], + should_verify_certificate=conf["should_verify_certificate"], + headless_mode=conf["headless_mode"] + ) def _validate_config_content(self) -> None: """ Ensure configuration parameters are not corrupt. """ + with open(self.path_to_config, 'r', encoding='utf-8') as f: + conf = json.load(f) + + if not (isinstance(conf['seed_urls'], list) + and all(re.match(r"https?://(www.)?", seed_url) for seed_url in conf['seed_urls'])): + raise IncorrectSeedURLError + + num = conf['total_articles_to_find_and_parse'] + + if not isinstance(num, int) or (num <= 0): + raise IncorrectNumberOfArticlesError + + if num < 1 or num > 150: + raise NumberOfArticlesOutOfRangeError + + if not isinstance(conf['headers'], dict): + raise IncorrectHeadersError + + if not isinstance(conf['encoding'], str): + raise IncorrectEncodingError + + if not (isinstance(conf['timeout'], int) and (0 < conf['timeout'] < 60)): + raise IncorrectTimeoutError + + if not isinstance(conf['should_verify_certificate'], bool) \ + or not isinstance(conf['headless_mode'], bool): + raise IncorrectVerifyError def get_seed_urls(self) -> list[str]: """ @@ -39,6 +142,7 @@ def get_seed_urls(self) -> list[str]: Returns: list[str]: Seed urls """ + return self._seed_urls def get_num_articles(self) -> int: """ @@ -47,6 +151,7 @@ def get_num_articles(self) -> int: Returns: int: Total number of articles to scrape """ + return self._num_articles def get_headers(self) -> dict[str, str]: """ @@ -55,6 +160,7 @@ def get_headers(self) -> dict[str, str]: Returns: dict[str, str]: Headers """ + return self._headers def get_encoding(self) -> str: """ @@ -63,6 +169,7 @@ def get_encoding(self) -> str: Returns: str: Encoding """ + return self._encoding def get_timeout(self) -> int: """ @@ -71,6 +178,7 @@ def get_timeout(self) -> int: Returns: int: Number of seconds to wait for response """ + return self._timeout def get_verify_certificate(self) -> bool: """ @@ -79,6 +187,7 @@ def get_verify_certificate(self) -> bool: Returns: bool: Whether to verify certificate or not """ + return self._should_verify_certificate def get_headless_mode(self) -> bool: """ @@ -87,6 +196,7 @@ def get_headless_mode(self) -> bool: Returns: bool: Whether to use headless mode or not """ + return self._headless_mode def make_request(url: str, config: Config) -> requests.models.Response: @@ -100,6 +210,8 @@ def make_request(url: str, config: Config) -> requests.models.Response: Returns: requests.models.Response: A response from a request """ + return requests.get(url=url, timeout=config.get_timeout(), + headers=config.get_headers(), verify=config.get_verify_certificate()) class Crawler: @@ -116,6 +228,9 @@ def __init__(self, config: Config) -> None: Args: config (Config): Configuration """ + self.config = config + self.urls = [] + self.url_pattern = self.config.get_seed_urls()[0].split('/format')[0] def _extract_url(self, article_bs: BeautifulSoup) -> str: """ @@ -127,11 +242,31 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str: Returns: str: Url from HTML """ + url = "" + links = article_bs.find_all('a', class_="qZbm2") + for link in links: + url = link.get('href') + url = self.url_pattern + url[len("/text")::] + if url not in self.urls: + break + + return url def find_articles(self) -> None: """ Find articles. """ + seed_urls = self.get_search_urls() + + while len(self.urls) < self.config.get_num_articles(): + for seed_url in seed_urls: + response = make_request(seed_url, self.config) + if not response.ok: + continue + + article_bs = BeautifulSoup(response.text, "html.parser") + extracted = self._extract_url(article_bs) + self.urls.append(extracted) def get_search_urls(self) -> list: """ @@ -140,6 +275,7 @@ def get_search_urls(self) -> list: Returns: list: seed_urls param """ + return self.config.get_seed_urls() # 10 @@ -160,6 +296,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None: article_id (int): Article id config (Config): Configuration """ + self.full_url = full_url + self.article_id = article_id + self.config = config + self.article = Article(self.full_url, self.article_id) def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None: """ @@ -168,6 +308,13 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None: Args: article_soup (bs4.BeautifulSoup): BeautifulSoup instance """ + raw_text = '' + text_blocks = article_soup.find_all('div', class_='uiArticleBlockText_i9h2o') + for text_block in text_blocks: + if text_block.string: + raw_text += f'\n{text_block.string}' + + self.article.text = raw_text def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None: """ @@ -176,6 +323,14 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No Args: article_soup (bs4.BeautifulSoup): BeautifulSoup instance """ + headline = article_soup.find("h1", class_="title_ip27z") + self.article.title = headline.text + author = article_soup.find("div", class_="name_GQmWc") + if not author: + self.article.author = ["NOT FOUND"] + + else: + self.article.author = [author.text] def unify_date_format(self, date_str: str) -> datetime.datetime: """ @@ -195,6 +350,13 @@ def parse(self) -> Union[Article, bool, list]: Returns: Union[Article, bool, list]: Article instance """ + response = make_request(self.full_url, self.config) + if response.ok: + article_bs = BeautifulSoup(response.text, "html.parser") + self._fill_article_with_text(article_bs) + self._fill_article_with_meta_information(article_bs) + + return self.article def prepare_environment(base_path: Union[pathlib.Path, str]) -> None: @@ -204,13 +366,31 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None: Args: base_path (Union[pathlib.Path, str]): Path where articles stores """ + if base_path.exists(): + shutil.rmtree(base_path) + base_path.mkdir(parents=True) def main() -> None: """ Entrypoint for scrapper module. """ + configuration = Config(path_to_config=constants.CRAWLER_CONFIG_PATH) + + prepare_environment(base_path=constants.ASSETS_PATH) + + crawler = Crawler(config=configuration) + crawler.find_articles() + urls = crawler.urls + + for index, url in enumerate(urls): + parser = HTMLParser(full_url=url, article_id=index + 1, config=configuration) + article = parser.parse() + if isinstance(article, Article): + to_raw(article) + to_meta(article) + print("done!") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/lab_5_scrapper/scrapper_config.json b/lab_5_scrapper/scrapper_config.json index 771fe42..c0bc3bc 100644 --- a/lab_5_scrapper/scrapper_config.json +++ b/lab_5_scrapper/scrapper_config.json @@ -1,9 +1,23 @@ { - "seed_urls": [], - "headers": {}, - "total_articles_to_find_and_parse": 0, - "encoding": "", - "timeout": 0, + "seed_urls": ["https://74.ru/text/format/mention/", + "https://74.ru/text/format/mention/?page=2", + "https://74.ru/text/format/mention/?page=3", + "https://74.ru/text/format/mention/?page=4", + "https://74.ru/text/format/mention/?page=5", + "https://74.ru/text/format/mention/?page=6", + "https://74.ru/text/format/mention/?page=7", + "https://74.ru/text/format/mention/?page=8", + "https://74.ru/text/format/mention/?page=9", + "https://74.ru/text/format/mention/?page=10"], + "headers": { + "cookie" : "stg_returning_visitor=Mon%2C%2008%20Apr%202024%2009:30:34%20GMT; stg_traffic_source_priority=1; _ga_KLCW8G3CY6=GS1.1.1717361661.1.0.1717361661.0.0.0; _ga=GA1.1.562201468.1717361661; stg_last_interaction=Sun%2C%2002%20Jun%202024%2020:54:22%20GMT", + "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language" : "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7", + "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" + }, + "total_articles_to_find_and_parse": 100, + "encoding": "utf-8", + "timeout": 15, "should_verify_certificate": true, "headless_mode": true } diff --git a/lab_5_scrapper/settings.json b/lab_5_scrapper/settings.json index 88a8b07..96b048b 100644 --- a/lab_5_scrapper/settings.json +++ b/lab_5_scrapper/settings.json @@ -1,3 +1,3 @@ { - "target_score": 4 -} + "target_score": 6 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8b13789..7b08cb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ - +requests==2.31.0 +beautifulsoup4==4.12.2 \ No newline at end of file