fipl-hse · RostislavHmelevski · Mar 31, 2024 · Apr 1, 2024 · Apr 24, 2024 · Apr 25, 2024
diff --git a/lab_5_scrapper/scrapper.py b/lab_5_scrapper/scrapper.py
@@ -2,9 +2,63 @@
 Crawler implementation.
 """
 # pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
+import datetime
+import json
 import pathlib
+import re
+import shutil
 from typing import Pattern, Union
 
+import requests
+from bs4 import BeautifulSoup
+
+from core_utils.article.article import Article
+from core_utils.article.io import to_meta, to_raw
+from core_utils.config_dto import ConfigDTO
+from core_utils.constants import ASSETS_PATH, CRAWLER_CONFIG_PATH
+
+
+class IncorrectSeedURLError(Exception):
+    """
+    Seed URL does not match standard pattern.
+    """
+
+
+class IncorrectNumberOfArticlesError(Exception):
+    """
+    Total number of articles to parse is not integer.
+    """
+
+
+class NumberOfArticlesOutOfRangeError(Exception):
+    """
+    Total number of articles is out of range from 1 to 150.
+    """
+
+
+class IncorrectHeadersError(Exception):
+    """
+    Headers are not in a form of dictionary.
+    """
+
+
+class IncorrectEncodingError(Exception):
+    """
+    encoding must be specified as a string.
+    """
+
+
+class IncorrectTimeoutError(Exception):
+    """
+    timeout value must be a positive integer less than 60.
+    """
+
+
+class IncorrectVerifyError(Exception):
+    """
+    verify certificate value must either be True or False.
+    """
+
 
 class Config:
     """
@@ -18,6 +72,16 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
         Args:
             path_to_config (pathlib.Path): Path to configuration.
         """
+        self.path_to_config = path_to_config
+        self._validate_config_content()
+        self.config = self._extract_config_content()
+        self._seed_urls = self.config.seed_urls
+        self._num_articles = self.config.total_articles
+        self._headers = self.config.headers
+        self._encoding = self.config.encoding
+        self._timeout = self.config.timeout
+        self._should_verify_certificate = self.config.should_verify_certificate
+        self._headless_mode = self.config.headless_mode
 
     def _extract_config_content(self) -> ConfigDTO:
         """
@@ -26,11 +90,50 @@ def _extract_config_content(self) -> ConfigDTO:
         Returns:
             ConfigDTO: Config values
         """
+        with open(self.path_to_config, 'r', encoding='utf-8') as file:
+            config = json.load(file)
+        return ConfigDTO(
+            seed_urls=config["seed_urls"],
+            total_articles_to_find_and_parse=config["total_articles_to_find_and_parse"],
+            headers=config["headers"],
+            encoding=config["encoding"],
+            timeout=config["timeout"],
+            should_verify_certificate=config["should_verify_certificate"],
+            headless_mode=config["headless_mode"]
+        )
 
     def _validate_config_content(self) -> None:
         """
         Ensure configuration parameters are not corrupt.
         """
+        with open(self.path_to_config, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+
+            if not isinstance(config['seed_urls'], list):
+                raise IncorrectSeedURLError
+
+            if not all(seed_url.startswith('https://donday.ru/') for seed_url in config['seed_urls']):
+                raise IncorrectSeedURLError
+
+            if (not isinstance(config['total_articles_to_find_and_parse'], int) or
+                    config['total_articles_to_find_and_parse'] <= 0):
+                raise IncorrectNumberOfArticlesError
+
+            if not 1 < config['total_articles_to_find_and_parse'] <= 150:
+                raise NumberOfArticlesOutOfRangeError
+
+            if not isinstance(config['headers'], dict):
+                raise IncorrectHeadersError
+
+            if not isinstance(config['encoding'], str):
+                raise IncorrectEncodingError
+
+            if not isinstance(config['timeout'], int) or not 0 < config['timeout'] < 60:
+                raise IncorrectTimeoutError
+
+            if (not isinstance(config['should_verify_certificate'], bool) or
+                    not isinstance(config['headless_mode'], bool)):
+                raise IncorrectVerifyError
 
     def get_seed_urls(self) -> list[str]:
         """
@@ -39,6 +142,7 @@ def get_seed_urls(self) -> list[str]:
         Returns:
             list[str]: Seed urls
         """
+        return self._seed_urls
 
     def get_num_articles(self) -> int:
         """
@@ -47,6 +151,7 @@ def get_num_articles(self) -> int:
         Returns:
             int: Total number of articles to scrape
         """
+        return self._num_articles
 
     def get_headers(self) -> dict[str, str]:
         """
@@ -55,6 +160,7 @@ def get_headers(self) -> dict[str, str]:
         Returns:
             dict[str, str]: Headers
         """
+        return self._headers
 
     def get_encoding(self) -> str:
         """
@@ -63,6 +169,7 @@ def get_encoding(self) -> str:
         Returns:
             str: Encoding
         """
+        return self._encoding
 
     def get_timeout(self) -> int:
         """
@@ -71,6 +178,7 @@ def get_timeout(self) -> int:
         Returns:
             int: Number of seconds to wait for response
         """
+        return self._timeout
 
     def get_verify_certificate(self) -> bool:
         """
@@ -79,6 +187,7 @@ def get_verify_certificate(self) -> bool:
         Returns:
             bool: Whether to verify certificate or not
         """
+        return self._should_verify_certificate
 
     def get_headless_mode(self) -> bool:
         """
@@ -87,6 +196,7 @@ def get_headless_mode(self) -> bool:
         Returns:
             bool: Whether to use headless mode or not
         """
+        return self._headless_mode
 
 
 def make_request(url: str, config: Config) -> requests.models.Response:
@@ -100,6 +210,11 @@ def make_request(url: str, config: Config) -> requests.models.Response:
     Returns:
         requests.models.Response: A response from a request
     """
+    res = requests.get(url=url,
+                        timeout=config.get_timeout(),
+                        headers=config.get_headers(),
+                        verify=config.get_verify_certificate())
+    return res
 
 
 class Crawler:
@@ -116,6 +231,8 @@ def __init__(self, config: Config) -> None:
         Args:
             config (Config): Configuration
         """
+        self.config = config
+        self.urls = []
 
     def _extract_url(self, article_bs: BeautifulSoup) -> str:
         """
@@ -128,18 +245,35 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
             str: Url from HTML
         """
 
+        article_url = article_bs.find('a').get('href')
+        return article_url
+
     def find_articles(self) -> None:
         """
         Find articles.
         """
 
+        for url in self.get_search_urls():
+            response = make_request(url, self.config)
+            if not response.ok:
+                continue
+            soup = BeautifulSoup(response.text, "html.parser")
+            contents = soup.find_all('div', id='dle-content')
+            max_articles = self.config.get_num_articles()
+            for content in contents[:max_articles]:
+                for item in content.find_all('h3', class_='btl'):
+                    url_news = self._extract_url(item)
+                    if url_news not in self.urls:
+                        self.urls.append(url_news)
+
     def get_search_urls(self) -> list:
         """
         Get seed_urls param.
 
         Returns:
             list: seed_urls param
         """
+        return self.config.get_seed_urls()
 
 
 # 10
@@ -160,6 +294,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
             article_id (int): Article id
             config (Config): Configuration
         """
+        self.full_url = full_url
+        self.article_id = article_id
+        self.config = config
+        self.article = Article(self.full_url, self.article_id)
 
     def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         """
@@ -168,6 +306,11 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         Args:
             article_soup (bs4.BeautifulSoup): BeautifulSoup instance
         """
+        allnews = article_soup.find(itemprop="articleBody")
+        text_split = allnews.text.replace('\n', '').split()
+        text = ' '.join(text_split)
+        clear_text = '. '.join(text.split('. ')[:-2])
+        self.article.text = clear_text
 
     def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
         """
@@ -176,6 +319,14 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No
         Args:
             article_soup (bs4.BeautifulSoup): BeautifulSoup instance
         """
+        title_find = article_soup.find(itemprop="headline")
+        self.article.title = title_find.text.replace('\n', '')
+        author = article_soup.find(class_="argauthor")
+        self.article.author = [author.text.replace('\n', '').strip()]
+        topics = article_soup.find(class_="argcat")
+        self.article.topics = topics.text.replace('\n', '')
+        time = article_soup.find('time', itemprop="datePublished")
+        self.article.time = time.text.replace('\n', '')
 
     def unify_date_format(self, date_str: str) -> datetime.datetime:
         """
@@ -195,6 +346,13 @@ def parse(self) -> Union[Article, bool, list]:
         Returns:
             Union[Article, bool, list]: Article instance
         """
+        response = make_request(self.full_url, self.config)
+        if response.ok:
+            article_bs = BeautifulSoup(response.text, features='lxml')
+            self._fill_article_with_text(article_bs)
+            self._fill_article_with_meta_information(article_bs)
+
+        return self.article
 
 
 def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
@@ -204,13 +362,27 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
     Args:
         base_path (Union[pathlib.Path, str]): Path where articles stores
     """
+    if base_path.exists():
+        shutil.rmtree(base_path)
+    base_path.mkdir(parents=True)
 
 
 def main() -> None:
     """
     Entrypoint for scrapper module.
     """
+    conf = Config(CRAWLER_CONFIG_PATH)
+    prepare_environment(ASSETS_PATH)
+    crawler = Crawler(conf)
+    crawler.find_articles()
+
+    for i, url in enumerate(crawler.urls, 1):
+        parser = HTMLParser(url, i, conf)
+        article = parser.parse()
+        to_raw(article)
+        to_meta(article)
+    print('Done')
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/lab_5_scrapper/scrapper_config.json b/lab_5_scrapper/scrapper_config.json
@@ -1,9 +1,9 @@
 {
-    "seed_urls": [],
+    "seed_urls": ["https://donday.ru/"],
     "headers": {},
-    "total_articles_to_find_and_parse": 0,
+    "total_articles_to_find_and_parse": 3,
     "encoding": "",
-    "timeout": 0,
+    "timeout": 10,
     "should_verify_certificate": true,
     "headless_mode": true
 }
diff --git a/lab_5_scrapper/settings.json b/lab_5_scrapper/settings.json
@@ -1,3 +1,3 @@
 {
-  "target_score": 0
+  "target_score": 6
 }
diff --git a/lab_5_scrapper/target_score.txt b/lab_5_scrapper/target_score.txt
@@ -0,0 +1 @@
+6