added realisation lab5

fipl-hse · Jun 2, 2024 · 2eaeb42 · 2eaeb42
1 parent 36b2065
commit 2eaeb42
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 9 deletions.
diff --git a/lab_5_scrapper/scrapper.py b/lab_5_scrapper/scrapper.py
@@ -1,10 +1,64 @@
 """
 Crawler implementation.
 """
+import datetime
+import json
 # pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable
 import pathlib
+import re
+import shutil
 from typing import Pattern, Union
 
+import requests
+from bs4 import BeautifulSoup
+
+from core_utils import constants
+from core_utils.article.article import Article
+from core_utils.article.io import to_meta, to_raw
+from core_utils.config_dto import ConfigDTO
+
+
+class IncorrectSeedURLError(Exception):
+    """
+    Seed URL does not match standard pattern
+    """
+
+
+class NumberOfArticlesOutOfRangeError(Exception):
+    """
+    Total number of articles is out of range from 1 to 150
+    """
+
+
+class IncorrectNumberOfArticlesError(Exception):
+    """
+    Total number of articles to parse is not positive integer
+    """
+
+
+class IncorrectHeadersError(Exception):
+    """
+    Headers are not in a form of dictionary
+    """
+
+
+class IncorrectEncodingError(Exception):
+    """
+    Encoding is not a string
+    """
+
+
+class IncorrectTimeoutError(Exception):
+    """
+    Timeout value is not a positive integer less than 60
+    """
+
+
+class IncorrectVerifyError(Exception):
+    """
+    Verify certificate value is not True or False
+    """
+
 
 class Config:
     """
@@ -18,6 +72,17 @@ def __init__(self, path_to_config: pathlib.Path) -> None:
         Args:
             path_to_config (pathlib.Path): Path to configuration.
         """
+        self.path_to_config = path_to_config
+        self._validate_config_content()
+        self.config = self._extract_config_content()
+
+        self._seed_urls = self.config.seed_urls
+        self._num_articles = self.config.total_articles
+        self._headers = self.config.headers
+        self._encoding = self.config.encoding
+        self._timeout = self.config.timeout
+        self._should_verify_certificate = self.config.should_verify_certificate
+        self._headless_mode = self.config.headless_mode
 
     def _extract_config_content(self) -> ConfigDTO:
         """
@@ -26,11 +91,49 @@ def _extract_config_content(self) -> ConfigDTO:
         Returns:
             ConfigDTO: Config values
         """
+        with open(self.path_to_config, "r", encoding="utf-8") as f:
+            conf = json.load(f)
+        return ConfigDTO(
+            seed_urls=conf["seed_urls"],
+            total_articles_to_find_and_parse=conf["total_articles_to_find_and_parse"],
+            headers=conf["headers"],
+            encoding=conf["encoding"],
+            timeout=conf["timeout"],
+            should_verify_certificate=conf["should_verify_certificate"],
+            headless_mode=conf["headless_mode"]
+        )
 
     def _validate_config_content(self) -> None:
         """
         Ensure configuration parameters are not corrupt.
         """
+        with open(self.path_to_config, 'r', encoding='utf-8') as f:
+            conf = json.load(f)
+
+        if not (isinstance(conf['seed_urls'], list)
+                and all(re.match(r"https?://(www.)?", seed_url) for seed_url in conf['seed_urls'])):
+            raise IncorrectSeedURLError
+
+        num = conf['total_articles_to_find_and_parse']
+
+        if not isinstance(num, int) or (num <= 0):
+            raise IncorrectNumberOfArticlesError
+
+        if num < 1 or num > 150:
+            raise NumberOfArticlesOutOfRangeError
+
+        if not isinstance(conf['headers'], dict):
+            raise IncorrectHeadersError
+
+        if not isinstance(conf['encoding'], str):
+            raise IncorrectEncodingError
+
+        if not (isinstance(conf['timeout'], int) and (0 < conf['timeout'] < 60)):
+            raise IncorrectTimeoutError
+
+        if not isinstance(conf['should_verify_certificate'], bool) \
+                or not isinstance(conf['headless_mode'], bool):
+            raise IncorrectVerifyError
 
     def get_seed_urls(self) -> list[str]:
         """
@@ -39,6 +142,7 @@ def get_seed_urls(self) -> list[str]:
         Returns:
             list[str]: Seed urls
         """
+        return self._seed_urls
 
     def get_num_articles(self) -> int:
         """
@@ -47,6 +151,7 @@ def get_num_articles(self) -> int:
         Returns:
             int: Total number of articles to scrape
         """
+        return self._num_articles
 
     def get_headers(self) -> dict[str, str]:
         """
@@ -55,6 +160,7 @@ def get_headers(self) -> dict[str, str]:
         Returns:
             dict[str, str]: Headers
         """
+        return self._headers
 
     def get_encoding(self) -> str:
         """
@@ -63,6 +169,7 @@ def get_encoding(self) -> str:
         Returns:
             str: Encoding
         """
+        return self._encoding
 
     def get_timeout(self) -> int:
         """
@@ -71,6 +178,7 @@ def get_timeout(self) -> int:
         Returns:
             int: Number of seconds to wait for response
         """
+        return self._timeout
 
     def get_verify_certificate(self) -> bool:
         """
@@ -79,6 +187,7 @@ def get_verify_certificate(self) -> bool:
         Returns:
             bool: Whether to verify certificate or not
         """
+        return self._should_verify_certificate
 
     def get_headless_mode(self) -> bool:
         """
@@ -87,6 +196,7 @@ def get_headless_mode(self) -> bool:
         Returns:
             bool: Whether to use headless mode or not
         """
+        return self._headless_mode
 
 
 def make_request(url: str, config: Config) -> requests.models.Response:
@@ -100,6 +210,8 @@ def make_request(url: str, config: Config) -> requests.models.Response:
     Returns:
         requests.models.Response: A response from a request
     """
+    return requests.get(url=url, timeout=config.get_timeout(),
+                        headers=config.get_headers(), verify=config.get_verify_certificate())
 
 
 class Crawler:
@@ -116,6 +228,9 @@ def __init__(self, config: Config) -> None:
         Args:
             config (Config): Configuration
         """
+        self.config = config
+        self.urls = []
+        self.url_pattern = self.config.get_seed_urls()[0].split('/format')[0]
 
     def _extract_url(self, article_bs: BeautifulSoup) -> str:
         """
@@ -127,11 +242,31 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str:
         Returns:
             str: Url from HTML
         """
+        url = ""
+        links = article_bs.find_all('a', class_="qZbm2")
+        for link in links:
+            url = link.get('href')
+            url = self.url_pattern + url[len("/text")::]
+            if url not in self.urls:
+                break
+
+        return url
 
     def find_articles(self) -> None:
         """
         Find articles.
         """
+        seed_urls = self.get_search_urls()
+
+        while len(self.urls) < self.config.get_num_articles():
+            for seed_url in seed_urls:
+                response = make_request(seed_url, self.config)
+                if not response.ok:
+                    continue
+
+                article_bs = BeautifulSoup(response.text, "html.parser")
+                extracted = self._extract_url(article_bs)
+                self.urls.append(extracted)
 
     def get_search_urls(self) -> list:
         """
@@ -140,6 +275,7 @@ def get_search_urls(self) -> list:
         Returns:
             list: seed_urls param
         """
+        return self.config.get_seed_urls()
 
 
 # 10
@@ -160,6 +296,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None:
             article_id (int): Article id
             config (Config): Configuration
         """
+        self.full_url = full_url
+        self.article_id = article_id
+        self.config = config
+        self.article = Article(self.full_url, self.article_id)
 
     def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         """
@@ -168,6 +308,13 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None:
         Args:
             article_soup (bs4.BeautifulSoup): BeautifulSoup instance
         """
+        raw_text = ''
+        text_blocks = article_soup.find_all('div', class_='uiArticleBlockText_i9h2o')
+        for text_block in text_blocks:
+            if text_block.string:
+                raw_text += f'\n{text_block.string}'
+
+        self.article.text = raw_text
 
     def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None:
         """
@@ -176,6 +323,14 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No
         Args:
             article_soup (bs4.BeautifulSoup): BeautifulSoup instance
         """
+        headline = article_soup.find("h1", class_="title_ip27z")
+        self.article.title = headline.text
+        author = article_soup.find("div", class_="name_GQmWc")
+        if not author:
+            self.article.author = ["NOT FOUND"]
+
+        else:
+            self.article.author = [author.text]
 
     def unify_date_format(self, date_str: str) -> datetime.datetime:
         """
@@ -195,6 +350,13 @@ def parse(self) -> Union[Article, bool, list]:
         Returns:
             Union[Article, bool, list]: Article instance
         """
+        response = make_request(self.full_url, self.config)
+        if response.ok:
+            article_bs = BeautifulSoup(response.text, "html.parser")
+            self._fill_article_with_text(article_bs)
+            self._fill_article_with_meta_information(article_bs)
+
+        return self.article
 
 
 def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
@@ -204,13 +366,31 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None:
     Args:
         base_path (Union[pathlib.Path, str]): Path where articles stores
     """
+    if base_path.exists():
+        shutil.rmtree(base_path)
+    base_path.mkdir(parents=True)
 
 
 def main() -> None:
     """
     Entrypoint for scrapper module.
     """
+    configuration = Config(path_to_config=constants.CRAWLER_CONFIG_PATH)
+
+    prepare_environment(base_path=constants.ASSETS_PATH)
+
+    crawler = Crawler(config=configuration)
+    crawler.find_articles()
+    urls = crawler.urls
+
+    for index, url in enumerate(urls):
+        parser = HTMLParser(full_url=url, article_id=index + 1, config=configuration)
+        article = parser.parse()
+        if isinstance(article, Article):
+            to_raw(article)
+            to_meta(article)
+    print("done!")
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/lab_5_scrapper/scrapper_config.json b/lab_5_scrapper/scrapper_config.json
@@ -1,9 +1,23 @@
 {
-    "seed_urls": [],
-    "headers": {},
-    "total_articles_to_find_and_parse": 0,
-    "encoding": "",
-    "timeout": 0,
+    "seed_urls": ["https://74.ru/text/format/mention/",
+        "https://74.ru/text/format/mention/?page=2",
+        "https://74.ru/text/format/mention/?page=3",
+        "https://74.ru/text/format/mention/?page=4",
+        "https://74.ru/text/format/mention/?page=5",
+        "https://74.ru/text/format/mention/?page=6",
+        "https://74.ru/text/format/mention/?page=7",
+        "https://74.ru/text/format/mention/?page=8",
+        "https://74.ru/text/format/mention/?page=9",
+        "https://74.ru/text/format/mention/?page=10"],
+    "headers": {
+        "cookie" : "stg_returning_visitor=Mon%2C%2008%20Apr%202024%2009:30:34%20GMT; stg_traffic_source_priority=1; _ga_KLCW8G3CY6=GS1.1.1717361661.1.0.1717361661.0.0.0; _ga=GA1.1.562201468.1717361661; stg_last_interaction=Sun%2C%2002%20Jun%202024%2020:54:22%20GMT",
+        "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language" : "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
+        "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
+    },
+    "total_articles_to_find_and_parse": 100,
+    "encoding": "utf-8",
+    "timeout": 15,
     "should_verify_certificate": true,
     "headless_mode": true
 }
diff --git a/lab_5_scrapper/settings.json b/lab_5_scrapper/settings.json
@@ -1,3 +1,3 @@
 {
-  "target_score": 4
-}
+  "target_score": 6
+}
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-
+requests==2.31.0
+beautifulsoup4==4.12.2