fipl-hse · ffmiil · Mar 6, 2021 · Mar 11, 2021 · Mar 11, 2021 · Mar 11, 2021
diff --git a/article.py b/article.py
@@ -46,7 +46,7 @@ def save_raw(self):
                       indent=4,
                       ensure_ascii=False,
                       separators=(',', ': '))
-    
+
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -90,17 +90,17 @@ def _get_meta(self):
             'id': self.article_id,
             'url': self.url,
             'title': self.title,
-            'date': self._date_to_text(),
+            'date': self.date,
             'author': self.author,
             'topics': self.topics
         }
-    
+
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
-    
+
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article

diff --git a/constants.py b/constants.py
@@ -7,3 +7,7 @@
 PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
 ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
+HEADERS = {
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
+    'Chrome/89.0.4389.90 Safari/537.36'
+    }
diff --git a/crawler_config.json b/crawler_config.json
@@ -1,5 +1,5 @@
 {
-    "base_urls": [],
-    "total_articles_to_find_and_parse": 0,
-    "max_number_articles_to_get_from_one_seed": 0
+  "base_urls": ["https://www.e1.ru/news/"],
+  "total_articles_to_find_and_parse": 5,
+  "max_number_articles_to_get_from_one_seed": 5
 }
diff --git a/pipeline.py b/pipeline.py
@@ -2,7 +2,12 @@
 Pipeline for text processing implementation
 """
 
+from pathlib import Path
 from typing import List
+from pymorphy2 import MorphAnalyzer
+from pymystem3 import Mystem
+from article import Article
+from constants import ASSETS_PATH
 
 
 class EmptyDirectoryError(Exception):
@@ -27,63 +32,110 @@ class MorphologicalToken:
     """
     Stores language params for each processed token
     """
-    def __init__(self, original_word, normalized_form):
-        pass
+    def __init__(self, normalized_form, original_word):
+        self.original_word = original_word
+        self.normalized_form = normalized_form
+        self.mystem_tags = ''
+        self.pymorphy_tags = ''
 
     def __str__(self):
-        return "MorphologicalToken instance here"
+        return f"{self.normalized_form}<{self.mystem_tags}>({self.pymorphy_tags})"
+
+    def public_method(self):
+        pass
 
 
 class CorpusManager:
     """
     Works with articles and stores them
     """
+
     def __init__(self, path_to_raw_txt_data: str):
-        pass
+        self.path_to_raw_txt_date = path_to_raw_txt_data
+        self._storage = {}
+        self._scan_dataset()
 
     def _scan_dataset(self):
         """
         Register each dataset entry
         """
-        pass
+        for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'):
+            id_each = int(file.parts[-1].split('_')[0])
+            self._storage[id_each] = Article(url=None, article_id=id_each)
 
     def get_articles(self):
         """
         Returns storage params
         """
+        return self._storage
+
+    def public_method(self):
         pass
 
 
 class TextProcessingPipeline:
     """
     Process articles from corpus manager
     """
+
     def __init__(self, corpus_manager: CorpusManager):
-        pass
+        self.corpus_manager = corpus_manager
+        self.raw_text = ''
 
     def run(self):
         """
         Runs pipeline process scenario
         """
-        pass
+        for article in self.corpus_manager.get_articles().values():
+            self.raw_text = article.get_raw_text()
+            processed_text = list(map(str, self._process()))
+            article.save_processed(' '.join(processed_text))
 
     def _process(self) -> List[type(MorphologicalToken)]:
         """
         Performs processing of each text
         """
+        process = Mystem().analyze(self.raw_text)
+        tokens = []
+
+        for tok in process:
+            if tok.get('analysis') and tok.get('text'):
+                morph_token = MorphologicalToken(original_word=tok['text'], normalized_form=tok['analysis'][0]['lex'])
+                morph_token.mystem_tags = tok['analysis'][0]['gr']
+                morph_token.pymorphy_tags = MorphAnalyzer().parse(word=morph_token.original_word)[0].tag
+                tokens.append(morph_token)
+
+        return tokens
+
+    def public_method(self):
         pass
 
 
 def validate_dataset(path_to_validate):
     """
     Validates folder with assets
     """
-    pass
+    path = Path(path_to_validate)
+
+    if not path.exists():
+        raise FileNotFoundError
+
+    if not path.is_dir():
+        raise NotADirectoryError
+
+    if not list(path.iterdir()):
+        raise EmptyDirectoryError
 
 
 def main():
-    print('Your code goes here')
+    validate_dataset(ASSETS_PATH)
+
+    corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH)
+    pipeline = TextProcessingPipeline(corpus_manager=corpus_manager)
+
+    pipeline.run()
 
 
 if __name__ == "__main__":
+    # YOUR CODE HERE
     main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+beautifulsoup4==4.9.3
+lxml==4.6.2
+pymorphy2==0.9.1
+pymystem3==0.2.0
+requests==2.25.1
diff --git a/scrapper.py b/scrapper.py
@@ -1,6 +1,12 @@
 """
 Crawler implementation
 """
+import os
+import json
+import requests
+from bs4 import BeautifulSoup
+from article import Article
+from constants import CRAWLER_CONFIG_PATH, HEADERS, ASSETS_PATH
 
 
 class IncorrectURLError(Exception):
@@ -31,38 +37,65 @@ class Crawler:
     """
     Crawler implementation
     """
-    def __init__(self, seed_urls: list, max_articles: int):
-        pass
+    def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: int):
+        self.seed_urls = seed_urls
+        self.total_max_articles = max_articles
+        self.max_articles_per_seed = max_articles_per_seed
+        self.urls = []
 
     @staticmethod
     def _extract_url(article_bs):
-        pass
+        article_link = article_bs.find('h2', class_="G9ax").find('a').get('href')
+        return 'https://www.e1.ru' + article_link
 
     def find_articles(self):
         """
         Finds articles
         """
-        pass
+        for url in self.seed_urls:
+            response = requests.get(url, headers=HEADERS)
+            if not response:
+                raise IncorrectURLError
+
+            page_soup = BeautifulSoup(response.content, features='lxml')
+            article_soup = page_soup.find_all('article', class_="G9alp")
+
+            for articles in article_soup[:max_num_per_seed]:
+                seed_url = self._extract_url(articles)
+                self.urls.append(seed_url)
+
+                if len(self.urls) == max_num_articles:
+                    break
+
+            if len(self.urls) == max_num_articles:
+                break
 
     def get_search_urls(self):
         """
         Returns seed_urls param
         """
-        pass
+        return self.seed_urls
 
 
 class ArticleParser:
     """
     ArticleParser implementation
     """
     def __init__(self, full_url: str, article_id: int):
-        pass
+        self.full_url = full_url
+        self.article_id = article_id
+        self.article = Article(url=full_url, article_id=article_id)
 
     def _fill_article_with_text(self, article_soup):
-        pass
+        article_text = article_soup.find('div', class_="GFahz").find('div').find_all('p')
+        for par in article_text:
+            self.article.text += par.text.strip() + '\n'
 
     def _fill_article_with_meta_information(self, article_soup):
-        pass
+        self.article.title = article_soup.find('h2', class_="CRqd CRsn JPax").find('span').text
+        self.article.author = 'NOT FOUND'
+        self.article.topics = article_soup.find('a', class_="CRqz CRsv JPall").find('span').text
+        self.article.date = article_soup.find('time', class_="HHkz").find('a').text
 
     @staticmethod
     def unify_date_format(date_str):
@@ -75,23 +108,63 @@ def parse(self):
         """
         Parses each article
         """
-        pass
+        response = requests.get(self.full_url, headers=HEADERS)
+        if not response:
+            raise IncorrectURLError
+
+        article_soup = BeautifulSoup(response.text, 'lxml')
+        self._fill_article_with_text(article_soup)
+        self._fill_article_with_meta_information(article_soup)
+        return self.article
 
 
 def prepare_environment(base_path):
     """
     Creates ASSETS_PATH folder if not created and removes existing folder
     """
-    pass
+    if not os.path.exists(base_path):
+        os.makedirs(base_path)
 
 
 def validate_config(crawler_path):
     """
     Validates given config
     """
-    pass
+    try:
+        with open(crawler_path, 'r', encoding='utf-8') as config:
+            params = json.load(config)
+
+        seed_urls = params.get('base_urls')
+        max_articles = params.get('total_articles_to_find_and_parse')
+        max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed')
+
+        if not isinstance(seed_urls, list):
+            raise IncorrectURLError
+        for url in seed_urls:
+            if not isinstance(url, str) or not url.startswith('http'):
+                raise IncorrectURLError
+
+        if not isinstance(max_articles, int) or max_articles < 0:
+            raise IncorrectNumberOfArticlesError
+
+        if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > 100:
+            raise NumberOfArticlesOutOfRangeError
+
+    except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:
+        raise error
+    else:
+        return seed_urls, max_articles, max_articles_per_seed
 
 
 if __name__ == '__main__':
     # YOUR CODE HERE
-    pass
+    seed_urls_list, max_num_articles, max_num_per_seed = validate_config(CRAWLER_CONFIG_PATH)
+    crawler = Crawler(seed_urls=seed_urls_list,
+                      max_articles=max_num_articles,
+                      max_articles_per_seed=max_num_per_seed)
+    crawler.find_articles()
+    prepare_environment(ASSETS_PATH)
+    for article_id_num, article_url in enumerate(crawler.urls, 1):
+        parser = ArticleParser(full_url=article_url, article_id=article_id_num)
+        article = parser.parse()
+        article.save_raw()
diff --git a/target_score.txt b/target_score.txt
@@ -2,4 +2,4 @@
 6
 
 # Target score for pipeline.py:
-0
+8