fipl-hse · x-ae-a-12 · Mar 6, 2021 · Mar 6, 2021 · Mar 9, 2021 · Mar 10, 2021
diff --git a/.pylintrc b/.pylintrc
@@ -143,7 +143,8 @@ disable=print-statement,
         bad-continuation,
         unused-argument,
         unnecessary-pass,
-        import-error
+        import-error,
+        too-few-public-methods
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/article.py b/article.py
@@ -46,7 +46,7 @@ def save_raw(self):
                       indent=4,
                       ensure_ascii=False,
                       separators=(',', ': '))
-    
+
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -94,13 +94,13 @@ def _get_meta(self):
             'author': self.author,
             'topics': self.topics
         }
-    
+
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
-    
+
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article

diff --git a/config/raw_metadata_test.py b/config/raw_metadata_test.py
@@ -4,6 +4,7 @@
 import unittest
 import requests
 from constants import ASSETS_PATH, CRAWLER_CONFIG_PATH
+from bs4 import BeautifulSoup
 
 
 class RawDataValidator(unittest.TestCase):
@@ -48,7 +49,7 @@ def test_validate_metadata(self):
                             msg="Can not open URL: <{}>. Check how you collect URLs".format(
                                 metadata[1]['url']))
 
-            html_source = requests.get(metadata[1]['url']).text
+            html_source = BeautifulSoup(requests.get(metadata[1]['url']).content, features='lxml').text
 
             self.assertTrue(metadata[1]['title'] in
                             html_source,

diff --git a/config/student_text_preprocess_score_eight_test.py b/config/student_text_preprocess_score_eight_test.py
@@ -4,7 +4,7 @@
 from constants import ASSETS_PATH
 
 
-TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO"]
+TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO", "INTJ"]
 
 
 class StudentTextPreprocessTest(unittest.TestCase):

diff --git a/crawler_config.json b/crawler_config.json
@@ -1,5 +1,5 @@
 {
-    "base_urls": [],
-    "total_articles_to_find_and_parse": 0,
-    "max_number_articles_to_get_from_one_seed": 0
+    "base_urls": ["https://mordovia-news.ru/"],
+    "total_articles_to_find_and_parse": 10,
+    "max_number_articles_to_get_from_one_seed": 3
 }
diff --git a/pipeline.py b/pipeline.py
@@ -2,8 +2,15 @@
 Pipeline for text processing implementation
 """
 
+import os
 from typing import List
 
+from pymystem3 import Mystem
+from pymorphy2 import MorphAnalyzer
+
+from constants import ASSETS_PATH
+from article import Article
+
 
 class EmptyDirectoryError(Exception):
     """
@@ -28,61 +35,92 @@ class MorphologicalToken:
     Stores language params for each processed token
     """
     def __init__(self, original_word, normalized_form):
-        pass
+        self.normalized_form = normalized_form
+        self.original_word = original_word
+        self.tags = []
+        self.morphy_tags = []
 
     def __str__(self):
-        return "MorphologicalToken instance here"
+        return f'{self.normalized_form}<{self.tags}>({self.morphy_tags})'
 
 
 class CorpusManager:
     """
     Works with articles and stores them
     """
     def __init__(self, path_to_raw_txt_data: str):
-        pass
+        self._storage = dict()
+        self._path = path_to_raw_txt_data
+        self._scan_dataset()
 
     def _scan_dataset(self):
         """
         Register each dataset entry
         """
-        pass
+        for file in os.listdir(self._path):
+            if file.endswith('_raw.txt'):
+                self._storage[int(file[:-8])] = Article(url=None, article_id=int(file[:-8]))
 
     def get_articles(self):
         """
         Returns storage params
         """
-        pass
+        return self._storage
 
 
 class TextProcessingPipeline:
     """
     Process articles from corpus manager
     """
     def __init__(self, corpus_manager: CorpusManager):
-        pass
+        self.corpus_manager = corpus_manager
 
     def run(self):
         """
         Runs pipeline process scenario
         """
-        pass
+        for article in self.corpus_manager.get_articles().values():
+            original_text = article.get_raw_text().lower()
+            processed_text = self._process(original_text)
+            article.save_processed(' '.join([str(token) for token in processed_text]))
 
-    def _process(self) -> List[type(MorphologicalToken)]:
+    @staticmethod
+    def _process(text) -> List[type(MorphologicalToken)]:
         """
         Performs processing of each text
         """
-        pass
+        analyze = Mystem().analyze(text)
+        morph = MorphAnalyzer()
+        tokens = []
+        for feature in analyze:
+            if 'analysis' not in feature or not feature['analysis']:
+                continue
+            token = MorphologicalToken(feature['text'],
+                                       feature['analysis'][0]['lex'])
+            token.tags = feature['analysis'][0]['gr']
+            token.morphy_tags = morph.parse(token.original_word)[0].tag
+            tokens.append(token)
+        return tokens
 
 
 def validate_dataset(path_to_validate):
     """
     Validates folder with assets
     """
-    pass
+    if not os.path.exists(path_to_validate):
+        raise FileNotFoundError
+    if not os.path.isdir(path_to_validate):
+        raise NotADirectoryError
+    if not os.listdir(path_to_validate):
+        raise EmptyDirectoryError
 
 
 def main():
-    print('Your code goes here')
+    validate_dataset(ASSETS_PATH)
+    corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH)
+    pipeline = TextProcessingPipeline(corpus_manager)
+    pipeline.run()
+    print('Text processing pipeline has just finished')
 
 
 if __name__ == "__main__":

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+requests == 2.25.1
+beautifulsoup4 == 4.9.3
+lxml == 4.6.2
+pymystem3
+pymorphy2
diff --git a/scrapper.py b/scrapper.py
@@ -2,6 +2,15 @@
 Crawler implementation
 """
 
+import re
+import os
+import json
+import datetime
+import requests
+from bs4 import BeautifulSoup
+from article import Article
+from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH
+
 
 class IncorrectURLError(Exception):
     """
@@ -32,66 +41,109 @@ class Crawler:
     Crawler implementation
     """
     def __init__(self, seed_urls: list, max_articles: int):
-        pass
-
-    @staticmethod
-    def _extract_url(article_bs):
-        pass
+        self.search_urls = seed_urls
+        self.max_articles = max_articles
+        self.found_urls = []
+        self.link_pattern = r'/?news-\d+-\d+\.htm'
+
+    def _extract_url(self, article_bs):
+        links = []
+        for link in article_bs.find_all('a', href=True):
+            potential_link = re.match(self.link_pattern, link['href'])
+            if potential_link:
+                links.append(potential_link.group(0))
+        return links
 
     def find_articles(self):
         """
         Finds articles
         """
-        pass
+        for url in self.search_urls:
+            request = requests.get(url).content
+            soup = BeautifulSoup(request,
+                                 features='lxml')
+            for article_url in self._extract_url(soup):
+                if len(self.found_urls) != self.max_articles \
+                        and url+article_url not in self.found_urls:
+                    self.found_urls.append(url+article_url)
+        print(f'Found {len(self.found_urls)} links to articles to process')
 
     def get_search_urls(self):
         """
         Returns seed_urls param
         """
-        pass
+        return self.found_urls
 
 
 class ArticleParser:
     """
     ArticleParser implementation
     """
     def __init__(self, full_url: str, article_id: int):
-        pass
+        self.article = Article(url=full_url, article_id=article_id)
 
     def _fill_article_with_text(self, article_soup):
-        pass
+        self.article.text = article_soup.find('dd', class_='text').text
 
     def _fill_article_with_meta_information(self, article_soup):
-        pass
+        self.article.title = article_soup.find('dd', class_='title').text.strip()
+        self.article.author = 'NOT FOUND'
+        self.article.topics = article_soup.find('span', class_='title_text').find_all('a')[1].text
+        self.article.date = self.unify_date_format(article_soup.find('span', class_='title_data').text[-10:])
 
     @staticmethod
     def unify_date_format(date_str):
         """
         Unifies date format
         """
-        pass
+        return datetime.datetime.strptime(date_str, "%d.%m.%Y")
 
     def parse(self):
         """
         Parses each article
         """
-        pass
+        request = requests.get(self.article.url).content
+        soup = BeautifulSoup(request, features='lxml')
+        self._fill_article_with_meta_information(soup)
+        self._fill_article_with_text(soup)
+        self.article.save_raw()
 
 
 def prepare_environment(base_path):
     """
     Creates ASSETS_PATH folder if not created and removes existing folder
     """
-    pass
+    if not os.path.exists(base_path):
+        os.makedirs(base_path)
 
 
 def validate_config(crawler_path):
     """
     Validates given config
     """
-    pass
+    with open(crawler_path, 'r', encoding='utf-8') as data:
+        settings = json.load(data)
+    url_pattern = 'https://'
+
+    for url in settings['base_urls']:
+        if url_pattern not in url:
+            raise IncorrectURLError
+
+    if not isinstance(settings['total_articles_to_find_and_parse'], int):
+        raise IncorrectNumberOfArticlesError
+
+    if settings['total_articles_to_find_and_parse'] > 100:
+        raise NumberOfArticlesOutOfRangeError
+    return settings['base_urls'], settings['total_articles_to_find_and_parse']
 
 
 if __name__ == '__main__':
-    # YOUR CODE HERE
-    pass
+    urls, num_articles = validate_config(CRAWLER_CONFIG_PATH)
+    prepare_environment(ASSETS_PATH)
+
+    crawler = Crawler(seed_urls=urls, max_articles=num_articles)
+    crawler.find_articles()
+
+    for _article_id, _article_link in enumerate(crawler.get_search_urls()):
+        parser = ArticleParser(_article_link, _article_id+1)
+        parser.parse()
diff --git a/target_score.txt b/target_score.txt
@@ -1,5 +1,5 @@
 # Target score for scrapper.py:
-6
+8
 
 # Target score for pipeline.py:
-0
+8
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,5 @@
		# Target score for scrapper.py:
		6
		8
Copy link Contributor dmitry-uraev Mar 6, 2021 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Yes, that s good switch. x-ae-a-12 reacted with confused emoji Copy link Author x-ae-a-12 Mar 6, 2021 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Yes, that s good switch. No, I do not think so. It is eminent I'd better switch to 4 or 6 later.

		# Target score for pipeline.py:
		0
		8