fipl-hse · marina-kaz · Feb 28, 2021 · Mar 8, 2021 · Mar 9, 2021 · Mar 9, 2021
diff --git a/article.py b/article.py
@@ -1,3 +1,4 @@
+# pylint: disable=R0902
 """
 Article implementation
 """
@@ -29,6 +30,7 @@ def __init__(self, url, article_id):
         self.author = ''
         self.topics = []
         self.text = ''
+        self.pos_frequencies = {}
 
     def save_raw(self):
         """
@@ -46,7 +48,7 @@ def save_raw(self):
                       indent=4,
                       ensure_ascii=False,
                       separators=(',', ': '))
-    
+
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -62,6 +64,7 @@ def from_meta_json(json_path: str):
         article.date = date_from_meta(meta.get('date', None))
         article.author = meta.get('author', None)
         article.topics = meta.get('topics', None)
+        article.pos_frequencies = meta.get('pos_frequencies', None)
 
         # intentionally leave it empty
         article.text = None
@@ -92,15 +95,16 @@ def _get_meta(self):
             'title': self.title,
             'date': self._date_to_text(),
             'author': self.author,
-            'topics': self.topics
+            'topics': self.topics,
+            'pos_frequencies': self.pos_frequencies
         }
-    
+
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
-    
+
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article

diff --git a/config/student_text_preprocess_score_eight_test.py b/config/student_text_preprocess_score_eight_test.py
@@ -4,7 +4,7 @@
 from constants import ASSETS_PATH
 
 
-TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO"]
+TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO", "COM"]
 
 
 class StudentTextPreprocessTest(unittest.TestCase):

diff --git a/constants.py b/constants.py
@@ -7,3 +7,9 @@
 PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
 ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
+LINKS_STORAGE_DIR = os.path.join(PROJECT_ROOT, 'links')
+LINKS_STORAGE_FILE = os.path.join(LINKS_STORAGE_DIR, 'links.txt')
+
+URL_START = 'https://burunen.ru'
+HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'
+                         ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
diff --git a/crawler_config.json b/crawler_config.json
@@ -1,5 +1,11 @@
 {
-    "base_urls": [],
-    "total_articles_to_find_and_parse": 0,
-    "max_number_articles_to_get_from_one_seed": 0
+    "base_urls": ["https://burunen.ru/news/society/",
+                  "https://burunen.ru/news/culture/",
+                  "https://burunen.ru/news/economy/",
+                  "https://burunen.ru/news/sports/",
+                  "https://burunen.ru/news/incidents/",
+                  "https://burunen.ru/news/politic/"
+    ],
+    "total_articles_to_find_and_parse": 10,
+    "max_number_articles_to_get_from_one_seed": 5
 }
diff --git a/pipeline.py b/pipeline.py
@@ -1,9 +1,17 @@
+# pylint: disable=R0903
+
 """
 Pipeline for text processing implementation
 """
-
 from typing import List
 
+from pathlib import Path
+from pymorphy2 import MorphAnalyzer
+from pymystem3 import Mystem
+
+from article import Article
+from constants import ASSETS_PATH
+
 
 class EmptyDirectoryError(Exception):
     """
@@ -28,61 +36,110 @@ class MorphologicalToken:
     Stores language params for each processed token
     """
     def __init__(self, original_word, normalized_form):
-        pass
+        self.original = original_word
+        self.normalized = normalized_form
+        self.mystem_tags = ''
+        self.pymorphy_tags = ''
 
     def __str__(self):
-        return "MorphologicalToken instance here"
+        return f'{self.normalized}<{self.mystem_tags}>({str(self.pymorphy_tags)})'
 
 
 class CorpusManager:
     """
     Works with articles and stores them
     """
     def __init__(self, path_to_raw_txt_data: str):
-        pass
+        self.path_to_raw = path_to_raw_txt_data
+        self._storage = {}
 
     def _scan_dataset(self):
         """
         Register each dataset entry
         """
-        pass
+        path = Path(ASSETS_PATH)
+        for file in path.iterdir():
+            file_name = file.relative_to(path)
+            if str(file_name).endswith('_raw.txt'):
+                index = str(file_name).split('_raw.txt')[0]
+                self._storage[index] = Article(url=None, article_id=int(index))
 
     def get_articles(self):
         """
         Returns storage params
         """
-        pass
+        self._scan_dataset()
+        return self._storage
 
 
 class TextProcessingPipeline:
     """
     Process articles from corpus manager
     """
     def __init__(self, corpus_manager: CorpusManager):
-        pass
+        self.corpus = corpus_manager
+        self.current_raw_text = ''
 
     def run(self):
         """
         Runs pipeline process scenario
         """
-        pass
+        for article in self.corpus.get_articles().values():
+            self.current_raw_text = article.get_raw_text()
+            tokens = self._process()
+            processed = ' '.join(map(str, tokens))
+            article.save_processed(processed)
 
     def _process(self) -> List[type(MorphologicalToken)]:
-        """
-        Performs processing of each text
-        """
-        pass
+        mystem = Mystem()
+        pymorphy = MorphAnalyzer()
+        words = mystem.analyze(self.current_raw_text)
+        tokens = []
+        for word in words:
+            orig = word['text'].strip()
+            if orig.isalpha():
+                try:
+                    token = MorphologicalToken(original_word=orig, normalized_form=word['analysis'][0]['lex'])
+                    token.mystem_tags = word['analysis'][0]['gr'].strip()
+                    token.pymorphy_tags = pymorphy.parse(orig)[0].tag
+                    tokens.append(token)
+                except IndexError:
+                    token = MorphologicalToken(original_word=orig, normalized_form=orig)
+                    if str(pymorphy.parse(orig)[0].tag) != 'UNKN':
+                        token.pymorphy_tags = pymorphy.parse(orig)[0].tag
+                    tokens.append(token)
+        return tokens
 
 
 def validate_dataset(path_to_validate):
     """
     Validates folder with assets
     """
-    pass
+    path = Path(path_to_validate)
+    if not path.exists():
+        raise FileNotFoundError
+    if not path.is_dir():
+        raise NotADirectoryError
+    if not list(path.iterdir()):
+        raise EmptyDirectoryError
+    files = [str(file.relative_to(path)) for file in path.iterdir()]
+    metas = list(filter(lambda x: x.endswith('_raw.txt'), files))
+    raws = list(filter(lambda x: x.endswith('_meta.json'), files))
+    if not len(metas) == len(raws):
+        raise InconsistentDatasetError
+    meta_indices = sorted(list(map(lambda x: int(x.split('_')[0]), metas)))
+    raw_indices = sorted(list(map(lambda x: int(x.split('_')[0]), raws)))
+    if not raw_indices == meta_indices:
+        raise InconsistentDatasetError
 
 
 def main():
-    print('Your code goes here')
+    validate_dataset(ASSETS_PATH)
+    print('validated dataset')
+    corpus_manager = CorpusManager(ASSETS_PATH)
+    print('onto processing')
+    pipeline = TextProcessingPipeline(corpus_manager=corpus_manager)
+    pipeline.run()
 
 
 if __name__ == "__main__":

diff --git a/pos_frequency_pipeline.py b/pos_frequency_pipeline.py
@@ -1,8 +1,53 @@
 """
 Implementation of POSFrequencyPipeline for score ten only.
 """
+import re
+
+from pathlib import Path
+
+from visualizer import visualize
+from pipeline import CorpusManager
+
+from constants import ASSETS_PATH
 
 
 class POSFrequencyPipeline:
-    def __init__(self, assets):
-        pass
+    def __init__(self, corpus: CorpusManager):
+        self.corpus = corpus
+        self.current_article = None
+
+    def run(self):
+        articles = self.corpus.get_articles()
+        for article in articles.values():
+            self.current_article = article
+            frequencies = self._count_frequencies()
+            self._update_meta(frequencies)
+            path = Path(ASSETS_PATH) / f'{article.article_id}_image.png'
+            visualize(frequencies, path)
+
+    def _count_frequencies(self):
+        path = Path(ASSETS_PATH) / f'{self.current_article.article_id}_processed.txt'
+        with open(path, encoding='utf-8') as file:
+            contents = file.read()
+        tags_found = re.findall(r"<([A-Z]+)[,=]?", contents)
+        frequencies = {}
+        for tag in tags_found:
+            frequencies[tag] = tags_found.count(tag)
+        return frequencies
+
+    def _update_meta(self, frequencies):
+        meta_path = Path(ASSETS_PATH) / f'{self.current_article.article_id}_meta.json'
+        article = self.current_article.from_meta_json(meta_path)
+        article.pos_frequencies = frequencies
+        article.text = article.get_raw_text()
+        article.save_raw()
+
+
+def main():
+    corpus_manager = CorpusManager(ASSETS_PATH)
+    visualizer = POSFrequencyPipeline(corpus_manager)
+    visualizer.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+beautifulsoup4==4.9.0
+pymorphy2==0.9.1
+pymystem3==0.2.0
+requests==2.23.0
+numpy==1.20.1
+matplotlib==3.4.0