From dae378d9a69622a9e0127645232d551e3fbfef8b Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Sat, 6 Mar 2021 14:01:58 +0300
Subject: [PATCH 01/33] ao

---
 crawler_config.json |  6 +++---
 requirements.txt    |  3 +++
 scrapper.py         | 19 +++++++++++++++++--
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/crawler_config.json b/crawler_config.json
index e60ce0f7..419c0303 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
-    "base_urls": [],
-    "total_articles_to_find_and_parse": 0,
-    "max_number_articles_to_get_from_one_seed": 0
+    "base_urls": ["https://express-kamchatka1.ru/sobytiya.html"],
+    "total_articles_to_find_and_parse": 5,
+    "max_number_articles_to_get_from_one_seed": 5
 }
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index e69de29b..97b9fc20 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+requests==2.25.1
+beautifulsoup4==4.9.3
+lxml==4.6.2
\ No newline at end of file
diff --git a/scrapper.py b/scrapper.py
index 43aecef5..8f5685a7 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -1,7 +1,18 @@
 """
 Crawler implementation
 """
-
+import article
+import json
+import os
+import random
+import re
+import requests
+import datetime
+
+from bs4 import BeautifulSoup
+from constants import CRAWLER_CONFIG_PATH
+from constants import PROJECT_ROOT
+from time import sleep
 
 class IncorrectURLError(Exception):
     """
@@ -94,4 +105,8 @@ def validate_config(crawler_path):
 
 if __name__ == '__main__':
     # YOUR CODE HERE
-    pass
+    headers = {
+        'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+        'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
+    }
+    response = requests.get('https://express-kamchatka1.ru/sobytiya.html', headers=headers)

From 277f0b9cece3d862905ba43f490fd2147a965882 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 07:57:26 +0300
Subject: [PATCH 02/33] try

---
 constants.py        |   3 ++
 crawler_config.json |   4 +-
 scrapper.py         | 104 ++++++++++++++++++++++++++++++++++----------
 3 files changed, 85 insertions(+), 26 deletions(-)

diff --git a/constants.py b/constants.py
index 12d85256..913a418e 100644
--- a/constants.py
+++ b/constants.py
@@ -7,3 +7,6 @@
 PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
 ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
+HEADERS = {
+        'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
+    }
diff --git a/crawler_config.json b/crawler_config.json
index 419c0303..1c4f02db 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
     "base_urls": ["https://express-kamchatka1.ru/sobytiya.html"],
-    "total_articles_to_find_and_parse": 5,
-    "max_number_articles_to_get_from_one_seed": 5
+    "total_articles_to_find_and_parse": 15,
+    "max_number_articles_to_get_from_one_seed": 15
 }
\ No newline at end of file
diff --git a/scrapper.py b/scrapper.py
index 8f5685a7..677b5e62 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -1,18 +1,17 @@
 """
 Crawler implementation
 """
-import article
 import json
 import os
-import random
-import re
 import requests
-import datetime
 
+from datetime import datetime
 from bs4 import BeautifulSoup
+from article import Article
 from constants import CRAWLER_CONFIG_PATH
-from constants import PROJECT_ROOT
-from time import sleep
+from constants import HEADERS
+from urllib.parse import urlparse
+
 
 class IncorrectURLError(Exception):
     """
@@ -42,24 +41,53 @@ class Crawler:
     """
     Crawler implementation
     """
-    def __init__(self, seed_urls: list, max_articles: int):
-        pass
+    def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: int):
+        self.seed_urls = seed_urls
+        self.total_max_articles = max_articles
+        self.max_articles_per_seed = max_articles_per_seed
+
+        self.urls = []
 
     @staticmethod
     def _extract_url(article_bs):
-        pass
+        articles = article_bs.find_all('div', {'itemprop': 'blogPost'})
+        current_seed_links = []
+        for blog_tag in articles:
+            article_name_tag = blog_tag.find('h2', {'itemprop': 'name'})
+            article_link = article_name_tag.a
+            current_seed_links.append(article_link.attrs['href'])
+        return current_seed_links
 
     def find_articles(self):
         """
         Finds articles
         """
-        pass
+        for url in self.seed_urls:
+            url_parsed = urlparse(url)
+            url_scheme, url_domain = url_parsed.scheme, url_parsed.netloc
+            url_base = '{}://{}'.format(url_scheme, url_domain)
+
+            # Change user-agent to avoid 403 error
+            response = requests.get(url, headers=HEADERS)  # make a request to seed url
+            if response:
+                content = response.text
+                links = self._extract_url(BeautifulSoup(content, 'html.parser'))
+                full_links = []
+
+                for link in links:
+                    if link.startswith('/'):
+                        full_links.append(url_base + link)
+                    else:
+                        full_links.append(link)
+
+                self.urls.extend(full_links[:max_articles_per_seed])
+        assert len(self.urls) >= self.total_max_articles
 
     def get_search_urls(self):
         """
         Returns seed_urls param
         """
-        pass
+        return self.seed_urls
 
 
 class ArticleParser:
@@ -67,46 +95,74 @@ class ArticleParser:
     ArticleParser implementation
     """
     def __init__(self, full_url: str, article_id: int):
-        pass
+        self.full_url = full_url
+        self.article_id = article_id
+        self.article = Article(full_url, article_id)
 
     def _fill_article_with_text(self, article_soup):
-        pass
+        self.article.text = article_soup.find("div", class_="leading-0").text
 
     def _fill_article_with_meta_information(self, article_soup):
-        pass
+        self.article.title = article_soup.find('div', class_="page-header").text.strip()
+        self.article.views = article_soup.find('div', class_="hits").find('meta').text
+        self.article.date = self.unify_date_format(article_soup.find('div', class_="create").find('time').text)
+        self.article.author = 'NOT FOUND'
 
     @staticmethod
     def unify_date_format(date_str):
         """
         Unifies date format
         """
-        pass
+        return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
 
     def parse(self):
         """
         Parses each article
         """
-        pass
+        response = requests.get(self.full_url, headers=HEADERS)
+        article_soup = BeautifulSoup(response.content, features='lxml')
+        self._fill_article_with_text(article_soup)
+        self._fill_article_with_meta_information(article_soup)
+        self.article.save_raw()
 
 
 def prepare_environment(base_path):
     """
     Creates ASSETS_PATH folder if not created and removes existing folder
     """
-    pass
+    if not os.path.isdir(base_path):
+        os.makedirs(base_path)
 
 
 def validate_config(crawler_path):
     """
     Validates given config
     """
-    pass
+    with open(crawler_path, 'r', encoding='utf-8') as config:
+        params = json.load(config)
+
+    if 'base_urls' not in params or not all([isinstance(url, str) for url in params['base_urls']]):
+        raise IncorrectURLError
+
+    if params['total_articles_to_find_and_parse'] > 100:
+        raise NumberOfArticlesOutOfRangeError
+
+    if not isinstance(params['total_articles_to_find_and_parse'], int):
+        raise IncorrectNumberOfArticlesError
+
+    return params['base_urls'], params['total_articles_to_find_and_parse'], params['total_articles_to_find_and_parse']
 
 
 if __name__ == '__main__':
-    # YOUR CODE HERE
-    headers = {
-        'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
-        'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
-    }
-    response = requests.get('https://express-kamchatka1.ru/sobytiya.html', headers=headers)
+    #YOUR CODE HERE
+    try:
+        seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH)
+        crawler = Crawler(seed_urls=seed_urls,
+                          max_articles=max_articles,
+                          max_articles_per_seed=max_articles_per_seed)
+        crawler.find_articles()
+
+        for i, url in enumerate(crawler.urls):
+            parser = ArticleParser(full_url=url, article_id=i)
+    except (IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError, UnknownConfigError):
+        exit(1)

From 4782ec2dd693d1cb047c5f48ae2228aeb02f1f32 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 08:15:28 +0300
Subject: [PATCH 03/33] maybe...

---
 scrapper.py | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index 677b5e62..ae58c0ea 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -138,19 +138,32 @@ def validate_config(crawler_path):
     """
     Validates given config
     """
-    with open(crawler_path, 'r', encoding='utf-8') as config:
-        params = json.load(config)
-
-    if 'base_urls' not in params or not all([isinstance(url, str) for url in params['base_urls']]):
-        raise IncorrectURLError
-
-    if params['total_articles_to_find_and_parse'] > 100:
-        raise NumberOfArticlesOutOfRangeError
-
-    if not isinstance(params['total_articles_to_find_and_parse'], int):
-        raise IncorrectNumberOfArticlesError
-
-    return params['base_urls'], params['total_articles_to_find_and_parse'], params['total_articles_to_find_and_parse']
+    try:
+        with open(crawler_path, 'r', encoding='utf-8') as config:
+            params = json.load(config)
+
+        seed_urls = params.get('base_urls')
+        max_articles = params.get('total_articles_to_find_and_parse')
+        max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed')
+
+        if not isinstance(seed_urls, list):
+            raise IncorrectURLError
+        for url in seed_urls:
+            if not isinstance(url, str) or not url.startswith('http'):
+                raise IncorrectURLError
+
+        if not isinstance(max_articles, int) or max_articles < 0:
+            raise IncorrectNumberOfArticlesError
+
+        if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles:
+            raise NumberOfArticlesOutOfRangeError
+
+    except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:
+        raise error
+    except:
+        raise UnknownConfigError
+    else:
+        return seed_urls, max_articles, max_articles_per_seed
 
 
 if __name__ == '__main__':

From 5eb76689bde500811491bfcb8120032961ab0b8c Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 09:03:01 +0300
Subject: [PATCH 04/33] please

---
 scrapper.py      | 33 +++++++++++++++++++--------------
 target_score.txt |  2 +-
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index ae58c0ea..2632b700 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -4,13 +4,18 @@
 import json
 import os
 import requests
+import random
+import shutil
 
 from datetime import datetime
 from bs4 import BeautifulSoup
 from article import Article
 from constants import CRAWLER_CONFIG_PATH
 from constants import HEADERS
+from constants import ASSETS_PATH
+from constants import PROJECT_ROOT
 from urllib.parse import urlparse
+from time import sleep
 
 
 class IncorrectURLError(Exception):
@@ -130,9 +135,9 @@ def prepare_environment(base_path):
     """
     Creates ASSETS_PATH folder if not created and removes existing folder
     """
-    if not os.path.isdir(base_path):
-        os.makedirs(base_path)
-
+    if os.path.exists(os.path.join(base_path, 'tmp', 'articles')):
+        shutil.rmtree(os.path.join(base_path, 'tmp', 'articles'))
+    os.makedirs(os.path.join(base_path, 'tmp', 'articles'))
 
 def validate_config(crawler_path):
     """
@@ -168,14 +173,14 @@ def validate_config(crawler_path):
 
 if __name__ == '__main__':
     #YOUR CODE HERE
-    try:
-        seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH)
-        crawler = Crawler(seed_urls=seed_urls,
-                          max_articles=max_articles,
-                          max_articles_per_seed=max_articles_per_seed)
-        crawler.find_articles()
-
-        for i, url in enumerate(crawler.urls):
-            parser = ArticleParser(full_url=url, article_id=i)
-    except (IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError, UnknownConfigError):
-        exit(1)
+    seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH)
+    crawler = Crawler(seed_urls=seed_urls,
+                      max_articles=max_articles,
+                      max_articles_per_seed=max_articles_per_seed)
+    crawler.find_articles()
+    prepare_environment(PROJECT_ROOT)
+    for i, url in enumerate(crawler.urls):
+        parser = ArticleParser(full_url=url, article_id=i)
+        sleep(random.randint(2, 5))
+        articles = parser.parse()
+        articles.save_raw()
\ No newline at end of file
diff --git a/target_score.txt b/target_score.txt
index dd08e182..3ee0e68c 100644
--- a/target_score.txt
+++ b/target_score.txt
@@ -1,5 +1,5 @@
 # Target score for scrapper.py:
-6
+8
 
 # Target score for pipeline.py:
 0

From e8c318a79ad1c6ea7861e7906f88dc72a7940d87 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 09:05:02 +0300
Subject: [PATCH 05/33] op,otkat

---
 target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target_score.txt b/target_score.txt
index 3ee0e68c..dd08e182 100644
--- a/target_score.txt
+++ b/target_score.txt
@@ -1,5 +1,5 @@
 # Target score for scrapper.py:
-8
+6
 
 # Target score for pipeline.py:
 0

From 425c4808d7c06151494e0c1dc7974f6bcb04aa3d Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 09:21:24 +0300
Subject: [PATCH 06/33] moya popitka nomer pyat

---
 scrapper.py | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index 2632b700..eae7faec 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -147,28 +147,22 @@ def validate_config(crawler_path):
         with open(crawler_path, 'r', encoding='utf-8') as config:
             params = json.load(config)
 
-        seed_urls = params.get('base_urls')
-        max_articles = params.get('total_articles_to_find_and_parse')
-        max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed')
-
-        if not isinstance(seed_urls, list):
+        if 'base_urls' not in params or not isinstance(params['base_urls'], list) or \
+                not all([isinstance(link, str) for link in params['base_urls']]):
             raise IncorrectURLError
-        for url in seed_urls:
-            if not isinstance(url, str) or not url.startswith('http'):
-                raise IncorrectURLError
 
-        if not isinstance(max_articles, int) or max_articles < 0:
+        if 'max_number_articles_to_get_from_one_seed' not in params or \
+                not isinstance(params['max_number_articles_to_get_from_one_seed'], int) or \
+                'total_articles_to_find_and_parse' not in params or \
+                not isinstance(params['total_articles_to_find_and_parse'], int):
             raise IncorrectNumberOfArticlesError
 
-        if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles:
+        if 'total_articles_to_find_and_parse' in params and \
+                isinstance(params['total_articles_to_find_and_parse'], int) and \
+                params['total_articles_to_find_and_parse'] > 100:
             raise NumberOfArticlesOutOfRangeError
 
-    except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:
-        raise error
-    except:
-        raise UnknownConfigError
-    else:
-        return seed_urls, max_articles, max_articles_per_seed
+        return params['base_urls'], params['total_articles_to_find_and_parse'], params['max_number_articles_to_get_from_one_seed']
 
 
 if __name__ == '__main__':
@@ -181,6 +175,6 @@ def validate_config(crawler_path):
     prepare_environment(PROJECT_ROOT)
     for i, url in enumerate(crawler.urls):
         parser = ArticleParser(full_url=url, article_id=i)
-        sleep(random.randint(2, 5))
         articles = parser.parse()
-        articles.save_raw()
\ No newline at end of file
+        articles.save_raw()
+        sleep(random.randint(2, 5))
\ No newline at end of file

From 739dcee42f266c0086adc8ae84a403789ae704fd Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 15:18:08 +0300
Subject: [PATCH 07/33] meow'

---
 scrapper.py | 49 ++++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index eae7faec..4294583f 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -4,8 +4,6 @@
 import json
 import os
 import requests
-import random
-import shutil
 
 from datetime import datetime
 from bs4 import BeautifulSoup
@@ -13,9 +11,7 @@
 from constants import CRAWLER_CONFIG_PATH
 from constants import HEADERS
 from constants import ASSETS_PATH
-from constants import PROJECT_ROOT
 from urllib.parse import urlparse
-from time import sleep
 
 
 class IncorrectURLError(Exception):
@@ -72,8 +68,7 @@ def find_articles(self):
             url_scheme, url_domain = url_parsed.scheme, url_parsed.netloc
             url_base = '{}://{}'.format(url_scheme, url_domain)
 
-            # Change user-agent to avoid 403 error
-            response = requests.get(url, headers=HEADERS)  # make a request to seed url
+            response = requests.get(url, headers=HEADERS)
             if response:
                 content = response.text
                 links = self._extract_url(BeautifulSoup(content, 'html.parser'))
@@ -88,6 +83,7 @@ def find_articles(self):
                 self.urls.extend(full_links[:max_articles_per_seed])
         assert len(self.urls) >= self.total_max_articles
 
+
     def get_search_urls(self):
         """
         Returns seed_urls param
@@ -108,7 +104,7 @@ def _fill_article_with_text(self, article_soup):
         self.article.text = article_soup.find("div", class_="leading-0").text
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('div', class_="page-header").text.strip()
+        self.article.title = article_soup.find('div', class_="page-header").find('h2').find('a').text.strip()
         self.article.views = article_soup.find('div', class_="hits").find('meta').text
         self.article.date = self.unify_date_format(article_soup.find('div', class_="create").find('time').text)
         self.article.author = 'NOT FOUND'
@@ -118,7 +114,7 @@ def unify_date_format(date_str):
         """
         Unifies date format
         """
-        return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
+        return datetime.strptime(date_str, "%Y-%m-%d")
 
     def parse(self):
         """
@@ -135,9 +131,8 @@ def prepare_environment(base_path):
     """
     Creates ASSETS_PATH folder if not created and removes existing folder
     """
-    if os.path.exists(os.path.join(base_path, 'tmp', 'articles')):
-        shutil.rmtree(os.path.join(base_path, 'tmp', 'articles'))
-    os.makedirs(os.path.join(base_path, 'tmp', 'articles'))
+    if not os.path.exists(os.path.join(base_path, 'tmp', 'articles')):
+        os.makedirs(os.path.join(base_path, 'tmp', 'articles'))
 
 def validate_config(crawler_path):
     """
@@ -147,22 +142,28 @@ def validate_config(crawler_path):
         with open(crawler_path, 'r', encoding='utf-8') as config:
             params = json.load(config)
 
-        if 'base_urls' not in params or not isinstance(params['base_urls'], list) or \
-                not all([isinstance(link, str) for link in params['base_urls']]):
+        seed_urls = params.get('base_urls')
+        max_articles = params.get('total_articles_to_find_and_parse')
+        max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed')
+
+        if not isinstance(seed_urls, list):
             raise IncorrectURLError
+        for url in seed_urls:
+            if not isinstance(url, str) or not url.startswith('http'):
+                raise IncorrectURLError
 
-        if 'max_number_articles_to_get_from_one_seed' not in params or \
-                not isinstance(params['max_number_articles_to_get_from_one_seed'], int) or \
-                'total_articles_to_find_and_parse' not in params or \
-                not isinstance(params['total_articles_to_find_and_parse'], int):
+        if not isinstance(max_articles, int) or max_articles < 0:
             raise IncorrectNumberOfArticlesError
 
-        if 'total_articles_to_find_and_parse' in params and \
-                isinstance(params['total_articles_to_find_and_parse'], int) and \
-                params['total_articles_to_find_and_parse'] > 100:
+        if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles:
             raise NumberOfArticlesOutOfRangeError
 
-        return params['base_urls'], params['total_articles_to_find_and_parse'], params['max_number_articles_to_get_from_one_seed']
+    except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:
+        raise error
+    except:
+        raise UnknownConfigError
+    else:
+        return seed_urls, max_articles, max_articles_per_seed
 
 
 if __name__ == '__main__':
@@ -172,9 +173,7 @@ def validate_config(crawler_path):
                       max_articles=max_articles,
                       max_articles_per_seed=max_articles_per_seed)
     crawler.find_articles()
-    prepare_environment(PROJECT_ROOT)
+    prepare_environment(ASSETS_PATH)
     for i, url in enumerate(crawler.urls):
         parser = ArticleParser(full_url=url, article_id=i)
-        articles = parser.parse()
-        articles.save_raw()
-        sleep(random.randint(2, 5))
\ No newline at end of file
+        parser.parse()

From 6d5aa100951d58f0276a3fa3efbe10531feb63cf Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 15:32:52 +0300
Subject: [PATCH 08/33] fuki-mazfuki

---
 article.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/article.py b/article.py
index 1d759cd2..515df527 100644
--- a/article.py
+++ b/article.py
@@ -46,7 +46,7 @@ def save_raw(self):
                       indent=4,
                       ensure_ascii=False,
                       separators=(',', ': '))
-    
+    +
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -94,13 +94,13 @@ def _get_meta(self):
             'author': self.author,
             'topics': self.topics
         }
-    
++
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
-    
++
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article

From d174872920a0ba82451c7f704065de9a3e20e370 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 15:36:20 +0300
Subject: [PATCH 09/33] uzhas

---
 article.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/article.py b/article.py
index 515df527..718a3b3b 100644
--- a/article.py
+++ b/article.py
@@ -46,7 +46,7 @@ def save_raw(self):
                       indent=4,
                       ensure_ascii=False,
                       separators=(',', ': '))
-    +
+
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -94,13 +94,13 @@ def _get_meta(self):
             'author': self.author,
             'topics': self.topics
         }
-+
+
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
-+
+
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article

From a6a5e24d16a391d5f5c6eba3b6b27c4d01fd8a1b Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 11 Mar 2021 17:12:13 +0300
Subject: [PATCH 10/33] 'puk'

---
 constants.py | 2 +-
 scrapper.py  | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/constants.py b/constants.py
index 913a418e..b3b9ba89 100644
--- a/constants.py
+++ b/constants.py
@@ -8,5 +8,5 @@
 ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
 HEADERS = {
-        'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
+        'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
     }
diff --git a/scrapper.py b/scrapper.py
index 4294583f..19babfbf 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -101,12 +101,12 @@ def __init__(self, full_url: str, article_id: int):
         self.article = Article(full_url, article_id)
 
     def _fill_article_with_text(self, article_soup):
-        self.article.text = article_soup.find("div", class_="leading-0").text
+        self.article.text = article_soup.find("div", class_="item-page").text
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('div', class_="page-header").find('h2').find('a').text.strip()
-        self.article.views = article_soup.find('div', class_="hits").find('meta').text
-        self.article.date = self.unify_date_format(article_soup.find('div', class_="create").find('time').text)
+        self.article.title = article_soup.find('div', class_="page-header").find('h2').text
+        self.article.views = article_soup.find('dd', class_="hits").find('meta').text
+        self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text)
         self.article.author = 'NOT FOUND'
 
     @staticmethod
@@ -173,6 +173,7 @@ def validate_config(crawler_path):
                       max_articles=max_articles,
                       max_articles_per_seed=max_articles_per_seed)
     crawler.find_articles()
+
     prepare_environment(ASSETS_PATH)
     for i, url in enumerate(crawler.urls):
         parser = ArticleParser(full_url=url, article_id=i)

From 747f0ec4e28f338f3ea2e9be9abbaec568aa3dfb Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 12 Mar 2021 23:14:23 +0300
Subject: [PATCH 11/33] zzz

---
 article.py  |  3 +++
 scrapper.py | 45 ++++++++++++++++++---------------------------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/article.py b/article.py
index 718a3b3b..75f33b0c 100644
--- a/article.py
+++ b/article.py
@@ -47,6 +47,7 @@ def save_raw(self):
                       ensure_ascii=False,
                       separators=(',', ': '))
 
+
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -95,12 +96,14 @@ def _get_meta(self):
             'topics': self.topics
         }
 
+
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
 
+
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article
diff --git a/scrapper.py b/scrapper.py
index 19babfbf..e0577767 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -11,7 +11,8 @@
 from constants import CRAWLER_CONFIG_PATH
 from constants import HEADERS
 from constants import ASSETS_PATH
-from urllib.parse import urlparse
+from time import sleep
+import random
 
 
 class IncorrectURLError(Exception):
@@ -51,37 +52,24 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in
 
     @staticmethod
     def _extract_url(article_bs):
-        articles = article_bs.find_all('div', {'itemprop': 'blogPost'})
-        current_seed_links = []
-        for blog_tag in articles:
-            article_name_tag = blog_tag.find('h2', {'itemprop': 'name'})
-            article_link = article_name_tag.a
-            current_seed_links.append(article_link.attrs['href'])
-        return current_seed_links
+        article_link = article_bs.find('h2', {'itemprop': 'name'}).find('a').get('href')
+        return(article_link)
 
     def find_articles(self):
         """
         Finds articles
         """
         for url in self.seed_urls:
-            url_parsed = urlparse(url)
-            url_scheme, url_domain = url_parsed.scheme, url_parsed.netloc
-            url_base = '{}://{}'.format(url_scheme, url_domain)
-
+            sleep(random.randint(2, 8))
             response = requests.get(url, headers=HEADERS)
-            if response:
-                content = response.text
-                links = self._extract_url(BeautifulSoup(content, 'html.parser'))
-                full_links = []
-
-                for link in links:
-                    if link.startswith('/'):
-                        full_links.append(url_base + link)
-                    else:
-                        full_links.append(link)
-
-                self.urls.extend(full_links[:max_articles_per_seed])
-        assert len(self.urls) >= self.total_max_articles
+            if not response:
+                continue
+            link = BeautifulSoup(response.content, features='lxml')
+            articles_soup = link.find_all('li')
+            for article_bs in articles_soup[:max_articles_per_seed]:
+                self.urls.append(self._extract_url(article_bs))
+                if len(self.urls) == max_articles:
+                    return self.urls
 
 
     def get_search_urls(self):
@@ -101,10 +89,13 @@ def __init__(self, full_url: str, article_id: int):
         self.article = Article(full_url, article_id)
 
     def _fill_article_with_text(self, article_soup):
-        self.article.text = article_soup.find("div", class_="item-page").text
+        article_text = article_soup.find_all('p')
+        for par in article_text:
+            if 'class' not in par.attrs:
+                self.article.text += par.text.strip() + ' '
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('div', class_="page-header").find('h2').text
+        self.article.title = article_soup.find('dev',class_='page-header').find('h2').text
         self.article.views = article_soup.find('dd', class_="hits").find('meta').text
         self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text)
         self.article.author = 'NOT FOUND'

From a25e84a79114a2b5d3126893caa787f4ab4aefdd Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 12 Mar 2021 23:35:08 +0300
Subject: [PATCH 12/33] no coment'

---
 scrapper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index e0577767..38b9bd2a 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -53,7 +53,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in
     @staticmethod
     def _extract_url(article_bs):
         article_link = article_bs.find('h2', {'itemprop': 'name'}).find('a').get('href')
-        return(article_link)
+        print(article_link)
 
     def find_articles(self):
         """
@@ -95,7 +95,7 @@ def _fill_article_with_text(self, article_soup):
                 self.article.text += par.text.strip() + ' '
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('dev',class_='page-header').find('h2').text
+        self.article.title = article_soup.find('div',class_='page-header').find('h2').text
         self.article.views = article_soup.find('dd', class_="hits").find('meta').text
         self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text)
         self.article.author = 'NOT FOUND'
@@ -146,7 +146,7 @@ def validate_config(crawler_path):
         if not isinstance(max_articles, int) or max_articles < 0:
             raise IncorrectNumberOfArticlesError
 
-        if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles:
+        if max_articles_per_seed > 100:
             raise NumberOfArticlesOutOfRangeError
 
     except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:

From 144a74c5c551b6ecfb582c079ff097d6417f99fc Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 12 Mar 2021 23:40:13 +0300
Subject: [PATCH 13/33] f

---
 scrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapper.py b/scrapper.py
index 38b9bd2a..305a83bb 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -146,7 +146,7 @@ def validate_config(crawler_path):
         if not isinstance(max_articles, int) or max_articles < 0:
             raise IncorrectNumberOfArticlesError
 
-        if max_articles_per_seed > 100:
+        if not isinstance(max_articles_per_seed,int) or max_articles_per_seed > 100:
             raise NumberOfArticlesOutOfRangeError
 
     except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:

From d3382bae2639b5af109c5e03d3d58aef917dafbf Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Sun, 14 Mar 2021 23:04:52 +0300
Subject: [PATCH 14/33] uspeshno?

---
 article.py          |  5 +--
 constants.py        |  3 +-
 crawler_config.json |  6 ++--
 scrapper.py         | 75 +++++++++++++++++++++++----------------------
 4 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/article.py b/article.py
index 75f33b0c..95ea374d 100644
--- a/article.py
+++ b/article.py
@@ -47,7 +47,6 @@ def save_raw(self):
                       ensure_ascii=False,
                       separators=(',', ': '))
 
-
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -91,19 +90,17 @@ def _get_meta(self):
             'id': self.article_id,
             'url': self.url,
             'title': self.title,
-            'date': self._date_to_text(),
+            'date': self.date,
             'author': self.author,
             'topics': self.topics
         }
 
-
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
 
-
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article
diff --git a/constants.py b/constants.py
index b3b9ba89..917cc470 100644
--- a/constants.py
+++ b/constants.py
@@ -8,5 +8,6 @@
 ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
 HEADERS = {
-        'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 
+                  'Chrome/89.0.4389.82 Safari/537.36'
     }
diff --git a/crawler_config.json b/crawler_config.json
index 1c4f02db..e2a71584 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
-    "base_urls": ["https://express-kamchatka1.ru/sobytiya.html"],
-    "total_articles_to_find_and_parse": 15,
-    "max_number_articles_to_get_from_one_seed": 15
+    "base_urls": ["https://www.e1.ru/news/"],
+    "total_articles_to_find_and_parse": 5,
+    "max_number_articles_to_get_from_one_seed": 10
 }
\ No newline at end of file
diff --git a/scrapper.py b/scrapper.py
index 305a83bb..74f9ef8d 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -3,16 +3,13 @@
 """
 import json
 import os
+import random
 import requests
 
-from datetime import datetime
 from bs4 import BeautifulSoup
 from article import Article
-from constants import CRAWLER_CONFIG_PATH
-from constants import HEADERS
-from constants import ASSETS_PATH
+from constants import CRAWLER_CONFIG_PATH, HEADERS, ASSETS_PATH
 from time import sleep
-import random
 
 
 class IncorrectURLError(Exception):
@@ -47,30 +44,33 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in
         self.seed_urls = seed_urls
         self.total_max_articles = max_articles
         self.max_articles_per_seed = max_articles_per_seed
-
         self.urls = []
 
     @staticmethod
     def _extract_url(article_bs):
-        article_link = article_bs.find('h2', {'itemprop': 'name'}).find('a').get('href')
-        print(article_link)
+        url_article = article_bs.find('h2', class_="G3ax").find('a')
+        article_link = url_article.attrs['href']
+        return 'https://www.e1.ru' + article_link
 
     def find_articles(self):
         """
         Finds articles
         """
         for url in self.seed_urls:
-            sleep(random.randint(2, 8))
             response = requests.get(url, headers=HEADERS)
             if not response:
-                continue
-            link = BeautifulSoup(response.content, features='lxml')
-            articles_soup = link.find_all('li')
-            for article_bs in articles_soup[:max_articles_per_seed]:
-                self.urls.append(self._extract_url(article_bs))
-                if len(self.urls) == max_articles:
-                    return self.urls
-
+                raise IncorrectURLError
+            if response.status_code == 200:
+                sleep(random.randrange(2, 6))
+            response.encoding = 'utf-8'
+            page_soup = BeautifulSoup(response.content, features='lxml')
+            article_soup = page_soup.find_all('article', class_="G3aj7")
+            for article in article_soup:
+                seed_url = self._extract_url(article)
+                self.urls.append(seed_url)
+                if len(self.urls) <= max_articles and article not in self.urls:
+                    seed_url = self._extract_url(article)
+                    self.urls.append(seed_url)
 
     def get_search_urls(self):
         """
@@ -89,41 +89,44 @@ def __init__(self, full_url: str, article_id: int):
         self.article = Article(full_url, article_id)
 
     def _fill_article_with_text(self, article_soup):
-        article_text = article_soup.find_all('p')
+        article_text = article_soup.find('div', class_="F-af3").find_all('p')
         for par in article_text:
-            if 'class' not in par.attrs:
-                self.article.text += par.text.strip() + ' '
+            self.article.text += par.text.strip() + '\n'
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('div',class_='page-header').find('h2').text
-        self.article.views = article_soup.find('dd', class_="hits").find('meta').text
-        self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text)
+        self.article.title = article_soup.find('h2', {'itemprop': 'headline'}).find('span').text
+        self.article.annotation = article_soup.find('p', class_="CLpj JZaj-").find('span').text
         self.article.author = 'NOT FOUND'
+        self.article.date = article_soup.find('time', class_="G-k1").find('a').text.strip()
 
     @staticmethod
     def unify_date_format(date_str):
         """
         Unifies date format
         """
-        return datetime.strptime(date_str, "%Y-%m-%d")
+        pass
 
     def parse(self):
         """
         Parses each article
         """
         response = requests.get(self.full_url, headers=HEADERS)
-        article_soup = BeautifulSoup(response.content, features='lxml')
+        if not response:
+            raise IncorrectURLError
+
+        article_soup = BeautifulSoup(response.text, 'lxml')
         self._fill_article_with_text(article_soup)
         self._fill_article_with_meta_information(article_soup)
-        self.article.save_raw()
+        return self.article
 
 
 def prepare_environment(base_path):
     """
     Creates ASSETS_PATH folder if not created and removes existing folder
     """
-    if not os.path.exists(os.path.join(base_path, 'tmp', 'articles')):
-        os.makedirs(os.path.join(base_path, 'tmp', 'articles'))
+    if not os.path.exists(base_path):
+        os.makedirs(base_path)
+
 
 def validate_config(crawler_path):
     """
@@ -146,13 +149,11 @@ def validate_config(crawler_path):
         if not isinstance(max_articles, int) or max_articles < 0:
             raise IncorrectNumberOfArticlesError
 
-        if not isinstance(max_articles_per_seed,int) or max_articles_per_seed > 100:
+        if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > 100:
             raise NumberOfArticlesOutOfRangeError
 
     except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:
         raise error
-    except:
-        raise UnknownConfigError
     else:
         return seed_urls, max_articles, max_articles_per_seed
 
@@ -163,9 +164,11 @@ def validate_config(crawler_path):
     crawler = Crawler(seed_urls=seed_urls,
                       max_articles=max_articles,
                       max_articles_per_seed=max_articles_per_seed)
-    crawler.find_articles()
-
+    art = crawler.find_articles()
+    print(art)
     prepare_environment(ASSETS_PATH)
-    for i, url in enumerate(crawler.urls):
-        parser = ArticleParser(full_url=url, article_id=i)
-        parser.parse()
+    for article_id, article_url in enumerate(crawler.urls):
+        parser = ArticleParser(article_url, article_id+1)
+        article = parser.parse()
+        article.save_raw()
+        sleep((random.randrange(2, 6)))

From 57a712d4b64409a56dd1c35ff41ea75a60cf56a0 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Sun, 14 Mar 2021 23:21:22 +0300
Subject: [PATCH 15/33] uspeh

---
 crawler_config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler_config.json b/crawler_config.json
index e2a71584..1a34d92c 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
     "base_urls": ["https://www.e1.ru/news/"],
     "total_articles_to_find_and_parse": 5,
-    "max_number_articles_to_get_from_one_seed": 10
+    "max_number_articles_to_get_from_one_seed": 15
 }
\ No newline at end of file

From 89ff700276052b6981f8299b3c5ae476fbaf3a8c Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Sun, 14 Mar 2021 23:28:21 +0300
Subject: [PATCH 16/33] test

---
 crawler_config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler_config.json b/crawler_config.json
index 1a34d92c..e2a71584 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
     "base_urls": ["https://www.e1.ru/news/"],
     "total_articles_to_find_and_parse": 5,
-    "max_number_articles_to_get_from_one_seed": 15
+    "max_number_articles_to_get_from_one_seed": 10
 }
\ No newline at end of file

From 3d3c72d9020788c7d4d8887bd4a2bdeacf493611 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Mon, 15 Mar 2021 00:29:34 +0300
Subject: [PATCH 17/33] popitaemsya snova

---
 crawler_config.json | 6 +++---
 scrapper.py         | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/crawler_config.json b/crawler_config.json
index e2a71584..94524902 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
-    "base_urls": ["https://www.e1.ru/news/"],
-    "total_articles_to_find_and_parse": 5,
-    "max_number_articles_to_get_from_one_seed": 10
+  "base_urls": ["https://www.e1.ru/news/"],
+  "total_articles_to_find_and_parse": 6,
+  "max_number_articles_to_get_from_one_seed": 10
 }
\ No newline at end of file
diff --git a/scrapper.py b/scrapper.py
index 74f9ef8d..b971a6ff 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -97,6 +97,7 @@ def _fill_article_with_meta_information(self, article_soup):
         self.article.title = article_soup.find('h2', {'itemprop': 'headline'}).find('span').text
         self.article.annotation = article_soup.find('p', class_="CLpj JZaj-").find('span').text
         self.article.author = 'NOT FOUND'
+        self.article.topics = article_soup.find('a', class_="CLpx CLrt JZak9").find('span').text
         self.article.date = article_soup.find('time', class_="G-k1").find('a').text.strip()
 
     @staticmethod

From 9647f8274fba7b0e8b23c6116601364f71d6847a Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Mon, 15 Mar 2021 11:33:42 +0300
Subject: [PATCH 18/33] n

---
 scrapper.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index b971a6ff..d1eb1685 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -165,11 +165,9 @@ def validate_config(crawler_path):
     crawler = Crawler(seed_urls=seed_urls,
                       max_articles=max_articles,
                       max_articles_per_seed=max_articles_per_seed)
-    art = crawler.find_articles()
-    print(art)
+    crawler.find_articles()
     prepare_environment(ASSETS_PATH)
     for article_id, article_url in enumerate(crawler.urls):
-        parser = ArticleParser(article_url, article_id+1)
+        parser = ArticleParser(article_url, article_id + 1)
         article = parser.parse()
         article.save_raw()
-        sleep((random.randrange(2, 6)))

From 1a9d86c3358a8ca4aa90380e62874a297b41e710 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Mon, 15 Mar 2021 11:36:33 +0300
Subject: [PATCH 19/33] g

---
 scrapper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scrapper.py b/scrapper.py
index d1eb1685..2453b6ed 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -168,6 +168,7 @@ def validate_config(crawler_path):
     crawler.find_articles()
     prepare_environment(ASSETS_PATH)
     for article_id, article_url in enumerate(crawler.urls):
-        parser = ArticleParser(article_url, article_id + 1)
+        parser = ArticleParser(article_url, article_id+1)
         article = parser.parse()
         article.save_raw()
+        sleep((random.randrange(2, 6)))

From a3ac3f1087bd301f9a1e6ced4b6935f5728a72b8 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Sun, 21 Mar 2021 19:38:16 +0300
Subject: [PATCH 20/33] p

---
 constants.py        |  4 ++--
 crawler_config.json |  2 +-
 scrapper.py         | 14 ++++++--------
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/constants.py b/constants.py
index 917cc470..a6b20ca9 100644
--- a/constants.py
+++ b/constants.py
@@ -8,6 +8,6 @@
 ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
 HEADERS = {
-    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 
-                  'Chrome/89.0.4389.82 Safari/537.36'
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                   'Chrome/88.0.4324.152 YaBrowser/21.2.3.100 Yowser/2.5 Safari/537.36'
     }
diff --git a/crawler_config.json b/crawler_config.json
index 94524902..6e532dee 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
   "base_urls": ["https://www.e1.ru/news/"],
-  "total_articles_to_find_and_parse": 6,
+  "total_articles_to_find_and_parse": 5,
   "max_number_articles_to_get_from_one_seed": 10
 }
\ No newline at end of file
diff --git a/scrapper.py b/scrapper.py
index 2453b6ed..ecf59c25 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -48,8 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in
 
     @staticmethod
     def _extract_url(article_bs):
-        url_article = article_bs.find('h2', class_="G3ax").find('a')
-        article_link = url_article.attrs['href']
+        article_link = article_bs.find('h2', class_="G3ax").find('a').get('href')
         return 'https://www.e1.ru' + article_link
 
     def find_articles(self):
@@ -64,7 +63,7 @@ def find_articles(self):
                 sleep(random.randrange(2, 6))
             response.encoding = 'utf-8'
             page_soup = BeautifulSoup(response.content, features='lxml')
-            article_soup = page_soup.find_all('article', class_="G3aj7")
+            article_soup = page_soup.find_all('article', class_="G3ajx")
             for article in article_soup:
                 seed_url = self._extract_url(article)
                 self.urls.append(seed_url)
@@ -94,11 +93,11 @@ def _fill_article_with_text(self, article_soup):
             self.article.text += par.text.strip() + '\n'
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('h2', {'itemprop': 'headline'}).find('span').text
-        self.article.annotation = article_soup.find('p', class_="CLpj JZaj-").find('span').text
+        self.article.title = article_soup.find('h2', class_="CVq3 CVtb KHax").find('span').text
+        self.article.annotation = article_soup.find('p', class_="CVq- KHaj1").find('span').text
         self.article.author = 'NOT FOUND'
-        self.article.topics = article_soup.find('a', class_="CLpx CLrt JZak9").find('span').text
-        self.article.date = article_soup.find('time', class_="G-k1").find('a').text.strip()
+        self.article.topics = article_soup.find('a', class_="CVrn CVtj KHak5").find('span').text
+        self.article.date = article_soup.find('time', class_="HDkz").find('a').text
 
     @staticmethod
     def unify_date_format(date_str):
@@ -171,4 +170,3 @@ def validate_config(crawler_path):
         parser = ArticleParser(article_url, article_id+1)
         article = parser.parse()
         article.save_raw()
-        sleep((random.randrange(2, 6)))

From ee1e49697cfb0c2b3a0589ddb907bea9c8be1d50 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Sun, 21 Mar 2021 19:56:38 +0300
Subject: [PATCH 21/33] pp

---
 scrapper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index ecf59c25..155c7296 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -48,7 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in
 
     @staticmethod
     def _extract_url(article_bs):
-        article_link = article_bs.find('h2', class_="G3ax").find('a').get('href')
+        article_link = article_bs.find('h2', class_="G1ax").find('a').get('href')
         return 'https://www.e1.ru' + article_link
 
     def find_articles(self):
@@ -63,7 +63,7 @@ def find_articles(self):
                 sleep(random.randrange(2, 6))
             response.encoding = 'utf-8'
             page_soup = BeautifulSoup(response.content, features='lxml')
-            article_soup = page_soup.find_all('article', class_="G3ajx")
+            article_soup = page_soup.find_all('article', class_="G1ajx")
             for article in article_soup:
                 seed_url = self._extract_url(article)
                 self.urls.append(seed_url)
@@ -166,7 +166,7 @@ def validate_config(crawler_path):
                       max_articles_per_seed=max_articles_per_seed)
     crawler.find_articles()
     prepare_environment(ASSETS_PATH)
-    for article_id, article_url in enumerate(crawler.urls):
-        parser = ArticleParser(article_url, article_id+1)
+    for article_id, article_url in enumerate(crawler.urls, 1):
+        parser = ArticleParser(article_url, article_id)
         article = parser.parse()
         article.save_raw()

From e651fd02dcba9a489acac7a6299253a0085fbc2f Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Sun, 21 Mar 2021 21:09:03 +0300
Subject: [PATCH 22/33] goo

---
 scrapper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index 155c7296..e30e5643 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -64,12 +64,12 @@ def find_articles(self):
             response.encoding = 'utf-8'
             page_soup = BeautifulSoup(response.content, features='lxml')
             article_soup = page_soup.find_all('article', class_="G1ajx")
-            for article in article_soup:
+            for article in article_soup[:max_articles_per_seed]:
                 seed_url = self._extract_url(article)
                 self.urls.append(seed_url)
-                if len(self.urls) <= max_articles and article not in self.urls:
-                    seed_url = self._extract_url(article)
-                    self.urls.append(seed_url)
+                if len(self.urls) == max_articles:
+                    return self.urls
+
 
     def get_search_urls(self):
         """

From e4368959bffba5989d7d545d99955e6acf86f37e Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 1 Apr 2021 17:28:30 +0300
Subject: [PATCH 23/33] start pipe

---
 constants.py        |   4 +-
 crawler_config.json |   2 +-
 pipeline.py         | 114 +++++++++++++++++++++++++++++---------------
 requirements.txt    |   5 +-
 scrapper.py         |  14 +++---
 5 files changed, 88 insertions(+), 51 deletions(-)

diff --git a/constants.py b/constants.py
index a6b20ca9..6d9ad653 100644
--- a/constants.py
+++ b/constants.py
@@ -8,6 +8,6 @@
 ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
 HEADERS = {
-    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
-                   'Chrome/88.0.4324.152 YaBrowser/21.2.3.100 Yowser/2.5 Safari/537.36'
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
+    'Chrome/88.0.4324.182 YaBrowser/21.2.4.165 Yowser/2.5 Safari/537.36'
     }
diff --git a/crawler_config.json b/crawler_config.json
index 6e532dee..70338ee4 100644
--- a/crawler_config.json
+++ b/crawler_config.json
@@ -1,5 +1,5 @@
 {
   "base_urls": ["https://www.e1.ru/news/"],
   "total_articles_to_find_and_parse": 5,
-  "max_number_articles_to_get_from_one_seed": 10
+  "max_number_articles_to_get_from_one_seed": 5
 }
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index b6847326..69fd8bf6 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,15 +1,27 @@
 """
 Pipeline for text processing implementation
 """
+from pymystem3 import Mystem
+from typing import List
+from pathlib import Path
 
+from article import Article
+from constants import ASSETS_PATH
 
-class ArticleNotFoundError(Exception):
+
+class EmptyDirectoryError(Exception):
     """
     Custom error
     """
 
 
-class EmptyDirectoryError(Exception):
+class InconsistentDatasetError(Exception):
+    """
+    Custom error
+    """
+
+
+class UnknownDatasetError(Exception):
     """
     Custom error
     """
@@ -19,80 +31,106 @@ class MorphologicalToken:
     """
     Stores language params for each processed token
     """
-    def __init__(self, normalized_form, tags, original_word):
-        pass
-
-    def to_text(self):
-        """
-        Converts instance to str format
-        """
-        pass
+    def __init__(self, normalized_form, original_word):
+        self.original_word = original_word
+        self.normalized_form = normalized_form
+        self.mystem_tags = ''
 
     def __str__(self):
-        pass
+        return f"{self.normalized_form}<{self.mystem_tags}>"
 
 
 class CorpusManager:
     """
     Works with articles and stores them
     """
+
     def __init__(self, path_to_raw_txt_data: str):
-        pass
+        self.path_to_raw_txt_date = path_to_raw_txt_data
+        self._storage = {}
 
-    def get_articles_meta(self):
-        """
-        Gets article metadata
-        """
-        pass
+        self._scan_dataset()
 
-    def get_raw_text(self, text_id):
+    def _scan_dataset(self):
         """
-        Opens processed text
+        Register each dataset entry
         """
-        pass
+        for file in Path(self.path_to_raw_txt_date).glob('*_raw.txt'):
+            id = str(file).split('\\')[-1].split('_')[0]
+            self._storage[id] = Article(url=None, article_id=id)
 
-    def write_processed_text(self, text_id, processed_text):
+    def get_articles(self):
         """
-        Writes processed text
+        Returns storage params
         """
-        pass
+        return self._storage
 
 
 class TextProcessingPipeline:
     """
     Process articles from corpus manager
     """
+
     def __init__(self, corpus_manager: CorpusManager):
-        pass
+        self.corpus_manager = corpus_manager
+        self._text = ''
 
     def run(self):
         """
         Runs pipeline process scenario
         """
-        pass
+        for article in self.corpus_manager.get_articles().values():
+            self.text_ = article.get_raw_text()
+            processed_text = list(map(str, self._process()))
+            article.save_processed(' '.join(processed_text))
 
-    @staticmethod
-    def normalize_and_tag_text(text) -> str:
+    def _process(self) -> List[type(MorphologicalToken)]:
         """
-        Processes each token and creates MorphToken class instance
+        Performs processing of each text
         """
-        pass
+        text = self.text_
+        result = Mystem().analyze(text)
+        tokens = []
 
-    @staticmethod
-    def transform_tokens_to_text(tokens: list) -> str:
-        """
-        Transforms given list of tokens to str
-        """
-        pass
+        for word in result:
+            try:
+                token = MorphologicalToken(original_word=word['text'], normalized_form=word['analysis'][0]['lex'])
+                token.mystem_tags = word['analysis'][0]['gr']
+            except (IndexError, KeyError):
+                if not word['text'].isnumeric():
+                    continue
+                token = MorphologicalToken(original_word=word['text'], normalized_form=word['text'])
 
+            tokens.append(token)
 
-def validate_given_path(path_to_validate):
+            return tokens
+
+
+def validate_dataset(path_to_validate):
     """
     Validates folder with assets
     """
-    pass
+    path = Path(path_to_validate)
+
+    if not path.exists():
+        raise FileNotFoundError
+
+    if not path.is_dir():
+        raise NotADirectoryError
+
+    if not list(path.iterdir()):
+        raise EmptyDirectoryError
+
+
+def main():
+    validate_dataset(ASSETS_PATH)
+
+    corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH)
+    pipeline = TextProcessingPipeline(corpus_manager=corpus_manager)
+
+    pipeline.run()
 
 
 if __name__ == "__main__":
     # YOUR CODE HERE
-    pass
+    main()
diff --git a/requirements.txt b/requirements.txt
index 97b9fc20..7cc4f073 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
-requests==2.25.1
 beautifulsoup4==4.9.3
-lxml==4.6.2
\ No newline at end of file
+lxml==4.6.2
+pymystem3==0.2.0
+requests==2.25.1
\ No newline at end of file
diff --git a/scrapper.py b/scrapper.py
index e30e5643..f90e25db 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -48,7 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in
 
     @staticmethod
     def _extract_url(article_bs):
-        article_link = article_bs.find('h2', class_="G1ax").find('a').get('href')
+        article_link = article_bs.find('h2', class_="G3ad").find('a').get('href')
         return 'https://www.e1.ru' + article_link
 
     def find_articles(self):
@@ -63,14 +63,13 @@ def find_articles(self):
                 sleep(random.randrange(2, 6))
             response.encoding = 'utf-8'
             page_soup = BeautifulSoup(response.content, features='lxml')
-            article_soup = page_soup.find_all('article', class_="G1ajx")
+            article_soup = page_soup.find_all('article', class_="G3aj-")
             for article in article_soup[:max_articles_per_seed]:
                 seed_url = self._extract_url(article)
                 self.urls.append(seed_url)
                 if len(self.urls) == max_articles:
                     return self.urls
 
-
     def get_search_urls(self):
         """
         Returns seed_urls param
@@ -88,16 +87,15 @@ def __init__(self, full_url: str, article_id: int):
         self.article = Article(full_url, article_id)
 
     def _fill_article_with_text(self, article_soup):
-        article_text = article_soup.find('div', class_="F-af3").find_all('p')
+        article_text = article_soup.find('div', class_="GDagz").find('div').find_all('p')
         for par in article_text:
             self.article.text += par.text.strip() + '\n'
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('h2', class_="CVq3 CVtb KHax").find('span').text
-        self.article.annotation = article_soup.find('p', class_="CVq- KHaj1").find('span').text
+        self.article.title = article_soup.find('h2', class_="C7r1 C7t- KBad").find('span').text
         self.article.author = 'NOT FOUND'
-        self.article.topics = article_soup.find('a', class_="CVrn CVtj KHak5").find('span').text
-        self.article.date = article_soup.find('time', class_="HDkz").find('a').text
+        self.article.topics = article_soup.find('a', class_="C7sl C7uh KBal9").find('span').text
+        self.article.date = article_soup.find('time', class_="HDk-").find('a').text
 
     @staticmethod
     def unify_date_format(date_str):

From ea8cbaeb40f051686f1c5b87f053f3ab6ada218a Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Thu, 1 Apr 2021 17:59:20 +0300
Subject: [PATCH 24/33] target score 6

---
 pipeline.py      | 1 -
 target_score.txt | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 69fd8bf6..4458a408 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -4,7 +4,6 @@
 from pymystem3 import Mystem
 from typing import List
 from pathlib import Path
-
 from article import Article
 from constants import ASSETS_PATH
 
diff --git a/target_score.txt b/target_score.txt
index dd08e182..a7013d3a 100644
--- a/target_score.txt
+++ b/target_score.txt
@@ -2,4 +2,4 @@
 6
 
 # Target score for pipeline.py:
-0
+6

From 0411522f5d0bed25b6a4176f778d0a432c626379 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 15:11:34 +0300
Subject: [PATCH 25/33] pymorphy try

---
 constants.py     |  2 +-
 pipeline.py      | 22 +++++++++++-----------
 requirements.txt |  1 +
 scrapper.py      | 12 ++++++------
 target_score.txt |  2 +-
 5 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/constants.py b/constants.py
index 6d9ad653..22208549 100644
--- a/constants.py
+++ b/constants.py
@@ -9,5 +9,5 @@
 CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
 HEADERS = {
     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
-    'Chrome/88.0.4324.182 YaBrowser/21.2.4.165 Yowser/2.5 Safari/537.36'
+    'Chrome/89.0.4389.90 Safari/537.36'
     }
diff --git a/pipeline.py b/pipeline.py
index 4458a408..11be3ef1 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,6 +1,7 @@
 """
 Pipeline for text processing implementation
 """
+from pymorphy2 import MorphAnalyzer
 from pymystem3 import Mystem
 from typing import List
 from pathlib import Path
@@ -34,9 +35,10 @@ def __init__(self, normalized_form, original_word):
         self.original_word = original_word
         self.normalized_form = normalized_form
         self.mystem_tags = ''
+        self.pymorphy_tags = ''
 
     def __str__(self):
-        return f"{self.normalized_form}<{self.mystem_tags}>"
+        return f"{self.normalized_form}<{self.mystem_tags}>({self.pymorphy_tags})"
 
 
 class CorpusManager:
@@ -47,14 +49,13 @@ class CorpusManager:
     def __init__(self, path_to_raw_txt_data: str):
         self.path_to_raw_txt_date = path_to_raw_txt_data
         self._storage = {}
-
         self._scan_dataset()
 
     def _scan_dataset(self):
         """
         Register each dataset entry
         """
-        for file in Path(self.path_to_raw_txt_date).glob('*_raw.txt'):
+        for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'):
             id = str(file).split('\\')[-1].split('_')[0]
             self._storage[id] = Article(url=None, article_id=id)
 
@@ -72,14 +73,14 @@ class TextProcessingPipeline:
 
     def __init__(self, corpus_manager: CorpusManager):
         self.corpus_manager = corpus_manager
-        self._text = ''
+        self.raw_text = ''
 
     def run(self):
         """
         Runs pipeline process scenario
         """
         for article in self.corpus_manager.get_articles().values():
-            self.text_ = article.get_raw_text()
+            self.raw_text = article.get_raw_text()
             processed_text = list(map(str, self._process()))
             article.save_processed(' '.join(processed_text))
 
@@ -87,22 +88,21 @@ def _process(self) -> List[type(MorphologicalToken)]:
         """
         Performs processing of each text
         """
-        text = self.text_
-        result = Mystem().analyze(text)
+        result = Mystem().analyze(self.raw_text)
         tokens = []
 
         for word in result:
             try:
                 token = MorphologicalToken(original_word=word['text'], normalized_form=word['analysis'][0]['lex'])
                 token.mystem_tags = word['analysis'][0]['gr']
+                tokens.append(token)
             except (IndexError, KeyError):
                 if not word['text'].isnumeric():
                     continue
-                token = MorphologicalToken(original_word=word['text'], normalized_form=word['text'])
-
-            tokens.append(token)
+            for token in tokens:
+                token.pymorphy_tags = MorphAnalyzer().parse(token.original_word)[0].tag
 
-            return tokens
+        return tokens
 
 
 def validate_dataset(path_to_validate):
diff --git a/requirements.txt b/requirements.txt
index 7cc4f073..06ccd482 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 beautifulsoup4==4.9.3
 lxml==4.6.2
+pymorphy2==0.9.1
 pymystem3==0.2.0
 requests==2.25.1
\ No newline at end of file
diff --git a/scrapper.py b/scrapper.py
index f90e25db..436b02e2 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -48,7 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in
 
     @staticmethod
     def _extract_url(article_bs):
-        article_link = article_bs.find('h2', class_="G3ad").find('a').get('href')
+        article_link = article_bs.find('h2', class_="G9ax").find('a').get('href')
         return 'https://www.e1.ru' + article_link
 
     def find_articles(self):
@@ -63,7 +63,7 @@ def find_articles(self):
                 sleep(random.randrange(2, 6))
             response.encoding = 'utf-8'
             page_soup = BeautifulSoup(response.content, features='lxml')
-            article_soup = page_soup.find_all('article', class_="G3aj-")
+            article_soup = page_soup.find_all('article', class_="G9alp")
             for article in article_soup[:max_articles_per_seed]:
                 seed_url = self._extract_url(article)
                 self.urls.append(seed_url)
@@ -87,15 +87,15 @@ def __init__(self, full_url: str, article_id: int):
         self.article = Article(full_url, article_id)
 
     def _fill_article_with_text(self, article_soup):
-        article_text = article_soup.find('div', class_="GDagz").find('div').find_all('p')
+        article_text = article_soup.find('div', class_="GFahz").find('div').find_all('p')
         for par in article_text:
             self.article.text += par.text.strip() + '\n'
 
     def _fill_article_with_meta_information(self, article_soup):
-        self.article.title = article_soup.find('h2', class_="C7r1 C7t- KBad").find('span').text
+        self.article.title = article_soup.find('h2', class_="CRqd CRsn JPax").find('span').text
         self.article.author = 'NOT FOUND'
-        self.article.topics = article_soup.find('a', class_="C7sl C7uh KBal9").find('span').text
-        self.article.date = article_soup.find('time', class_="HDk-").find('a').text
+        self.article.topics = article_soup.find('a', class_="CRqz CRsv JPall").find('span').text
+        self.article.date = article_soup.find('time', class_="HHkz").find('a').text
 
     @staticmethod
     def unify_date_format(date_str):
diff --git a/target_score.txt b/target_score.txt
index a7013d3a..72ecf9ea 100644
--- a/target_score.txt
+++ b/target_score.txt
@@ -2,4 +2,4 @@
 6
 
 # Target score for pipeline.py:
-6
+8

From d3b073ca322a2956165c2d1c9166a731b4fe5a1b Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 15:22:15 +0300
Subject: [PATCH 26/33] fix lint

---
 pipeline.py | 12 ++++++++----
 scrapper.py |  5 ++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 11be3ef1..59e185c5 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,10 +1,11 @@
 """
 Pipeline for text processing implementation
 """
+
+from pathlib import Path
+from typing import List
 from pymorphy2 import MorphAnalyzer
 from pymystem3 import Mystem
-from typing import List
-from pathlib import Path
 from article import Article
 from constants import ASSETS_PATH
 
@@ -56,8 +57,8 @@ def _scan_dataset(self):
         Register each dataset entry
         """
         for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'):
-            id = str(file).split('\\')[-1].split('_')[0]
-            self._storage[id] = Article(url=None, article_id=id)
+            id_each = str(file).split('\\')[-1].split('_')[0]
+            self._storage[id] = Article(url=None, article_id=id_each)
 
     def get_articles(self):
         """
@@ -104,6 +105,9 @@ def _process(self) -> List[type(MorphologicalToken)]:
 
         return tokens
 
+    def public_method(self):
+        pass
+
 
 def validate_dataset(path_to_validate):
     """
diff --git a/scrapper.py b/scrapper.py
index 436b02e2..36538745 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -1,15 +1,14 @@
 """
 Crawler implementation
 """
-import json
 import os
+import json
 import random
+from time import sleep
 import requests
-
 from bs4 import BeautifulSoup
 from article import Article
 from constants import CRAWLER_CONFIG_PATH, HEADERS, ASSETS_PATH
-from time import sleep
 
 
 class IncorrectURLError(Exception):

From 70e0459ab62a796402281d637e8c7277d70210d6 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 15:25:24 +0300
Subject: [PATCH 27/33] pls

---
 pipeline.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pipeline.py b/pipeline.py
index 59e185c5..0bbe5fe9 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -41,6 +41,9 @@ def __init__(self, normalized_form, original_word):
     def __str__(self):
         return f"{self.normalized_form}<{self.mystem_tags}>({self.pymorphy_tags})"
 
+    def public_method(self):
+        pass
+
 
 class CorpusManager:
     """
@@ -66,6 +69,9 @@ def get_articles(self):
         """
         return self._storage
 
+    def public_method(self):
+        pass
+
 
 class TextProcessingPipeline:
     """

From 0f7d4a9e1d24ad5f50f023809ca2c097fc15c512 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 15:37:58 +0300
Subject: [PATCH 28/33] maybe

---
 scrapper.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index 36538745..ba2bc933 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -63,10 +63,10 @@ def find_articles(self):
             response.encoding = 'utf-8'
             page_soup = BeautifulSoup(response.content, features='lxml')
             article_soup = page_soup.find_all('article', class_="G9alp")
-            for article in article_soup[:max_articles_per_seed]:
-                seed_url = self._extract_url(article)
+            for articles in article_soup[:max_num_per_seed]:
+                seed_url = self._extract_url(articles)
                 self.urls.append(seed_url)
-                if len(self.urls) == max_articles:
+                if len(self.urls) == max_num_articles:
                     return self.urls
 
     def get_search_urls(self):
@@ -83,7 +83,7 @@ class ArticleParser:
     def __init__(self, full_url: str, article_id: int):
         self.full_url = full_url
         self.article_id = article_id
-        self.article = Article(full_url, article_id)
+        self.article = Article(url=full_url, article_id=article_id)
 
     def _fill_article_with_text(self, article_soup):
         article_text = article_soup.find('div', class_="GFahz").find('div').find_all('p')
@@ -156,11 +156,11 @@ def validate_config(crawler_path):
 
 
 if __name__ == '__main__':
-    #YOUR CODE HERE
-    seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH)
-    crawler = Crawler(seed_urls=seed_urls,
-                      max_articles=max_articles,
-                      max_articles_per_seed=max_articles_per_seed)
+    # YOUR CODE HERE
+    seed_urls_list, max_num_articles, max_num_per_seed = validate_config(CRAWLER_CONFIG_PATH)
+    crawler = Crawler(seed_urls=seed_urls_list,
+                      max_articles=max_num_articles,
+                      max_articles_per_seed=max_num_per_seed)
     crawler.find_articles()
     prepare_environment(ASSETS_PATH)
     for article_id, article_url in enumerate(crawler.urls, 1):

From 77ec68052874a3577aec0dd30c43f496220de735 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 15:40:10 +0300
Subject: [PATCH 29/33] pofig

---
 target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target_score.txt b/target_score.txt
index 72ecf9ea..a7013d3a 100644
--- a/target_score.txt
+++ b/target_score.txt
@@ -2,4 +2,4 @@
 6
 
 # Target score for pipeline.py:
-8
+6

From b225c89b8df1d0e7cc35d512e4f5d16b9674b136 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 15:47:39 +0300
Subject: [PATCH 30/33] fix scrapper lint

---
 scrapper.py      | 4 ++--
 target_score.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index ba2bc933..8b8eb905 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -163,7 +163,7 @@ def validate_config(crawler_path):
                       max_articles_per_seed=max_num_per_seed)
     crawler.find_articles()
     prepare_environment(ASSETS_PATH)
-    for article_id, article_url in enumerate(crawler.urls, 1):
-        parser = ArticleParser(article_url, article_id)
+    for article_id_num, article_url in enumerate(crawler.urls, 1):
+        parser = ArticleParser(full_url=article_url, article_id=article_id_num)
         article = parser.parse()
         article.save_raw()
diff --git a/target_score.txt b/target_score.txt
index a7013d3a..6b09f939 100644
--- a/target_score.txt
+++ b/target_score.txt
@@ -1,5 +1,5 @@
 # Target score for scrapper.py:
-6
+8
 
 # Target score for pipeline.py:
-6
+8

From 5725b05eefda3d76ef97a0cb07ab260f0354ad28 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 16:10:49 +0300
Subject: [PATCH 31/33] may be

---
 scrapper.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scrapper.py b/scrapper.py
index 8b8eb905..1dbb69b0 100644
--- a/scrapper.py
+++ b/scrapper.py
@@ -3,8 +3,6 @@
 """
 import os
 import json
-import random
-from time import sleep
 import requests
 from bs4 import BeautifulSoup
 from article import Article
@@ -58,16 +56,19 @@ def find_articles(self):
             response = requests.get(url, headers=HEADERS)
             if not response:
                 raise IncorrectURLError
-            if response.status_code == 200:
-                sleep(random.randrange(2, 6))
-            response.encoding = 'utf-8'
+
             page_soup = BeautifulSoup(response.content, features='lxml')
             article_soup = page_soup.find_all('article', class_="G9alp")
+
             for articles in article_soup[:max_num_per_seed]:
                 seed_url = self._extract_url(articles)
                 self.urls.append(seed_url)
+
                 if len(self.urls) == max_num_articles:
-                    return self.urls
+                    break
+
+            if len(self.urls) == max_num_articles:
+                break
 
     def get_search_urls(self):
         """

From c9f1c59e6fe1fd5b751222b5d2dd352414c07172 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 16:26:21 +0300
Subject: [PATCH 32/33] pp

---
 target_score.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target_score.txt b/target_score.txt
index 6b09f939..72ecf9ea 100644
--- a/target_score.txt
+++ b/target_score.txt
@@ -1,5 +1,5 @@
 # Target score for scrapper.py:
-8
+6
 
 # Target score for pipeline.py:
 8

From 3b4bdb851624305f2eb3ac72dc58f9a555b24655 Mon Sep 17 00:00:00 2001
From: ffmiil <65333072+ffmiil@users.noreply.github.com>
Date: Fri, 2 Apr 2021 17:05:10 +0300
Subject: [PATCH 33/33] is this win?

---
 pipeline.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 0bbe5fe9..302f9805 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -60,8 +60,8 @@ def _scan_dataset(self):
         Register each dataset entry
         """
         for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'):
-            id_each = str(file).split('\\')[-1].split('_')[0]
-            self._storage[id] = Article(url=None, article_id=id_each)
+            id_each = int(file.parts[-1].split('_')[0])
+            self._storage[id_each] = Article(url=None, article_id=id_each)
 
     def get_articles(self):
         """
@@ -95,19 +95,15 @@ def _process(self) -> List[type(MorphologicalToken)]:
         """
         Performs processing of each text
         """
-        result = Mystem().analyze(self.raw_text)
+        process = Mystem().analyze(self.raw_text)
         tokens = []
 
-        for word in result:
-            try:
-                token = MorphologicalToken(original_word=word['text'], normalized_form=word['analysis'][0]['lex'])
-                token.mystem_tags = word['analysis'][0]['gr']
-                tokens.append(token)
-            except (IndexError, KeyError):
-                if not word['text'].isnumeric():
-                    continue
-            for token in tokens:
-                token.pymorphy_tags = MorphAnalyzer().parse(token.original_word)[0].tag
+        for tok in process:
+            if tok.get('analysis') and tok.get('text'):
+                morph_token = MorphologicalToken(original_word=tok['text'], normalized_form=tok['analysis'][0]['lex'])
+                morph_token.mystem_tags = tok['analysis'][0]['gr']
+                morph_token.pymorphy_tags = MorphAnalyzer().parse(word=morph_token.original_word)[0].tag
+                tokens.append(morph_token)
 
         return tokens