fipl-hse · x-ae-a-12 · Mar 6, 2021 · Mar 6, 2021 · Mar 9, 2021 · Mar 10, 2021
diff --git a/article.py b/article.py
@@ -46,7 +46,7 @@ def save_raw(self):
                       indent=4,
                       ensure_ascii=False,
                       separators=(',', ': '))
-    
+
     @staticmethod
     def from_meta_json(json_path: str):
         """
@@ -94,13 +94,13 @@ def _get_meta(self):
             'author': self.author,
             'topics': self.topics
         }
-    
+
     def _date_to_text(self):
         """
         Converts datetime object to text
         """
         return self.date.strftime("%Y-%m-%d %H:%M:%S")
-    
+
     def _get_raw_text_path(self):
         """
         Returns path for requested raw article

diff --git a/crawler_config.json b/crawler_config.json
@@ -1,5 +1,5 @@
 {
-    "base_urls": [],
-    "total_articles_to_find_and_parse": 0,
-    "max_number_articles_to_get_from_one_seed": 0
+    "base_urls": ["https://mordovia-news.ru/"],
+    "total_articles_to_find_and_parse": 3,
+    "max_number_articles_to_get_from_one_seed": 3
 }
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+requests == 2.25.1
+beautifulsoup4 == 4.9.3
+lxml == 4.6.2
diff --git a/scrapper.py b/scrapper.py
@@ -2,6 +2,15 @@
 Crawler implementation
 """
 
+import re
+import os
+import json
+import datetime
+import requests
+from bs4 import BeautifulSoup
+from article import Article
+from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH
+
 
 class IncorrectURLError(Exception):
     """
@@ -32,66 +41,109 @@ class Crawler:
     Crawler implementation
     """
     def __init__(self, seed_urls: list, max_articles: int):
-        pass
-
-    @staticmethod
-    def _extract_url(article_bs):
-        pass
+        self.search_urls = seed_urls
+        self.max_articles = max_articles
+        self.found_urls = []
+        self.link_pattern = r'/?news-\d+-\d+\.htm'
+
+    def _extract_url(self, article_bs):
+        links = []
+        for link in article_bs.find_all('a', href=True):
+            potential_link = re.match(self.link_pattern, link['href'])
+            if potential_link:
+                links.append(potential_link.group(0))
+        return links
 
     def find_articles(self):
         """
         Finds articles
         """
-        pass
+        for url in self.search_urls:
+            request = requests.get(url).content
+            soup = BeautifulSoup(request,
+                                 features='lxml')
+            for article_url in self._extract_url(soup):
+                if len(self.found_urls) != self.max_articles \
+                        and url+article_url not in self.found_urls:
+                    self.found_urls.append(url+article_url)
+        print(f'Found {len(self.found_urls)} links to articles to process')
 
     def get_search_urls(self):
         """
         Returns seed_urls param
         """
-        pass
+        return self.found_urls
 
 
 class ArticleParser:
     """
     ArticleParser implementation
     """
     def __init__(self, full_url: str, article_id: int):
-        pass
+        self.article = Article(url=full_url, article_id=article_id)
 
     def _fill_article_with_text(self, article_soup):
-        pass
+        self.article.text = article_soup.find('dd', class_='text').text
 
     def _fill_article_with_meta_information(self, article_soup):
-        pass
+        self.article.title = article_soup.find('dd', class_='title').text.strip()
+        self.article.author = 'NOT FOUND'
+        self.article.topics = article_soup.find('span', class_='title_text').find_all('a')[1].text
+        self.article.date = self.unify_date_format(article_soup.find('span', class_='title_data').text[-10:])
 
     @staticmethod
     def unify_date_format(date_str):
         """
         Unifies date format
         """
-        pass
+        return datetime.datetime.strptime(date_str, "%d.%m.%Y")
 
     def parse(self):
         """
         Parses each article
         """
-        pass
+        request = requests.get(self.article.url).content
+        soup = BeautifulSoup(request, features='lxml')
+        self._fill_article_with_meta_information(soup)
+        self._fill_article_with_text(soup)
+        self.article.save_raw()
 
 
 def prepare_environment(base_path):
     """
     Creates ASSETS_PATH folder if not created and removes existing folder
     """
-    pass
+    if not os.path.exists(base_path):
+        os.makedirs(base_path)
 
 
 def validate_config(crawler_path):
     """
     Validates given config
     """
-    pass
+    with open(crawler_path, 'r', encoding='utf-8') as data:
+        settings = json.load(data)
+    url_pattern = 'https://'
+
+    for url in settings['base_urls']:
+        if url_pattern not in url:
+            raise IncorrectURLError
+
+    if not isinstance(settings['total_articles_to_find_and_parse'], int):
+        raise IncorrectNumberOfArticlesError
+
+    if settings['total_articles_to_find_and_parse'] > 100:
+        raise NumberOfArticlesOutOfRangeError
+    return settings['base_urls'], settings['total_articles_to_find_and_parse']
 
 
 if __name__ == '__main__':
-    # YOUR CODE HERE
-    pass
+    urls, num_articles = validate_config(CRAWLER_CONFIG_PATH)
+    prepare_environment(ASSETS_PATH)
+
+    crawler = Crawler(seed_urls=urls, max_articles=num_articles)
+    crawler.find_articles()
+
+    for _article_id, _article_link in enumerate(crawler.get_search_urls()):
+        parser = ArticleParser(_article_link, _article_id+1)
+        parser.parse()
diff --git a/target_score.txt b/target_score.txt
@@ -1,5 +1,5 @@
 # Target score for scrapper.py:
-6
+8
 
 # Target score for pipeline.py:
 0
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,5 @@
		# Target score for scrapper.py:
		6
		8
Copy link Contributor dmitry-uraev Mar 6, 2021 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Yes, that s good switch. x-ae-a-12 reacted with confused emoji Copy link Author x-ae-a-12 Mar 6, 2021 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Yes, that s good switch. No, I do not think so. It is eminent I'd better switch to 4 or 6 later.

		# Target score for pipeline.py:
		0