diff --git a/crawler_config.json b/crawler_config.json index e60ce0f7..d0e4b628 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { - "base_urls": [], - "total_articles_to_find_and_parse": 0, - "max_number_articles_to_get_from_one_seed": 0 + "base_urls": ["http://www.znamyatrud.ru/news-7.html"], + "total_articles_to_find_and_parse": 3, + "max_number_articles_to_get_from_one_seed": 3 } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29b..79075d23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests == 2.25.1 +beautifulsoup4 == 4.9.3 +lxml == 4.6.2 \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 43aecef5..ec32e836 100644 --- a/scrapper.py +++ b/scrapper.py @@ -1,7 +1,18 @@ """ Crawler implementation """ - +import requests +from bs4 import BeautifulSoup +from time import sleep +import random +import json +import os +import re +import datetime +from article import Article +from constants import CRAWLER_CONFIG_PATH +from constants import ASSETS_PATH +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'} class IncorrectURLError(Exception): """ @@ -32,23 +43,37 @@ class Crawler: Crawler implementation """ def __init__(self, seed_urls: list, max_articles: int): - pass + self.seed_urls = seed_urls + self.max_articles = max_articles + self.max_articles_per_seed = max_articles_per_seed + self.urls = [] @staticmethod def _extract_url(article_bs): - pass + return article_bs.find('a').attrs['href'] def find_articles(self): """ Finds articles """ - pass + for urls in self.seed_urls: + response = requests.get(urls, headers=headers) + sleep(random.randrange(2, 6)) + if not response: + raise IncorrectURLError + b_soup = BeautifulSoup(response.content, features='lxml') + article_soup = b_soup.find_all('h1', class_='entry-title') + for article_bs in article_soup[:self.max_articles_per_seed]: + if len(self.urls) <= self.max_articles and article_bs not in self.urls: + seed_url = self._extract_url(article_bs) + self.urls.append(seed_url) + return self.urls def get_search_urls(self): """ Returns seed_urls param """ - pass + return self.seed_urls class ArticleParser: @@ -56,42 +81,71 @@ class ArticleParser: ArticleParser implementation """ def __init__(self, full_url: str, article_id: int): - pass + self.full_url = full_url + self.article_id = article_id + self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup): - pass + self.article.text = article_soup.find(name='div', class_="onemidnew").text def _fill_article_with_meta_information(self, article_soup): - pass + self.article.title = article_soup.find('h2', class_='mnname').text.strip() + self.article.author = 'NOT FOUND' + for topic in article_soup.find_all('a', rel="tag"): + self.article.topics.append(topic.text) + self.article.date = self.unify_date_format(article_soup.find(name='div', class_='mndate').text) @staticmethod def unify_date_format(date_str): """ Unifies date format """ - pass + return datetime.datetime.strptime(date_str, "%d.%m.%Y") def parse(self): """ Parses each article """ - pass + article_bs = BeautifulSoup(requests.get(self.full_url, headers=headers).content, 'lxml') + self._fill_article_with_text(article_bs) + self._fill_article_with_meta_information(article_bs) + self.article.save_raw() + return self.article def prepare_environment(base_path): """ Creates ASSETS_PATH folder if not created and removes existing folder """ - pass + if not os.path.exists(base_path): + os.makedirs(base_path) def validate_config(crawler_path): """ Validates given config """ - pass + with open(crawler_path, 'r', encoding='utf-8') as file: + crawler_configur = json.load(file) + for base_url in crawler_configur['base_urls']: + if not re.match('https://', base_url): + raise IncorrectURLError + if 'total_articles_to_find_and_parse' in crawler_configur and \ + isinstance(crawler_configur['total_articles_to_find_and_parse'], int) and \ + crawler_configur['total_articles_to_find_and_parse'] > 100: + raise NumberOfArticlesOutOfRangeError + if not isinstance(crawler_configur['total_articles_to_find_and_parse'], int): + raise IncorrectNumberOfArticlesError + return crawler_configur['base_urls'], crawler_configur['total_articles_to_find_and_parse'], \ + crawler_configur['max_number_articles_to_get_from_one_seed'] if __name__ == '__main__': # YOUR CODE HERE - pass + urls, maxi_articles, maxi_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH) + crawler = Crawler(urls, maxi_articles, maxi_articles_per_seed) + crawler.find_articles() + for i, articles_url in enumerate(urls): + parser = ArticleParser(full_url=articles_url, article_id=i + 1) + article = parser.parse() + parser.parse() diff --git a/target_score.txt b/target_score.txt index dd08e182..2675de3c 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,5 +1,8 @@ # Target score for scrapper.py: -6 +4 # Target score for pipeline.py: 0 + +# Skip pipeline checks: +1