From dae378d9a69622a9e0127645232d551e3fbfef8b Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Sat, 6 Mar 2021 14:01:58 +0300 Subject: [PATCH 01/33] ao --- crawler_config.json | 6 +++--- requirements.txt | 3 +++ scrapper.py | 19 +++++++++++++++++-- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index e60ce0f7..419c0303 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { - "base_urls": [], - "total_articles_to_find_and_parse": 0, - "max_number_articles_to_get_from_one_seed": 0 + "base_urls": ["https://express-kamchatka1.ru/sobytiya.html"], + "total_articles_to_find_and_parse": 5, + "max_number_articles_to_get_from_one_seed": 5 } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29b..97b9fc20 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests==2.25.1 +beautifulsoup4==4.9.3 +lxml==4.6.2 \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 43aecef5..8f5685a7 100644 --- a/scrapper.py +++ b/scrapper.py @@ -1,7 +1,18 @@ """ Crawler implementation """ - +import article +import json +import os +import random +import re +import requests +import datetime + +from bs4 import BeautifulSoup +from constants import CRAWLER_CONFIG_PATH +from constants import PROJECT_ROOT +from time import sleep class IncorrectURLError(Exception): """ @@ -94,4 +105,8 @@ def validate_config(crawler_path): if __name__ == '__main__': # YOUR CODE HERE - pass + headers = { + 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' + } + response = requests.get('https://express-kamchatka1.ru/sobytiya.html', headers=headers) From 277f0b9cece3d862905ba43f490fd2147a965882 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 07:57:26 +0300 Subject: [PATCH 02/33] try --- constants.py | 3 ++ crawler_config.json | 4 +- scrapper.py | 104 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 85 insertions(+), 26 deletions(-) diff --git a/constants.py b/constants.py index 12d85256..913a418e 100644 --- a/constants.py +++ b/constants.py @@ -7,3 +7,6 @@ PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') +HEADERS = { + 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' + } diff --git a/crawler_config.json b/crawler_config.json index 419c0303..1c4f02db 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { "base_urls": ["https://express-kamchatka1.ru/sobytiya.html"], - "total_articles_to_find_and_parse": 5, - "max_number_articles_to_get_from_one_seed": 5 + "total_articles_to_find_and_parse": 15, + "max_number_articles_to_get_from_one_seed": 15 } \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 8f5685a7..677b5e62 100644 --- a/scrapper.py +++ b/scrapper.py @@ -1,18 +1,17 @@ """ Crawler implementation """ -import article import json import os -import random -import re import requests -import datetime +from datetime import datetime from bs4 import BeautifulSoup +from article import Article from constants import CRAWLER_CONFIG_PATH -from constants import PROJECT_ROOT -from time import sleep +from constants import HEADERS +from urllib.parse import urlparse + class IncorrectURLError(Exception): """ @@ -42,24 +41,53 @@ class Crawler: """ Crawler implementation """ - def __init__(self, seed_urls: list, max_articles: int): - pass + def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: int): + self.seed_urls = seed_urls + self.total_max_articles = max_articles + self.max_articles_per_seed = max_articles_per_seed + + self.urls = [] @staticmethod def _extract_url(article_bs): - pass + articles = article_bs.find_all('div', {'itemprop': 'blogPost'}) + current_seed_links = [] + for blog_tag in articles: + article_name_tag = blog_tag.find('h2', {'itemprop': 'name'}) + article_link = article_name_tag.a + current_seed_links.append(article_link.attrs['href']) + return current_seed_links def find_articles(self): """ Finds articles """ - pass + for url in self.seed_urls: + url_parsed = urlparse(url) + url_scheme, url_domain = url_parsed.scheme, url_parsed.netloc + url_base = '{}://{}'.format(url_scheme, url_domain) + + # Change user-agent to avoid 403 error + response = requests.get(url, headers=HEADERS) # make a request to seed url + if response: + content = response.text + links = self._extract_url(BeautifulSoup(content, 'html.parser')) + full_links = [] + + for link in links: + if link.startswith('/'): + full_links.append(url_base + link) + else: + full_links.append(link) + + self.urls.extend(full_links[:max_articles_per_seed]) + assert len(self.urls) >= self.total_max_articles def get_search_urls(self): """ Returns seed_urls param """ - pass + return self.seed_urls class ArticleParser: @@ -67,46 +95,74 @@ class ArticleParser: ArticleParser implementation """ def __init__(self, full_url: str, article_id: int): - pass + self.full_url = full_url + self.article_id = article_id + self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup): - pass + self.article.text = article_soup.find("div", class_="leading-0").text def _fill_article_with_meta_information(self, article_soup): - pass + self.article.title = article_soup.find('div', class_="page-header").text.strip() + self.article.views = article_soup.find('div', class_="hits").find('meta').text + self.article.date = self.unify_date_format(article_soup.find('div', class_="create").find('time').text) + self.article.author = 'NOT FOUND' @staticmethod def unify_date_format(date_str): """ Unifies date format """ - pass + return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") def parse(self): """ Parses each article """ - pass + response = requests.get(self.full_url, headers=HEADERS) + article_soup = BeautifulSoup(response.content, features='lxml') + self._fill_article_with_text(article_soup) + self._fill_article_with_meta_information(article_soup) + self.article.save_raw() def prepare_environment(base_path): """ Creates ASSETS_PATH folder if not created and removes existing folder """ - pass + if not os.path.isdir(base_path): + os.makedirs(base_path) def validate_config(crawler_path): """ Validates given config """ - pass + with open(crawler_path, 'r', encoding='utf-8') as config: + params = json.load(config) + + if 'base_urls' not in params or not all([isinstance(url, str) for url in params['base_urls']]): + raise IncorrectURLError + + if params['total_articles_to_find_and_parse'] > 100: + raise NumberOfArticlesOutOfRangeError + + if not isinstance(params['total_articles_to_find_and_parse'], int): + raise IncorrectNumberOfArticlesError + + return params['base_urls'], params['total_articles_to_find_and_parse'], params['total_articles_to_find_and_parse'] if __name__ == '__main__': - # YOUR CODE HERE - headers = { - 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' - } - response = requests.get('https://express-kamchatka1.ru/sobytiya.html', headers=headers) + #YOUR CODE HERE + try: + seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH) + crawler = Crawler(seed_urls=seed_urls, + max_articles=max_articles, + max_articles_per_seed=max_articles_per_seed) + crawler.find_articles() + + for i, url in enumerate(crawler.urls): + parser = ArticleParser(full_url=url, article_id=i) + except (IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError, UnknownConfigError): + exit(1) From 4782ec2dd693d1cb047c5f48ae2228aeb02f1f32 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 08:15:28 +0300 Subject: [PATCH 03/33] maybe... --- scrapper.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/scrapper.py b/scrapper.py index 677b5e62..ae58c0ea 100644 --- a/scrapper.py +++ b/scrapper.py @@ -138,19 +138,32 @@ def validate_config(crawler_path): """ Validates given config """ - with open(crawler_path, 'r', encoding='utf-8') as config: - params = json.load(config) - - if 'base_urls' not in params or not all([isinstance(url, str) for url in params['base_urls']]): - raise IncorrectURLError - - if params['total_articles_to_find_and_parse'] > 100: - raise NumberOfArticlesOutOfRangeError - - if not isinstance(params['total_articles_to_find_and_parse'], int): - raise IncorrectNumberOfArticlesError - - return params['base_urls'], params['total_articles_to_find_and_parse'], params['total_articles_to_find_and_parse'] + try: + with open(crawler_path, 'r', encoding='utf-8') as config: + params = json.load(config) + + seed_urls = params.get('base_urls') + max_articles = params.get('total_articles_to_find_and_parse') + max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed') + + if not isinstance(seed_urls, list): + raise IncorrectURLError + for url in seed_urls: + if not isinstance(url, str) or not url.startswith('http'): + raise IncorrectURLError + + if not isinstance(max_articles, int) or max_articles < 0: + raise IncorrectNumberOfArticlesError + + if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles: + raise NumberOfArticlesOutOfRangeError + + except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error: + raise error + except: + raise UnknownConfigError + else: + return seed_urls, max_articles, max_articles_per_seed if __name__ == '__main__': From 5eb76689bde500811491bfcb8120032961ab0b8c Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 09:03:01 +0300 Subject: [PATCH 04/33] please --- scrapper.py | 33 +++++++++++++++++++-------------- target_score.txt | 2 +- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/scrapper.py b/scrapper.py index ae58c0ea..2632b700 100644 --- a/scrapper.py +++ b/scrapper.py @@ -4,13 +4,18 @@ import json import os import requests +import random +import shutil from datetime import datetime from bs4 import BeautifulSoup from article import Article from constants import CRAWLER_CONFIG_PATH from constants import HEADERS +from constants import ASSETS_PATH +from constants import PROJECT_ROOT from urllib.parse import urlparse +from time import sleep class IncorrectURLError(Exception): @@ -130,9 +135,9 @@ def prepare_environment(base_path): """ Creates ASSETS_PATH folder if not created and removes existing folder """ - if not os.path.isdir(base_path): - os.makedirs(base_path) - + if os.path.exists(os.path.join(base_path, 'tmp', 'articles')): + shutil.rmtree(os.path.join(base_path, 'tmp', 'articles')) + os.makedirs(os.path.join(base_path, 'tmp', 'articles')) def validate_config(crawler_path): """ @@ -168,14 +173,14 @@ def validate_config(crawler_path): if __name__ == '__main__': #YOUR CODE HERE - try: - seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH) - crawler = Crawler(seed_urls=seed_urls, - max_articles=max_articles, - max_articles_per_seed=max_articles_per_seed) - crawler.find_articles() - - for i, url in enumerate(crawler.urls): - parser = ArticleParser(full_url=url, article_id=i) - except (IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError, UnknownConfigError): - exit(1) + seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH) + crawler = Crawler(seed_urls=seed_urls, + max_articles=max_articles, + max_articles_per_seed=max_articles_per_seed) + crawler.find_articles() + prepare_environment(PROJECT_ROOT) + for i, url in enumerate(crawler.urls): + parser = ArticleParser(full_url=url, article_id=i) + sleep(random.randint(2, 5)) + articles = parser.parse() + articles.save_raw() \ No newline at end of file diff --git a/target_score.txt b/target_score.txt index dd08e182..3ee0e68c 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,5 +1,5 @@ # Target score for scrapper.py: -6 +8 # Target score for pipeline.py: 0 From e8c318a79ad1c6ea7861e7906f88dc72a7940d87 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 09:05:02 +0300 Subject: [PATCH 05/33] op,otkat --- target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target_score.txt b/target_score.txt index 3ee0e68c..dd08e182 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,5 +1,5 @@ # Target score for scrapper.py: -8 +6 # Target score for pipeline.py: 0 From 425c4808d7c06151494e0c1dc7974f6bcb04aa3d Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 09:21:24 +0300 Subject: [PATCH 06/33] moya popitka nomer pyat --- scrapper.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/scrapper.py b/scrapper.py index 2632b700..eae7faec 100644 --- a/scrapper.py +++ b/scrapper.py @@ -147,28 +147,22 @@ def validate_config(crawler_path): with open(crawler_path, 'r', encoding='utf-8') as config: params = json.load(config) - seed_urls = params.get('base_urls') - max_articles = params.get('total_articles_to_find_and_parse') - max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed') - - if not isinstance(seed_urls, list): + if 'base_urls' not in params or not isinstance(params['base_urls'], list) or \ + not all([isinstance(link, str) for link in params['base_urls']]): raise IncorrectURLError - for url in seed_urls: - if not isinstance(url, str) or not url.startswith('http'): - raise IncorrectURLError - if not isinstance(max_articles, int) or max_articles < 0: + if 'max_number_articles_to_get_from_one_seed' not in params or \ + not isinstance(params['max_number_articles_to_get_from_one_seed'], int) or \ + 'total_articles_to_find_and_parse' not in params or \ + not isinstance(params['total_articles_to_find_and_parse'], int): raise IncorrectNumberOfArticlesError - if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles: + if 'total_articles_to_find_and_parse' in params and \ + isinstance(params['total_articles_to_find_and_parse'], int) and \ + params['total_articles_to_find_and_parse'] > 100: raise NumberOfArticlesOutOfRangeError - except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error: - raise error - except: - raise UnknownConfigError - else: - return seed_urls, max_articles, max_articles_per_seed + return params['base_urls'], params['total_articles_to_find_and_parse'], params['max_number_articles_to_get_from_one_seed'] if __name__ == '__main__': @@ -181,6 +175,6 @@ def validate_config(crawler_path): prepare_environment(PROJECT_ROOT) for i, url in enumerate(crawler.urls): parser = ArticleParser(full_url=url, article_id=i) - sleep(random.randint(2, 5)) articles = parser.parse() - articles.save_raw() \ No newline at end of file + articles.save_raw() + sleep(random.randint(2, 5)) \ No newline at end of file From 739dcee42f266c0086adc8ae84a403789ae704fd Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 15:18:08 +0300 Subject: [PATCH 07/33] meow' --- scrapper.py | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/scrapper.py b/scrapper.py index eae7faec..4294583f 100644 --- a/scrapper.py +++ b/scrapper.py @@ -4,8 +4,6 @@ import json import os import requests -import random -import shutil from datetime import datetime from bs4 import BeautifulSoup @@ -13,9 +11,7 @@ from constants import CRAWLER_CONFIG_PATH from constants import HEADERS from constants import ASSETS_PATH -from constants import PROJECT_ROOT from urllib.parse import urlparse -from time import sleep class IncorrectURLError(Exception): @@ -72,8 +68,7 @@ def find_articles(self): url_scheme, url_domain = url_parsed.scheme, url_parsed.netloc url_base = '{}://{}'.format(url_scheme, url_domain) - # Change user-agent to avoid 403 error - response = requests.get(url, headers=HEADERS) # make a request to seed url + response = requests.get(url, headers=HEADERS) if response: content = response.text links = self._extract_url(BeautifulSoup(content, 'html.parser')) @@ -88,6 +83,7 @@ def find_articles(self): self.urls.extend(full_links[:max_articles_per_seed]) assert len(self.urls) >= self.total_max_articles + def get_search_urls(self): """ Returns seed_urls param @@ -108,7 +104,7 @@ def _fill_article_with_text(self, article_soup): self.article.text = article_soup.find("div", class_="leading-0").text def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('div', class_="page-header").text.strip() + self.article.title = article_soup.find('div', class_="page-header").find('h2').find('a').text.strip() self.article.views = article_soup.find('div', class_="hits").find('meta').text self.article.date = self.unify_date_format(article_soup.find('div', class_="create").find('time').text) self.article.author = 'NOT FOUND' @@ -118,7 +114,7 @@ def unify_date_format(date_str): """ Unifies date format """ - return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") + return datetime.strptime(date_str, "%Y-%m-%d") def parse(self): """ @@ -135,9 +131,8 @@ def prepare_environment(base_path): """ Creates ASSETS_PATH folder if not created and removes existing folder """ - if os.path.exists(os.path.join(base_path, 'tmp', 'articles')): - shutil.rmtree(os.path.join(base_path, 'tmp', 'articles')) - os.makedirs(os.path.join(base_path, 'tmp', 'articles')) + if not os.path.exists(os.path.join(base_path, 'tmp', 'articles')): + os.makedirs(os.path.join(base_path, 'tmp', 'articles')) def validate_config(crawler_path): """ @@ -147,22 +142,28 @@ def validate_config(crawler_path): with open(crawler_path, 'r', encoding='utf-8') as config: params = json.load(config) - if 'base_urls' not in params or not isinstance(params['base_urls'], list) or \ - not all([isinstance(link, str) for link in params['base_urls']]): + seed_urls = params.get('base_urls') + max_articles = params.get('total_articles_to_find_and_parse') + max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed') + + if not isinstance(seed_urls, list): raise IncorrectURLError + for url in seed_urls: + if not isinstance(url, str) or not url.startswith('http'): + raise IncorrectURLError - if 'max_number_articles_to_get_from_one_seed' not in params or \ - not isinstance(params['max_number_articles_to_get_from_one_seed'], int) or \ - 'total_articles_to_find_and_parse' not in params or \ - not isinstance(params['total_articles_to_find_and_parse'], int): + if not isinstance(max_articles, int) or max_articles < 0: raise IncorrectNumberOfArticlesError - if 'total_articles_to_find_and_parse' in params and \ - isinstance(params['total_articles_to_find_and_parse'], int) and \ - params['total_articles_to_find_and_parse'] > 100: + if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles: raise NumberOfArticlesOutOfRangeError - return params['base_urls'], params['total_articles_to_find_and_parse'], params['max_number_articles_to_get_from_one_seed'] + except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error: + raise error + except: + raise UnknownConfigError + else: + return seed_urls, max_articles, max_articles_per_seed if __name__ == '__main__': @@ -172,9 +173,7 @@ def validate_config(crawler_path): max_articles=max_articles, max_articles_per_seed=max_articles_per_seed) crawler.find_articles() - prepare_environment(PROJECT_ROOT) + prepare_environment(ASSETS_PATH) for i, url in enumerate(crawler.urls): parser = ArticleParser(full_url=url, article_id=i) - articles = parser.parse() - articles.save_raw() - sleep(random.randint(2, 5)) \ No newline at end of file + parser.parse() From 6d5aa100951d58f0276a3fa3efbe10531feb63cf Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 15:32:52 +0300 Subject: [PATCH 08/33] fuki-mazfuki --- article.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/article.py b/article.py index 1d759cd2..515df527 100644 --- a/article.py +++ b/article.py @@ -46,7 +46,7 @@ def save_raw(self): indent=4, ensure_ascii=False, separators=(',', ': ')) - + + @staticmethod def from_meta_json(json_path: str): """ @@ -94,13 +94,13 @@ def _get_meta(self): 'author': self.author, 'topics': self.topics } - ++ def _date_to_text(self): """ Converts datetime object to text """ return self.date.strftime("%Y-%m-%d %H:%M:%S") - ++ def _get_raw_text_path(self): """ Returns path for requested raw article From d174872920a0ba82451c7f704065de9a3e20e370 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 15:36:20 +0300 Subject: [PATCH 09/33] uzhas --- article.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/article.py b/article.py index 515df527..718a3b3b 100644 --- a/article.py +++ b/article.py @@ -46,7 +46,7 @@ def save_raw(self): indent=4, ensure_ascii=False, separators=(',', ': ')) - + + @staticmethod def from_meta_json(json_path: str): """ @@ -94,13 +94,13 @@ def _get_meta(self): 'author': self.author, 'topics': self.topics } -+ + def _date_to_text(self): """ Converts datetime object to text """ return self.date.strftime("%Y-%m-%d %H:%M:%S") -+ + def _get_raw_text_path(self): """ Returns path for requested raw article From a6a5e24d16a391d5f5c6eba3b6b27c4d01fd8a1b Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 11 Mar 2021 17:12:13 +0300 Subject: [PATCH 10/33] 'puk' --- constants.py | 2 +- scrapper.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/constants.py b/constants.py index 913a418e..b3b9ba89 100644 --- a/constants.py +++ b/constants.py @@ -8,5 +8,5 @@ ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') HEADERS = { - 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' + 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' } diff --git a/scrapper.py b/scrapper.py index 4294583f..19babfbf 100644 --- a/scrapper.py +++ b/scrapper.py @@ -101,12 +101,12 @@ def __init__(self, full_url: str, article_id: int): self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup): - self.article.text = article_soup.find("div", class_="leading-0").text + self.article.text = article_soup.find("div", class_="item-page").text def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('div', class_="page-header").find('h2').find('a').text.strip() - self.article.views = article_soup.find('div', class_="hits").find('meta').text - self.article.date = self.unify_date_format(article_soup.find('div', class_="create").find('time').text) + self.article.title = article_soup.find('div', class_="page-header").find('h2').text + self.article.views = article_soup.find('dd', class_="hits").find('meta').text + self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text) self.article.author = 'NOT FOUND' @staticmethod @@ -173,6 +173,7 @@ def validate_config(crawler_path): max_articles=max_articles, max_articles_per_seed=max_articles_per_seed) crawler.find_articles() + prepare_environment(ASSETS_PATH) for i, url in enumerate(crawler.urls): parser = ArticleParser(full_url=url, article_id=i) From 747f0ec4e28f338f3ea2e9be9abbaec568aa3dfb Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 12 Mar 2021 23:14:23 +0300 Subject: [PATCH 11/33] zzz --- article.py | 3 +++ scrapper.py | 45 ++++++++++++++++++--------------------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/article.py b/article.py index 718a3b3b..75f33b0c 100644 --- a/article.py +++ b/article.py @@ -47,6 +47,7 @@ def save_raw(self): ensure_ascii=False, separators=(',', ': ')) + @staticmethod def from_meta_json(json_path: str): """ @@ -95,12 +96,14 @@ def _get_meta(self): 'topics': self.topics } + def _date_to_text(self): """ Converts datetime object to text """ return self.date.strftime("%Y-%m-%d %H:%M:%S") + def _get_raw_text_path(self): """ Returns path for requested raw article diff --git a/scrapper.py b/scrapper.py index 19babfbf..e0577767 100644 --- a/scrapper.py +++ b/scrapper.py @@ -11,7 +11,8 @@ from constants import CRAWLER_CONFIG_PATH from constants import HEADERS from constants import ASSETS_PATH -from urllib.parse import urlparse +from time import sleep +import random class IncorrectURLError(Exception): @@ -51,37 +52,24 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in @staticmethod def _extract_url(article_bs): - articles = article_bs.find_all('div', {'itemprop': 'blogPost'}) - current_seed_links = [] - for blog_tag in articles: - article_name_tag = blog_tag.find('h2', {'itemprop': 'name'}) - article_link = article_name_tag.a - current_seed_links.append(article_link.attrs['href']) - return current_seed_links + article_link = article_bs.find('h2', {'itemprop': 'name'}).find('a').get('href') + return(article_link) def find_articles(self): """ Finds articles """ for url in self.seed_urls: - url_parsed = urlparse(url) - url_scheme, url_domain = url_parsed.scheme, url_parsed.netloc - url_base = '{}://{}'.format(url_scheme, url_domain) - + sleep(random.randint(2, 8)) response = requests.get(url, headers=HEADERS) - if response: - content = response.text - links = self._extract_url(BeautifulSoup(content, 'html.parser')) - full_links = [] - - for link in links: - if link.startswith('/'): - full_links.append(url_base + link) - else: - full_links.append(link) - - self.urls.extend(full_links[:max_articles_per_seed]) - assert len(self.urls) >= self.total_max_articles + if not response: + continue + link = BeautifulSoup(response.content, features='lxml') + articles_soup = link.find_all('li') + for article_bs in articles_soup[:max_articles_per_seed]: + self.urls.append(self._extract_url(article_bs)) + if len(self.urls) == max_articles: + return self.urls def get_search_urls(self): @@ -101,10 +89,13 @@ def __init__(self, full_url: str, article_id: int): self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup): - self.article.text = article_soup.find("div", class_="item-page").text + article_text = article_soup.find_all('p') + for par in article_text: + if 'class' not in par.attrs: + self.article.text += par.text.strip() + ' ' def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('div', class_="page-header").find('h2').text + self.article.title = article_soup.find('dev',class_='page-header').find('h2').text self.article.views = article_soup.find('dd', class_="hits").find('meta').text self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text) self.article.author = 'NOT FOUND' From a25e84a79114a2b5d3126893caa787f4ab4aefdd Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 12 Mar 2021 23:35:08 +0300 Subject: [PATCH 12/33] no coment' --- scrapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapper.py b/scrapper.py index e0577767..38b9bd2a 100644 --- a/scrapper.py +++ b/scrapper.py @@ -53,7 +53,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in @staticmethod def _extract_url(article_bs): article_link = article_bs.find('h2', {'itemprop': 'name'}).find('a').get('href') - return(article_link) + print(article_link) def find_articles(self): """ @@ -95,7 +95,7 @@ def _fill_article_with_text(self, article_soup): self.article.text += par.text.strip() + ' ' def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('dev',class_='page-header').find('h2').text + self.article.title = article_soup.find('div',class_='page-header').find('h2').text self.article.views = article_soup.find('dd', class_="hits").find('meta').text self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text) self.article.author = 'NOT FOUND' @@ -146,7 +146,7 @@ def validate_config(crawler_path): if not isinstance(max_articles, int) or max_articles < 0: raise IncorrectNumberOfArticlesError - if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > max_articles: + if max_articles_per_seed > 100: raise NumberOfArticlesOutOfRangeError except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error: From 144a74c5c551b6ecfb582c079ff097d6417f99fc Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 12 Mar 2021 23:40:13 +0300 Subject: [PATCH 13/33] f --- scrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapper.py b/scrapper.py index 38b9bd2a..305a83bb 100644 --- a/scrapper.py +++ b/scrapper.py @@ -146,7 +146,7 @@ def validate_config(crawler_path): if not isinstance(max_articles, int) or max_articles < 0: raise IncorrectNumberOfArticlesError - if max_articles_per_seed > 100: + if not isinstance(max_articles_per_seed,int) or max_articles_per_seed > 100: raise NumberOfArticlesOutOfRangeError except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error: From d3382bae2639b5af109c5e03d3d58aef917dafbf Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Sun, 14 Mar 2021 23:04:52 +0300 Subject: [PATCH 14/33] uspeshno? --- article.py | 5 +-- constants.py | 3 +- crawler_config.json | 6 ++-- scrapper.py | 75 +++++++++++++++++++++++---------------------- 4 files changed, 45 insertions(+), 44 deletions(-) diff --git a/article.py b/article.py index 75f33b0c..95ea374d 100644 --- a/article.py +++ b/article.py @@ -47,7 +47,6 @@ def save_raw(self): ensure_ascii=False, separators=(',', ': ')) - @staticmethod def from_meta_json(json_path: str): """ @@ -91,19 +90,17 @@ def _get_meta(self): 'id': self.article_id, 'url': self.url, 'title': self.title, - 'date': self._date_to_text(), + 'date': self.date, 'author': self.author, 'topics': self.topics } - def _date_to_text(self): """ Converts datetime object to text """ return self.date.strftime("%Y-%m-%d %H:%M:%S") - def _get_raw_text_path(self): """ Returns path for requested raw article diff --git a/constants.py b/constants.py index b3b9ba89..917cc470 100644 --- a/constants.py +++ b/constants.py @@ -8,5 +8,6 @@ ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') HEADERS = { - 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + 'Chrome/89.0.4389.82 Safari/537.36' } diff --git a/crawler_config.json b/crawler_config.json index 1c4f02db..e2a71584 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { - "base_urls": ["https://express-kamchatka1.ru/sobytiya.html"], - "total_articles_to_find_and_parse": 15, - "max_number_articles_to_get_from_one_seed": 15 + "base_urls": ["https://www.e1.ru/news/"], + "total_articles_to_find_and_parse": 5, + "max_number_articles_to_get_from_one_seed": 10 } \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 305a83bb..74f9ef8d 100644 --- a/scrapper.py +++ b/scrapper.py @@ -3,16 +3,13 @@ """ import json import os +import random import requests -from datetime import datetime from bs4 import BeautifulSoup from article import Article -from constants import CRAWLER_CONFIG_PATH -from constants import HEADERS -from constants import ASSETS_PATH +from constants import CRAWLER_CONFIG_PATH, HEADERS, ASSETS_PATH from time import sleep -import random class IncorrectURLError(Exception): @@ -47,30 +44,33 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in self.seed_urls = seed_urls self.total_max_articles = max_articles self.max_articles_per_seed = max_articles_per_seed - self.urls = [] @staticmethod def _extract_url(article_bs): - article_link = article_bs.find('h2', {'itemprop': 'name'}).find('a').get('href') - print(article_link) + url_article = article_bs.find('h2', class_="G3ax").find('a') + article_link = url_article.attrs['href'] + return 'https://www.e1.ru' + article_link def find_articles(self): """ Finds articles """ for url in self.seed_urls: - sleep(random.randint(2, 8)) response = requests.get(url, headers=HEADERS) if not response: - continue - link = BeautifulSoup(response.content, features='lxml') - articles_soup = link.find_all('li') - for article_bs in articles_soup[:max_articles_per_seed]: - self.urls.append(self._extract_url(article_bs)) - if len(self.urls) == max_articles: - return self.urls - + raise IncorrectURLError + if response.status_code == 200: + sleep(random.randrange(2, 6)) + response.encoding = 'utf-8' + page_soup = BeautifulSoup(response.content, features='lxml') + article_soup = page_soup.find_all('article', class_="G3aj7") + for article in article_soup: + seed_url = self._extract_url(article) + self.urls.append(seed_url) + if len(self.urls) <= max_articles and article not in self.urls: + seed_url = self._extract_url(article) + self.urls.append(seed_url) def get_search_urls(self): """ @@ -89,41 +89,44 @@ def __init__(self, full_url: str, article_id: int): self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup): - article_text = article_soup.find_all('p') + article_text = article_soup.find('div', class_="F-af3").find_all('p') for par in article_text: - if 'class' not in par.attrs: - self.article.text += par.text.strip() + ' ' + self.article.text += par.text.strip() + '\n' def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('div',class_='page-header').find('h2').text - self.article.views = article_soup.find('dd', class_="hits").find('meta').text - self.article.date = self.unify_date_format(article_soup.find('dd', class_="create").find('time').text) + self.article.title = article_soup.find('h2', {'itemprop': 'headline'}).find('span').text + self.article.annotation = article_soup.find('p', class_="CLpj JZaj-").find('span').text self.article.author = 'NOT FOUND' + self.article.date = article_soup.find('time', class_="G-k1").find('a').text.strip() @staticmethod def unify_date_format(date_str): """ Unifies date format """ - return datetime.strptime(date_str, "%Y-%m-%d") + pass def parse(self): """ Parses each article """ response = requests.get(self.full_url, headers=HEADERS) - article_soup = BeautifulSoup(response.content, features='lxml') + if not response: + raise IncorrectURLError + + article_soup = BeautifulSoup(response.text, 'lxml') self._fill_article_with_text(article_soup) self._fill_article_with_meta_information(article_soup) - self.article.save_raw() + return self.article def prepare_environment(base_path): """ Creates ASSETS_PATH folder if not created and removes existing folder """ - if not os.path.exists(os.path.join(base_path, 'tmp', 'articles')): - os.makedirs(os.path.join(base_path, 'tmp', 'articles')) + if not os.path.exists(base_path): + os.makedirs(base_path) + def validate_config(crawler_path): """ @@ -146,13 +149,11 @@ def validate_config(crawler_path): if not isinstance(max_articles, int) or max_articles < 0: raise IncorrectNumberOfArticlesError - if not isinstance(max_articles_per_seed,int) or max_articles_per_seed > 100: + if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > 100: raise NumberOfArticlesOutOfRangeError except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error: raise error - except: - raise UnknownConfigError else: return seed_urls, max_articles, max_articles_per_seed @@ -163,9 +164,11 @@ def validate_config(crawler_path): crawler = Crawler(seed_urls=seed_urls, max_articles=max_articles, max_articles_per_seed=max_articles_per_seed) - crawler.find_articles() - + art = crawler.find_articles() + print(art) prepare_environment(ASSETS_PATH) - for i, url in enumerate(crawler.urls): - parser = ArticleParser(full_url=url, article_id=i) - parser.parse() + for article_id, article_url in enumerate(crawler.urls): + parser = ArticleParser(article_url, article_id+1) + article = parser.parse() + article.save_raw() + sleep((random.randrange(2, 6))) From 57a712d4b64409a56dd1c35ff41ea75a60cf56a0 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Sun, 14 Mar 2021 23:21:22 +0300 Subject: [PATCH 15/33] uspeh --- crawler_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler_config.json b/crawler_config.json index e2a71584..1a34d92c 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { "base_urls": ["https://www.e1.ru/news/"], "total_articles_to_find_and_parse": 5, - "max_number_articles_to_get_from_one_seed": 10 + "max_number_articles_to_get_from_one_seed": 15 } \ No newline at end of file From 89ff700276052b6981f8299b3c5ae476fbaf3a8c Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Sun, 14 Mar 2021 23:28:21 +0300 Subject: [PATCH 16/33] test --- crawler_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler_config.json b/crawler_config.json index 1a34d92c..e2a71584 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { "base_urls": ["https://www.e1.ru/news/"], "total_articles_to_find_and_parse": 5, - "max_number_articles_to_get_from_one_seed": 15 + "max_number_articles_to_get_from_one_seed": 10 } \ No newline at end of file From 3d3c72d9020788c7d4d8887bd4a2bdeacf493611 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Mon, 15 Mar 2021 00:29:34 +0300 Subject: [PATCH 17/33] popitaemsya snova --- crawler_config.json | 6 +++--- scrapper.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index e2a71584..94524902 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { - "base_urls": ["https://www.e1.ru/news/"], - "total_articles_to_find_and_parse": 5, - "max_number_articles_to_get_from_one_seed": 10 + "base_urls": ["https://www.e1.ru/news/"], + "total_articles_to_find_and_parse": 6, + "max_number_articles_to_get_from_one_seed": 10 } \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 74f9ef8d..b971a6ff 100644 --- a/scrapper.py +++ b/scrapper.py @@ -97,6 +97,7 @@ def _fill_article_with_meta_information(self, article_soup): self.article.title = article_soup.find('h2', {'itemprop': 'headline'}).find('span').text self.article.annotation = article_soup.find('p', class_="CLpj JZaj-").find('span').text self.article.author = 'NOT FOUND' + self.article.topics = article_soup.find('a', class_="CLpx CLrt JZak9").find('span').text self.article.date = article_soup.find('time', class_="G-k1").find('a').text.strip() @staticmethod From 9647f8274fba7b0e8b23c6116601364f71d6847a Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Mon, 15 Mar 2021 11:33:42 +0300 Subject: [PATCH 18/33] n --- scrapper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scrapper.py b/scrapper.py index b971a6ff..d1eb1685 100644 --- a/scrapper.py +++ b/scrapper.py @@ -165,11 +165,9 @@ def validate_config(crawler_path): crawler = Crawler(seed_urls=seed_urls, max_articles=max_articles, max_articles_per_seed=max_articles_per_seed) - art = crawler.find_articles() - print(art) + crawler.find_articles() prepare_environment(ASSETS_PATH) for article_id, article_url in enumerate(crawler.urls): - parser = ArticleParser(article_url, article_id+1) + parser = ArticleParser(article_url, article_id + 1) article = parser.parse() article.save_raw() - sleep((random.randrange(2, 6))) From 1a9d86c3358a8ca4aa90380e62874a297b41e710 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Mon, 15 Mar 2021 11:36:33 +0300 Subject: [PATCH 19/33] g --- scrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapper.py b/scrapper.py index d1eb1685..2453b6ed 100644 --- a/scrapper.py +++ b/scrapper.py @@ -168,6 +168,7 @@ def validate_config(crawler_path): crawler.find_articles() prepare_environment(ASSETS_PATH) for article_id, article_url in enumerate(crawler.urls): - parser = ArticleParser(article_url, article_id + 1) + parser = ArticleParser(article_url, article_id+1) article = parser.parse() article.save_raw() + sleep((random.randrange(2, 6))) From a3ac3f1087bd301f9a1e6ced4b6935f5728a72b8 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Sun, 21 Mar 2021 19:38:16 +0300 Subject: [PATCH 20/33] p --- constants.py | 4 ++-- crawler_config.json | 2 +- scrapper.py | 14 ++++++-------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/constants.py b/constants.py index 917cc470..a6b20ca9 100644 --- a/constants.py +++ b/constants.py @@ -8,6 +8,6 @@ ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') HEADERS = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - 'Chrome/89.0.4389.82 Safari/537.36' + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/88.0.4324.152 YaBrowser/21.2.3.100 Yowser/2.5 Safari/537.36' } diff --git a/crawler_config.json b/crawler_config.json index 94524902..6e532dee 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { "base_urls": ["https://www.e1.ru/news/"], - "total_articles_to_find_and_parse": 6, + "total_articles_to_find_and_parse": 5, "max_number_articles_to_get_from_one_seed": 10 } \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 2453b6ed..ecf59c25 100644 --- a/scrapper.py +++ b/scrapper.py @@ -48,8 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in @staticmethod def _extract_url(article_bs): - url_article = article_bs.find('h2', class_="G3ax").find('a') - article_link = url_article.attrs['href'] + article_link = article_bs.find('h2', class_="G3ax").find('a').get('href') return 'https://www.e1.ru' + article_link def find_articles(self): @@ -64,7 +63,7 @@ def find_articles(self): sleep(random.randrange(2, 6)) response.encoding = 'utf-8' page_soup = BeautifulSoup(response.content, features='lxml') - article_soup = page_soup.find_all('article', class_="G3aj7") + article_soup = page_soup.find_all('article', class_="G3ajx") for article in article_soup: seed_url = self._extract_url(article) self.urls.append(seed_url) @@ -94,11 +93,11 @@ def _fill_article_with_text(self, article_soup): self.article.text += par.text.strip() + '\n' def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('h2', {'itemprop': 'headline'}).find('span').text - self.article.annotation = article_soup.find('p', class_="CLpj JZaj-").find('span').text + self.article.title = article_soup.find('h2', class_="CVq3 CVtb KHax").find('span').text + self.article.annotation = article_soup.find('p', class_="CVq- KHaj1").find('span').text self.article.author = 'NOT FOUND' - self.article.topics = article_soup.find('a', class_="CLpx CLrt JZak9").find('span').text - self.article.date = article_soup.find('time', class_="G-k1").find('a').text.strip() + self.article.topics = article_soup.find('a', class_="CVrn CVtj KHak5").find('span').text + self.article.date = article_soup.find('time', class_="HDkz").find('a').text @staticmethod def unify_date_format(date_str): @@ -171,4 +170,3 @@ def validate_config(crawler_path): parser = ArticleParser(article_url, article_id+1) article = parser.parse() article.save_raw() - sleep((random.randrange(2, 6))) From ee1e49697cfb0c2b3a0589ddb907bea9c8be1d50 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Sun, 21 Mar 2021 19:56:38 +0300 Subject: [PATCH 21/33] pp --- scrapper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapper.py b/scrapper.py index ecf59c25..155c7296 100644 --- a/scrapper.py +++ b/scrapper.py @@ -48,7 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in @staticmethod def _extract_url(article_bs): - article_link = article_bs.find('h2', class_="G3ax").find('a').get('href') + article_link = article_bs.find('h2', class_="G1ax").find('a').get('href') return 'https://www.e1.ru' + article_link def find_articles(self): @@ -63,7 +63,7 @@ def find_articles(self): sleep(random.randrange(2, 6)) response.encoding = 'utf-8' page_soup = BeautifulSoup(response.content, features='lxml') - article_soup = page_soup.find_all('article', class_="G3ajx") + article_soup = page_soup.find_all('article', class_="G1ajx") for article in article_soup: seed_url = self._extract_url(article) self.urls.append(seed_url) @@ -166,7 +166,7 @@ def validate_config(crawler_path): max_articles_per_seed=max_articles_per_seed) crawler.find_articles() prepare_environment(ASSETS_PATH) - for article_id, article_url in enumerate(crawler.urls): - parser = ArticleParser(article_url, article_id+1) + for article_id, article_url in enumerate(crawler.urls, 1): + parser = ArticleParser(article_url, article_id) article = parser.parse() article.save_raw() From e651fd02dcba9a489acac7a6299253a0085fbc2f Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Sun, 21 Mar 2021 21:09:03 +0300 Subject: [PATCH 22/33] goo --- scrapper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapper.py b/scrapper.py index 155c7296..e30e5643 100644 --- a/scrapper.py +++ b/scrapper.py @@ -64,12 +64,12 @@ def find_articles(self): response.encoding = 'utf-8' page_soup = BeautifulSoup(response.content, features='lxml') article_soup = page_soup.find_all('article', class_="G1ajx") - for article in article_soup: + for article in article_soup[:max_articles_per_seed]: seed_url = self._extract_url(article) self.urls.append(seed_url) - if len(self.urls) <= max_articles and article not in self.urls: - seed_url = self._extract_url(article) - self.urls.append(seed_url) + if len(self.urls) == max_articles: + return self.urls + def get_search_urls(self): """ From e4368959bffba5989d7d545d99955e6acf86f37e Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 1 Apr 2021 17:28:30 +0300 Subject: [PATCH 23/33] start pipe --- constants.py | 4 +- crawler_config.json | 2 +- pipeline.py | 114 +++++++++++++++++++++++++++++--------------- requirements.txt | 5 +- scrapper.py | 14 +++--- 5 files changed, 88 insertions(+), 51 deletions(-) diff --git a/constants.py b/constants.py index a6b20ca9..6d9ad653 100644 --- a/constants.py +++ b/constants.py @@ -8,6 +8,6 @@ ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') HEADERS = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/88.0.4324.152 YaBrowser/21.2.3.100 Yowser/2.5 Safari/537.36' + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' + 'Chrome/88.0.4324.182 YaBrowser/21.2.4.165 Yowser/2.5 Safari/537.36' } diff --git a/crawler_config.json b/crawler_config.json index 6e532dee..70338ee4 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,5 @@ { "base_urls": ["https://www.e1.ru/news/"], "total_articles_to_find_and_parse": 5, - "max_number_articles_to_get_from_one_seed": 10 + "max_number_articles_to_get_from_one_seed": 5 } \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index b6847326..69fd8bf6 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,15 +1,27 @@ """ Pipeline for text processing implementation """ +from pymystem3 import Mystem +from typing import List +from pathlib import Path +from article import Article +from constants import ASSETS_PATH -class ArticleNotFoundError(Exception): + +class EmptyDirectoryError(Exception): """ Custom error """ -class EmptyDirectoryError(Exception): +class InconsistentDatasetError(Exception): + """ + Custom error + """ + + +class UnknownDatasetError(Exception): """ Custom error """ @@ -19,80 +31,106 @@ class MorphologicalToken: """ Stores language params for each processed token """ - def __init__(self, normalized_form, tags, original_word): - pass - - def to_text(self): - """ - Converts instance to str format - """ - pass + def __init__(self, normalized_form, original_word): + self.original_word = original_word + self.normalized_form = normalized_form + self.mystem_tags = '' def __str__(self): - pass + return f"{self.normalized_form}<{self.mystem_tags}>" class CorpusManager: """ Works with articles and stores them """ + def __init__(self, path_to_raw_txt_data: str): - pass + self.path_to_raw_txt_date = path_to_raw_txt_data + self._storage = {} - def get_articles_meta(self): - """ - Gets article metadata - """ - pass + self._scan_dataset() - def get_raw_text(self, text_id): + def _scan_dataset(self): """ - Opens processed text + Register each dataset entry """ - pass + for file in Path(self.path_to_raw_txt_date).glob('*_raw.txt'): + id = str(file).split('\\')[-1].split('_')[0] + self._storage[id] = Article(url=None, article_id=id) - def write_processed_text(self, text_id, processed_text): + def get_articles(self): """ - Writes processed text + Returns storage params """ - pass + return self._storage class TextProcessingPipeline: """ Process articles from corpus manager """ + def __init__(self, corpus_manager: CorpusManager): - pass + self.corpus_manager = corpus_manager + self._text = '' def run(self): """ Runs pipeline process scenario """ - pass + for article in self.corpus_manager.get_articles().values(): + self.text_ = article.get_raw_text() + processed_text = list(map(str, self._process())) + article.save_processed(' '.join(processed_text)) - @staticmethod - def normalize_and_tag_text(text) -> str: + def _process(self) -> List[type(MorphologicalToken)]: """ - Processes each token and creates MorphToken class instance + Performs processing of each text """ - pass + text = self.text_ + result = Mystem().analyze(text) + tokens = [] - @staticmethod - def transform_tokens_to_text(tokens: list) -> str: - """ - Transforms given list of tokens to str - """ - pass + for word in result: + try: + token = MorphologicalToken(original_word=word['text'], normalized_form=word['analysis'][0]['lex']) + token.mystem_tags = word['analysis'][0]['gr'] + except (IndexError, KeyError): + if not word['text'].isnumeric(): + continue + token = MorphologicalToken(original_word=word['text'], normalized_form=word['text']) + tokens.append(token) -def validate_given_path(path_to_validate): + return tokens + + +def validate_dataset(path_to_validate): """ Validates folder with assets """ - pass + path = Path(path_to_validate) + + if not path.exists(): + raise FileNotFoundError + + if not path.is_dir(): + raise NotADirectoryError + + if not list(path.iterdir()): + raise EmptyDirectoryError + + +def main(): + validate_dataset(ASSETS_PATH) + + corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH) + pipeline = TextProcessingPipeline(corpus_manager=corpus_manager) + + pipeline.run() if __name__ == "__main__": # YOUR CODE HERE - pass + main() diff --git a/requirements.txt b/requirements.txt index 97b9fc20..7cc4f073 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -requests==2.25.1 beautifulsoup4==4.9.3 -lxml==4.6.2 \ No newline at end of file +lxml==4.6.2 +pymystem3==0.2.0 +requests==2.25.1 \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index e30e5643..f90e25db 100644 --- a/scrapper.py +++ b/scrapper.py @@ -48,7 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in @staticmethod def _extract_url(article_bs): - article_link = article_bs.find('h2', class_="G1ax").find('a').get('href') + article_link = article_bs.find('h2', class_="G3ad").find('a').get('href') return 'https://www.e1.ru' + article_link def find_articles(self): @@ -63,14 +63,13 @@ def find_articles(self): sleep(random.randrange(2, 6)) response.encoding = 'utf-8' page_soup = BeautifulSoup(response.content, features='lxml') - article_soup = page_soup.find_all('article', class_="G1ajx") + article_soup = page_soup.find_all('article', class_="G3aj-") for article in article_soup[:max_articles_per_seed]: seed_url = self._extract_url(article) self.urls.append(seed_url) if len(self.urls) == max_articles: return self.urls - def get_search_urls(self): """ Returns seed_urls param @@ -88,16 +87,15 @@ def __init__(self, full_url: str, article_id: int): self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup): - article_text = article_soup.find('div', class_="F-af3").find_all('p') + article_text = article_soup.find('div', class_="GDagz").find('div').find_all('p') for par in article_text: self.article.text += par.text.strip() + '\n' def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('h2', class_="CVq3 CVtb KHax").find('span').text - self.article.annotation = article_soup.find('p', class_="CVq- KHaj1").find('span').text + self.article.title = article_soup.find('h2', class_="C7r1 C7t- KBad").find('span').text self.article.author = 'NOT FOUND' - self.article.topics = article_soup.find('a', class_="CVrn CVtj KHak5").find('span').text - self.article.date = article_soup.find('time', class_="HDkz").find('a').text + self.article.topics = article_soup.find('a', class_="C7sl C7uh KBal9").find('span').text + self.article.date = article_soup.find('time', class_="HDk-").find('a').text @staticmethod def unify_date_format(date_str): From ea8cbaeb40f051686f1c5b87f053f3ab6ada218a Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Thu, 1 Apr 2021 17:59:20 +0300 Subject: [PATCH 24/33] target score 6 --- pipeline.py | 1 - target_score.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 69fd8bf6..4458a408 100644 --- a/pipeline.py +++ b/pipeline.py @@ -4,7 +4,6 @@ from pymystem3 import Mystem from typing import List from pathlib import Path - from article import Article from constants import ASSETS_PATH diff --git a/target_score.txt b/target_score.txt index dd08e182..a7013d3a 100644 --- a/target_score.txt +++ b/target_score.txt @@ -2,4 +2,4 @@ 6 # Target score for pipeline.py: -0 +6 From 0411522f5d0bed25b6a4176f778d0a432c626379 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 15:11:34 +0300 Subject: [PATCH 25/33] pymorphy try --- constants.py | 2 +- pipeline.py | 22 +++++++++++----------- requirements.txt | 1 + scrapper.py | 12 ++++++------ target_score.txt | 2 +- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/constants.py b/constants.py index 6d9ad653..22208549 100644 --- a/constants.py +++ b/constants.py @@ -9,5 +9,5 @@ CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') HEADERS = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - 'Chrome/88.0.4324.182 YaBrowser/21.2.4.165 Yowser/2.5 Safari/537.36' + 'Chrome/89.0.4389.90 Safari/537.36' } diff --git a/pipeline.py b/pipeline.py index 4458a408..11be3ef1 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,6 +1,7 @@ """ Pipeline for text processing implementation """ +from pymorphy2 import MorphAnalyzer from pymystem3 import Mystem from typing import List from pathlib import Path @@ -34,9 +35,10 @@ def __init__(self, normalized_form, original_word): self.original_word = original_word self.normalized_form = normalized_form self.mystem_tags = '' + self.pymorphy_tags = '' def __str__(self): - return f"{self.normalized_form}<{self.mystem_tags}>" + return f"{self.normalized_form}<{self.mystem_tags}>({self.pymorphy_tags})" class CorpusManager: @@ -47,14 +49,13 @@ class CorpusManager: def __init__(self, path_to_raw_txt_data: str): self.path_to_raw_txt_date = path_to_raw_txt_data self._storage = {} - self._scan_dataset() def _scan_dataset(self): """ Register each dataset entry """ - for file in Path(self.path_to_raw_txt_date).glob('*_raw.txt'): + for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'): id = str(file).split('\\')[-1].split('_')[0] self._storage[id] = Article(url=None, article_id=id) @@ -72,14 +73,14 @@ class TextProcessingPipeline: def __init__(self, corpus_manager: CorpusManager): self.corpus_manager = corpus_manager - self._text = '' + self.raw_text = '' def run(self): """ Runs pipeline process scenario """ for article in self.corpus_manager.get_articles().values(): - self.text_ = article.get_raw_text() + self.raw_text = article.get_raw_text() processed_text = list(map(str, self._process())) article.save_processed(' '.join(processed_text)) @@ -87,22 +88,21 @@ def _process(self) -> List[type(MorphologicalToken)]: """ Performs processing of each text """ - text = self.text_ - result = Mystem().analyze(text) + result = Mystem().analyze(self.raw_text) tokens = [] for word in result: try: token = MorphologicalToken(original_word=word['text'], normalized_form=word['analysis'][0]['lex']) token.mystem_tags = word['analysis'][0]['gr'] + tokens.append(token) except (IndexError, KeyError): if not word['text'].isnumeric(): continue - token = MorphologicalToken(original_word=word['text'], normalized_form=word['text']) - - tokens.append(token) + for token in tokens: + token.pymorphy_tags = MorphAnalyzer().parse(token.original_word)[0].tag - return tokens + return tokens def validate_dataset(path_to_validate): diff --git a/requirements.txt b/requirements.txt index 7cc4f073..06ccd482 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ beautifulsoup4==4.9.3 lxml==4.6.2 +pymorphy2==0.9.1 pymystem3==0.2.0 requests==2.25.1 \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index f90e25db..436b02e2 100644 --- a/scrapper.py +++ b/scrapper.py @@ -48,7 +48,7 @@ def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: in @staticmethod def _extract_url(article_bs): - article_link = article_bs.find('h2', class_="G3ad").find('a').get('href') + article_link = article_bs.find('h2', class_="G9ax").find('a').get('href') return 'https://www.e1.ru' + article_link def find_articles(self): @@ -63,7 +63,7 @@ def find_articles(self): sleep(random.randrange(2, 6)) response.encoding = 'utf-8' page_soup = BeautifulSoup(response.content, features='lxml') - article_soup = page_soup.find_all('article', class_="G3aj-") + article_soup = page_soup.find_all('article', class_="G9alp") for article in article_soup[:max_articles_per_seed]: seed_url = self._extract_url(article) self.urls.append(seed_url) @@ -87,15 +87,15 @@ def __init__(self, full_url: str, article_id: int): self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup): - article_text = article_soup.find('div', class_="GDagz").find('div').find_all('p') + article_text = article_soup.find('div', class_="GFahz").find('div').find_all('p') for par in article_text: self.article.text += par.text.strip() + '\n' def _fill_article_with_meta_information(self, article_soup): - self.article.title = article_soup.find('h2', class_="C7r1 C7t- KBad").find('span').text + self.article.title = article_soup.find('h2', class_="CRqd CRsn JPax").find('span').text self.article.author = 'NOT FOUND' - self.article.topics = article_soup.find('a', class_="C7sl C7uh KBal9").find('span').text - self.article.date = article_soup.find('time', class_="HDk-").find('a').text + self.article.topics = article_soup.find('a', class_="CRqz CRsv JPall").find('span').text + self.article.date = article_soup.find('time', class_="HHkz").find('a').text @staticmethod def unify_date_format(date_str): diff --git a/target_score.txt b/target_score.txt index a7013d3a..72ecf9ea 100644 --- a/target_score.txt +++ b/target_score.txt @@ -2,4 +2,4 @@ 6 # Target score for pipeline.py: -6 +8 From d3b073ca322a2956165c2d1c9166a731b4fe5a1b Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 15:22:15 +0300 Subject: [PATCH 26/33] fix lint --- pipeline.py | 12 ++++++++---- scrapper.py | 5 ++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pipeline.py b/pipeline.py index 11be3ef1..59e185c5 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,10 +1,11 @@ """ Pipeline for text processing implementation """ + +from pathlib import Path +from typing import List from pymorphy2 import MorphAnalyzer from pymystem3 import Mystem -from typing import List -from pathlib import Path from article import Article from constants import ASSETS_PATH @@ -56,8 +57,8 @@ def _scan_dataset(self): Register each dataset entry """ for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'): - id = str(file).split('\\')[-1].split('_')[0] - self._storage[id] = Article(url=None, article_id=id) + id_each = str(file).split('\\')[-1].split('_')[0] + self._storage[id] = Article(url=None, article_id=id_each) def get_articles(self): """ @@ -104,6 +105,9 @@ def _process(self) -> List[type(MorphologicalToken)]: return tokens + def public_method(self): + pass + def validate_dataset(path_to_validate): """ diff --git a/scrapper.py b/scrapper.py index 436b02e2..36538745 100644 --- a/scrapper.py +++ b/scrapper.py @@ -1,15 +1,14 @@ """ Crawler implementation """ -import json import os +import json import random +from time import sleep import requests - from bs4 import BeautifulSoup from article import Article from constants import CRAWLER_CONFIG_PATH, HEADERS, ASSETS_PATH -from time import sleep class IncorrectURLError(Exception): From 70e0459ab62a796402281d637e8c7277d70210d6 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 15:25:24 +0300 Subject: [PATCH 27/33] pls --- pipeline.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pipeline.py b/pipeline.py index 59e185c5..0bbe5fe9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -41,6 +41,9 @@ def __init__(self, normalized_form, original_word): def __str__(self): return f"{self.normalized_form}<{self.mystem_tags}>({self.pymorphy_tags})" + def public_method(self): + pass + class CorpusManager: """ @@ -66,6 +69,9 @@ def get_articles(self): """ return self._storage + def public_method(self): + pass + class TextProcessingPipeline: """ From 0f7d4a9e1d24ad5f50f023809ca2c097fc15c512 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 15:37:58 +0300 Subject: [PATCH 28/33] maybe --- scrapper.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scrapper.py b/scrapper.py index 36538745..ba2bc933 100644 --- a/scrapper.py +++ b/scrapper.py @@ -63,10 +63,10 @@ def find_articles(self): response.encoding = 'utf-8' page_soup = BeautifulSoup(response.content, features='lxml') article_soup = page_soup.find_all('article', class_="G9alp") - for article in article_soup[:max_articles_per_seed]: - seed_url = self._extract_url(article) + for articles in article_soup[:max_num_per_seed]: + seed_url = self._extract_url(articles) self.urls.append(seed_url) - if len(self.urls) == max_articles: + if len(self.urls) == max_num_articles: return self.urls def get_search_urls(self): @@ -83,7 +83,7 @@ class ArticleParser: def __init__(self, full_url: str, article_id: int): self.full_url = full_url self.article_id = article_id - self.article = Article(full_url, article_id) + self.article = Article(url=full_url, article_id=article_id) def _fill_article_with_text(self, article_soup): article_text = article_soup.find('div', class_="GFahz").find('div').find_all('p') @@ -156,11 +156,11 @@ def validate_config(crawler_path): if __name__ == '__main__': - #YOUR CODE HERE - seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH) - crawler = Crawler(seed_urls=seed_urls, - max_articles=max_articles, - max_articles_per_seed=max_articles_per_seed) + # YOUR CODE HERE + seed_urls_list, max_num_articles, max_num_per_seed = validate_config(CRAWLER_CONFIG_PATH) + crawler = Crawler(seed_urls=seed_urls_list, + max_articles=max_num_articles, + max_articles_per_seed=max_num_per_seed) crawler.find_articles() prepare_environment(ASSETS_PATH) for article_id, article_url in enumerate(crawler.urls, 1): From 77ec68052874a3577aec0dd30c43f496220de735 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 15:40:10 +0300 Subject: [PATCH 29/33] pofig --- target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target_score.txt b/target_score.txt index 72ecf9ea..a7013d3a 100644 --- a/target_score.txt +++ b/target_score.txt @@ -2,4 +2,4 @@ 6 # Target score for pipeline.py: -8 +6 From b225c89b8df1d0e7cc35d512e4f5d16b9674b136 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 15:47:39 +0300 Subject: [PATCH 30/33] fix scrapper lint --- scrapper.py | 4 ++-- target_score.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapper.py b/scrapper.py index ba2bc933..8b8eb905 100644 --- a/scrapper.py +++ b/scrapper.py @@ -163,7 +163,7 @@ def validate_config(crawler_path): max_articles_per_seed=max_num_per_seed) crawler.find_articles() prepare_environment(ASSETS_PATH) - for article_id, article_url in enumerate(crawler.urls, 1): - parser = ArticleParser(article_url, article_id) + for article_id_num, article_url in enumerate(crawler.urls, 1): + parser = ArticleParser(full_url=article_url, article_id=article_id_num) article = parser.parse() article.save_raw() diff --git a/target_score.txt b/target_score.txt index a7013d3a..6b09f939 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,5 +1,5 @@ # Target score for scrapper.py: -6 +8 # Target score for pipeline.py: -6 +8 From 5725b05eefda3d76ef97a0cb07ab260f0354ad28 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 16:10:49 +0300 Subject: [PATCH 31/33] may be --- scrapper.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scrapper.py b/scrapper.py index 8b8eb905..1dbb69b0 100644 --- a/scrapper.py +++ b/scrapper.py @@ -3,8 +3,6 @@ """ import os import json -import random -from time import sleep import requests from bs4 import BeautifulSoup from article import Article @@ -58,16 +56,19 @@ def find_articles(self): response = requests.get(url, headers=HEADERS) if not response: raise IncorrectURLError - if response.status_code == 200: - sleep(random.randrange(2, 6)) - response.encoding = 'utf-8' + page_soup = BeautifulSoup(response.content, features='lxml') article_soup = page_soup.find_all('article', class_="G9alp") + for articles in article_soup[:max_num_per_seed]: seed_url = self._extract_url(articles) self.urls.append(seed_url) + if len(self.urls) == max_num_articles: - return self.urls + break + + if len(self.urls) == max_num_articles: + break def get_search_urls(self): """ From c9f1c59e6fe1fd5b751222b5d2dd352414c07172 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 16:26:21 +0300 Subject: [PATCH 32/33] pp --- target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target_score.txt b/target_score.txt index 6b09f939..72ecf9ea 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,5 +1,5 @@ # Target score for scrapper.py: -8 +6 # Target score for pipeline.py: 8 From 3b4bdb851624305f2eb3ac72dc58f9a555b24655 Mon Sep 17 00:00:00 2001 From: ffmiil <65333072+ffmiil@users.noreply.github.com> Date: Fri, 2 Apr 2021 17:05:10 +0300 Subject: [PATCH 33/33] is this win? --- pipeline.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/pipeline.py b/pipeline.py index 0bbe5fe9..302f9805 100644 --- a/pipeline.py +++ b/pipeline.py @@ -60,8 +60,8 @@ def _scan_dataset(self): Register each dataset entry """ for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'): - id_each = str(file).split('\\')[-1].split('_')[0] - self._storage[id] = Article(url=None, article_id=id_each) + id_each = int(file.parts[-1].split('_')[0]) + self._storage[id_each] = Article(url=None, article_id=id_each) def get_articles(self): """ @@ -95,19 +95,15 @@ def _process(self) -> List[type(MorphologicalToken)]: """ Performs processing of each text """ - result = Mystem().analyze(self.raw_text) + process = Mystem().analyze(self.raw_text) tokens = [] - for word in result: - try: - token = MorphologicalToken(original_word=word['text'], normalized_form=word['analysis'][0]['lex']) - token.mystem_tags = word['analysis'][0]['gr'] - tokens.append(token) - except (IndexError, KeyError): - if not word['text'].isnumeric(): - continue - for token in tokens: - token.pymorphy_tags = MorphAnalyzer().parse(token.original_word)[0].tag + for tok in process: + if tok.get('analysis') and tok.get('text'): + morph_token = MorphologicalToken(original_word=tok['text'], normalized_form=tok['analysis'][0]['lex']) + morph_token.mystem_tags = tok['analysis'][0]['gr'] + morph_token.pymorphy_tags = MorphAnalyzer().parse(word=morph_token.original_word)[0].tag + tokens.append(morph_token) return tokens