From 3261adb3a53ff7421cced147a0e2a1a687484914 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sun, 28 Feb 2021 15:52:16 +0300 Subject: [PATCH 01/50] target score change --- target_score.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target_score.txt b/target_score.txt index a404aa74..cf79a25c 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,8 +1,8 @@ # Target score for scrapper.py: -6 +10 # Target score for pipeline.py: -6 +10 # Skip pipeline checks: 1 From f471f4a267f948671f2b91ab737ed12d636b8384 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 8 Mar 2021 22:10:34 +0300 Subject: [PATCH 02/50] completed stages 1 and 2 --- crawler_config.json | 12 +++-- scrapper.py | 104 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 102 insertions(+), 14 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index e60ce0f7..80f0c5d8 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -1,5 +1,11 @@ { - "base_urls": [], - "total_articles_to_find_and_parse": 0, - "max_number_articles_to_get_from_one_seed": 0 + "base_urls": ["https://burunen.ru/news/society/", + "https://burunen.ru/news/culture/", + "https://burunen.ru/news/economy/", + "https://burunen.ru/news/sports/", + "https://burunen.ru/news/incidents/", + "https://burunen.ru/news/politic/" + ], + "total_articles_to_find_and_parse": 20, + "max_number_articles_to_get_from_one_seed": 25 } \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 43aecef5..d4460dc1 100644 --- a/scrapper.py +++ b/scrapper.py @@ -2,6 +2,15 @@ Crawler implementation """ +import json +import requests +from requests.exceptions import RequestException +from bs4 import BeautifulSoup +from time import sleep as wait +from article import Article + + +CRAWLER_CONFIG_PATH = 'crawler_config.json' class IncorrectURLError(Exception): """ @@ -31,18 +40,53 @@ class Crawler: """ Crawler implementation """ - def __init__(self, seed_urls: list, max_articles: int): - pass + def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_seed: int): + self.seed_urls = seed_urls + self.total_max_articles = total_max_articles + self.max_articles_per_seed = max_articles_per_seed + self.urls = [] + + self.URL_START = 'https://burunen.ru' @staticmethod - def _extract_url(article_bs): - pass + def _extract_url(article_bs, seen): + extracted = list(set([link['href'] for link in article_bs.find_all('a', href=True)])) + # print(extracted) + # print(' ',extracted) + return list(filter(lambda x: True if x.startswith('/news/') + and x not in seen + # and any(map(lambda y: y.isdigit(), x)) + else False, extracted)) def find_articles(self): """ Finds articles """ - pass + for url in self.seed_urls: + article_bs = BeautifulSoup(requests.get(url, 'html.parser').text, 'html.parser') + newfound = self._extract_url(article_bs, self.urls) + self.urls.extend(newfound[:self.max_articles_per_seed]) + self.urls = [i for i in self.urls if len(i) > 20][:self.total_max_articles] + print('Scraped seed urls, overall number of urls is', len(self.urls)) + + old = len(self.urls) + while len(self.urls) < self.total_max_articles: + print('Due to insufficient number started further iteration') + print('current number', len(self.urls), ', required', self.total_max_articles) + for url in self.urls: + article_bs = BeautifulSoup(requests.get(self.URL_START + url, 'html.parser').text, 'html.parser') + newfound = filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls)) + print(' checked new url, found', len(newfound), 'articles') + self.urls.extend(newfound[:self.max_articles_per_seed]) + # wait(10) + if len(self.urls) > self.total_max_articles: + break + if len(self.urls) == old: + print('Something is wrong with scraping parameters') + break + + self.urls = self.urls[:self.total_max_articles] + def get_search_urls(self): """ @@ -56,10 +100,14 @@ class ArticleParser: ArticleParser implementation """ def __init__(self, full_url: str, article_id: int): - pass + self.full_url = full_url + self.article_id = article_id + self.article = Article(self.full_url, self.article_id) def _fill_article_with_text(self, article_soup): - pass + all_text = article_soup.find('div', {'class' : 'text letter', 'itemprop' : 'articleBody'}).text + text = (all_text..split('Автор:')[0].strip()) + self.text = text def _fill_article_with_meta_information(self, article_soup): pass @@ -75,7 +123,11 @@ def parse(self): """ Parses each article """ - pass + html = requests.get(self.full_url, 'html.parser').text + article_bs = BeautifulSoup(html, 'html.parser') + self._fill_article_with_text(article_bs) + # self._fill_article_with_text(article_bs) + # self._fill_article_with_meta_information(article_bs) def prepare_environment(base_path): @@ -89,9 +141,39 @@ def validate_config(crawler_path): """ Validates given config """ - pass + with open(crawler_path) as crawler_config: + config = json.load(crawler_config) + try: + good_response = list(map(lambda link: True if requests.get(link).status_code == 200 else False, + config['base_urls'])) + except RequestException: + raise IncorrectURLError + except Exception: + raise UnknownConfigError + if not all(good_response): + raise IncorrectURLError + if not all((isinstance(config['total_articles_to_find_and_parse'], int), + isinstance(config['max_number_articles_to_get_from_one_seed'], int))): + raise IncorrectNumberOfArticlesError + if not config['total_articles_to_find_and_parse'] < config['max_number_articles_to_get_from_one_seed']\ + * len(good_response): + raise NumberOfArticlesOutOfRangeError + return config['base_urls'], config['total_articles_to_find_and_parse'], \ + config['max_number_articles_to_get_from_one_seed'] if __name__ == '__main__': - # YOUR CODE HERE - pass + seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH) + crawler = Crawler(seed_urls=seed_urls, + total_max_articles=max_articles, + max_articles_per_seed=max_articles_per_seed) + + crawler.find_articles() + print('Scraped', len(crawler.urls), 'articles') + + print('onto parsing') + + for n, url in enumerate(crawler.urls[:1]): + full_url = crawler.URL_START + url + parser = ArticleParser(full_url, n) + article = parser.parse() From dbcaa4d6b532c6e6c05aea72a4b1e9f93c067993 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 9 Mar 2021 15:10:15 +0300 Subject: [PATCH 03/50] build for four --- article.py | 16 ++++++++-------- config/constants.py | 9 +++++++++ scrapper.py | 42 ++++++++++++++++++++++++++---------------- target_score.txt | 4 ++-- 4 files changed, 45 insertions(+), 26 deletions(-) create mode 100644 config/constants.py diff --git a/article.py b/article.py index 1d759cd2..d7547150 100644 --- a/article.py +++ b/article.py @@ -38,14 +38,14 @@ def save_raw(self): with open(self._get_raw_text_path(), 'w', encoding='utf-8') as file: file.write(self.text) - - with open(os.path.join(ASSETS_PATH, article_meta_name), "w", encoding='utf-8') as file: - json.dump(self._get_meta(), - file, - sort_keys=False, - indent=4, - ensure_ascii=False, - separators=(',', ': ')) + # + # with open(os.path.join(ASSETS_PATH, article_meta_name), "w", encoding='utf-8') as file: + # json.dump(self._get_meta(), + # file, + # sort_keys=False, + # indent=4, + # ensure_ascii=False, + # separators=(',', ': ')) @staticmethod def from_meta_json(json_path: str): diff --git a/config/constants.py b/config/constants.py new file mode 100644 index 00000000..12d85256 --- /dev/null +++ b/config/constants.py @@ -0,0 +1,9 @@ +""" +Useful constant variables +""" + +import os + +PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) +ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') +CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') diff --git a/scrapper.py b/scrapper.py index d4460dc1..e217300e 100644 --- a/scrapper.py +++ b/scrapper.py @@ -6,11 +6,14 @@ import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup -from time import sleep as wait +# from time import sleep as wait from article import Article +import re CRAWLER_CONFIG_PATH = 'crawler_config.json' +NEWLINES_RE = re.compile(r"\n{2,}") # two or more "\n" characters + class IncorrectURLError(Exception): """ @@ -66,7 +69,8 @@ def find_articles(self): article_bs = BeautifulSoup(requests.get(url, 'html.parser').text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) self.urls.extend(newfound[:self.max_articles_per_seed]) - self.urls = [i for i in self.urls if len(i) > 20][:self.total_max_articles] + self.urls = [i for i in self.urls if len(i) > 20 + and not any(map(lambda y: y.isupper(), i))][:self.total_max_articles] print('Scraped seed urls, overall number of urls is', len(self.urls)) old = len(self.urls) @@ -82,12 +86,11 @@ def find_articles(self): if len(self.urls) > self.total_max_articles: break if len(self.urls) == old: - print('Something is wrong with scraping parameters') + print(' Something is wrong with scraping parameters') break self.urls = self.urls[:self.total_max_articles] - def get_search_urls(self): """ Returns seed_urls param @@ -105,9 +108,12 @@ def __init__(self, full_url: str, article_id: int): self.article = Article(self.full_url, self.article_id) def _fill_article_with_text(self, article_soup): - all_text = article_soup.find('div', {'class' : 'text letter', 'itemprop' : 'articleBody'}).text - text = (all_text..split('Автор:')[0].strip()) - self.text = text + try: + text = article_soup.find('div', {'class': 'text letter', 'itemprop': 'articleBody'}).text.strip() + # text = NEWLINES_RE.split(all_text) # regex splitting + self.article.text = text + except AttributeError: + print(' unable to parse', self.full_url) def _fill_article_with_meta_information(self, article_soup): pass @@ -123,18 +129,21 @@ def parse(self): """ Parses each article """ + # print(self.full_url) + self.article.url = self.full_url + self.article.article_id = self.article_id html = requests.get(self.full_url, 'html.parser').text article_bs = BeautifulSoup(html, 'html.parser') self._fill_article_with_text(article_bs) - # self._fill_article_with_text(article_bs) # self._fill_article_with_meta_information(article_bs) + self.article.save_raw() -def prepare_environment(base_path): - """ - Creates ASSETS_PATH folder if not created and removes existing folder - """ - pass +# def prepare_environment(base_path): +# """ +# Creates ASSETS_PATH folder if not created and removes existing folder +# """ +# pass def validate_config(crawler_path): @@ -169,11 +178,12 @@ def validate_config(crawler_path): max_articles_per_seed=max_articles_per_seed) crawler.find_articles() - print('Scraped', len(crawler.urls), 'articles') + # print('Scraped', len(crawler.urls), 'articles') print('onto parsing') - for n, url in enumerate(crawler.urls[:1]): + for n, url in enumerate(crawler.urls): full_url = crawler.URL_START + url parser = ArticleParser(full_url, n) - article = parser.parse() + parser.parse() + print('parsing is finished') diff --git a/target_score.txt b/target_score.txt index 686883dc..ad4942b9 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,6 +1,6 @@ # Target score for scrapper.py: -6 +4 # Target score for pipeline.py: -0 \ No newline at end of file +10 \ No newline at end of file From 0c35e7729b07b73bc8b9f9991c291783867044d4 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 9 Mar 2021 15:50:36 +0300 Subject: [PATCH 04/50] build for four, fixed target score --- config/raw_metadata_score_four_test.py | 1 + scrapper.py | 21 ++++++++++++++++++--- target_score.txt | 1 - 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/config/raw_metadata_score_four_test.py b/config/raw_metadata_score_four_test.py index d8691879..e6900c05 100644 --- a/config/raw_metadata_score_four_test.py +++ b/config/raw_metadata_score_four_test.py @@ -16,6 +16,7 @@ def setUp(self) -> None: def test_validate_sort(self): list_ids = [pair[0] for pair in self.texts] + print(list_ids) for i in range(1, len(list_ids)+1): self.assertTrue(i in list_ids, msg="""Articles ids are not homogeneous. E.g. numbers are not from 1 to N""") diff --git a/scrapper.py b/scrapper.py index e217300e..b643e044 100644 --- a/scrapper.py +++ b/scrapper.py @@ -116,7 +116,22 @@ def _fill_article_with_text(self, article_soup): print(' unable to parse', self.full_url) def _fill_article_with_meta_information(self, article_soup): - pass + try: + title = article_soup.title.text + self.article.title = title + + credits = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0] + if 'Автор:' in credits: + author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0][7:] + elif 'Источник:' in credits: + author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0][9:].strip() + else: + author = '' + self.article.author = author + except AttributeError: + print(' something is off with', self.full_url) + # print(title) + # self.article.title = title @staticmethod def unify_date_format(date_str): @@ -135,7 +150,7 @@ def parse(self): html = requests.get(self.full_url, 'html.parser').text article_bs = BeautifulSoup(html, 'html.parser') self._fill_article_with_text(article_bs) - # self._fill_article_with_meta_information(article_bs) + self._fill_article_with_meta_information(article_bs) self.article.save_raw() @@ -184,6 +199,6 @@ def validate_config(crawler_path): for n, url in enumerate(crawler.urls): full_url = crawler.URL_START + url - parser = ArticleParser(full_url, n) + parser = ArticleParser(full_url, n + 1) parser.parse() print('parsing is finished') diff --git a/target_score.txt b/target_score.txt index ad4942b9..221e98ce 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,4 +1,3 @@ - # Target score for scrapper.py: 4 From 8a09da7d8e34f01d2ec2632c3fa7a461ba40c9e4 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 9 Mar 2021 21:09:46 +0300 Subject: [PATCH 05/50] build for idk what score --- article.py | 18 +++++++++--------- scrapper.py | 37 ++++++++++++++++++++++++++++++------- target_score.txt | 4 ++-- 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/article.py b/article.py index d7547150..907f4f30 100644 --- a/article.py +++ b/article.py @@ -38,14 +38,14 @@ def save_raw(self): with open(self._get_raw_text_path(), 'w', encoding='utf-8') as file: file.write(self.text) - # - # with open(os.path.join(ASSETS_PATH, article_meta_name), "w", encoding='utf-8') as file: - # json.dump(self._get_meta(), - # file, - # sort_keys=False, - # indent=4, - # ensure_ascii=False, - # separators=(',', ': ')) + + with open(os.path.join(ASSETS_PATH, article_meta_name), "w", encoding='utf-8') as file: + json.dump(self._get_meta(), + file, + sort_keys=False, + indent=4, + ensure_ascii=False, + separators=(',', ': ')) @staticmethod def from_meta_json(json_path: str): @@ -99,7 +99,7 @@ def _date_to_text(self): """ Converts datetime object to text """ - return self.date.strftime("%Y-%m-%d %H:%M:%S") + return self.date.strftime("%Y-%m-%d") def _get_raw_text_path(self): """ diff --git a/scrapper.py b/scrapper.py index b643e044..f149dd2a 100644 --- a/scrapper.py +++ b/scrapper.py @@ -9,7 +9,8 @@ # from time import sleep as wait from article import Article import re - +from datetime import date +import os CRAWLER_CONFIG_PATH = 'crawler_config.json' NEWLINES_RE = re.compile(r"\n{2,}") # two or more "\n" characters @@ -128,6 +129,11 @@ def _fill_article_with_meta_information(self, article_soup): else: author = '' self.article.author = author + date = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[1] + self.article.date = self.unify_date_format(date) + + topic = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[0] + self.article.topics = topic except AttributeError: print(' something is off with', self.full_url) # print(title) @@ -138,7 +144,22 @@ def unify_date_format(date_str): """ Unifies date format """ - pass + day, month, year = date_str.split() + if len(day) == 1: + day = '0' + day + match = {'янв': '01', + 'фев': '02', + 'мар': '03', + 'апр': '04', + 'май': '05', + 'июн': '06', + 'июл': '07', + 'авг': '08', + 'сен': '09', + 'окт': '10', + 'ноя': '11', + 'дек': '12'} + return date.fromisoformat(year + '-' + match[month] + '-' + day) def parse(self): """ @@ -154,11 +175,13 @@ def parse(self): self.article.save_raw() -# def prepare_environment(base_path): -# """ -# Creates ASSETS_PATH folder if not created and removes existing folder -# """ -# pass +def prepare_environment(base_path): + """ + Creates ASSETS_PATH folder if not created and removes existing folder + """ + newpath = r'{}/ASSETS_PATH'.format(base_path) + if not os.path.exists(newpath): + os.makedirs(newpath) def validate_config(crawler_path): diff --git a/target_score.txt b/target_score.txt index 221e98ce..dde8d696 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,5 +1,5 @@ # Target score for scrapper.py: -4 +8 # Target score for pipeline.py: -10 \ No newline at end of file +8 \ No newline at end of file From 6fee8b34923b49beafb2c94337ecdbed5c14df20 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 9 Mar 2021 21:27:57 +0300 Subject: [PATCH 06/50] fixed linting a little --- article.py | 6 +++--- scrapper.py | 49 +++++++++++++++++++++++++------------------------ 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/article.py b/article.py index 907f4f30..f471cb6e 100644 --- a/article.py +++ b/article.py @@ -46,7 +46,7 @@ def save_raw(self): indent=4, ensure_ascii=False, separators=(',', ': ')) - + @staticmethod def from_meta_json(json_path: str): """ @@ -94,13 +94,13 @@ def _get_meta(self): 'author': self.author, 'topics': self.topics } - + def _date_to_text(self): """ Converts datetime object to text """ return self.date.strftime("%Y-%m-%d") - + def _get_raw_text_path(self): """ Returns path for requested raw article diff --git a/scrapper.py b/scrapper.py index f149dd2a..6efd8294 100644 --- a/scrapper.py +++ b/scrapper.py @@ -2,15 +2,15 @@ Crawler implementation """ +import os +import re import json +from datetime import date import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup # from time import sleep as wait from article import Article -import re -from datetime import date -import os CRAWLER_CONFIG_PATH = 'crawler_config.json' NEWLINES_RE = re.compile(r"\n{2,}") # two or more "\n" characters @@ -50,7 +50,7 @@ def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_se self.max_articles_per_seed = max_articles_per_seed self.urls = [] - self.URL_START = 'https://burunen.ru' + self.URLSTART = 'https://burunen.ru' @staticmethod def _extract_url(article_bs, seen): @@ -66,8 +66,8 @@ def find_articles(self): """ Finds articles """ - for url in self.seed_urls: - article_bs = BeautifulSoup(requests.get(url, 'html.parser').text, 'html.parser') + for link in self.seed_urls: + article_bs = BeautifulSoup(requests.get(link, 'html.parser').text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) self.urls.extend(newfound[:self.max_articles_per_seed]) self.urls = [i for i in self.urls if len(i) > 20 @@ -78,8 +78,8 @@ def find_articles(self): while len(self.urls) < self.total_max_articles: print('Due to insufficient number started further iteration') print('current number', len(self.urls), ', required', self.total_max_articles) - for url in self.urls: - article_bs = BeautifulSoup(requests.get(self.URL_START + url, 'html.parser').text, 'html.parser') + for link in self.urls: + article_bs = BeautifulSoup(requests.get(self.URLSTART + link, 'html.parser').text, 'html.parser') newfound = filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls)) print(' checked new url, found', len(newfound), 'articles') self.urls.extend(newfound[:self.max_articles_per_seed]) @@ -103,8 +103,8 @@ class ArticleParser: """ ArticleParser implementation """ - def __init__(self, full_url: str, article_id: int): - self.full_url = full_url + def __init__(self, full__url: str, article_id: int): + self.full_url = full__url self.article_id = article_id self.article = Article(self.full_url, self.article_id) @@ -121,16 +121,17 @@ def _fill_article_with_meta_information(self, article_soup): title = article_soup.title.text self.article.title = title - credits = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0] - if 'Автор:' in credits: + credit = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0] + if 'Автор:' in credit: author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0][7:] - elif 'Источник:' in credits: - author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0][9:].strip() + elif 'Источник:' in credit: + author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip() + author = author.split('\n')[0][9:].strip() else: author = '' self.article.author = author - date = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[1] - self.article.date = self.unify_date_format(date) + when = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[1] + self.article.date = self.unify_date_format(when) topic = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[0] self.article.topics = topic @@ -193,10 +194,10 @@ def validate_config(crawler_path): try: good_response = list(map(lambda link: True if requests.get(link).status_code == 200 else False, config['base_urls'])) - except RequestException: - raise IncorrectURLError - except Exception: - raise UnknownConfigError + except RequestException as e: + raise IncorrectURLError from e + except Exception as e: + raise UnknownConfigError from e if not all(good_response): raise IncorrectURLError if not all((isinstance(config['total_articles_to_find_and_parse'], int), @@ -210,10 +211,10 @@ def validate_config(crawler_path): if __name__ == '__main__': - seed_urls, max_articles, max_articles_per_seed = validate_config(CRAWLER_CONFIG_PATH) - crawler = Crawler(seed_urls=seed_urls, + seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) + crawler = Crawler(seed_urls=seedurls, total_max_articles=max_articles, - max_articles_per_seed=max_articles_per_seed) + max_articles_per_seed=max_arts_per_seed) crawler.find_articles() # print('Scraped', len(crawler.urls), 'articles') @@ -221,7 +222,7 @@ def validate_config(crawler_path): print('onto parsing') for n, url in enumerate(crawler.urls): - full_url = crawler.URL_START + url + full_url = crawler.URLSTART + url parser = ArticleParser(full_url, n + 1) parser.parse() print('parsing is finished') From aafdefd921956ac4c5a4294d6a2ac4444c4a4b72 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 9 Mar 2021 21:37:37 +0300 Subject: [PATCH 07/50] major link work --- scrapper.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scrapper.py b/scrapper.py index 6efd8294..9f924bec 100644 --- a/scrapper.py +++ b/scrapper.py @@ -50,17 +50,17 @@ def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_se self.max_articles_per_seed = max_articles_per_seed self.urls = [] - self.URLSTART = 'https://burunen.ru' + self.url_start = 'https://burunen.ru' @staticmethod def _extract_url(article_bs, seen): - extracted = list(set([link['href'] for link in article_bs.find_all('a', href=True)])) + extracted = list({link['href'] for link in article_bs.find_all('a', href=True)}) # print(extracted) # print(' ',extracted) - return list(filter(lambda x: True if x.startswith('/news/') + return list(filter(lambda x: x.startswith('/news/') and x not in seen # and any(map(lambda y: y.isdigit(), x)) - else False, extracted)) + , extracted)) def find_articles(self): """ @@ -79,8 +79,8 @@ def find_articles(self): print('Due to insufficient number started further iteration') print('current number', len(self.urls), ', required', self.total_max_articles) for link in self.urls: - article_bs = BeautifulSoup(requests.get(self.URLSTART + link, 'html.parser').text, 'html.parser') - newfound = filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls)) + article_bs = BeautifulSoup(requests.get(self.url_start + link, 'html.parser').text, 'html.parser') + newfound = list(filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls))) print(' checked new url, found', len(newfound), 'articles') self.urls.extend(newfound[:self.max_articles_per_seed]) # wait(10) @@ -192,12 +192,12 @@ def validate_config(crawler_path): with open(crawler_path) as crawler_config: config = json.load(crawler_config) try: - good_response = list(map(lambda link: True if requests.get(link).status_code == 200 else False, + good_response = list(map(lambda link: requests.get(link).status_code == 200, config['base_urls'])) - except RequestException as e: - raise IncorrectURLError from e - except Exception as e: - raise UnknownConfigError from e + except RequestException as exception: + raise IncorrectURLError from exception + except Exception as exception: + raise UnknownConfigError from exception if not all(good_response): raise IncorrectURLError if not all((isinstance(config['total_articles_to_find_and_parse'], int), @@ -222,7 +222,7 @@ def validate_config(crawler_path): print('onto parsing') for n, url in enumerate(crawler.urls): - full_url = crawler.URLSTART + url + full_url = crawler.url_start + url parser = ArticleParser(full_url, n + 1) parser.parse() print('parsing is finished') From 0cc3a91d5a1bc1bd2f2423f2c80c1e232cfa4251 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 9 Mar 2021 21:49:53 +0300 Subject: [PATCH 08/50] added requirements --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index e69de29b..327297ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4==4.9.0 +requests==2.23.0 From e3e8eda67d91d7d6155b94493a7173737756e0d1 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 9 Mar 2021 22:02:02 +0300 Subject: [PATCH 09/50] changed target score --- target_score.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target_score.txt b/target_score.txt index dde8d696..e91213a7 100644 --- a/target_score.txt +++ b/target_score.txt @@ -1,5 +1,5 @@ # Target score for scrapper.py: -8 +10 # Target score for pipeline.py: -8 \ No newline at end of file +10 \ No newline at end of file From 1355f14609e295e0403bf234ab539ee99c59f3d2 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 17:49:13 +0300 Subject: [PATCH 10/50] i dont understand why test fails --- scrapper.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scrapper.py b/scrapper.py index 9f924bec..ac063fb4 100644 --- a/scrapper.py +++ b/scrapper.py @@ -200,8 +200,11 @@ def validate_config(crawler_path): raise UnknownConfigError from exception if not all(good_response): raise IncorrectURLError - if not all((isinstance(config['total_articles_to_find_and_parse'], int), - isinstance(config['max_number_articles_to_get_from_one_seed'], int))): + # if not all((isinstance(config['total_articles_to_find_and_parse'], int), + # isinstance(config['max_number_articles_to_get_from_one_seed'], int))): + if not isinstance(config['total_articles_to_find_and_parse'], int): + raise IncorrectNumberOfArticlesError + if not isinstance(config['max_number_articles_to_get_from_one_seed'], int): raise IncorrectNumberOfArticlesError if not config['total_articles_to_find_and_parse'] < config['max_number_articles_to_get_from_one_seed']\ * len(good_response): From 56fbb389cf39f00cebdf32b8949f44c2e038f497 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 18:01:43 +0300 Subject: [PATCH 11/50] oooohh --- scrapper.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/scrapper.py b/scrapper.py index ac063fb4..19cbb595 100644 --- a/scrapper.py +++ b/scrapper.py @@ -58,9 +58,7 @@ def _extract_url(article_bs, seen): # print(extracted) # print(' ',extracted) return list(filter(lambda x: x.startswith('/news/') - and x not in seen - # and any(map(lambda y: y.isdigit(), x)) - , extracted)) + and x not in seen, extracted)) def find_articles(self): """ @@ -200,15 +198,18 @@ def validate_config(crawler_path): raise UnknownConfigError from exception if not all(good_response): raise IncorrectURLError - # if not all((isinstance(config['total_articles_to_find_and_parse'], int), - # isinstance(config['max_number_articles_to_get_from_one_seed'], int))): - if not isinstance(config['total_articles_to_find_and_parse'], int): - raise IncorrectNumberOfArticlesError - if not isinstance(config['max_number_articles_to_get_from_one_seed'], int): - raise IncorrectNumberOfArticlesError - if not config['total_articles_to_find_and_parse'] < config['max_number_articles_to_get_from_one_seed']\ - * len(good_response): - raise NumberOfArticlesOutOfRangeError + try: + if not isinstance(config['total_articles_to_find_and_parse'], int): + raise IncorrectNumberOfArticlesError + if not config['total_articles_to_find_and_parse'] > 1000: + raise NumberOfArticlesOutOfRangeError + if not isinstance(config['max_number_articles_to_get_from_one_seed'], int): + raise IncorrectNumberOfArticlesError + if not config['total_articles_to_find_and_parse'] < config['max_number_articles_to_get_from_one_seed']\ + * len(good_response): + raise NumberOfArticlesOutOfRangeError + except KeyError as exception: + raise IncorrectNumberOfArticlesError from exception return config['base_urls'], config['total_articles_to_find_and_parse'], \ config['max_number_articles_to_get_from_one_seed'] From cfeae30b6034925b90e2dc53152b427fd1f40d47 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 18:14:53 +0300 Subject: [PATCH 12/50] i have questions --- scrapper.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/scrapper.py b/scrapper.py index 19cbb595..ae60601f 100644 --- a/scrapper.py +++ b/scrapper.py @@ -201,21 +201,29 @@ def validate_config(crawler_path): try: if not isinstance(config['total_articles_to_find_and_parse'], int): raise IncorrectNumberOfArticlesError - if not config['total_articles_to_find_and_parse'] > 1000: - raise NumberOfArticlesOutOfRangeError - if not isinstance(config['max_number_articles_to_get_from_one_seed'], int): - raise IncorrectNumberOfArticlesError - if not config['total_articles_to_find_and_parse'] < config['max_number_articles_to_get_from_one_seed']\ - * len(good_response): + if config['total_articles_to_find_and_parse'] > 1000: raise NumberOfArticlesOutOfRangeError + # if not isinstance(config['max_number_articles_to_get_from_one_seed'], int): + # raise IncorrectNumberOfArticlesError + # if not config['total_articles_to_find_and_parse'] < config['max_number_articles_to_get_from_one_seed']\ + # * len(good_response): + # raise NumberOfArticlesOutOfRangeError except KeyError as exception: raise IncorrectNumberOfArticlesError from exception - return config['base_urls'], config['total_articles_to_find_and_parse'], \ - config['max_number_articles_to_get_from_one_seed'] + try: + return config['base_urls'], config['total_articles_to_find_and_parse'], \ + config['max_number_articles_to_get_from_one_seed'] + except KeyError: + return config['base_urls'], config['total_articles_to_find_and_parse'] if __name__ == '__main__': - seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) + params = validate_config(CRAWLER_CONFIG_PATH) + if len(params) == 3: + seedurls, max_articles, max_arts_per_seed = params + else: + seedurls, max_articles = params + max_arts_per_seed = max_articles crawler = Crawler(seed_urls=seedurls, total_max_articles=max_articles, max_articles_per_seed=max_arts_per_seed) From 39e892355a93b4f8c02d33cd27df9ab137963d7c Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 18:19:36 +0300 Subject: [PATCH 13/50] didn't work --- scrapper.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/scrapper.py b/scrapper.py index ae60601f..c608d87a 100644 --- a/scrapper.py +++ b/scrapper.py @@ -214,15 +214,12 @@ def validate_config(crawler_path): return config['base_urls'], config['total_articles_to_find_and_parse'], \ config['max_number_articles_to_get_from_one_seed'] except KeyError: - return config['base_urls'], config['total_articles_to_find_and_parse'] + return config['base_urls'], config['total_articles_to_find_and_parse'], None if __name__ == '__main__': - params = validate_config(CRAWLER_CONFIG_PATH) - if len(params) == 3: - seedurls, max_articles, max_arts_per_seed = params - else: - seedurls, max_articles = params + seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) + if not max_arts_per_seed: max_arts_per_seed = max_articles crawler = Crawler(seed_urls=seedurls, total_max_articles=max_articles, From 6e5ec03b80db40932fbbc25ae8c952f30684837d Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 18:27:23 +0300 Subject: [PATCH 14/50] working on --- scrapper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scrapper.py b/scrapper.py index c608d87a..a9d49748 100644 --- a/scrapper.py +++ b/scrapper.py @@ -11,9 +11,7 @@ from bs4 import BeautifulSoup # from time import sleep as wait from article import Article - -CRAWLER_CONFIG_PATH = 'crawler_config.json' -NEWLINES_RE = re.compile(r"\n{2,}") # two or more "\n" characters +from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH class IncorrectURLError(Exception): @@ -109,7 +107,6 @@ def __init__(self, full__url: str, article_id: int): def _fill_article_with_text(self, article_soup): try: text = article_soup.find('div', {'class': 'text letter', 'itemprop': 'articleBody'}).text.strip() - # text = NEWLINES_RE.split(all_text) # regex splitting self.article.text = text except AttributeError: print(' unable to parse', self.full_url) @@ -218,6 +215,7 @@ def validate_config(crawler_path): if __name__ == '__main__': + prepare_environment(ASSETS_PATH) seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) if not max_arts_per_seed: max_arts_per_seed = max_articles From 4733964aa55e33dcd433258a314e87cd2c3a6b2a Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 18:29:35 +0300 Subject: [PATCH 15/50] there is no pleasing you --- scrapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapper.py b/scrapper.py index a9d49748..7058e47d 100644 --- a/scrapper.py +++ b/scrapper.py @@ -3,7 +3,6 @@ """ import os -import re import json from datetime import date import requests From ead1f177aecf94154d2f4b9a533a0b7bd8bbfdd6 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 18:42:43 +0300 Subject: [PATCH 16/50] i'm experimenting --- target_score.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target_score.txt b/target_score.txt index e91213a7..3de837b8 100644 --- a/target_score.txt +++ b/target_score.txt @@ -2,4 +2,4 @@ 10 # Target score for pipeline.py: -10 \ No newline at end of file +0 \ No newline at end of file From cf7e97ba541a4c1e2ca92d5ff3e3e9dcb169e182 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 21:06:11 +0300 Subject: [PATCH 17/50] added stuff for 10, tests will fail? --- config/constants.py | 1 + constants.py | 2 + links/url_backup.txt | 320 +++++++++++++++++++++++++++++++++++++++++++ scrapper.py | 94 +++++++++++-- 4 files changed, 403 insertions(+), 14 deletions(-) create mode 100644 links/url_backup.txt diff --git a/config/constants.py b/config/constants.py index 12d85256..28a84b06 100644 --- a/config/constants.py +++ b/config/constants.py @@ -7,3 +7,4 @@ PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') +LINKS_STORAGE = os.path.join(PROJECT_ROOT, 'links') diff --git a/constants.py b/constants.py index 12d85256..3dc98002 100644 --- a/constants.py +++ b/constants.py @@ -7,3 +7,5 @@ PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') +LINKS_STORAGE = os.path.join(PROJECT_ROOT, 'links') +URL_START = 'https://burunen.ru' \ No newline at end of file diff --git a/links/url_backup.txt b/links/url_backup.txt new file mode 100644 index 00000000..d7bb92f5 --- /dev/null +++ b/links/url_backup.txt @@ -0,0 +1,320 @@ +/news/society/80553-pionery-buryatskogo-biznesa/ +/news/society/80527-zurkhay-na-10-marta-27-lunnyy-den/ +/news/society/80543-nakanune-10-marta-v-buryatii-ushel-iz-zhizni-eks-rektor-bgskha-aleksandr-popov/ +/news/society/80575-v-buryatii-startoval-federalnyy-proekt-chistaya-voda/ +/news/society/80548-v-stolitsu-buryatii-pribyl-arsen-fadzaev/ +/news/culture/80594-polozhenie-o-konkurse-rasskaza-2021/ +/news/society/80583-vystavka-okhotnichikh-laek-proshla-v-zakamenskom-rayone-buryatii/ +/news/society/80574-v-vuzakh-buryatii-poyavyatsya-prorektory-po-tsifrovizatsii/ +/news/society/80552-poyushchaya-garga-kak-zhivyet-poyushchee-selskoe-poselenie-v-kurumkane/ +/news/society/80572-zurkhay-na-11-marta-28-lunnyy-den/ +/news/society/80555-buryatiya-voshla-v-chislo-regionov-kotorye-podderzhali-pevitsu-manizhu/ +/news/society/80581-zhivaya-legenda-volnoy-borby-rossii-pribyl-v-buryatiyu-v-svoy-den-rozhdeniya/ +/news/society/80540-v-stolitsu-buryatii-nachali-pribyvat-uchastniki-i-gosti-chempionata-rossii-po-volnoy-borbe/ +/news/society/80542-stalo-izvestno-komu-iz-zhiteley-buryatii-udastsya-vyyti-na-pensiyu-dosrochno/ +/news/society/80602-v-buryatskom-sele-vydrino-otremontiruyut-detskuyu-shkolu-iskusstv/ +/news/society/80573-preimushchestvenno-bez-osadkov-dnyem-do-5-tepla-ozhidaetsya-v-buryatii-segodnya-11-marta-/ +/news/culture/79137-pamyati-geroev-rabotniki-sudebnoy-sistemy-zabaykalya-vypustili-knigu-vospominaniy-ob-uchastnikakh-vo/ +/news/society/80558-irkutskuyu-oblast-i-buryatiyu-vozmozhno-svyazhet-noveyshaya-elektrichka/ +/news/society/80535-v-buryatii-podveli-itogi-baykalskoy-mili/ +/news/society/80565-v-zaigraevskom-rayone-buryatii-otkrylas-novaya-shkola-na-450-mest/ +/news/society/80577-na-buryatiyu-obrushitsya-anomalnoe-poteplenie/ +/news/society/80584-v-buryatii-nachalis-meropriyatiya-po-profilaktike-ledyanykh-zatorov/ +/news/culture/80559-proverit-svoe-znanie-buryatskoy-grammatiki-smogut-zhiteli-buryatii/ +/news/culture/80475-dusha-v-tantse-studenty-kolledzha-iskusstv-dali-uroki-yekhora-dlya-zhurnalistov/ +/news/culture/80511-v-buryatii-otkrylas-vystavka-eksperimentalnogo-iskusstva-ii-sulde/ +/news/culture/80515-reper-iz-buryatii-zapisal-klip-s-pevitsey-kotoraya-vystupit-na-evrovidenii/ +/news/culture/80387-smysly-zurkhaya-geshe-tsyren-lama-o-tonkostyakh-buddiyskoy-astrologii-/ +/news/culture/80388-v-ulan-ude-sostoyalas-premera-opery-knyazya-igorya/ +/news/culture/80384-v-ulan-ude-vozvrashchaetsya-festival-uu-sound-/ +/news/culture/80521-v-ulan-ude-predstavili-dokumentalnyy-film-o-buryatskom-kostyume/ +/news/culture/80390-yunye-tsirkovye-artisty-buryatii-vpervye-vystupyat-v-tyumenskom-tsirke/ +/news/culture/80416-buryatskiy-ansambl-bulzhamuur-stal-laureatom-vserossiyskogo-festivalya/ +/news/culture/80592-v-ulan-ude-proshyel-kontsert-etnicheskoy-muzyki/ +/news/culture/80560-v-ulan-ude-nachinaet-svoyu-rabotu-etnokovorking/ +/news/culture/80536-knyaz-igor-v-buryatii-otkuda-v-stepi-drevnerusskaya-grust/ +/news/culture/80509-sezon-dozhdey-dadut-kontsert-v-ulan-ude/ +/news/culture/80557-koster-na-glavnoy-ploshchadi-buryatiya-gotovitsya-k-maslenitse/ +/news/culture/80522-v-ulan-ude-proshla-vstrecha-vesny-kak-zavershenie-vostochnogo-novogo-goda/ +/news/culture/80397-muzhchina-iz-severobaykalska-poluchil-realnyy-srok-za-povtornuyu-ezdu-v-pyanom-vide/ +/news/culture/80420-v-buryatii-izdali-knigu-minii-nyutag/ +/news/culture/80389-artisty-teatra-baykal-vyydut-v-efir-radio-mayak/ +/news/economy/80551-v-buryatii-rastet-spros-na-novye-avtomobili/ +/news/economy/80400-agrarii-buryatii-vernuli-chast-zemel-merii-ulan-ude/ +/news/economy/80462-v-buryatii-otremontiruyut-dorogu-v-posele/ +/news/economy/80447-biznes-buryatii-poluchit-vozmozhnosti-opravitsya-ot-posledstviy-pandemii/ +/news/economy/80549-prognoznyy-poleznyy-otpusk-elektroenergii-i-moshchnosti-po-tarifnym-gruppam/ +/news/economy/80427-v-buryatii-proydet-zasedanie-soveta-direktorov-kholdinga-vertolyety-rossii/ +/news/economy/80506-v-buryatii-otremontiruyut-odnu-iz-samykh-populyarnykh-avtodorog/ +/news/economy/80429-buryatiya-poluchit-okolo-3-milliardov-rubley-na-dorogi/ +/news/economy/80415-v-ulan-ude-startoval-akselerator-proektov-upravlentsev-buryatii/ +/news/economy/80432-buryatiya-poluchit-bolee-28-mln-rubley-na-podderzhku-proektov-sotsialno-orientirovannykh-organizatsi/ +/news/economy/80413-usloviya-dalnevostochnaya-ipoteka-namereny-uluchshit-v-buryatii-i-po-vsemu-dfo/ +/news/economy/80340-tunkinskiy-rayon-buryatii-poluchit-sredstva-na-razvitie-turizma/ +/news/economy/80435-ivan-alkheev-naznachen-zampredom-pravitelstva-buryatii/ +/news/economy/80433-na-ulan-udenskom-aviazavode-vpervye-proshlo-zasedanie-soveta-direktorov-vertolety-rossii/ +/news/economy/80317-bolee-2-tysyach-zhiteley-buryatii-prinyali-uchastie-v-biznes-vstreche-s-alekseem-tsydenovym/ +/news/economy/80478-produktsiya-iz-buryatii-vyshla-na-rynok-germanii/ +/news/economy/80412-buryatiya-priobretet-paket-aktsiy-avrora/ +/news/economy/80488-v-buryatii-razdeli-mestorozhdenie-urana/ +/news/economy/80344-pereezd-na-dalniy-vostok-pozvolit-buryatii-reanimirovat-dva-krupnykh-proekta/ +/news/economy/80491-buryatiya-voshla-v-zonu-modernizatsii-zheleznykh-dorog/ +/news/sports/80463-vdokhnovlyayushchie-rezultaty-novye-vzlyety-khudozhestvennoy-gimnastiki-buryatii/ +/news/sports/80600-v-kabanskom-rayone-buryatii-otkryli-sportivnuyu-ploshchadku/ +/news/sports/80546-studentki-iz-buryatii-na-pedestale-rossiyskogo-urovnya/ +/news/sports/80331-boksery-iz-buryatii-zavoevali-shest-medaley-na-vserossiyskom-turnire-klassa-a/ +/news/sports/80520-final-kubka-buryatii-ne-doigrali-no-pobeditelya-opredelili/ +/news/sports/80578-na-chempionate-rossii-v-ulan-ude-103-bortsa-opredelyat-pervykh-finalistov-obnovlyaetsya/ +/news/sports/80477-borets-iz-buryatii-rasskazal-kak-borolsya-za-ameriku/ +/news/sports/80351-boytsy-federatsii-pankrationa-buryatii-vystupili-na-urovne-dfo/ +/news/sports/80601-ministr-sporta-buryatii-chempionat-rossii-dast-novyy-impuls-razvitiyu-volnoy-borby-v-respublike/ +/news/sports/80507-buryatiya-utverdila-okonchatelnyy-sostav-sbornoy-dlya-uchastiya-v-chempionate-rossii-po-volnoy-borbe/ +/news/sports/80494-borits-iz-buryatii-zakryli-na-karantin-v-rime-/ +/news/sports/80472-na-lyzhakh-s-lukom-buryatskaya-sportsmenka-vyigrala-chetyre-medali-na-pervenstve-rossii/ +/news/sports/80547-beskompromissnye-igry-buryatskikh-shakhmatistov-na-dalnem-vostoke/ +/news/sports/80354-pervyy-bortsovskiy-internat-buryatii-vzyal-na-vooruzhenie-igru-go/ +/news/sports/80568-v-stolitse-buryatii-proshla-pervaya-zherebevka-chempionata-rossii-po-volnoy-borbe/ +/news/sports/80525-glavnyy-favorit-gryadushchego-chempionata-rossii-v-buryatii-ozvuchil-svoy-sostav/ +/news/sports/80470-v-buryatii-gryadet-bitva-titanov/ +/news/sports/80452-sportsmeny-iz-eravninskogo-rayona-buryatii-oderzhali-dve-pobedy/ +/news/sports/80345-znamenityy-rossiyskiy-futbolist-vyigral-baykalskiy-marafon-v-buryatii/ +/news/incidents/80181-v-buryatii-stali-bolshe-pit/ +/news/incidents/80467-v-buryatii-vyyavili-pyat-narusheniy-protivoepidemicheskikh-mer/ +/news/incidents/80408-byvshemu-rukovoditelyu-energeticheskoy-kompanii-iz-buryatii-inkriminiruyut-sozdanie-opg/ +/news/incidents/80517-kommunisty-buryatii-obvinili-organizatorov-baykalskoy-mili/ +/news/incidents/80453-v-kabanskom-rayone-buryatii-zhiteli-zamerzayut-v-svoikh-domakh-/ +/news/incidents/80461-v-bichure-podtopilo-pushkina-/ +/news/incidents/80226-v-buryatii-vynesli-verdikt-po-delu-o-napadenii-sobak-na-pervoklassnika/ +/news/incidents/80471-v-belorussii-izbili-futbolnogo-trenera-iz-buryatii/ +/news/incidents/80530-delo-o-zaderzhanii-brakonerov-v-buryatii-prokommentiroval-skr-/ +/news/incidents/80411-na-baykale-zamorozili-demontazh-zavoda-po-rozlivu-baykalskoy-vody/ +/news/incidents/80399-vrachi-rasskazali-o-sostoyanii-rebenka-ranennogo-v-buryatii/ +/news/incidents/80545-v-buryatii-pri-pozhare-v-zhilom-dome-spaslis-mat-i-dvoe-detey/ +/news/incidents/80582-v-kurumkanskom-rayone-buryatii-zaderzhan-pokhititel-myasa/ +/news/incidents/80349-musornaya-reforma-v-buryatii-stala-povodom-dlya-politicheskikh-sporov/ +/news/incidents/80380-v-buryatii-snova-otlichilsya-voditel-leksusa/ +/news/incidents/80422-sledstvennyy-komitet-po-buryatii-dal-kommentariy-po-povodu-zaderzhaniya-sergeya-ivanova/ +/news/incidents/80191-intsident-na-baykale-pod-kontrolem-pravitelstva-buryatii/ +/news/incidents/80326-v-buryatii-obyasnili-prichiny-tramvaynogo-kollapsa-v-ulan-ude/ +/news/incidents/80529-materialnyy-ushcherb-po-faktu-gibeli-podrostkov-v-buryatii-vzyshchut-s-vladeltsa-avto/ +/news/incidents/80492-v-buryatii-obvinyayut-inspektora-kotoryy-zaderzhal-vliyatelnykh-brakonerov-/ +/news/politics/80591-ministr-sporta-buryatii-otmetil-naplyv-imenitykh-sportsmenov-v-dni-chempionata-rossii-po-volnoy-borb/ +/news/politics/80588-glava-buryatii-prinyal-uchastie-v-soveshchanii-generalnogo-direktora-oao-rzhd//news/society/80553-pionery-buryatskogo-biznesa/ +/news/society/80527-zurkhay-na-10-marta-27-lunnyy-den/ +/news/society/80543-nakanune-10-marta-v-buryatii-ushel-iz-zhizni-eks-rektor-bgskha-aleksandr-popov/ +/news/society/80575-v-buryatii-startoval-federalnyy-proekt-chistaya-voda/ +/news/society/80548-v-stolitsu-buryatii-pribyl-arsen-fadzaev/ +/news/culture/80594-polozhenie-o-konkurse-rasskaza-2021/ +/news/society/80583-vystavka-okhotnichikh-laek-proshla-v-zakamenskom-rayone-buryatii/ +/news/society/80574-v-vuzakh-buryatii-poyavyatsya-prorektory-po-tsifrovizatsii/ +/news/society/80552-poyushchaya-garga-kak-zhivyet-poyushchee-selskoe-poselenie-v-kurumkane/ +/news/society/80572-zurkhay-na-11-marta-28-lunnyy-den/ +/news/society/80555-buryatiya-voshla-v-chislo-regionov-kotorye-podderzhali-pevitsu-manizhu/ +/news/society/80581-zhivaya-legenda-volnoy-borby-rossii-pribyl-v-buryatiyu-v-svoy-den-rozhdeniya/ +/news/society/80540-v-stolitsu-buryatii-nachali-pribyvat-uchastniki-i-gosti-chempionata-rossii-po-volnoy-borbe/ +/news/society/80542-stalo-izvestno-komu-iz-zhiteley-buryatii-udastsya-vyyti-na-pensiyu-dosrochno/ +/news/society/80602-v-buryatskom-sele-vydrino-otremontiruyut-detskuyu-shkolu-iskusstv/ +/news/society/80573-preimushchestvenno-bez-osadkov-dnyem-do-5-tepla-ozhidaetsya-v-buryatii-segodnya-11-marta-/ +/news/culture/79137-pamyati-geroev-rabotniki-sudebnoy-sistemy-zabaykalya-vypustili-knigu-vospominaniy-ob-uchastnikakh-vo/ +/news/society/80558-irkutskuyu-oblast-i-buryatiyu-vozmozhno-svyazhet-noveyshaya-elektrichka/ +/news/society/80535-v-buryatii-podveli-itogi-baykalskoy-mili/ +/news/society/80565-v-zaigraevskom-rayone-buryatii-otkrylas-novaya-shkola-na-450-mest/ +/news/society/80577-na-buryatiyu-obrushitsya-anomalnoe-poteplenie/ +/news/society/80584-v-buryatii-nachalis-meropriyatiya-po-profilaktike-ledyanykh-zatorov/ +/news/culture/80559-proverit-svoe-znanie-buryatskoy-grammatiki-smogut-zhiteli-buryatii/ +/news/culture/80475-dusha-v-tantse-studenty-kolledzha-iskusstv-dali-uroki-yekhora-dlya-zhurnalistov/ +/news/culture/80511-v-buryatii-otkrylas-vystavka-eksperimentalnogo-iskusstva-ii-sulde/ +/news/culture/80515-reper-iz-buryatii-zapisal-klip-s-pevitsey-kotoraya-vystupit-na-evrovidenii/ +/news/culture/80387-smysly-zurkhaya-geshe-tsyren-lama-o-tonkostyakh-buddiyskoy-astrologii-/ +/news/culture/80388-v-ulan-ude-sostoyalas-premera-opery-knyazya-igorya/ +/news/culture/80384-v-ulan-ude-vozvrashchaetsya-festival-uu-sound-/ +/news/culture/80521-v-ulan-ude-predstavili-dokumentalnyy-film-o-buryatskom-kostyume/ +/news/culture/80390-yunye-tsirkovye-artisty-buryatii-vpervye-vystupyat-v-tyumenskom-tsirke/ +/news/culture/80416-buryatskiy-ansambl-bulzhamuur-stal-laureatom-vserossiyskogo-festivalya/ +/news/culture/80592-v-ulan-ude-proshyel-kontsert-etnicheskoy-muzyki/ +/news/culture/80560-v-ulan-ude-nachinaet-svoyu-rabotu-etnokovorking/ +/news/culture/80536-knyaz-igor-v-buryatii-otkuda-v-stepi-drevnerusskaya-grust/ +/news/culture/80509-sezon-dozhdey-dadut-kontsert-v-ulan-ude/ +/news/culture/80557-koster-na-glavnoy-ploshchadi-buryatiya-gotovitsya-k-maslenitse/ +/news/culture/80522-v-ulan-ude-proshla-vstrecha-vesny-kak-zavershenie-vostochnogo-novogo-goda/ +/news/culture/80397-muzhchina-iz-severobaykalska-poluchil-realnyy-srok-za-povtornuyu-ezdu-v-pyanom-vide/ +/news/culture/80420-v-buryatii-izdali-knigu-minii-nyutag/ +/news/culture/80389-artisty-teatra-baykal-vyydut-v-efir-radio-mayak/ +/news/economy/80551-v-buryatii-rastet-spros-na-novye-avtomobili/ +/news/economy/80400-agrarii-buryatii-vernuli-chast-zemel-merii-ulan-ude/ +/news/economy/80462-v-buryatii-otremontiruyut-dorogu-v-posele/ +/news/economy/80447-biznes-buryatii-poluchit-vozmozhnosti-opravitsya-ot-posledstviy-pandemii/ +/news/economy/80549-prognoznyy-poleznyy-otpusk-elektroenergii-i-moshchnosti-po-tarifnym-gruppam/ +/news/economy/80427-v-buryatii-proydet-zasedanie-soveta-direktorov-kholdinga-vertolyety-rossii/ +/news/economy/80506-v-buryatii-otremontiruyut-odnu-iz-samykh-populyarnykh-avtodorog/ +/news/economy/80429-buryatiya-poluchit-okolo-3-milliardov-rubley-na-dorogi/ +/news/economy/80415-v-ulan-ude-startoval-akselerator-proektov-upravlentsev-buryatii/ +/news/economy/80432-buryatiya-poluchit-bolee-28-mln-rubley-na-podderzhku-proektov-sotsialno-orientirovannykh-organizatsi/ +/news/economy/80413-usloviya-dalnevostochnaya-ipoteka-namereny-uluchshit-v-buryatii-i-po-vsemu-dfo/ +/news/economy/80340-tunkinskiy-rayon-buryatii-poluchit-sredstva-na-razvitie-turizma/ +/news/economy/80435-ivan-alkheev-naznachen-zampredom-pravitelstva-buryatii/ +/news/economy/80433-na-ulan-udenskom-aviazavode-vpervye-proshlo-zasedanie-soveta-direktorov-vertolety-rossii/ +/news/economy/80317-bolee-2-tysyach-zhiteley-buryatii-prinyali-uchastie-v-biznes-vstreche-s-alekseem-tsydenovym/ +/news/economy/80478-produktsiya-iz-buryatii-vyshla-na-rynok-germanii/ +/news/economy/80412-buryatiya-priobretet-paket-aktsiy-avrora/ +/news/economy/80488-v-buryatii-razdeli-mestorozhdenie-urana/ +/news/economy/80344-pereezd-na-dalniy-vostok-pozvolit-buryatii-reanimirovat-dva-krupnykh-proekta/ +/news/economy/80491-buryatiya-voshla-v-zonu-modernizatsii-zheleznykh-dorog/ +/news/sports/80463-vdokhnovlyayushchie-rezultaty-novye-vzlyety-khudozhestvennoy-gimnastiki-buryatii/ +/news/sports/80600-v-kabanskom-rayone-buryatii-otkryli-sportivnuyu-ploshchadku/ +/news/sports/80546-studentki-iz-buryatii-na-pedestale-rossiyskogo-urovnya/ +/news/sports/80331-boksery-iz-buryatii-zavoevali-shest-medaley-na-vserossiyskom-turnire-klassa-a/ +/news/sports/80520-final-kubka-buryatii-ne-doigrali-no-pobeditelya-opredelili/ +/news/sports/80578-na-chempionate-rossii-v-ulan-ude-103-bortsa-opredelyat-pervykh-finalistov-obnovlyaetsya/ +/news/sports/80477-borets-iz-buryatii-rasskazal-kak-borolsya-za-ameriku/ +/news/sports/80351-boytsy-federatsii-pankrationa-buryatii-vystupili-na-urovne-dfo/ +/news/sports/80601-ministr-sporta-buryatii-chempionat-rossii-dast-novyy-impuls-razvitiyu-volnoy-borby-v-respublike/ +/news/sports/80507-buryatiya-utverdila-okonchatelnyy-sostav-sbornoy-dlya-uchastiya-v-chempionate-rossii-po-volnoy-borbe/ +/news/sports/80494-borits-iz-buryatii-zakryli-na-karantin-v-rime-/ +/news/sports/80472-na-lyzhakh-s-lukom-buryatskaya-sportsmenka-vyigrala-chetyre-medali-na-pervenstve-rossii/ +/news/sports/80547-beskompromissnye-igry-buryatskikh-shakhmatistov-na-dalnem-vostoke/ +/news/sports/80354-pervyy-bortsovskiy-internat-buryatii-vzyal-na-vooruzhenie-igru-go/ +/news/sports/80568-v-stolitse-buryatii-proshla-pervaya-zherebevka-chempionata-rossii-po-volnoy-borbe/ +/news/sports/80525-glavnyy-favorit-gryadushchego-chempionata-rossii-v-buryatii-ozvuchil-svoy-sostav/ +/news/sports/80470-v-buryatii-gryadet-bitva-titanov/ +/news/sports/80452-sportsmeny-iz-eravninskogo-rayona-buryatii-oderzhali-dve-pobedy/ +/news/sports/80345-znamenityy-rossiyskiy-futbolist-vyigral-baykalskiy-marafon-v-buryatii/ +/news/incidents/80181-v-buryatii-stali-bolshe-pit/ +/news/incidents/80467-v-buryatii-vyyavili-pyat-narusheniy-protivoepidemicheskikh-mer/ +/news/incidents/80408-byvshemu-rukovoditelyu-energeticheskoy-kompanii-iz-buryatii-inkriminiruyut-sozdanie-opg/ +/news/incidents/80517-kommunisty-buryatii-obvinili-organizatorov-baykalskoy-mili/ +/news/incidents/80453-v-kabanskom-rayone-buryatii-zhiteli-zamerzayut-v-svoikh-domakh-/ +/news/incidents/80461-v-bichure-podtopilo-pushkina-/ +/news/incidents/80226-v-buryatii-vynesli-verdikt-po-delu-o-napadenii-sobak-na-pervoklassnika/ +/news/incidents/80471-v-belorussii-izbili-futbolnogo-trenera-iz-buryatii/ +/news/incidents/80530-delo-o-zaderzhanii-brakonerov-v-buryatii-prokommentiroval-skr-/ +/news/incidents/80411-na-baykale-zamorozili-demontazh-zavoda-po-rozlivu-baykalskoy-vody/ +/news/incidents/80399-vrachi-rasskazali-o-sostoyanii-rebenka-ranennogo-v-buryatii/ +/news/incidents/80545-v-buryatii-pri-pozhare-v-zhilom-dome-spaslis-mat-i-dvoe-detey/ +/news/incidents/80582-v-kurumkanskom-rayone-buryatii-zaderzhan-pokhititel-myasa/ +/news/incidents/80349-musornaya-reforma-v-buryatii-stala-povodom-dlya-politicheskikh-sporov/ +/news/incidents/80380-v-buryatii-snova-otlichilsya-voditel-leksusa/ +/news/incidents/80422-sledstvennyy-komitet-po-buryatii-dal-kommentariy-po-povodu-zaderzhaniya-sergeya-ivanova/ +/news/incidents/80191-intsident-na-baykale-pod-kontrolem-pravitelstva-buryatii/ +/news/incidents/80326-v-buryatii-obyasnili-prichiny-tramvaynogo-kollapsa-v-ulan-ude/ +/news/incidents/80529-materialnyy-ushcherb-po-faktu-gibeli-podrostkov-v-buryatii-vzyshchut-s-vladeltsa-avto/ +/news/incidents/80492-v-buryatii-obvinyayut-inspektora-kotoryy-zaderzhal-vliyatelnykh-brakonerov-/ +/news/politics/80591-ministr-sporta-buryatii-otmetil-naplyv-imenitykh-sportsmenov-v-dni-chempionata-rossii-po-volnoy-borb/ +/news/politics/80588-glava-buryatii-prinyal-uchastie-v-soveshchanii-generalnogo-direktora-oao-rzhd/ +/news/politics/80590-glava-buryatii-vyekhal-s-rabochey-poezdkoy-v-mukhorshibirskiy-rayon/ +/news/politics/80498-aleksey-tsydenov-pozdravil-vsekh-zhenshchin-buryatii-s-8-marta-/ +/news/politics/80538-delo-o-zaderzhanii-brakonerov-v-buryatii-poruchil-vzyat-pod-kontrol-yuriy-trutnev/ +/news/politics/80480-glava-buryatii-vmeste-s-poslom-izrailya-v-rossii-pochtili-pamyat-geroev-vov/ +/news/politics/80482-glava-buryatii-predlozhil-poslu-izrailya-sotrudnichestvo-v-sfere-turizma-i-meditsiny/ +/news/politics/80436-glava-buryatii-i-prezident-tatarstana-obsudili-sotrudnichestvo-mezhdu-regionami/ +/news/politics/80456-glavnyy-kommunist-buryatii-ne-voshyel-v-short-list-ot-kprf/ +/news/politics/80508-glava-buryatii-nagradil-laureatov-gosudarstvennykh-premiy-respubliki-v-sfere-kultury-i-iskusstva//news/society/80553-pionery-buryatskogo-biznesa/ +/news/society/80527-zurkhay-na-10-marta-27-lunnyy-den/ +/news/society/80543-nakanune-10-marta-v-buryatii-ushel-iz-zhizni-eks-rektor-bgskha-aleksandr-popov/ +/news/society/80575-v-buryatii-startoval-federalnyy-proekt-chistaya-voda/ +/news/society/80548-v-stolitsu-buryatii-pribyl-arsen-fadzaev/ +/news/culture/80594-polozhenie-o-konkurse-rasskaza-2021/ +/news/society/80583-vystavka-okhotnichikh-laek-proshla-v-zakamenskom-rayone-buryatii/ +/news/society/80574-v-vuzakh-buryatii-poyavyatsya-prorektory-po-tsifrovizatsii/ +/news/society/80552-poyushchaya-garga-kak-zhivyet-poyushchee-selskoe-poselenie-v-kurumkane/ +/news/society/80572-zurkhay-na-11-marta-28-lunnyy-den/ +/news/society/80555-buryatiya-voshla-v-chislo-regionov-kotorye-podderzhali-pevitsu-manizhu/ +/news/society/80581-zhivaya-legenda-volnoy-borby-rossii-pribyl-v-buryatiyu-v-svoy-den-rozhdeniya/ +/news/society/80540-v-stolitsu-buryatii-nachali-pribyvat-uchastniki-i-gosti-chempionata-rossii-po-volnoy-borbe/ +/news/society/80542-stalo-izvestno-komu-iz-zhiteley-buryatii-udastsya-vyyti-na-pensiyu-dosrochno/ +/news/society/80602-v-buryatskom-sele-vydrino-otremontiruyut-detskuyu-shkolu-iskusstv/ +/news/society/80573-preimushchestvenno-bez-osadkov-dnyem-do-5-tepla-ozhidaetsya-v-buryatii-segodnya-11-marta-/ +/news/culture/79137-pamyati-geroev-rabotniki-sudebnoy-sistemy-zabaykalya-vypustili-knigu-vospominaniy-ob-uchastnikakh-vo/ +/news/society/80558-irkutskuyu-oblast-i-buryatiyu-vozmozhno-svyazhet-noveyshaya-elektrichka/ +/news/society/80535-v-buryatii-podveli-itogi-baykalskoy-mili/ +/news/society/80565-v-zaigraevskom-rayone-buryatii-otkrylas-novaya-shkola-na-450-mest/ +/news/society/80577-na-buryatiyu-obrushitsya-anomalnoe-poteplenie/ +/news/society/80584-v-buryatii-nachalis-meropriyatiya-po-profilaktike-ledyanykh-zatorov/ +/news/culture/80559-proverit-svoe-znanie-buryatskoy-grammatiki-smogut-zhiteli-buryatii/ +/news/culture/80475-dusha-v-tantse-studenty-kolledzha-iskusstv-dali-uroki-yekhora-dlya-zhurnalistov/ +/news/culture/80511-v-buryatii-otkrylas-vystavka-eksperimentalnogo-iskusstva-ii-sulde/ +/news/culture/80515-reper-iz-buryatii-zapisal-klip-s-pevitsey-kotoraya-vystupit-na-evrovidenii/ +/news/culture/80387-smysly-zurkhaya-geshe-tsyren-lama-o-tonkostyakh-buddiyskoy-astrologii-/ +/news/culture/80388-v-ulan-ude-sostoyalas-premera-opery-knyazya-igorya/ +/news/culture/80384-v-ulan-ude-vozvrashchaetsya-festival-uu-sound-/ +/news/culture/80521-v-ulan-ude-predstavili-dokumentalnyy-film-o-buryatskom-kostyume/ +/news/culture/80390-yunye-tsirkovye-artisty-buryatii-vpervye-vystupyat-v-tyumenskom-tsirke/ +/news/culture/80416-buryatskiy-ansambl-bulzhamuur-stal-laureatom-vserossiyskogo-festivalya/ +/news/culture/80592-v-ulan-ude-proshyel-kontsert-etnicheskoy-muzyki/ +/news/culture/80560-v-ulan-ude-nachinaet-svoyu-rabotu-etnokovorking/ +/news/culture/80536-knyaz-igor-v-buryatii-otkuda-v-stepi-drevnerusskaya-grust/ +/news/culture/80509-sezon-dozhdey-dadut-kontsert-v-ulan-ude/ +/news/culture/80557-koster-na-glavnoy-ploshchadi-buryatiya-gotovitsya-k-maslenitse/ +/news/culture/80522-v-ulan-ude-proshla-vstrecha-vesny-kak-zavershenie-vostochnogo-novogo-goda/ +/news/culture/80397-muzhchina-iz-severobaykalska-poluchil-realnyy-srok-za-povtornuyu-ezdu-v-pyanom-vide/ +/news/culture/80420-v-buryatii-izdali-knigu-minii-nyutag/ +/news/culture/80389-artisty-teatra-baykal-vyydut-v-efir-radio-mayak/ +/news/economy/80551-v-buryatii-rastet-spros-na-novye-avtomobili/ +/news/economy/80400-agrarii-buryatii-vernuli-chast-zemel-merii-ulan-ude/ +/news/economy/80462-v-buryatii-otremontiruyut-dorogu-v-posele/ +/news/economy/80447-biznes-buryatii-poluchit-vozmozhnosti-opravitsya-ot-posledstviy-pandemii/ +/news/economy/80549-prognoznyy-poleznyy-otpusk-elektroenergii-i-moshchnosti-po-tarifnym-gruppam/ +/news/economy/80427-v-buryatii-proydet-zasedanie-soveta-direktorov-kholdinga-vertolyety-rossii/ +/news/economy/80506-v-buryatii-otremontiruyut-odnu-iz-samykh-populyarnykh-avtodorog/ +/news/economy/80429-buryatiya-poluchit-okolo-3-milliardov-rubley-na-dorogi/ +/news/economy/80415-v-ulan-ude-startoval-akselerator-proektov-upravlentsev-buryatii/ +/news/economy/80432-buryatiya-poluchit-bolee-28-mln-rubley-na-podderzhku-proektov-sotsialno-orientirovannykh-organizatsi/ +/news/economy/80413-usloviya-dalnevostochnaya-ipoteka-namereny-uluchshit-v-buryatii-i-po-vsemu-dfo/ +/news/economy/80340-tunkinskiy-rayon-buryatii-poluchit-sredstva-na-razvitie-turizma/ +/news/economy/80435-ivan-alkheev-naznachen-zampredom-pravitelstva-buryatii/ +/news/economy/80433-na-ulan-udenskom-aviazavode-vpervye-proshlo-zasedanie-soveta-direktorov-vertolety-rossii/ +/news/economy/80317-bolee-2-tysyach-zhiteley-buryatii-prinyali-uchastie-v-biznes-vstreche-s-alekseem-tsydenovym/ +/news/economy/80478-produktsiya-iz-buryatii-vyshla-na-rynok-germanii/ +/news/economy/80412-buryatiya-priobretet-paket-aktsiy-avrora/ +/news/economy/80488-v-buryatii-razdeli-mestorozhdenie-urana/ +/news/economy/80344-pereezd-na-dalniy-vostok-pozvolit-buryatii-reanimirovat-dva-krupnykh-proekta/ +/news/economy/80491-buryatiya-voshla-v-zonu-modernizatsii-zheleznykh-dorog/ +/news/sports/80463-vdokhnovlyayushchie-rezultaty-novye-vzlyety-khudozhestvennoy-gimnastiki-buryatii/ +/news/sports/80600-v-kabanskom-rayone-buryatii-otkryli-sportivnuyu-ploshchadku/ +/news/sports/80546-studentki-iz-buryatii-na-pedestale-rossiyskogo-urovnya/ +/news/sports/80331-boksery-iz-buryatii-zavoevali-shest-medaley-na-vserossiyskom-turnire-klassa-a/ +/news/sports/80520-final-kubka-buryatii-ne-doigrali-no-pobeditelya-opredelili/ +/news/sports/80578-na-chempionate-rossii-v-ulan-ude-103-bortsa-opredelyat-pervykh-finalistov-obnovlyaetsya/ +/news/sports/80477-borets-iz-buryatii-rasskazal-kak-borolsya-za-ameriku/ +/news/sports/80351-boytsy-federatsii-pankrationa-buryatii-vystupili-na-urovne-dfo/ +/news/sports/80601-ministr-sporta-buryatii-chempionat-rossii-dast-novyy-impuls-razvitiyu-volnoy-borby-v-respublike/ +/news/sports/80507-buryatiya-utverdila-okonchatelnyy-sostav-sbornoy-dlya-uchastiya-v-chempionate-rossii-po-volnoy-borbe/ +/news/sports/80494-borits-iz-buryatii-zakryli-na-karantin-v-rime-/ +/news/sports/80472-na-lyzhakh-s-lukom-buryatskaya-sportsmenka-vyigrala-chetyre-medali-na-pervenstve-rossii/ +/news/sports/80547-beskompromissnye-igry-buryatskikh-shakhmatistov-na-dalnem-vostoke/ +/news/sports/80354-pervyy-bortsovskiy-internat-buryatii-vzyal-na-vooruzhenie-igru-go/ +/news/sports/80568-v-stolitse-buryatii-proshla-pervaya-zherebevka-chempionata-rossii-po-volnoy-borbe/ +/news/sports/80525-glavnyy-favorit-gryadushchego-chempionata-rossii-v-buryatii-ozvuchil-svoy-sostav/ +/news/sports/80470-v-buryatii-gryadet-bitva-titanov/ +/news/sports/80452-sportsmeny-iz-eravninskogo-rayona-buryatii-oderzhali-dve-pobedy/ +/news/sports/80345-znamenityy-rossiyskiy-futbolist-vyigral-baykalskiy-marafon-v-buryatii/ +/news/incidents/80181-v-buryatii-stali-bolshe-pit/ +/news/incidents/80467-v-buryatii-vyyavili-pyat-narusheniy-protivoepidemicheskikh-mer/ +/news/incidents/80408-byvshemu-rukovoditelyu-energeticheskoy-kompanii-iz-buryatii-inkriminiruyut-sozdanie-opg/ +/news/incidents/80517-kommunisty-buryatii-obvinili-organizatorov-baykalskoy-mili/ +/news/incidents/80453-v-kabanskom-rayone-buryatii-zhiteli-zamerzayut-v-svoikh-domakh-/ +/news/incidents/80461-v-bichure-podtopilo-pushkina-/ +/news/incidents/80226-v-buryatii-vynesli-verdikt-po-delu-o-napadenii-sobak-na-pervoklassnika/ +/news/incidents/80471-v-belorussii-izbili-futbolnogo-trenera-iz-buryatii/ +/news/incidents/80530-delo-o-zaderzhanii-brakonerov-v-buryatii-prokommentiroval-skr-/ +/news/incidents/80411-na-baykale-zamorozili-demontazh-zavoda-po-rozlivu-baykalskoy-vody/ +/news/incidents/80399-vrachi-rasskazali-o-sostoyanii-rebenka-ranennogo-v-buryatii/ +/news/incidents/80545-v-buryatii-pri-pozhare-v-zhilom-dome-spaslis-mat-i-dvoe-detey/ +/news/incidents/80582-v-kurumkanskom-rayone-buryatii-zaderzhan-pokhititel-myasa/ +/news/incidents/80349-musornaya-reforma-v-buryatii-stala-povodom-dlya-politicheskikh-sporov/ +/news/incidents/80380-v-buryatii-snova-otlichilsya-voditel-leksusa/ +/news/incidents/80422-sledstvennyy-komitet-po-buryatii-dal-kommentariy-po-povodu-zaderzhaniya-sergeya-ivanova/ +/news/incidents/80191-intsident-na-baykale-pod-kontrolem-pravitelstva-buryatii/ +/news/incidents/80326-v-buryatii-obyasnili-prichiny-tramvaynogo-kollapsa-v-ulan-ude/ +/news/incidents/80529-materialnyy-ushcherb-po-faktu-gibeli-podrostkov-v-buryatii-vzyshchut-s-vladeltsa-avto/ +/news/incidents/80492-v-buryatii-obvinyayut-inspektora-kotoryy-zaderzhal-vliyatelnykh-brakonerov-/ +/news/politics/80591-ministr-sporta-buryatii-otmetil-naplyv-imenitykh-sportsmenov-v-dni-chempionata-rossii-po-volnoy-borb/ +/news/politics/80588-glava-buryatii-prinyal-uchastie-v-soveshchanii-generalnogo-direktora-oao-rzhd/ +/news/politics/80590-glava-buryatii-vyekhal-s-rabochey-poezdkoy-v-mukhorshibirskiy-rayon/ +/news/politics/80498-aleksey-tsydenov-pozdravil-vsekh-zhenshchin-buryatii-s-8-marta-/ +/news/politics/80538-delo-o-zaderzhanii-brakonerov-v-buryatii-poruchil-vzyat-pod-kontrol-yuriy-trutnev/ +/news/politics/80480-glava-buryatii-vmeste-s-poslom-izrailya-v-rossii-pochtili-pamyat-geroev-vov/ +/news/politics/80482-glava-buryatii-predlozhil-poslu-izrailya-sotrudnichestvo-v-sfere-turizma-i-meditsiny/ +/news/politics/80436-glava-buryatii-i-prezident-tatarstana-obsudili-sotrudnichestvo-mezhdu-regionami/ +/news/politics/80456-glavnyy-kommunist-buryatii-ne-voshyel-v-short-list-ot-kprf/ +/news/politics/80508-glava-buryatii-nagradil-laureatov-gosudarstvennykh-premiy-respubliki-v-sfere-kultury-i-iskusstva/ \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 7058e47d..a2d86e41 100644 --- a/scrapper.py +++ b/scrapper.py @@ -8,9 +8,10 @@ import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup -# from time import sleep as wait +from time import sleep as wait +from random import randint from article import Article -from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH +from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH, LINKS_STORAGE, URL_START class IncorrectURLError(Exception): @@ -47,8 +48,6 @@ def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_se self.max_articles_per_seed = max_articles_per_seed self.urls = [] - self.url_start = 'https://burunen.ru' - @staticmethod def _extract_url(article_bs, seen): extracted = list({link['href'] for link in article_bs.find_all('a', href=True)}) @@ -68,13 +67,12 @@ def find_articles(self): self.urls = [i for i in self.urls if len(i) > 20 and not any(map(lambda y: y.isupper(), i))][:self.total_max_articles] print('Scraped seed urls, overall number of urls is', len(self.urls)) - old = len(self.urls) while len(self.urls) < self.total_max_articles: print('Due to insufficient number started further iteration') print('current number', len(self.urls), ', required', self.total_max_articles) for link in self.urls: - article_bs = BeautifulSoup(requests.get(self.url_start + link, 'html.parser').text, 'html.parser') + article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') newfound = list(filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls))) print(' checked new url, found', len(newfound), 'articles') self.urls.extend(newfound[:self.max_articles_per_seed]) @@ -87,11 +85,71 @@ def find_articles(self): self.urls = self.urls[:self.total_max_articles] + +class CrawlerRecursive(Crawler): + + def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_seed: int): + super().__init__(seed_urls, total_max_articles, max_articles_per_seed) + + def find_articles(self): + if self.get_backedup(): + print('backed up urls found, starting iteration') + if not self.urls: + for link in self.seed_urls: + # wait(randint(0, 10)) + article_bs = BeautifulSoup(requests.get(link, 'html.parser').text, 'html.parser') + newfound = self._extract_url(article_bs, self.urls) + self.urls.extend(newfound) + self.urls = [i for i in self.urls if len(i) > 20 + and not any(map(lambda y: y.isupper(), i))] + with open('links/url_backup.txt', 'w', encoding='utf-8') as file: + file.write('\n'.join(self.urls)) + print(f'Scraped {len(self.urls)} from seed') + if self.verify_proceed(): + print('starting recursive scraping') + self.find_articles() + else: + print(f'recursive crawling finished with {len(self.urls)} urls.') + else: + old = len(self.urls) + for link in self.urls: + # wait(randint(0, 10)) + article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') + newfound = self._extract_url(article_bs, self.urls) + newfound = [i for i in newfound if len(i) > 20 + and not any(map(lambda y: y.isupper(), i))] + self.urls.extend(newfound) + with open('links/url_backup.txt', 'a', encoding='utf-8') as file: + file.write('\n'.join(self.urls)) + if len(self.urls) == old: + print(f'there are no unseen links found\nrecursive crawling finished with {len(self.urls)} urls.') + else: + print(f'found {len(self.urls) - old} new urls') + if self.verify_proceed(): + print('starting new iteration') + self.find_articles() + else: + print(f'recursive crawling finished with {len(self.urls)} urls.') + + @staticmethod + def verify_proceed(): + answer = input('Would you like to proceed? yes or no: ').strip() + return True if answer == 'yes' else False + + def get_backedup(self): + try: + with open('links/url_backup.txt', 'r', encoding='utf-8') as file: + sources = file.read().split('\n') + self.urls = sources + return True + except FileNotFoundError: + return False + def get_search_urls(self): """ Returns seed_urls param """ - pass + return self.urls class ArticleParser: @@ -174,9 +232,16 @@ def prepare_environment(base_path): """ Creates ASSETS_PATH folder if not created and removes existing folder """ - newpath = r'{}/ASSETS_PATH'.format(base_path) - if not os.path.exists(newpath): - os.makedirs(newpath) + if not os.path.exists(base_path): + os.makedirs(base_path) + + +def enable_backup(base_path): + """ + Creates folder for backup links if not created + """ + if not os.path.exists(base_path): + os.makedirs(base_path) def validate_config(crawler_path): @@ -215,12 +280,13 @@ def validate_config(crawler_path): if __name__ == '__main__': prepare_environment(ASSETS_PATH) + enable_backup(LINKS_STORAGE) seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) if not max_arts_per_seed: max_arts_per_seed = max_articles - crawler = Crawler(seed_urls=seedurls, - total_max_articles=max_articles, - max_articles_per_seed=max_arts_per_seed) + crawler = CrawlerRecursive(seed_urls=seedurls, + total_max_articles=max_articles, + max_articles_per_seed=max_arts_per_seed) crawler.find_articles() # print('Scraped', len(crawler.urls), 'articles') @@ -228,7 +294,7 @@ def validate_config(crawler_path): print('onto parsing') for n, url in enumerate(crawler.urls): - full_url = crawler.url_start + url + full_url = URL_START + url parser = ArticleParser(full_url, n + 1) parser.parse() print('parsing is finished') From 54a49ed564588d6714170139ca0e994a6e7e70b9 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 21:13:53 +0300 Subject: [PATCH 18/50] fixed my favorite lint --- constants.py | 2 +- scrapper.py | 28 +++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/constants.py b/constants.py index 3dc98002..3a7976d0 100644 --- a/constants.py +++ b/constants.py @@ -8,4 +8,4 @@ ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') LINKS_STORAGE = os.path.join(PROJECT_ROOT, 'links') -URL_START = 'https://burunen.ru' \ No newline at end of file +URL_START = 'https://burunen.ru' diff --git a/scrapper.py b/scrapper.py index a2d86e41..65676a05 100644 --- a/scrapper.py +++ b/scrapper.py @@ -6,10 +6,10 @@ import json from datetime import date import requests +from random import randint +from time import sleep as wait from requests.exceptions import RequestException from bs4 import BeautifulSoup -from time import sleep as wait -from random import randint from article import Article from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH, LINKS_STORAGE, URL_START @@ -76,7 +76,6 @@ def find_articles(self): newfound = list(filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls))) print(' checked new url, found', len(newfound), 'articles') self.urls.extend(newfound[:self.max_articles_per_seed]) - # wait(10) if len(self.urls) > self.total_max_articles: break if len(self.urls) == old: @@ -85,18 +84,26 @@ def find_articles(self): self.urls = self.urls[:self.total_max_articles] + def get_search_urls(self): + """ + Returns seed_urls param + """ + return self.urls + class CrawlerRecursive(Crawler): - def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_seed: int): + def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_seed: int, to_wait=False): super().__init__(seed_urls, total_max_articles, max_articles_per_seed) + self.is_waiting = to_wait def find_articles(self): if self.get_backedup(): print('backed up urls found, starting iteration') if not self.urls: for link in self.seed_urls: - # wait(randint(0, 10)) + if self.is_waiting: + wait(randint(0, 10)) article_bs = BeautifulSoup(requests.get(link, 'html.parser').text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) self.urls.extend(newfound) @@ -113,7 +120,8 @@ def find_articles(self): else: old = len(self.urls) for link in self.urls: - # wait(randint(0, 10)) + if self.is_waiting: + wait(randint(0, 10)) article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) newfound = [i for i in newfound if len(i) > 20 @@ -134,7 +142,7 @@ def find_articles(self): @staticmethod def verify_proceed(): answer = input('Would you like to proceed? yes or no: ').strip() - return True if answer == 'yes' else False + return answer == 'yes' def get_backedup(self): try: @@ -145,12 +153,6 @@ def get_backedup(self): except FileNotFoundError: return False - def get_search_urls(self): - """ - Returns seed_urls param - """ - return self.urls - class ArticleParser: """ From 2854e58003ec91eabff24dc44f0db41b42b9a627 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 21:21:49 +0300 Subject: [PATCH 19/50] optimized a few things --- scrapper.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/scrapper.py b/scrapper.py index 65676a05..eb1295f2 100644 --- a/scrapper.py +++ b/scrapper.py @@ -5,9 +5,9 @@ import os import json from datetime import date -import requests from random import randint from time import sleep as wait +import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup from article import Article @@ -97,20 +97,23 @@ def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_se super().__init__(seed_urls, total_max_articles, max_articles_per_seed) self.is_waiting = to_wait + def _crawl(self, pool: list): + for link in pool: + if self.is_waiting: + wait(randint(0, 10)) + article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') + newfound = self._extract_url(article_bs, self.urls) + newfound = [i for i in newfound if len(i) > 20 + and not any(map(lambda y: y.isupper(), i))] + return newfound + def find_articles(self): if self.get_backedup(): print('backed up urls found, starting iteration') if not self.urls: - for link in self.seed_urls: - if self.is_waiting: - wait(randint(0, 10)) - article_bs = BeautifulSoup(requests.get(link, 'html.parser').text, 'html.parser') - newfound = self._extract_url(article_bs, self.urls) - self.urls.extend(newfound) - self.urls = [i for i in self.urls if len(i) > 20 - and not any(map(lambda y: y.isupper(), i))] - with open('links/url_backup.txt', 'w', encoding='utf-8') as file: - file.write('\n'.join(self.urls)) + self.urls = self._crawl(self.seed_urls) + with open('links/url_backup.txt', 'w', encoding='utf-8') as file: + file.write('\n'.join(self.urls)) print(f'Scraped {len(self.urls)} from seed') if self.verify_proceed(): print('starting recursive scraping') @@ -118,21 +121,14 @@ def find_articles(self): else: print(f'recursive crawling finished with {len(self.urls)} urls.') else: - old = len(self.urls) - for link in self.urls: - if self.is_waiting: - wait(randint(0, 10)) - article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') - newfound = self._extract_url(article_bs, self.urls) - newfound = [i for i in newfound if len(i) > 20 - and not any(map(lambda y: y.isupper(), i))] - self.urls.extend(newfound) - with open('links/url_backup.txt', 'a', encoding='utf-8') as file: - file.write('\n'.join(self.urls)) - if len(self.urls) == old: + newfound = self._crawl(self.urls) + if not newfound: print(f'there are no unseen links found\nrecursive crawling finished with {len(self.urls)} urls.') else: - print(f'found {len(self.urls) - old} new urls') + self.urls.extend(newfound) + with open('links/url_backup.txt', 'a', encoding='utf-8') as file: + file.write('\n'.join(newfound)) + print(f'found {len(newfound)} new urls') if self.verify_proceed(): print('starting new iteration') self.find_articles() From 358f74ba8f1128f862e904ddf1eff20a284a256d Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 21:55:52 +0300 Subject: [PATCH 20/50] improved text formation --- links/url_backup.txt | 320 ------------------------------------------- scrapper.py | 13 +- 2 files changed, 8 insertions(+), 325 deletions(-) delete mode 100644 links/url_backup.txt diff --git a/links/url_backup.txt b/links/url_backup.txt deleted file mode 100644 index d7bb92f5..00000000 --- a/links/url_backup.txt +++ /dev/null @@ -1,320 +0,0 @@ -/news/society/80553-pionery-buryatskogo-biznesa/ -/news/society/80527-zurkhay-na-10-marta-27-lunnyy-den/ -/news/society/80543-nakanune-10-marta-v-buryatii-ushel-iz-zhizni-eks-rektor-bgskha-aleksandr-popov/ -/news/society/80575-v-buryatii-startoval-federalnyy-proekt-chistaya-voda/ -/news/society/80548-v-stolitsu-buryatii-pribyl-arsen-fadzaev/ -/news/culture/80594-polozhenie-o-konkurse-rasskaza-2021/ -/news/society/80583-vystavka-okhotnichikh-laek-proshla-v-zakamenskom-rayone-buryatii/ -/news/society/80574-v-vuzakh-buryatii-poyavyatsya-prorektory-po-tsifrovizatsii/ -/news/society/80552-poyushchaya-garga-kak-zhivyet-poyushchee-selskoe-poselenie-v-kurumkane/ -/news/society/80572-zurkhay-na-11-marta-28-lunnyy-den/ -/news/society/80555-buryatiya-voshla-v-chislo-regionov-kotorye-podderzhali-pevitsu-manizhu/ -/news/society/80581-zhivaya-legenda-volnoy-borby-rossii-pribyl-v-buryatiyu-v-svoy-den-rozhdeniya/ -/news/society/80540-v-stolitsu-buryatii-nachali-pribyvat-uchastniki-i-gosti-chempionata-rossii-po-volnoy-borbe/ -/news/society/80542-stalo-izvestno-komu-iz-zhiteley-buryatii-udastsya-vyyti-na-pensiyu-dosrochno/ -/news/society/80602-v-buryatskom-sele-vydrino-otremontiruyut-detskuyu-shkolu-iskusstv/ -/news/society/80573-preimushchestvenno-bez-osadkov-dnyem-do-5-tepla-ozhidaetsya-v-buryatii-segodnya-11-marta-/ -/news/culture/79137-pamyati-geroev-rabotniki-sudebnoy-sistemy-zabaykalya-vypustili-knigu-vospominaniy-ob-uchastnikakh-vo/ -/news/society/80558-irkutskuyu-oblast-i-buryatiyu-vozmozhno-svyazhet-noveyshaya-elektrichka/ -/news/society/80535-v-buryatii-podveli-itogi-baykalskoy-mili/ -/news/society/80565-v-zaigraevskom-rayone-buryatii-otkrylas-novaya-shkola-na-450-mest/ -/news/society/80577-na-buryatiyu-obrushitsya-anomalnoe-poteplenie/ -/news/society/80584-v-buryatii-nachalis-meropriyatiya-po-profilaktike-ledyanykh-zatorov/ -/news/culture/80559-proverit-svoe-znanie-buryatskoy-grammatiki-smogut-zhiteli-buryatii/ -/news/culture/80475-dusha-v-tantse-studenty-kolledzha-iskusstv-dali-uroki-yekhora-dlya-zhurnalistov/ -/news/culture/80511-v-buryatii-otkrylas-vystavka-eksperimentalnogo-iskusstva-ii-sulde/ -/news/culture/80515-reper-iz-buryatii-zapisal-klip-s-pevitsey-kotoraya-vystupit-na-evrovidenii/ -/news/culture/80387-smysly-zurkhaya-geshe-tsyren-lama-o-tonkostyakh-buddiyskoy-astrologii-/ -/news/culture/80388-v-ulan-ude-sostoyalas-premera-opery-knyazya-igorya/ -/news/culture/80384-v-ulan-ude-vozvrashchaetsya-festival-uu-sound-/ -/news/culture/80521-v-ulan-ude-predstavili-dokumentalnyy-film-o-buryatskom-kostyume/ -/news/culture/80390-yunye-tsirkovye-artisty-buryatii-vpervye-vystupyat-v-tyumenskom-tsirke/ -/news/culture/80416-buryatskiy-ansambl-bulzhamuur-stal-laureatom-vserossiyskogo-festivalya/ -/news/culture/80592-v-ulan-ude-proshyel-kontsert-etnicheskoy-muzyki/ -/news/culture/80560-v-ulan-ude-nachinaet-svoyu-rabotu-etnokovorking/ -/news/culture/80536-knyaz-igor-v-buryatii-otkuda-v-stepi-drevnerusskaya-grust/ -/news/culture/80509-sezon-dozhdey-dadut-kontsert-v-ulan-ude/ -/news/culture/80557-koster-na-glavnoy-ploshchadi-buryatiya-gotovitsya-k-maslenitse/ -/news/culture/80522-v-ulan-ude-proshla-vstrecha-vesny-kak-zavershenie-vostochnogo-novogo-goda/ -/news/culture/80397-muzhchina-iz-severobaykalska-poluchil-realnyy-srok-za-povtornuyu-ezdu-v-pyanom-vide/ -/news/culture/80420-v-buryatii-izdali-knigu-minii-nyutag/ -/news/culture/80389-artisty-teatra-baykal-vyydut-v-efir-radio-mayak/ -/news/economy/80551-v-buryatii-rastet-spros-na-novye-avtomobili/ -/news/economy/80400-agrarii-buryatii-vernuli-chast-zemel-merii-ulan-ude/ -/news/economy/80462-v-buryatii-otremontiruyut-dorogu-v-posele/ -/news/economy/80447-biznes-buryatii-poluchit-vozmozhnosti-opravitsya-ot-posledstviy-pandemii/ -/news/economy/80549-prognoznyy-poleznyy-otpusk-elektroenergii-i-moshchnosti-po-tarifnym-gruppam/ -/news/economy/80427-v-buryatii-proydet-zasedanie-soveta-direktorov-kholdinga-vertolyety-rossii/ -/news/economy/80506-v-buryatii-otremontiruyut-odnu-iz-samykh-populyarnykh-avtodorog/ -/news/economy/80429-buryatiya-poluchit-okolo-3-milliardov-rubley-na-dorogi/ -/news/economy/80415-v-ulan-ude-startoval-akselerator-proektov-upravlentsev-buryatii/ -/news/economy/80432-buryatiya-poluchit-bolee-28-mln-rubley-na-podderzhku-proektov-sotsialno-orientirovannykh-organizatsi/ -/news/economy/80413-usloviya-dalnevostochnaya-ipoteka-namereny-uluchshit-v-buryatii-i-po-vsemu-dfo/ -/news/economy/80340-tunkinskiy-rayon-buryatii-poluchit-sredstva-na-razvitie-turizma/ -/news/economy/80435-ivan-alkheev-naznachen-zampredom-pravitelstva-buryatii/ -/news/economy/80433-na-ulan-udenskom-aviazavode-vpervye-proshlo-zasedanie-soveta-direktorov-vertolety-rossii/ -/news/economy/80317-bolee-2-tysyach-zhiteley-buryatii-prinyali-uchastie-v-biznes-vstreche-s-alekseem-tsydenovym/ -/news/economy/80478-produktsiya-iz-buryatii-vyshla-na-rynok-germanii/ -/news/economy/80412-buryatiya-priobretet-paket-aktsiy-avrora/ -/news/economy/80488-v-buryatii-razdeli-mestorozhdenie-urana/ -/news/economy/80344-pereezd-na-dalniy-vostok-pozvolit-buryatii-reanimirovat-dva-krupnykh-proekta/ -/news/economy/80491-buryatiya-voshla-v-zonu-modernizatsii-zheleznykh-dorog/ -/news/sports/80463-vdokhnovlyayushchie-rezultaty-novye-vzlyety-khudozhestvennoy-gimnastiki-buryatii/ -/news/sports/80600-v-kabanskom-rayone-buryatii-otkryli-sportivnuyu-ploshchadku/ -/news/sports/80546-studentki-iz-buryatii-na-pedestale-rossiyskogo-urovnya/ -/news/sports/80331-boksery-iz-buryatii-zavoevali-shest-medaley-na-vserossiyskom-turnire-klassa-a/ -/news/sports/80520-final-kubka-buryatii-ne-doigrali-no-pobeditelya-opredelili/ -/news/sports/80578-na-chempionate-rossii-v-ulan-ude-103-bortsa-opredelyat-pervykh-finalistov-obnovlyaetsya/ -/news/sports/80477-borets-iz-buryatii-rasskazal-kak-borolsya-za-ameriku/ -/news/sports/80351-boytsy-federatsii-pankrationa-buryatii-vystupili-na-urovne-dfo/ -/news/sports/80601-ministr-sporta-buryatii-chempionat-rossii-dast-novyy-impuls-razvitiyu-volnoy-borby-v-respublike/ -/news/sports/80507-buryatiya-utverdila-okonchatelnyy-sostav-sbornoy-dlya-uchastiya-v-chempionate-rossii-po-volnoy-borbe/ -/news/sports/80494-borits-iz-buryatii-zakryli-na-karantin-v-rime-/ -/news/sports/80472-na-lyzhakh-s-lukom-buryatskaya-sportsmenka-vyigrala-chetyre-medali-na-pervenstve-rossii/ -/news/sports/80547-beskompromissnye-igry-buryatskikh-shakhmatistov-na-dalnem-vostoke/ -/news/sports/80354-pervyy-bortsovskiy-internat-buryatii-vzyal-na-vooruzhenie-igru-go/ -/news/sports/80568-v-stolitse-buryatii-proshla-pervaya-zherebevka-chempionata-rossii-po-volnoy-borbe/ -/news/sports/80525-glavnyy-favorit-gryadushchego-chempionata-rossii-v-buryatii-ozvuchil-svoy-sostav/ -/news/sports/80470-v-buryatii-gryadet-bitva-titanov/ -/news/sports/80452-sportsmeny-iz-eravninskogo-rayona-buryatii-oderzhali-dve-pobedy/ -/news/sports/80345-znamenityy-rossiyskiy-futbolist-vyigral-baykalskiy-marafon-v-buryatii/ -/news/incidents/80181-v-buryatii-stali-bolshe-pit/ -/news/incidents/80467-v-buryatii-vyyavili-pyat-narusheniy-protivoepidemicheskikh-mer/ -/news/incidents/80408-byvshemu-rukovoditelyu-energeticheskoy-kompanii-iz-buryatii-inkriminiruyut-sozdanie-opg/ -/news/incidents/80517-kommunisty-buryatii-obvinili-organizatorov-baykalskoy-mili/ -/news/incidents/80453-v-kabanskom-rayone-buryatii-zhiteli-zamerzayut-v-svoikh-domakh-/ -/news/incidents/80461-v-bichure-podtopilo-pushkina-/ -/news/incidents/80226-v-buryatii-vynesli-verdikt-po-delu-o-napadenii-sobak-na-pervoklassnika/ -/news/incidents/80471-v-belorussii-izbili-futbolnogo-trenera-iz-buryatii/ -/news/incidents/80530-delo-o-zaderzhanii-brakonerov-v-buryatii-prokommentiroval-skr-/ -/news/incidents/80411-na-baykale-zamorozili-demontazh-zavoda-po-rozlivu-baykalskoy-vody/ -/news/incidents/80399-vrachi-rasskazali-o-sostoyanii-rebenka-ranennogo-v-buryatii/ -/news/incidents/80545-v-buryatii-pri-pozhare-v-zhilom-dome-spaslis-mat-i-dvoe-detey/ -/news/incidents/80582-v-kurumkanskom-rayone-buryatii-zaderzhan-pokhititel-myasa/ -/news/incidents/80349-musornaya-reforma-v-buryatii-stala-povodom-dlya-politicheskikh-sporov/ -/news/incidents/80380-v-buryatii-snova-otlichilsya-voditel-leksusa/ -/news/incidents/80422-sledstvennyy-komitet-po-buryatii-dal-kommentariy-po-povodu-zaderzhaniya-sergeya-ivanova/ -/news/incidents/80191-intsident-na-baykale-pod-kontrolem-pravitelstva-buryatii/ -/news/incidents/80326-v-buryatii-obyasnili-prichiny-tramvaynogo-kollapsa-v-ulan-ude/ -/news/incidents/80529-materialnyy-ushcherb-po-faktu-gibeli-podrostkov-v-buryatii-vzyshchut-s-vladeltsa-avto/ -/news/incidents/80492-v-buryatii-obvinyayut-inspektora-kotoryy-zaderzhal-vliyatelnykh-brakonerov-/ -/news/politics/80591-ministr-sporta-buryatii-otmetil-naplyv-imenitykh-sportsmenov-v-dni-chempionata-rossii-po-volnoy-borb/ -/news/politics/80588-glava-buryatii-prinyal-uchastie-v-soveshchanii-generalnogo-direktora-oao-rzhd//news/society/80553-pionery-buryatskogo-biznesa/ -/news/society/80527-zurkhay-na-10-marta-27-lunnyy-den/ -/news/society/80543-nakanune-10-marta-v-buryatii-ushel-iz-zhizni-eks-rektor-bgskha-aleksandr-popov/ -/news/society/80575-v-buryatii-startoval-federalnyy-proekt-chistaya-voda/ -/news/society/80548-v-stolitsu-buryatii-pribyl-arsen-fadzaev/ -/news/culture/80594-polozhenie-o-konkurse-rasskaza-2021/ -/news/society/80583-vystavka-okhotnichikh-laek-proshla-v-zakamenskom-rayone-buryatii/ -/news/society/80574-v-vuzakh-buryatii-poyavyatsya-prorektory-po-tsifrovizatsii/ -/news/society/80552-poyushchaya-garga-kak-zhivyet-poyushchee-selskoe-poselenie-v-kurumkane/ -/news/society/80572-zurkhay-na-11-marta-28-lunnyy-den/ -/news/society/80555-buryatiya-voshla-v-chislo-regionov-kotorye-podderzhali-pevitsu-manizhu/ -/news/society/80581-zhivaya-legenda-volnoy-borby-rossii-pribyl-v-buryatiyu-v-svoy-den-rozhdeniya/ -/news/society/80540-v-stolitsu-buryatii-nachali-pribyvat-uchastniki-i-gosti-chempionata-rossii-po-volnoy-borbe/ -/news/society/80542-stalo-izvestno-komu-iz-zhiteley-buryatii-udastsya-vyyti-na-pensiyu-dosrochno/ -/news/society/80602-v-buryatskom-sele-vydrino-otremontiruyut-detskuyu-shkolu-iskusstv/ -/news/society/80573-preimushchestvenno-bez-osadkov-dnyem-do-5-tepla-ozhidaetsya-v-buryatii-segodnya-11-marta-/ -/news/culture/79137-pamyati-geroev-rabotniki-sudebnoy-sistemy-zabaykalya-vypustili-knigu-vospominaniy-ob-uchastnikakh-vo/ -/news/society/80558-irkutskuyu-oblast-i-buryatiyu-vozmozhno-svyazhet-noveyshaya-elektrichka/ -/news/society/80535-v-buryatii-podveli-itogi-baykalskoy-mili/ -/news/society/80565-v-zaigraevskom-rayone-buryatii-otkrylas-novaya-shkola-na-450-mest/ -/news/society/80577-na-buryatiyu-obrushitsya-anomalnoe-poteplenie/ -/news/society/80584-v-buryatii-nachalis-meropriyatiya-po-profilaktike-ledyanykh-zatorov/ -/news/culture/80559-proverit-svoe-znanie-buryatskoy-grammatiki-smogut-zhiteli-buryatii/ -/news/culture/80475-dusha-v-tantse-studenty-kolledzha-iskusstv-dali-uroki-yekhora-dlya-zhurnalistov/ -/news/culture/80511-v-buryatii-otkrylas-vystavka-eksperimentalnogo-iskusstva-ii-sulde/ -/news/culture/80515-reper-iz-buryatii-zapisal-klip-s-pevitsey-kotoraya-vystupit-na-evrovidenii/ -/news/culture/80387-smysly-zurkhaya-geshe-tsyren-lama-o-tonkostyakh-buddiyskoy-astrologii-/ -/news/culture/80388-v-ulan-ude-sostoyalas-premera-opery-knyazya-igorya/ -/news/culture/80384-v-ulan-ude-vozvrashchaetsya-festival-uu-sound-/ -/news/culture/80521-v-ulan-ude-predstavili-dokumentalnyy-film-o-buryatskom-kostyume/ -/news/culture/80390-yunye-tsirkovye-artisty-buryatii-vpervye-vystupyat-v-tyumenskom-tsirke/ -/news/culture/80416-buryatskiy-ansambl-bulzhamuur-stal-laureatom-vserossiyskogo-festivalya/ -/news/culture/80592-v-ulan-ude-proshyel-kontsert-etnicheskoy-muzyki/ -/news/culture/80560-v-ulan-ude-nachinaet-svoyu-rabotu-etnokovorking/ -/news/culture/80536-knyaz-igor-v-buryatii-otkuda-v-stepi-drevnerusskaya-grust/ -/news/culture/80509-sezon-dozhdey-dadut-kontsert-v-ulan-ude/ -/news/culture/80557-koster-na-glavnoy-ploshchadi-buryatiya-gotovitsya-k-maslenitse/ -/news/culture/80522-v-ulan-ude-proshla-vstrecha-vesny-kak-zavershenie-vostochnogo-novogo-goda/ -/news/culture/80397-muzhchina-iz-severobaykalska-poluchil-realnyy-srok-za-povtornuyu-ezdu-v-pyanom-vide/ -/news/culture/80420-v-buryatii-izdali-knigu-minii-nyutag/ -/news/culture/80389-artisty-teatra-baykal-vyydut-v-efir-radio-mayak/ -/news/economy/80551-v-buryatii-rastet-spros-na-novye-avtomobili/ -/news/economy/80400-agrarii-buryatii-vernuli-chast-zemel-merii-ulan-ude/ -/news/economy/80462-v-buryatii-otremontiruyut-dorogu-v-posele/ -/news/economy/80447-biznes-buryatii-poluchit-vozmozhnosti-opravitsya-ot-posledstviy-pandemii/ -/news/economy/80549-prognoznyy-poleznyy-otpusk-elektroenergii-i-moshchnosti-po-tarifnym-gruppam/ -/news/economy/80427-v-buryatii-proydet-zasedanie-soveta-direktorov-kholdinga-vertolyety-rossii/ -/news/economy/80506-v-buryatii-otremontiruyut-odnu-iz-samykh-populyarnykh-avtodorog/ -/news/economy/80429-buryatiya-poluchit-okolo-3-milliardov-rubley-na-dorogi/ -/news/economy/80415-v-ulan-ude-startoval-akselerator-proektov-upravlentsev-buryatii/ -/news/economy/80432-buryatiya-poluchit-bolee-28-mln-rubley-na-podderzhku-proektov-sotsialno-orientirovannykh-organizatsi/ -/news/economy/80413-usloviya-dalnevostochnaya-ipoteka-namereny-uluchshit-v-buryatii-i-po-vsemu-dfo/ -/news/economy/80340-tunkinskiy-rayon-buryatii-poluchit-sredstva-na-razvitie-turizma/ -/news/economy/80435-ivan-alkheev-naznachen-zampredom-pravitelstva-buryatii/ -/news/economy/80433-na-ulan-udenskom-aviazavode-vpervye-proshlo-zasedanie-soveta-direktorov-vertolety-rossii/ -/news/economy/80317-bolee-2-tysyach-zhiteley-buryatii-prinyali-uchastie-v-biznes-vstreche-s-alekseem-tsydenovym/ -/news/economy/80478-produktsiya-iz-buryatii-vyshla-na-rynok-germanii/ -/news/economy/80412-buryatiya-priobretet-paket-aktsiy-avrora/ -/news/economy/80488-v-buryatii-razdeli-mestorozhdenie-urana/ -/news/economy/80344-pereezd-na-dalniy-vostok-pozvolit-buryatii-reanimirovat-dva-krupnykh-proekta/ -/news/economy/80491-buryatiya-voshla-v-zonu-modernizatsii-zheleznykh-dorog/ -/news/sports/80463-vdokhnovlyayushchie-rezultaty-novye-vzlyety-khudozhestvennoy-gimnastiki-buryatii/ -/news/sports/80600-v-kabanskom-rayone-buryatii-otkryli-sportivnuyu-ploshchadku/ -/news/sports/80546-studentki-iz-buryatii-na-pedestale-rossiyskogo-urovnya/ -/news/sports/80331-boksery-iz-buryatii-zavoevali-shest-medaley-na-vserossiyskom-turnire-klassa-a/ -/news/sports/80520-final-kubka-buryatii-ne-doigrali-no-pobeditelya-opredelili/ -/news/sports/80578-na-chempionate-rossii-v-ulan-ude-103-bortsa-opredelyat-pervykh-finalistov-obnovlyaetsya/ -/news/sports/80477-borets-iz-buryatii-rasskazal-kak-borolsya-za-ameriku/ -/news/sports/80351-boytsy-federatsii-pankrationa-buryatii-vystupili-na-urovne-dfo/ -/news/sports/80601-ministr-sporta-buryatii-chempionat-rossii-dast-novyy-impuls-razvitiyu-volnoy-borby-v-respublike/ -/news/sports/80507-buryatiya-utverdila-okonchatelnyy-sostav-sbornoy-dlya-uchastiya-v-chempionate-rossii-po-volnoy-borbe/ -/news/sports/80494-borits-iz-buryatii-zakryli-na-karantin-v-rime-/ -/news/sports/80472-na-lyzhakh-s-lukom-buryatskaya-sportsmenka-vyigrala-chetyre-medali-na-pervenstve-rossii/ -/news/sports/80547-beskompromissnye-igry-buryatskikh-shakhmatistov-na-dalnem-vostoke/ -/news/sports/80354-pervyy-bortsovskiy-internat-buryatii-vzyal-na-vooruzhenie-igru-go/ -/news/sports/80568-v-stolitse-buryatii-proshla-pervaya-zherebevka-chempionata-rossii-po-volnoy-borbe/ -/news/sports/80525-glavnyy-favorit-gryadushchego-chempionata-rossii-v-buryatii-ozvuchil-svoy-sostav/ -/news/sports/80470-v-buryatii-gryadet-bitva-titanov/ -/news/sports/80452-sportsmeny-iz-eravninskogo-rayona-buryatii-oderzhali-dve-pobedy/ -/news/sports/80345-znamenityy-rossiyskiy-futbolist-vyigral-baykalskiy-marafon-v-buryatii/ -/news/incidents/80181-v-buryatii-stali-bolshe-pit/ -/news/incidents/80467-v-buryatii-vyyavili-pyat-narusheniy-protivoepidemicheskikh-mer/ -/news/incidents/80408-byvshemu-rukovoditelyu-energeticheskoy-kompanii-iz-buryatii-inkriminiruyut-sozdanie-opg/ -/news/incidents/80517-kommunisty-buryatii-obvinili-organizatorov-baykalskoy-mili/ -/news/incidents/80453-v-kabanskom-rayone-buryatii-zhiteli-zamerzayut-v-svoikh-domakh-/ -/news/incidents/80461-v-bichure-podtopilo-pushkina-/ -/news/incidents/80226-v-buryatii-vynesli-verdikt-po-delu-o-napadenii-sobak-na-pervoklassnika/ -/news/incidents/80471-v-belorussii-izbili-futbolnogo-trenera-iz-buryatii/ -/news/incidents/80530-delo-o-zaderzhanii-brakonerov-v-buryatii-prokommentiroval-skr-/ -/news/incidents/80411-na-baykale-zamorozili-demontazh-zavoda-po-rozlivu-baykalskoy-vody/ -/news/incidents/80399-vrachi-rasskazali-o-sostoyanii-rebenka-ranennogo-v-buryatii/ -/news/incidents/80545-v-buryatii-pri-pozhare-v-zhilom-dome-spaslis-mat-i-dvoe-detey/ -/news/incidents/80582-v-kurumkanskom-rayone-buryatii-zaderzhan-pokhititel-myasa/ -/news/incidents/80349-musornaya-reforma-v-buryatii-stala-povodom-dlya-politicheskikh-sporov/ -/news/incidents/80380-v-buryatii-snova-otlichilsya-voditel-leksusa/ -/news/incidents/80422-sledstvennyy-komitet-po-buryatii-dal-kommentariy-po-povodu-zaderzhaniya-sergeya-ivanova/ -/news/incidents/80191-intsident-na-baykale-pod-kontrolem-pravitelstva-buryatii/ -/news/incidents/80326-v-buryatii-obyasnili-prichiny-tramvaynogo-kollapsa-v-ulan-ude/ -/news/incidents/80529-materialnyy-ushcherb-po-faktu-gibeli-podrostkov-v-buryatii-vzyshchut-s-vladeltsa-avto/ -/news/incidents/80492-v-buryatii-obvinyayut-inspektora-kotoryy-zaderzhal-vliyatelnykh-brakonerov-/ -/news/politics/80591-ministr-sporta-buryatii-otmetil-naplyv-imenitykh-sportsmenov-v-dni-chempionata-rossii-po-volnoy-borb/ -/news/politics/80588-glava-buryatii-prinyal-uchastie-v-soveshchanii-generalnogo-direktora-oao-rzhd/ -/news/politics/80590-glava-buryatii-vyekhal-s-rabochey-poezdkoy-v-mukhorshibirskiy-rayon/ -/news/politics/80498-aleksey-tsydenov-pozdravil-vsekh-zhenshchin-buryatii-s-8-marta-/ -/news/politics/80538-delo-o-zaderzhanii-brakonerov-v-buryatii-poruchil-vzyat-pod-kontrol-yuriy-trutnev/ -/news/politics/80480-glava-buryatii-vmeste-s-poslom-izrailya-v-rossii-pochtili-pamyat-geroev-vov/ -/news/politics/80482-glava-buryatii-predlozhil-poslu-izrailya-sotrudnichestvo-v-sfere-turizma-i-meditsiny/ -/news/politics/80436-glava-buryatii-i-prezident-tatarstana-obsudili-sotrudnichestvo-mezhdu-regionami/ -/news/politics/80456-glavnyy-kommunist-buryatii-ne-voshyel-v-short-list-ot-kprf/ -/news/politics/80508-glava-buryatii-nagradil-laureatov-gosudarstvennykh-premiy-respubliki-v-sfere-kultury-i-iskusstva//news/society/80553-pionery-buryatskogo-biznesa/ -/news/society/80527-zurkhay-na-10-marta-27-lunnyy-den/ -/news/society/80543-nakanune-10-marta-v-buryatii-ushel-iz-zhizni-eks-rektor-bgskha-aleksandr-popov/ -/news/society/80575-v-buryatii-startoval-federalnyy-proekt-chistaya-voda/ -/news/society/80548-v-stolitsu-buryatii-pribyl-arsen-fadzaev/ -/news/culture/80594-polozhenie-o-konkurse-rasskaza-2021/ -/news/society/80583-vystavka-okhotnichikh-laek-proshla-v-zakamenskom-rayone-buryatii/ -/news/society/80574-v-vuzakh-buryatii-poyavyatsya-prorektory-po-tsifrovizatsii/ -/news/society/80552-poyushchaya-garga-kak-zhivyet-poyushchee-selskoe-poselenie-v-kurumkane/ -/news/society/80572-zurkhay-na-11-marta-28-lunnyy-den/ -/news/society/80555-buryatiya-voshla-v-chislo-regionov-kotorye-podderzhali-pevitsu-manizhu/ -/news/society/80581-zhivaya-legenda-volnoy-borby-rossii-pribyl-v-buryatiyu-v-svoy-den-rozhdeniya/ -/news/society/80540-v-stolitsu-buryatii-nachali-pribyvat-uchastniki-i-gosti-chempionata-rossii-po-volnoy-borbe/ -/news/society/80542-stalo-izvestno-komu-iz-zhiteley-buryatii-udastsya-vyyti-na-pensiyu-dosrochno/ -/news/society/80602-v-buryatskom-sele-vydrino-otremontiruyut-detskuyu-shkolu-iskusstv/ -/news/society/80573-preimushchestvenno-bez-osadkov-dnyem-do-5-tepla-ozhidaetsya-v-buryatii-segodnya-11-marta-/ -/news/culture/79137-pamyati-geroev-rabotniki-sudebnoy-sistemy-zabaykalya-vypustili-knigu-vospominaniy-ob-uchastnikakh-vo/ -/news/society/80558-irkutskuyu-oblast-i-buryatiyu-vozmozhno-svyazhet-noveyshaya-elektrichka/ -/news/society/80535-v-buryatii-podveli-itogi-baykalskoy-mili/ -/news/society/80565-v-zaigraevskom-rayone-buryatii-otkrylas-novaya-shkola-na-450-mest/ -/news/society/80577-na-buryatiyu-obrushitsya-anomalnoe-poteplenie/ -/news/society/80584-v-buryatii-nachalis-meropriyatiya-po-profilaktike-ledyanykh-zatorov/ -/news/culture/80559-proverit-svoe-znanie-buryatskoy-grammatiki-smogut-zhiteli-buryatii/ -/news/culture/80475-dusha-v-tantse-studenty-kolledzha-iskusstv-dali-uroki-yekhora-dlya-zhurnalistov/ -/news/culture/80511-v-buryatii-otkrylas-vystavka-eksperimentalnogo-iskusstva-ii-sulde/ -/news/culture/80515-reper-iz-buryatii-zapisal-klip-s-pevitsey-kotoraya-vystupit-na-evrovidenii/ -/news/culture/80387-smysly-zurkhaya-geshe-tsyren-lama-o-tonkostyakh-buddiyskoy-astrologii-/ -/news/culture/80388-v-ulan-ude-sostoyalas-premera-opery-knyazya-igorya/ -/news/culture/80384-v-ulan-ude-vozvrashchaetsya-festival-uu-sound-/ -/news/culture/80521-v-ulan-ude-predstavili-dokumentalnyy-film-o-buryatskom-kostyume/ -/news/culture/80390-yunye-tsirkovye-artisty-buryatii-vpervye-vystupyat-v-tyumenskom-tsirke/ -/news/culture/80416-buryatskiy-ansambl-bulzhamuur-stal-laureatom-vserossiyskogo-festivalya/ -/news/culture/80592-v-ulan-ude-proshyel-kontsert-etnicheskoy-muzyki/ -/news/culture/80560-v-ulan-ude-nachinaet-svoyu-rabotu-etnokovorking/ -/news/culture/80536-knyaz-igor-v-buryatii-otkuda-v-stepi-drevnerusskaya-grust/ -/news/culture/80509-sezon-dozhdey-dadut-kontsert-v-ulan-ude/ -/news/culture/80557-koster-na-glavnoy-ploshchadi-buryatiya-gotovitsya-k-maslenitse/ -/news/culture/80522-v-ulan-ude-proshla-vstrecha-vesny-kak-zavershenie-vostochnogo-novogo-goda/ -/news/culture/80397-muzhchina-iz-severobaykalska-poluchil-realnyy-srok-za-povtornuyu-ezdu-v-pyanom-vide/ -/news/culture/80420-v-buryatii-izdali-knigu-minii-nyutag/ -/news/culture/80389-artisty-teatra-baykal-vyydut-v-efir-radio-mayak/ -/news/economy/80551-v-buryatii-rastet-spros-na-novye-avtomobili/ -/news/economy/80400-agrarii-buryatii-vernuli-chast-zemel-merii-ulan-ude/ -/news/economy/80462-v-buryatii-otremontiruyut-dorogu-v-posele/ -/news/economy/80447-biznes-buryatii-poluchit-vozmozhnosti-opravitsya-ot-posledstviy-pandemii/ -/news/economy/80549-prognoznyy-poleznyy-otpusk-elektroenergii-i-moshchnosti-po-tarifnym-gruppam/ -/news/economy/80427-v-buryatii-proydet-zasedanie-soveta-direktorov-kholdinga-vertolyety-rossii/ -/news/economy/80506-v-buryatii-otremontiruyut-odnu-iz-samykh-populyarnykh-avtodorog/ -/news/economy/80429-buryatiya-poluchit-okolo-3-milliardov-rubley-na-dorogi/ -/news/economy/80415-v-ulan-ude-startoval-akselerator-proektov-upravlentsev-buryatii/ -/news/economy/80432-buryatiya-poluchit-bolee-28-mln-rubley-na-podderzhku-proektov-sotsialno-orientirovannykh-organizatsi/ -/news/economy/80413-usloviya-dalnevostochnaya-ipoteka-namereny-uluchshit-v-buryatii-i-po-vsemu-dfo/ -/news/economy/80340-tunkinskiy-rayon-buryatii-poluchit-sredstva-na-razvitie-turizma/ -/news/economy/80435-ivan-alkheev-naznachen-zampredom-pravitelstva-buryatii/ -/news/economy/80433-na-ulan-udenskom-aviazavode-vpervye-proshlo-zasedanie-soveta-direktorov-vertolety-rossii/ -/news/economy/80317-bolee-2-tysyach-zhiteley-buryatii-prinyali-uchastie-v-biznes-vstreche-s-alekseem-tsydenovym/ -/news/economy/80478-produktsiya-iz-buryatii-vyshla-na-rynok-germanii/ -/news/economy/80412-buryatiya-priobretet-paket-aktsiy-avrora/ -/news/economy/80488-v-buryatii-razdeli-mestorozhdenie-urana/ -/news/economy/80344-pereezd-na-dalniy-vostok-pozvolit-buryatii-reanimirovat-dva-krupnykh-proekta/ -/news/economy/80491-buryatiya-voshla-v-zonu-modernizatsii-zheleznykh-dorog/ -/news/sports/80463-vdokhnovlyayushchie-rezultaty-novye-vzlyety-khudozhestvennoy-gimnastiki-buryatii/ -/news/sports/80600-v-kabanskom-rayone-buryatii-otkryli-sportivnuyu-ploshchadku/ -/news/sports/80546-studentki-iz-buryatii-na-pedestale-rossiyskogo-urovnya/ -/news/sports/80331-boksery-iz-buryatii-zavoevali-shest-medaley-na-vserossiyskom-turnire-klassa-a/ -/news/sports/80520-final-kubka-buryatii-ne-doigrali-no-pobeditelya-opredelili/ -/news/sports/80578-na-chempionate-rossii-v-ulan-ude-103-bortsa-opredelyat-pervykh-finalistov-obnovlyaetsya/ -/news/sports/80477-borets-iz-buryatii-rasskazal-kak-borolsya-za-ameriku/ -/news/sports/80351-boytsy-federatsii-pankrationa-buryatii-vystupili-na-urovne-dfo/ -/news/sports/80601-ministr-sporta-buryatii-chempionat-rossii-dast-novyy-impuls-razvitiyu-volnoy-borby-v-respublike/ -/news/sports/80507-buryatiya-utverdila-okonchatelnyy-sostav-sbornoy-dlya-uchastiya-v-chempionate-rossii-po-volnoy-borbe/ -/news/sports/80494-borits-iz-buryatii-zakryli-na-karantin-v-rime-/ -/news/sports/80472-na-lyzhakh-s-lukom-buryatskaya-sportsmenka-vyigrala-chetyre-medali-na-pervenstve-rossii/ -/news/sports/80547-beskompromissnye-igry-buryatskikh-shakhmatistov-na-dalnem-vostoke/ -/news/sports/80354-pervyy-bortsovskiy-internat-buryatii-vzyal-na-vooruzhenie-igru-go/ -/news/sports/80568-v-stolitse-buryatii-proshla-pervaya-zherebevka-chempionata-rossii-po-volnoy-borbe/ -/news/sports/80525-glavnyy-favorit-gryadushchego-chempionata-rossii-v-buryatii-ozvuchil-svoy-sostav/ -/news/sports/80470-v-buryatii-gryadet-bitva-titanov/ -/news/sports/80452-sportsmeny-iz-eravninskogo-rayona-buryatii-oderzhali-dve-pobedy/ -/news/sports/80345-znamenityy-rossiyskiy-futbolist-vyigral-baykalskiy-marafon-v-buryatii/ -/news/incidents/80181-v-buryatii-stali-bolshe-pit/ -/news/incidents/80467-v-buryatii-vyyavili-pyat-narusheniy-protivoepidemicheskikh-mer/ -/news/incidents/80408-byvshemu-rukovoditelyu-energeticheskoy-kompanii-iz-buryatii-inkriminiruyut-sozdanie-opg/ -/news/incidents/80517-kommunisty-buryatii-obvinili-organizatorov-baykalskoy-mili/ -/news/incidents/80453-v-kabanskom-rayone-buryatii-zhiteli-zamerzayut-v-svoikh-domakh-/ -/news/incidents/80461-v-bichure-podtopilo-pushkina-/ -/news/incidents/80226-v-buryatii-vynesli-verdikt-po-delu-o-napadenii-sobak-na-pervoklassnika/ -/news/incidents/80471-v-belorussii-izbili-futbolnogo-trenera-iz-buryatii/ -/news/incidents/80530-delo-o-zaderzhanii-brakonerov-v-buryatii-prokommentiroval-skr-/ -/news/incidents/80411-na-baykale-zamorozili-demontazh-zavoda-po-rozlivu-baykalskoy-vody/ -/news/incidents/80399-vrachi-rasskazali-o-sostoyanii-rebenka-ranennogo-v-buryatii/ -/news/incidents/80545-v-buryatii-pri-pozhare-v-zhilom-dome-spaslis-mat-i-dvoe-detey/ -/news/incidents/80582-v-kurumkanskom-rayone-buryatii-zaderzhan-pokhititel-myasa/ -/news/incidents/80349-musornaya-reforma-v-buryatii-stala-povodom-dlya-politicheskikh-sporov/ -/news/incidents/80380-v-buryatii-snova-otlichilsya-voditel-leksusa/ -/news/incidents/80422-sledstvennyy-komitet-po-buryatii-dal-kommentariy-po-povodu-zaderzhaniya-sergeya-ivanova/ -/news/incidents/80191-intsident-na-baykale-pod-kontrolem-pravitelstva-buryatii/ -/news/incidents/80326-v-buryatii-obyasnili-prichiny-tramvaynogo-kollapsa-v-ulan-ude/ -/news/incidents/80529-materialnyy-ushcherb-po-faktu-gibeli-podrostkov-v-buryatii-vzyshchut-s-vladeltsa-avto/ -/news/incidents/80492-v-buryatii-obvinyayut-inspektora-kotoryy-zaderzhal-vliyatelnykh-brakonerov-/ -/news/politics/80591-ministr-sporta-buryatii-otmetil-naplyv-imenitykh-sportsmenov-v-dni-chempionata-rossii-po-volnoy-borb/ -/news/politics/80588-glava-buryatii-prinyal-uchastie-v-soveshchanii-generalnogo-direktora-oao-rzhd/ -/news/politics/80590-glava-buryatii-vyekhal-s-rabochey-poezdkoy-v-mukhorshibirskiy-rayon/ -/news/politics/80498-aleksey-tsydenov-pozdravil-vsekh-zhenshchin-buryatii-s-8-marta-/ -/news/politics/80538-delo-o-zaderzhanii-brakonerov-v-buryatii-poruchil-vzyat-pod-kontrol-yuriy-trutnev/ -/news/politics/80480-glava-buryatii-vmeste-s-poslom-izrailya-v-rossii-pochtili-pamyat-geroev-vov/ -/news/politics/80482-glava-buryatii-predlozhil-poslu-izrailya-sotrudnichestvo-v-sfere-turizma-i-meditsiny/ -/news/politics/80436-glava-buryatii-i-prezident-tatarstana-obsudili-sotrudnichestvo-mezhdu-regionami/ -/news/politics/80456-glavnyy-kommunist-buryatii-ne-voshyel-v-short-list-ot-kprf/ -/news/politics/80508-glava-buryatii-nagradil-laureatov-gosudarstvennykh-premiy-respubliki-v-sfere-kultury-i-iskusstva/ \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index eb1295f2..bf64ac64 100644 --- a/scrapper.py +++ b/scrapper.py @@ -51,8 +51,6 @@ def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_se @staticmethod def _extract_url(article_bs, seen): extracted = list({link['href'] for link in article_bs.find_all('a', href=True)}) - # print(extracted) - # print(' ',extracted) return list(filter(lambda x: x.startswith('/news/') and x not in seen, extracted)) @@ -101,7 +99,11 @@ def _crawl(self, pool: list): for link in pool: if self.is_waiting: wait(randint(0, 10)) - article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') + try: + article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') + except requests.exceptions.ConnectionError: + wait(10) + article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) newfound = [i for i in newfound if len(i) > 20 and not any(map(lambda y: y.isupper(), i))] @@ -162,7 +164,9 @@ def __init__(self, full__url: str, article_id: int): def _fill_article_with_text(self, article_soup): try: text = article_soup.find('div', {'class': 'text letter', 'itemprop': 'articleBody'}).text.strip() - self.article.text = text + text = [i for i in text.split('\n') if 'Фото:' not in i and 'Автор:' not in i + and '© фото:' not in i and 'Источник:' not in i] + self.article.text = '\n'.join(text).strip() except AttributeError: print(' unable to parse', self.full_url) @@ -216,7 +220,6 @@ def parse(self): """ Parses each article """ - # print(self.full_url) self.article.url = self.full_url self.article.article_id = self.article_id html = requests.get(self.full_url, 'html.parser').text From f7aa7f6e18c57acf17958b1e04091618c8f67aa8 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Thu, 11 Mar 2021 22:05:57 +0300 Subject: [PATCH 21/50] uhm --- scrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapper.py b/scrapper.py index bf64ac64..062b493e 100644 --- a/scrapper.py +++ b/scrapper.py @@ -182,7 +182,7 @@ def _fill_article_with_meta_information(self, article_soup): author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip() author = author.split('\n')[0][9:].strip() else: - author = '' + author = 'Not found' self.article.author = author when = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[1] self.article.date = self.unify_date_format(when) From e75381c3c0e1d8aab8dc5b6fc3eb5c7d06f50b5d Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sat, 13 Mar 2021 11:51:05 +0300 Subject: [PATCH 22/50] fixed some problems from review --- config/constants.py | 10 -- config/raw_metadata_score_four_test.py | 1 - constants.py | 6 +- links/links.txt | 0 scrapper.py | 162 +++++++++++-------------- 5 files changed, 75 insertions(+), 104 deletions(-) delete mode 100644 config/constants.py create mode 100644 links/links.txt diff --git a/config/constants.py b/config/constants.py deleted file mode 100644 index 28a84b06..00000000 --- a/config/constants.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -Useful constant variables -""" - -import os - -PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) -ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') -CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') -LINKS_STORAGE = os.path.join(PROJECT_ROOT, 'links') diff --git a/config/raw_metadata_score_four_test.py b/config/raw_metadata_score_four_test.py index e6900c05..d8691879 100644 --- a/config/raw_metadata_score_four_test.py +++ b/config/raw_metadata_score_four_test.py @@ -16,7 +16,6 @@ def setUp(self) -> None: def test_validate_sort(self): list_ids = [pair[0] for pair in self.texts] - print(list_ids) for i in range(1, len(list_ids)+1): self.assertTrue(i in list_ids, msg="""Articles ids are not homogeneous. E.g. numbers are not from 1 to N""") diff --git a/constants.py b/constants.py index 3a7976d0..938510b9 100644 --- a/constants.py +++ b/constants.py @@ -7,5 +7,9 @@ PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles') CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json') -LINKS_STORAGE = os.path.join(PROJECT_ROOT, 'links') +LINKS_STORAGE_DIR = os.path.join(PROJECT_ROOT, 'links') +LINKS_STORAGE_FILE = os.path.join(LINKS_STORAGE_DIR, 'links.txt') + URL_START = 'https://burunen.ru' +HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)' + ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} \ No newline at end of file diff --git a/links/links.txt b/links/links.txt new file mode 100644 index 00000000..e69de29b diff --git a/scrapper.py b/scrapper.py index 062b493e..372973e7 100644 --- a/scrapper.py +++ b/scrapper.py @@ -2,16 +2,18 @@ Crawler implementation """ -import os -import json from datetime import date +import json +import os from random import randint from time import sleep as wait + +from bs4 import BeautifulSoup import requests from requests.exceptions import RequestException -from bs4 import BeautifulSoup + from article import Article -from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH, LINKS_STORAGE, URL_START +from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH, LINKS_STORAGE_DIR, URL_START, HEADERS, LINKS_STORAGE_FILE class IncorrectURLError(Exception): @@ -38,6 +40,12 @@ class UnknownConfigError(Exception): """ +class NoBackUpEnabled(Exception): + """ + Custom Error + """ + + class Crawler: """ Crawler implementation @@ -59,27 +67,28 @@ def find_articles(self): Finds articles """ for link in self.seed_urls: - article_bs = BeautifulSoup(requests.get(link, 'html.parser').text, 'html.parser') + article_bs = BeautifulSoup(requests.get(link, 'html.parser', headers=HEADERS).text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) self.urls.extend(newfound[:self.max_articles_per_seed]) self.urls = [i for i in self.urls if len(i) > 20 and not any(map(lambda y: y.isupper(), i))][:self.total_max_articles] print('Scraped seed urls, overall number of urls is', len(self.urls)) - old = len(self.urls) while len(self.urls) < self.total_max_articles: print('Due to insufficient number started further iteration') print('current number', len(self.urls), ', required', self.total_max_articles) + old = len(self.urls) for link in self.urls: - article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') + article_bs = BeautifulSoup(requests.get(URL_START + link, + 'html.parser', headers=HEADERS).text, 'html.parser') newfound = list(filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls))) - print(' checked new url, found', len(newfound), 'articles') + print('checked new url, found', len(newfound), 'articles') self.urls.extend(newfound[:self.max_articles_per_seed]) if len(self.urls) > self.total_max_articles: break if len(self.urls) == old: - print(' Something is wrong with scraping parameters') + print('There are no unseen urls found in all of the available addresses') + print(f'crawling finished with {len(self.urls)}') break - self.urls = self.urls[:self.total_max_articles] def get_search_urls(self): @@ -96,60 +105,54 @@ def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_se self.is_waiting = to_wait def _crawl(self, pool: list): + found = [] for link in pool: if self.is_waiting: wait(randint(0, 10)) - try: - article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') - except requests.exceptions.ConnectionError: - wait(10) - article_bs = BeautifulSoup(requests.get(URL_START + link, 'html.parser').text, 'html.parser') + article_bs = BeautifulSoup(requests.get(URL_START + link, headers=HEADERS).text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) newfound = [i for i in newfound if len(i) > 20 and not any(map(lambda y: y.isupper(), i))] - return newfound + found.extend(newfound) + return list(set(found)) def find_articles(self): - if self.get_backedup(): + if self.read_backedup(): print('backed up urls found, starting iteration') if not self.urls: - self.urls = self._crawl(self.seed_urls) - with open('links/url_backup.txt', 'w', encoding='utf-8') as file: - file.write('\n'.join(self.urls)) - print(f'Scraped {len(self.urls)} from seed') + pool = self.seed_urls + else: + pool = self.urls + newfound = self._crawl(pool) + if not newfound: + print(f'there are no unseen links found\nrecursive crawling finished with {len(self.urls)} urls.') + else: + self.urls.extend(newfound) + with open(LINKS_STORAGE_FILE, 'a', encoding='utf-8') as file: + file.write('\n'.join(newfound)) + print(f'found {len(newfound)} new urls') if self.verify_proceed(): - print('starting recursive scraping') + print('starting new iteration') self.find_articles() else: print(f'recursive crawling finished with {len(self.urls)} urls.') - else: - newfound = self._crawl(self.urls) - if not newfound: - print(f'there are no unseen links found\nrecursive crawling finished with {len(self.urls)} urls.') - else: - self.urls.extend(newfound) - with open('links/url_backup.txt', 'a', encoding='utf-8') as file: - file.write('\n'.join(newfound)) - print(f'found {len(newfound)} new urls') - if self.verify_proceed(): - print('starting new iteration') - self.find_articles() - else: - print(f'recursive crawling finished with {len(self.urls)} urls.') @staticmethod def verify_proceed(): answer = input('Would you like to proceed? yes or no: ').strip() return answer == 'yes' - def get_backedup(self): + def read_backedup(self): try: - with open('links/url_backup.txt', 'r', encoding='utf-8') as file: + with open(LINKS_STORAGE_FILE, 'r', encoding='utf-8') as file: sources = file.read().split('\n') self.urls = sources - return True + if self.urls: + print('backed up urls found') except FileNotFoundError: - return False + print('no backed up files found') + with open(LINKS_STORAGE_FILE, 'w', encoding='utf-8'): + pass class ArticleParser: @@ -163,36 +166,28 @@ def __init__(self, full__url: str, article_id: int): def _fill_article_with_text(self, article_soup): try: - text = article_soup.find('div', {'class': 'text letter', 'itemprop': 'articleBody'}).text.strip() - text = [i for i in text.split('\n') if 'Фото:' not in i and 'Автор:' not in i - and '© фото:' not in i and 'Источник:' not in i] - self.article.text = '\n'.join(text).strip() + text = article_soup.find('div', {'class': 'text letter', + 'itemprop': 'articleBody'}).text.strip().split('\n') + stopws = ['Фото:', 'Автор:', 'Источник:', '© фото:'] + self.article.text = ' '.join(filter(lambda line: + all(map(lambda stopw: stopw not in line, stopws)), text)).strip() except AttributeError: - print(' unable to parse', self.full_url) + print('unable to parse', self.full_url) + self.article.text = 'ERROR' def _fill_article_with_meta_information(self, article_soup): try: title = article_soup.title.text self.article.title = title - - credit = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0] - if 'Автор:' in credit: - author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip().split('\n')[0][7:] - elif 'Источник:' in credit: - author = article_soup.find('div', {'class': 'credits t-caption'}).text.strip() - author = author.split('\n')[0][9:].strip() - else: - author = 'Not found' - self.article.author = author + author = article_soup.find('div', + {'class': 'credits t-caption'}).text.strip().split('\n')[0].split(': ')[-1] + self.article.author = author.strip() when = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[1] self.article.date = self.unify_date_format(when) - topic = article_soup.find('div', {'class': 'b-caption'}).text.strip().split('\n')[0] self.article.topics = topic except AttributeError: - print(' something is off with', self.full_url) - # print(title) - # self.article.title = title + print('something is off with', self.full_url) @staticmethod def unify_date_format(date_str): @@ -220,29 +215,22 @@ def parse(self): """ Parses each article """ - self.article.url = self.full_url - self.article.article_id = self.article_id - html = requests.get(self.full_url, 'html.parser').text + html = requests.get(self.full_url, 'html.parser', headers=HEADERS).text article_bs = BeautifulSoup(html, 'html.parser') self._fill_article_with_text(article_bs) self._fill_article_with_meta_information(article_bs) - self.article.save_raw() + return self.article -def prepare_environment(base_path): +def prepare_environment(base_path, backup_path_dir): """ Creates ASSETS_PATH folder if not created and removes existing folder """ if not os.path.exists(base_path): os.makedirs(base_path) - - -def enable_backup(base_path): - """ - Creates folder for backup links if not created - """ - if not os.path.exists(base_path): - os.makedirs(base_path) + if not os.path.exists(backup_path_dir): + print('GOT HERE WITH PATH', backup_path_dir) + os.makedirs(backup_path_dir) def validate_config(crawler_path): @@ -251,13 +239,8 @@ def validate_config(crawler_path): """ with open(crawler_path) as crawler_config: config = json.load(crawler_config) - try: - good_response = list(map(lambda link: requests.get(link).status_code == 200, - config['base_urls'])) - except RequestException as exception: - raise IncorrectURLError from exception - except Exception as exception: - raise UnknownConfigError from exception + good_response = list(map(lambda link: link.startswith('https://'), + config['base_urls'])) if not all(good_response): raise IncorrectURLError try: @@ -265,11 +248,6 @@ def validate_config(crawler_path): raise IncorrectNumberOfArticlesError if config['total_articles_to_find_and_parse'] > 1000: raise NumberOfArticlesOutOfRangeError - # if not isinstance(config['max_number_articles_to_get_from_one_seed'], int): - # raise IncorrectNumberOfArticlesError - # if not config['total_articles_to_find_and_parse'] < config['max_number_articles_to_get_from_one_seed']\ - # * len(good_response): - # raise NumberOfArticlesOutOfRangeError except KeyError as exception: raise IncorrectNumberOfArticlesError from exception try: @@ -280,22 +258,22 @@ def validate_config(crawler_path): if __name__ == '__main__': - prepare_environment(ASSETS_PATH) - enable_backup(LINKS_STORAGE) + prepare_environment(ASSETS_PATH, LINKS_STORAGE_DIR) seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) if not max_arts_per_seed: max_arts_per_seed = max_articles - crawler = CrawlerRecursive(seed_urls=seedurls, - total_max_articles=max_articles, - max_articles_per_seed=max_arts_per_seed) + crawler = Crawler(seed_urls=seedurls, + total_max_articles=max_articles, + max_articles_per_seed=max_arts_per_seed) crawler.find_articles() - # print('Scraped', len(crawler.urls), 'articles') print('onto parsing') - for n, url in enumerate(crawler.urls): + for n, url in enumerate(crawler.urls[:4]): full_url = URL_START + url parser = ArticleParser(full_url, n + 1) - parser.parse() + article = parser.parse() + article.save_raw() print('parsing is finished') +# From 225e72a1fcbbcce7b082863ed3c161f6c5fcd5d4 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sat, 13 Mar 2021 11:58:30 +0300 Subject: [PATCH 23/50] fixed lint --- constants.py | 2 +- scrapper.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/constants.py b/constants.py index 938510b9..21dcf665 100644 --- a/constants.py +++ b/constants.py @@ -12,4 +12,4 @@ URL_START = 'https://burunen.ru' HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)' - ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} \ No newline at end of file + ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} diff --git a/scrapper.py b/scrapper.py index 372973e7..165bea21 100644 --- a/scrapper.py +++ b/scrapper.py @@ -10,7 +10,6 @@ from bs4 import BeautifulSoup import requests -from requests.exceptions import RequestException from article import Article from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH, LINKS_STORAGE_DIR, URL_START, HEADERS, LINKS_STORAGE_FILE From dc7c08a46f00fa07b34a1c810ee6dd8bbe968663 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sat, 13 Mar 2021 12:01:47 +0300 Subject: [PATCH 24/50] fixed config valid --- scrapper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrapper.py b/scrapper.py index 165bea21..6cff2894 100644 --- a/scrapper.py +++ b/scrapper.py @@ -238,8 +238,11 @@ def validate_config(crawler_path): """ with open(crawler_path) as crawler_config: config = json.load(crawler_config) - good_response = list(map(lambda link: link.startswith('https://'), + try: + good_response = list(map(lambda link: link.startswith('https://'), config['base_urls'])) + except AttributeError as exception: + raise IncorrectURLError from exception if not all(good_response): raise IncorrectURLError try: From b147b47deb712b91e221b1224862358f318e08b2 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sat, 13 Mar 2021 14:21:57 +0300 Subject: [PATCH 25/50] optimized --- scrapper.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/scrapper.py b/scrapper.py index 6cff2894..212b6988 100644 --- a/scrapper.py +++ b/scrapper.py @@ -61,14 +61,22 @@ def _extract_url(article_bs, seen): return list(filter(lambda x: x.startswith('/news/') and x not in seen, extracted)) + def _crawl(self, pool: iter): + found = [] + for link in pool: + article_bs = BeautifulSoup(requests.get(link, headers=HEADERS).text, 'html.parser') + newfound = self._extract_url(article_bs, self.urls) + newfound = [i for i in newfound if len(i) > 20 + and not any(map(lambda y: y.isupper(), i))] + found.extend(newfound) + return list(set(found))[:self.max_articles_per_seed] + def find_articles(self): """ Finds articles """ - for link in self.seed_urls: - article_bs = BeautifulSoup(requests.get(link, 'html.parser', headers=HEADERS).text, 'html.parser') - newfound = self._extract_url(article_bs, self.urls) - self.urls.extend(newfound[:self.max_articles_per_seed]) + found = self._crawl(self.seed_urls) + self.urls.extend(found) self.urls = [i for i in self.urls if len(i) > 20 and not any(map(lambda y: y.isupper(), i))][:self.total_max_articles] print('Scraped seed urls, overall number of urls is', len(self.urls)) @@ -76,14 +84,9 @@ def find_articles(self): print('Due to insufficient number started further iteration') print('current number', len(self.urls), ', required', self.total_max_articles) old = len(self.urls) - for link in self.urls: - article_bs = BeautifulSoup(requests.get(URL_START + link, - 'html.parser', headers=HEADERS).text, 'html.parser') - newfound = list(filter(lambda x: len(x) > 20, self._extract_url(article_bs, self.urls))) - print('checked new url, found', len(newfound), 'articles') - self.urls.extend(newfound[:self.max_articles_per_seed]) - if len(self.urls) > self.total_max_articles: - break + pool = tuple(map(lambda x: URL_START + x, self.urls)) + found = self._crawl(pool) + self.urls.extend(found) if len(self.urls) == old: print('There are no unseen urls found in all of the available addresses') print(f'crawling finished with {len(self.urls)}') @@ -272,7 +275,7 @@ def validate_config(crawler_path): print('onto parsing') - for n, url in enumerate(crawler.urls[:4]): + for n, url in enumerate(crawler.urls): full_url = URL_START + url parser = ArticleParser(full_url, n + 1) article = parser.parse() From d09c9f3b2ccdba61f7dced1578efa425c212d9c4 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sat, 13 Mar 2021 14:35:44 +0300 Subject: [PATCH 26/50] fixed date format --- article.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article.py b/article.py index f471cb6e..718a3b3b 100644 --- a/article.py +++ b/article.py @@ -99,7 +99,7 @@ def _date_to_text(self): """ Converts datetime object to text """ - return self.date.strftime("%Y-%m-%d") + return self.date.strftime("%Y-%m-%d %H:%M:%S") def _get_raw_text_path(self): """ From 731448ce9b0b4f332a4b1791e434208f315849a9 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 15 Mar 2021 16:04:23 +0300 Subject: [PATCH 27/50] removed user interaction and links folder --- crawler_config.json | 2 +- links/links.txt | 0 scrapper.py | 54 ++++++++++++++++++++++----------------------- 3 files changed, 28 insertions(+), 28 deletions(-) delete mode 100644 links/links.txt diff --git a/crawler_config.json b/crawler_config.json index 80f0c5d8..01928e35 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -6,6 +6,6 @@ "https://burunen.ru/news/incidents/", "https://burunen.ru/news/politic/" ], - "total_articles_to_find_and_parse": 20, + "total_articles_to_find_and_parse": 100, "max_number_articles_to_get_from_one_seed": 25 } \ No newline at end of file diff --git a/links/links.txt b/links/links.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/scrapper.py b/scrapper.py index 212b6988..932588dd 100644 --- a/scrapper.py +++ b/scrapper.py @@ -12,7 +12,7 @@ import requests from article import Article -from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH, LINKS_STORAGE_DIR, URL_START, HEADERS, LINKS_STORAGE_FILE +from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH, LINKS_STORAGE_DIR, HEADERS, LINKS_STORAGE_FILE, URL_START class IncorrectURLError(Exception): @@ -64,6 +64,8 @@ def _extract_url(article_bs, seen): def _crawl(self, pool: iter): found = [] for link in pool: + if not link.startswith('https'): + link = URL_START + link article_bs = BeautifulSoup(requests.get(link, headers=HEADERS).text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) newfound = [i for i in newfound if len(i) > 20 @@ -81,15 +83,15 @@ def find_articles(self): and not any(map(lambda y: y.isupper(), i))][:self.total_max_articles] print('Scraped seed urls, overall number of urls is', len(self.urls)) while len(self.urls) < self.total_max_articles: - print('Due to insufficient number started further iteration') - print('current number', len(self.urls), ', required', self.total_max_articles) + print('Due to insufficient number of urls started further iteration') + print('current number is', len(self.urls), ', required', self.total_max_articles) old = len(self.urls) pool = tuple(map(lambda x: URL_START + x, self.urls)) found = self._crawl(pool) self.urls.extend(found) if len(self.urls) == old: print('There are no unseen urls found in all of the available addresses') - print(f'crawling finished with {len(self.urls)}') + print(f'crawling finished with {len(self.urls)} urls') break self.urls = self.urls[:self.total_max_articles] @@ -101,17 +103,24 @@ def get_search_urls(self): class CrawlerRecursive(Crawler): - - def __init__(self, seed_urls: list, total_max_articles: int, max_articles_per_seed: int, to_wait=False): + """ + Crawler implementation + Scrapes all the articles from the source + Uses advanced user imitation with fake headers and random waiting time + """ + def __init__(self, seed_urls: list, total_max_articles: int, + max_articles_per_seed: int, to_wait=False): super().__init__(seed_urls, total_max_articles, max_articles_per_seed) self.is_waiting = to_wait - def _crawl(self, pool: list): + def _crawl(self, pool: iter): found = [] for link in pool: if self.is_waiting: - wait(randint(0, 10)) - article_bs = BeautifulSoup(requests.get(URL_START + link, headers=HEADERS).text, 'html.parser') + wait(randint(1, 10)) + if not link.startswith('https'): + link = URL_START + link + article_bs = BeautifulSoup(requests.get(link, headers=HEADERS).text, 'html.parser') newfound = self._extract_url(article_bs, self.urls) newfound = [i for i in newfound if len(i) > 20 and not any(map(lambda y: y.isupper(), i))] @@ -119,8 +128,9 @@ def _crawl(self, pool: list): return list(set(found)) def find_articles(self): - if self.read_backedup(): - print('backed up urls found, starting iteration') + if not self.urls: + if self.read_backedup(): + print('backed up urls found, starting iteration') if not self.urls: pool = self.seed_urls else: @@ -132,17 +142,9 @@ def find_articles(self): self.urls.extend(newfound) with open(LINKS_STORAGE_FILE, 'a', encoding='utf-8') as file: file.write('\n'.join(newfound)) - print(f'found {len(newfound)} new urls') - if self.verify_proceed(): - print('starting new iteration') - self.find_articles() - else: - print(f'recursive crawling finished with {len(self.urls)} urls.') - - @staticmethod - def verify_proceed(): - answer = input('Would you like to proceed? yes or no: ').strip() - return answer == 'yes' + print(f'found {len(newfound)} new urls, overall number is {len(self.urls)}') + print('starting new iteration') + self.find_articles() def read_backedup(self): try: @@ -231,7 +233,6 @@ def prepare_environment(base_path, backup_path_dir): if not os.path.exists(base_path): os.makedirs(base_path) if not os.path.exists(backup_path_dir): - print('GOT HERE WITH PATH', backup_path_dir) os.makedirs(backup_path_dir) @@ -267,9 +268,9 @@ def validate_config(crawler_path): seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) if not max_arts_per_seed: max_arts_per_seed = max_articles - crawler = Crawler(seed_urls=seedurls, - total_max_articles=max_articles, - max_articles_per_seed=max_arts_per_seed) + crawler = CrawlerRecursive(seed_urls=seedurls, + total_max_articles=max_articles, + max_articles_per_seed=max_arts_per_seed) crawler.find_articles() @@ -281,4 +282,3 @@ def validate_config(crawler_path): article = parser.parse() article.save_raw() print('parsing is finished') -# From fd304d74ca27f7e7d914bcbd0fa807e8affd6ede Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 15 Mar 2021 16:11:02 +0300 Subject: [PATCH 28/50] removed user interaction and links folder[2] --- crawler_config.json | 4 ++-- scrapper.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index 01928e35..c765cac3 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -6,6 +6,6 @@ "https://burunen.ru/news/incidents/", "https://burunen.ru/news/politic/" ], - "total_articles_to_find_and_parse": 100, - "max_number_articles_to_get_from_one_seed": 25 + "total_articles_to_find_and_parse": 20, + "max_number_articles_to_get_from_one_seed": 10 } \ No newline at end of file diff --git a/scrapper.py b/scrapper.py index 932588dd..8ac37d99 100644 --- a/scrapper.py +++ b/scrapper.py @@ -268,7 +268,7 @@ def validate_config(crawler_path): seedurls, max_articles, max_arts_per_seed = validate_config(CRAWLER_CONFIG_PATH) if not max_arts_per_seed: max_arts_per_seed = max_articles - crawler = CrawlerRecursive(seed_urls=seedurls, + crawler = Crawler(seed_urls=seedurls, total_max_articles=max_articles, max_articles_per_seed=max_arts_per_seed) From 23182bb1d0499ed48cf8a4867c421716fa6309af Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Fri, 26 Mar 2021 16:21:11 +0300 Subject: [PATCH 29/50] initial commit with the build for 8 and bad lintering --- crawler_config.json | 2 +- pipeline.py | 78 +++++++++++++++++++++++++++++++++++++-------- requirements.txt | 2 ++ target_score.txt | 2 +- 4 files changed, 69 insertions(+), 15 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index c765cac3..d65a60dd 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -6,6 +6,6 @@ "https://burunen.ru/news/incidents/", "https://burunen.ru/news/politic/" ], - "total_articles_to_find_and_parse": 20, + "total_articles_to_find_and_parse": 10, "max_number_articles_to_get_from_one_seed": 10 } \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index d603e2e1..11a4b911 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,9 +1,15 @@ """ Pipeline for text processing implementation """ - +import os from typing import List +from pymorphy2 import MorphAnalyzer +from pymystem3 import Mystem + +from article import Article +from constants import ASSETS_PATH + class EmptyDirectoryError(Exception): """ @@ -34,10 +40,13 @@ class MorphologicalToken: Stores language params for each processed token """ def __init__(self, original_word, normalized_form): - pass + self.original = original_word + self.normalized = normalized_form + self.mystem_tags = '' + self.pymorphy_tags = '' def __str__(self): - return "MorphologicalToken instance here" + return self.normalized + '<' + self.mystem_tags + '>' + '(' + str(self.pymorphy_tags) + ')' class CorpusManager: @@ -45,19 +54,24 @@ class CorpusManager: Works with articles and stores them """ def __init__(self, path_to_raw_txt_data: str): - pass + self.path_to_raw = path_to_raw_txt_data + self._storage = {} def _scan_dataset(self): """ Register each dataset entry """ - pass + for file in os.listdir(ASSETS_PATH): + if file.endswith('_raw.txt'): + index = file.split('_raw.txt')[0] + self._storage[index] = Article(url=None, article_id=index) def get_articles(self): """ Returns storage params """ - pass + self._scan_dataset() + return self._storage class TextProcessingPipeline: @@ -65,30 +79,68 @@ class TextProcessingPipeline: Process articles from corpus manager """ def __init__(self, corpus_manager: CorpusManager): - pass + self.corpus = corpus_manager def run(self): """ Runs pipeline process scenario """ - pass - - def _process(self) -> List[type(MorphologicalToken)]: + print(f'there are {self.corpus.get_articles()} articles to process') + for index, article in self.corpus.get_articles().items(): + raw_text = article.get_raw_text() + tokens = self._process(raw_text) + processed = ' '.join(map(lambda token: str(token), tokens)) + article.save_processed(processed) + + @staticmethod + def _process(text) -> List[type(MorphologicalToken)]: """ Performs processing of each text """ - pass + mystem = Mystem() + pymorphy = MorphAnalyzer() + words = mystem.analyze(text) + tokens = [] + for word in words: + orig = word['text'].strip() + if orig.isalpha(): + try: + token = MorphologicalToken(original_word=orig, normalized_form=word['analysis'][0]['lex']) + token.mystem_tags = word['analysis'][0]['gr'] + token.pymorphy_tags = pymorphy.parse(orig)[0].tag + tokens.append(token) + except IndexError: + token = MorphologicalToken(original_word=orig, normalized_form=orig) + tokens.append(token) + return tokens def validate_dataset(path_to_validate): """ Validates folder with assets """ - pass + if not os.path.exists(path_to_validate): + raise UnknownDatasetError + if not os.path.isdir(path_to_validate): + raise NotADirectoryError + if not os.listdir(path_to_validate): + raise EmptyDirectoryError + metas, raws = 0, 0 + for file in os.listdir(ASSETS_PATH): + if file.endswith("_raw.txt"): + raws += 1 + if file.endswith("_meta.json"): + metas += 1 + if not metas == raws: + raise InconsistentDatasetError def main(): - print('Your code goes here') + validate_dataset(ASSETS_PATH) + print('validated dataset') + corpus_manager = CorpusManager(ASSETS_PATH) + pipeline = TextProcessingPipeline(corpus_manager=corpus_manager) + pipeline.run() if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 327297ca..cb49a8af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ beautifulsoup4==4.9.0 +pymorphy2==0.9.1 +pymystem3==0.2.0 requests==2.23.0 diff --git a/target_score.txt b/target_score.txt index 3de837b8..4bb41dfa 100644 --- a/target_score.txt +++ b/target_score.txt @@ -2,4 +2,4 @@ 10 # Target score for pipeline.py: -0 \ No newline at end of file +8 \ No newline at end of file From 0ee4d5e9eef2dda4142f78029a4bf4f5d18ac911 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Fri, 26 Mar 2021 16:34:57 +0300 Subject: [PATCH 30/50] build for 8 with a lot of lintering interface kostyly --- pipeline.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pipeline.py b/pipeline.py index 11a4b911..24a13638 100644 --- a/pipeline.py +++ b/pipeline.py @@ -17,10 +17,10 @@ class EmptyDirectoryError(Exception): """ -class NotADirectoryError(Exception): - """ - Custom error - """ +# class NotADirectoryError(Exception): +# """ +# Custom error +# """ class InconsistentDatasetError(Exception): @@ -48,6 +48,9 @@ def __init__(self, original_word, normalized_form): def __str__(self): return self.normalized + '<' + self.mystem_tags + '>' + '(' + str(self.pymorphy_tags) + ')' + def placeholder_public_method(self): + pass + class CorpusManager: """ @@ -73,6 +76,9 @@ def get_articles(self): self._scan_dataset() return self._storage + def placeholder_public_method(self): + pass + class TextProcessingPipeline: """ @@ -85,13 +91,16 @@ def run(self): """ Runs pipeline process scenario """ - print(f'there are {self.corpus.get_articles()} articles to process') - for index, article in self.corpus.get_articles().items(): + print(f'there are {len(self.corpus.get_articles())} articles to process') + for article in self.corpus.get_articles().values(): raw_text = article.get_raw_text() tokens = self._process(raw_text) - processed = ' '.join(map(lambda token: str(token), tokens)) + processed = ' '.join(map(str, tokens)) article.save_processed(processed) + def placeholder_public_method(self): + pass + @staticmethod def _process(text) -> List[type(MorphologicalToken)]: """ From f1a59fa88bf8f29861167cac0a97e073249ed83a Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Fri, 26 Mar 2021 21:31:14 +0300 Subject: [PATCH 31/50] added features for 10, did not upd tests --- pipeline.py | 24 ++++++++++++++++++++++-- pos_frequency_pipeline.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index 24a13638..ea97d22b 100644 --- a/pipeline.py +++ b/pipeline.py @@ -9,6 +9,7 @@ from article import Article from constants import ASSETS_PATH +from pos_frequency_pipeline import POSFrequencyPipeline class EmptyDirectoryError(Exception): @@ -49,6 +50,11 @@ def __str__(self): return self.normalized + '<' + self.mystem_tags + '>' + '(' + str(self.pymorphy_tags) + ')' def placeholder_public_method(self): + """ + In order to pass lint check, + class must contain at least + two public methods + """ pass @@ -77,6 +83,11 @@ def get_articles(self): return self._storage def placeholder_public_method(self): + """ + In order to pass lint check, + class must contain at least + two public methods + """ pass @@ -99,6 +110,11 @@ def run(self): article.save_processed(processed) def placeholder_public_method(self): + """ + In order to pass lint check, + class must contain at least + two public methods + """ pass @staticmethod @@ -115,7 +131,7 @@ def _process(text) -> List[type(MorphologicalToken)]: if orig.isalpha(): try: token = MorphologicalToken(original_word=orig, normalized_form=word['analysis'][0]['lex']) - token.mystem_tags = word['analysis'][0]['gr'] + token.mystem_tags = word['analysis'][0]['gr'].strip() token.pymorphy_tags = pymorphy.parse(orig)[0].tag tokens.append(token) except IndexError: @@ -129,7 +145,7 @@ def validate_dataset(path_to_validate): Validates folder with assets """ if not os.path.exists(path_to_validate): - raise UnknownDatasetError + raise FileNotFoundError if not os.path.isdir(path_to_validate): raise NotADirectoryError if not os.listdir(path_to_validate): @@ -148,8 +164,12 @@ def main(): validate_dataset(ASSETS_PATH) print('validated dataset') corpus_manager = CorpusManager(ASSETS_PATH) + print('onto processing') pipeline = TextProcessingPipeline(corpus_manager=corpus_manager) pipeline.run() + print('onto analytics') + visualizer = POSFrequencyPipeline(corpus_manager) + visualizer.run() if __name__ == "__main__": diff --git a/pos_frequency_pipeline.py b/pos_frequency_pipeline.py index 232c4d07..f8e887d1 100644 --- a/pos_frequency_pipeline.py +++ b/pos_frequency_pipeline.py @@ -1,8 +1,36 @@ """ Implementation of POSFrequencyPipeline for score ten only. """ +import os +import re + +# from pipeline import CorpusManager +from visualizer import visualize + +from constants import ASSETS_PATH class POSFrequencyPipeline: - def __init__(self, assets, destination): - pass + def __init__(self, corpus_manager): + self.corpus = corpus_manager + + def run(self): + frequencies = self._count_frequencies() + visualize(frequencies, os.path.join(ASSETS_PATH, 'pos_frequencies.png')) + + def _count_frequencies(self): + articles = self.corpus.get_articles() + tags_found = [] + for index, article in articles.items(): + article_path = os.path.join(ASSETS_PATH, f'{index}_processed.txt') + with open(article_path, encoding='utf-8') as file: + contents = file.read() + tags_found.extend(re.findall(r"<([A-Z]*)[,=]?", contents)) + frequencies = {} + for tag in tags_found: + frequencies[tag] = tags_found.count(tag) + return frequencies + + +if __name__ == "__main__": + pass From 21e4fd8009804f16d94f3a4a0e696167c0d1844b Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Fri, 26 Mar 2021 21:52:48 +0300 Subject: [PATCH 32/50] build for 10 except i did not refactor --- requirements.txt | 2 ++ target_score.txt | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cb49a8af..46e89c07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ beautifulsoup4==4.9.0 pymorphy2==0.9.1 pymystem3==0.2.0 requests==2.23.0 +numpy==1.20.1 +matplotlib==3.4.0 \ No newline at end of file diff --git a/target_score.txt b/target_score.txt index 4bb41dfa..e91213a7 100644 --- a/target_score.txt +++ b/target_score.txt @@ -2,4 +2,4 @@ 10 # Target score for pipeline.py: -8 \ No newline at end of file +10 \ No newline at end of file From 1b73cb306b5146b80cb55ef94db7da9450a4ad7c Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Fri, 26 Mar 2021 22:11:37 +0300 Subject: [PATCH 33/50] refactor from os to pathlib --- pipeline.py | 22 ++++++++++++---------- pos_frequency_pipeline.py | 12 ++++++------ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/pipeline.py b/pipeline.py index 276a85c2..dc82fe42 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,9 +1,9 @@ """ Pipeline for text processing implementation """ -import os from typing import List +from pathlib import Path from pymorphy2 import MorphAnalyzer from pymystem3 import Mystem @@ -64,9 +64,10 @@ def _scan_dataset(self): """ Register each dataset entry """ - for file in os.listdir(ASSETS_PATH): - if file.endswith('_raw.txt'): - index = file.split('_raw.txt')[0] + path = Path(ASSETS_PATH) + for file in path.iterdir(): + if str(file).endswith('_raw.txt'): + index = str(file).split('_raw.txt')[0] self._storage[index] = Article(url=None, article_id=index) def get_articles(self): @@ -138,17 +139,18 @@ def validate_dataset(path_to_validate): """ Validates folder with assets """ - if not os.path.exists(path_to_validate): + path = Path(path_to_validate) + if not path.exists(): raise FileNotFoundError - if not os.path.isdir(path_to_validate): + if not path.is_dir(): raise NotADirectoryError - if not os.listdir(path_to_validate): + if not path.iterdir(): raise EmptyDirectoryError metas, raws = 0, 0 - for file in os.listdir(ASSETS_PATH): - if file.endswith("_raw.txt"): + for file in path.iterdir(): + if str(file).endswith("_raw.txt"): raws += 1 - if file.endswith("_meta.json"): + if str(file).endswith("_meta.json"): metas += 1 if not metas == raws: raise InconsistentDatasetError diff --git a/pos_frequency_pipeline.py b/pos_frequency_pipeline.py index 71e0d498..48ca938d 100644 --- a/pos_frequency_pipeline.py +++ b/pos_frequency_pipeline.py @@ -1,10 +1,10 @@ """ Implementation of POSFrequencyPipeline for score ten only. """ -import os import re -# from pipeline import CorpusManager +from pathlib import Path + from visualizer import visualize from constants import ASSETS_PATH @@ -16,14 +16,15 @@ def __init__(self, corpus_manager): def run(self): frequencies = self._count_frequencies() - visualize(frequencies, os.path.join(ASSETS_PATH, 'pos_frequencies.png')) + path = Path(ASSETS_PATH) / 'pos_frequencies.png' + visualize(frequencies, path) def _count_frequencies(self): articles = self.corpus.get_articles() tags_found = [] for index, article in articles.items(): - article_path = os.path.join(ASSETS_PATH, f'{index}_processed.txt') - with open(article_path, encoding='utf-8') as file: + path = Path(ASSETS_PATH) / f'{index}_processed.txt' + with open(path, encoding='utf-8') as file: contents = file.read() tags_found.extend(re.findall(r"<([A-Z]*)[,=]?", contents)) frequencies = {} @@ -34,4 +35,3 @@ def _count_frequencies(self): if __name__ == "__main__": pass - From 19ff11d234cc6ae5be032ece10c629c85f99b2be Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 30 Mar 2021 15:22:07 +0300 Subject: [PATCH 34/50] fixed everything --- pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index dc82fe42..754df485 100644 --- a/pipeline.py +++ b/pipeline.py @@ -144,7 +144,8 @@ def validate_dataset(path_to_validate): raise FileNotFoundError if not path.is_dir(): raise NotADirectoryError - if not path.iterdir(): + files = [i for i in path.iterdir()] + if not files: raise EmptyDirectoryError metas, raws = 0, 0 for file in path.iterdir(): From c30d85a1cc989c379cdce2436059be9ce328c285 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 30 Mar 2021 15:25:05 +0300 Subject: [PATCH 35/50] now everything --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 754df485..8772fecb 100644 --- a/pipeline.py +++ b/pipeline.py @@ -144,7 +144,7 @@ def validate_dataset(path_to_validate): raise FileNotFoundError if not path.is_dir(): raise NotADirectoryError - files = [i for i in path.iterdir()] + files = list(path.iterdir()) if not files: raise EmptyDirectoryError metas, raws = 0, 0 From c58526b8ea8e4150ec026737f4789eef9287bb44 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 30 Mar 2021 15:40:36 +0300 Subject: [PATCH 36/50] i don't understand --- pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipeline.py b/pipeline.py index 8772fecb..28cd745a 100644 --- a/pipeline.py +++ b/pipeline.py @@ -144,8 +144,7 @@ def validate_dataset(path_to_validate): raise FileNotFoundError if not path.is_dir(): raise NotADirectoryError - files = list(path.iterdir()) - if not files: + if not list(path.iterdir()): raise EmptyDirectoryError metas, raws = 0, 0 for file in path.iterdir(): From 0a9c9fe65080a7232c894b0311d8e6a24b8da0de Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 30 Mar 2021 15:45:37 +0300 Subject: [PATCH 37/50] fixed regular expression --- pos_frequency_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pos_frequency_pipeline.py b/pos_frequency_pipeline.py index 48ca938d..d292c605 100644 --- a/pos_frequency_pipeline.py +++ b/pos_frequency_pipeline.py @@ -26,7 +26,7 @@ def _count_frequencies(self): path = Path(ASSETS_PATH) / f'{index}_processed.txt' with open(path, encoding='utf-8') as file: contents = file.read() - tags_found.extend(re.findall(r"<([A-Z]*)[,=]?", contents)) + tags_found.extend(re.findall(r"<([A-Z]+)[,=]?", contents)) frequencies = {} for tag in tags_found: frequencies[tag] = tags_found.count(tag) From db17bf6b8d9e873e0c8551a40ab29b61db458ccb Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Tue, 30 Mar 2021 16:19:01 +0300 Subject: [PATCH 38/50] added COM to tags --- config/student_text_preprocess_score_eight_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/student_text_preprocess_score_eight_test.py b/config/student_text_preprocess_score_eight_test.py index 0926e57a..2352fbd7 100644 --- a/config/student_text_preprocess_score_eight_test.py +++ b/config/student_text_preprocess_score_eight_test.py @@ -4,7 +4,7 @@ from constants import ASSETS_PATH -TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO"] +TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO", "COM"] class StudentTextPreprocessTest(unittest.TestCase): From 7ea4755aee6231831f4e045d8fa13413f8bdad16 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sun, 4 Apr 2021 14:40:54 +0300 Subject: [PATCH 39/50] added requested changes, ready to fight linter --- article.py | 5 ++- pipeline.py | 70 ++++++++++++--------------------------- pos_frequency_pipeline.py | 42 +++++++++++++++-------- 3 files changed, 55 insertions(+), 62 deletions(-) diff --git a/article.py b/article.py index 718a3b3b..e59d6d63 100644 --- a/article.py +++ b/article.py @@ -29,6 +29,7 @@ def __init__(self, url, article_id): self.author = '' self.topics = [] self.text = '' + self.pos_frequencies = {} def save_raw(self): """ @@ -62,6 +63,7 @@ def from_meta_json(json_path: str): article.date = date_from_meta(meta.get('date', None)) article.author = meta.get('author', None) article.topics = meta.get('topics', None) + article.pos_frequencies = meta.get('pos_frequencies', None) # intentionally leave it empty article.text = None @@ -92,7 +94,8 @@ def _get_meta(self): 'title': self.title, 'date': self._date_to_text(), 'author': self.author, - 'topics': self.topics + 'topics': self.topics, + 'pos_frequencies': self.pos_frequencies } def _date_to_text(self): diff --git a/pipeline.py b/pipeline.py index 28cd745a..576bdcdb 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,3 +1,5 @@ +# pylint: disable=R0903 + """ Pipeline for text processing implementation """ @@ -9,7 +11,6 @@ from article import Article from constants import ASSETS_PATH -from pos_frequency_pipeline import POSFrequencyPipeline class EmptyDirectoryError(Exception): @@ -41,15 +42,7 @@ def __init__(self, original_word, normalized_form): self.pymorphy_tags = '' def __str__(self): - return self.normalized + '<' + self.mystem_tags + '>' + '(' + str(self.pymorphy_tags) + ')' - - def placeholder_public_method(self): - """ - In order to pass lint check, - class must contain at least - two public methods - """ - pass + return f'{self.normalized}<{self.mystem_tags}>({str(self.pymorphy_tags)})' class CorpusManager: @@ -66,9 +59,10 @@ def _scan_dataset(self): """ path = Path(ASSETS_PATH) for file in path.iterdir(): - if str(file).endswith('_raw.txt'): - index = str(file).split('_raw.txt')[0] - self._storage[index] = Article(url=None, article_id=index) + file_name = file.relative_to(path) + if str(file_name).endswith('_raw.txt'): + index = str(file_name).split('_raw.txt')[0] + self._storage[index] = Article(url=None, article_id=int(index)) def get_articles(self): """ @@ -77,14 +71,6 @@ def get_articles(self): self._scan_dataset() return self._storage - def placeholder_public_method(self): - """ - In order to pass lint check, - class must contain at least - two public methods - """ - pass - class TextProcessingPipeline: """ @@ -92,34 +78,22 @@ class TextProcessingPipeline: """ def __init__(self, corpus_manager: CorpusManager): self.corpus = corpus_manager + self.current_raw_text = '' def run(self): """ Runs pipeline process scenario """ - print(f'there are {len(self.corpus.get_articles())} articles to process') for article in self.corpus.get_articles().values(): - raw_text = article.get_raw_text() - tokens = self._process(raw_text) + self.current_raw_text = article.get_raw_text() + tokens = self._process() processed = ' '.join(map(str, tokens)) article.save_processed(processed) - def placeholder_public_method(self): - """ - In order to pass lint check, - class must contain at least - two public methods - """ - pass - - @staticmethod - def _process(text) -> List[type(MorphologicalToken)]: - """ - Performs processing of each text - """ + def _process(self) -> List[type(MorphologicalToken)]: mystem = Mystem() pymorphy = MorphAnalyzer() - words = mystem.analyze(text) + words = mystem.analyze(self.current_raw_text) tokens = [] for word in words: orig = word['text'].strip() @@ -131,6 +105,8 @@ def _process(text) -> List[type(MorphologicalToken)]: tokens.append(token) except IndexError: token = MorphologicalToken(original_word=orig, normalized_form=orig) + if not str(pymorphy.parse(orig)[0].tag) == 'UNKN': + token.pymorphy_tags = pymorphy.parse(orig)[0].tag tokens.append(token) return tokens @@ -146,13 +122,14 @@ def validate_dataset(path_to_validate): raise NotADirectoryError if not list(path.iterdir()): raise EmptyDirectoryError - metas, raws = 0, 0 - for file in path.iterdir(): - if str(file).endswith("_raw.txt"): - raws += 1 - if str(file).endswith("_meta.json"): - metas += 1 - if not metas == raws: + files = [str(file.relative_to(path)) for file in path.iterdir()] + metas = list(filter(lambda x: x.endswith('_raw.txt'), files)) + raws = list(filter(lambda x: x.endswith('_meta.json'), files)) + if not len(metas) == len(raws): + raise InconsistentDatasetError + meta_indices = sorted(list(map(lambda x: int(x.split('_')[0]), metas))) + raw_indices = sorted(list(map(lambda x: int(x.split('_')[0]), raws))) + if not meta_indices == raw_indices or not meta_indices == [i + 1 for i in range(len(meta_indices))]: raise InconsistentDatasetError @@ -163,9 +140,6 @@ def main(): print('onto processing') pipeline = TextProcessingPipeline(corpus_manager=corpus_manager) pipeline.run() - print('onto analytics') - visualizer = POSFrequencyPipeline(corpus_manager) - visualizer.run() if __name__ == "__main__": diff --git a/pos_frequency_pipeline.py b/pos_frequency_pipeline.py index d292c605..3c226622 100644 --- a/pos_frequency_pipeline.py +++ b/pos_frequency_pipeline.py @@ -6,32 +6,48 @@ from pathlib import Path from visualizer import visualize +from pipeline import CorpusManager from constants import ASSETS_PATH class POSFrequencyPipeline: - def __init__(self, corpus_manager): - self.corpus = corpus_manager + def __init__(self, corpus: CorpusManager): + self.corpus = corpus + self.current_article = None def run(self): - frequencies = self._count_frequencies() - path = Path(ASSETS_PATH) / 'pos_frequencies.png' - visualize(frequencies, path) + articles = self.corpus.get_articles() + for article in articles.values(): + self.current_article = article + frequencies = self._count_frequencies() + self._update_meta(frequencies) + path = Path(ASSETS_PATH) / f'{article.article_id}_image.png' + visualize(frequencies, path) def _count_frequencies(self): - articles = self.corpus.get_articles() - tags_found = [] - for index, article in articles.items(): - path = Path(ASSETS_PATH) / f'{index}_processed.txt' - with open(path, encoding='utf-8') as file: - contents = file.read() - tags_found.extend(re.findall(r"<([A-Z]+)[,=]?", contents)) + path = Path(ASSETS_PATH) / f'{self.current_article.article_id}_processed.txt' + with open(path, encoding='utf-8') as file: + contents = file.read() + tags_found = re.findall(r"<([A-Z]+)[,=]?", contents) frequencies = {} for tag in tags_found: frequencies[tag] = tags_found.count(tag) return frequencies + def _update_meta(self, frequencies): + meta_path = Path(ASSETS_PATH) / f'{self.current_article.article_id}_meta.json' + article = self.current_article.from_meta_json(meta_path) + article.pos_frequencies = frequencies + article.text = article.get_raw_text() + article.save_raw() + + +def main(): + corpus_manager = CorpusManager(ASSETS_PATH) + visualizer = POSFrequencyPipeline(corpus_manager) + visualizer.run() + if __name__ == "__main__": - pass + main() From 7e859825febd7698dbcae3802103cdbefedef9fa Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sun, 4 Apr 2021 14:44:28 +0300 Subject: [PATCH 40/50] fought linter --- article.py | 1 + pipeline.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/article.py b/article.py index e59d6d63..b62ad9dd 100644 --- a/article.py +++ b/article.py @@ -1,3 +1,4 @@ +# pylint: disable=R0902 """ Article implementation """ diff --git a/pipeline.py b/pipeline.py index 576bdcdb..ef6ae50a 100644 --- a/pipeline.py +++ b/pipeline.py @@ -105,7 +105,7 @@ def _process(self) -> List[type(MorphologicalToken)]: tokens.append(token) except IndexError: token = MorphologicalToken(original_word=orig, normalized_form=orig) - if not str(pymorphy.parse(orig)[0].tag) == 'UNKN': + if str(pymorphy.parse(orig)[0].tag) != 'UNKN': token.pymorphy_tags = pymorphy.parse(orig)[0].tag tokens.append(token) return tokens From a83469eab66961f10ee134b9619f49e730bc1f27 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sun, 4 Apr 2021 14:53:06 +0300 Subject: [PATCH 41/50] fixed dataset validation --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index ef6ae50a..466a33de 100644 --- a/pipeline.py +++ b/pipeline.py @@ -129,7 +129,7 @@ def validate_dataset(path_to_validate): raise InconsistentDatasetError meta_indices = sorted(list(map(lambda x: int(x.split('_')[0]), metas))) raw_indices = sorted(list(map(lambda x: int(x.split('_')[0]), raws))) - if not meta_indices == raw_indices or not meta_indices == [i + 1 for i in range(len(meta_indices))]: + if not meta_indices == raw_indices or not meta_indices == list(range(len(metas))): raise InconsistentDatasetError From 27280a055861240812e1db565a922ee444881ecf Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sun, 4 Apr 2021 14:58:18 +0300 Subject: [PATCH 42/50] adjusted ds validator --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 466a33de..3ae0f01d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -129,7 +129,7 @@ def validate_dataset(path_to_validate): raise InconsistentDatasetError meta_indices = sorted(list(map(lambda x: int(x.split('_')[0]), metas))) raw_indices = sorted(list(map(lambda x: int(x.split('_')[0]), raws))) - if not meta_indices == raw_indices or not meta_indices == list(range(len(metas))): + if not meta_indices == raw_indices: raise InconsistentDatasetError From 4efc05d3e9608bcf9da39da34f661b52c8219b7f Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sun, 4 Apr 2021 15:01:42 +0300 Subject: [PATCH 43/50] i am sorry for my terrible commit history --- pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index 3ae0f01d..50463986 100644 --- a/pipeline.py +++ b/pipeline.py @@ -129,7 +129,7 @@ def validate_dataset(path_to_validate): raise InconsistentDatasetError meta_indices = sorted(list(map(lambda x: int(x.split('_')[0]), metas))) raw_indices = sorted(list(map(lambda x: int(x.split('_')[0]), raws))) - if not meta_indices == raw_indices: + if not raw_indices == meta_indices: raise InconsistentDatasetError From b45ee4022a531a2266295e5156113d3dd55acf08 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Sun, 4 Apr 2021 15:05:34 +0300 Subject: [PATCH 44/50] adjusted ds validator --- crawler_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler_config.json b/crawler_config.json index d65a60dd..542d6847 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -7,5 +7,5 @@ "https://burunen.ru/news/politic/" ], "total_articles_to_find_and_parse": 10, - "max_number_articles_to_get_from_one_seed": 10 + "max_number_articles_to_get_from_one_seed": 5 } \ No newline at end of file From 5c3a245601797f21e805518abd7ffa65fc7b74b2 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 5 Apr 2021 15:59:14 +0300 Subject: [PATCH 45/50] fixed several drawbacks --- crawler_config.json | 4 ++-- pipeline.py | 24 +++++++++++------------- pos_frequency_pipeline.py | 18 +++++++++++++++--- scrapper.py | 4 ++++ 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index 542d6847..36f2c523 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -6,6 +6,6 @@ "https://burunen.ru/news/incidents/", "https://burunen.ru/news/politic/" ], - "total_articles_to_find_and_parse": 10, - "max_number_articles_to_get_from_one_seed": 5 + "total_articles_to_find_and_parse": 2, + "max_number_articles_to_get_from_one_seed": 2 } \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 50463986..239ad3c8 100644 --- a/pipeline.py +++ b/pipeline.py @@ -96,17 +96,16 @@ def _process(self) -> List[type(MorphologicalToken)]: words = mystem.analyze(self.current_raw_text) tokens = [] for word in words: - orig = word['text'].strip() - if orig.isalpha(): + if word.get('analysis') and word.get('text'): try: - token = MorphologicalToken(original_word=orig, normalized_form=word['analysis'][0]['lex']) + token = MorphologicalToken(original_word=word['text'], normalized_form=word['analysis'][0]['lex']) token.mystem_tags = word['analysis'][0]['gr'].strip() - token.pymorphy_tags = pymorphy.parse(orig)[0].tag + token.pymorphy_tags = pymorphy.parse(word['text'])[0].tag tokens.append(token) - except IndexError: - token = MorphologicalToken(original_word=orig, normalized_form=orig) - if str(pymorphy.parse(orig)[0].tag) != 'UNKN': - token.pymorphy_tags = pymorphy.parse(orig)[0].tag + except (IndexError, KeyError): + token = MorphologicalToken(original_word=word['text'], normalized_form=word['text']) + if str(pymorphy.parse(word['text'])[0].tag) != 'UNKN': + token.pymorphy_tags = pymorphy.parse(word['text'])[0].tag tokens.append(token) return tokens @@ -122,13 +121,12 @@ def validate_dataset(path_to_validate): raise NotADirectoryError if not list(path.iterdir()): raise EmptyDirectoryError - files = [str(file.relative_to(path)) for file in path.iterdir()] - metas = list(filter(lambda x: x.endswith('_raw.txt'), files)) - raws = list(filter(lambda x: x.endswith('_meta.json'), files)) + raws = list(path.glob('*_raw.txt')) + metas = list(path.glob('*_meta.json')) if not len(metas) == len(raws): raise InconsistentDatasetError - meta_indices = sorted(list(map(lambda x: int(x.split('_')[0]), metas))) - raw_indices = sorted(list(map(lambda x: int(x.split('_')[0]), raws))) + meta_indices = sorted(list(map(lambda x: int(x.name.split('_')[0]), metas))) + raw_indices = sorted(list(map(lambda x: int(x.name.split('_')[0]), raws))) if not raw_indices == meta_indices: raise InconsistentDatasetError diff --git a/pos_frequency_pipeline.py b/pos_frequency_pipeline.py index 3c226622..f1c2b6cf 100644 --- a/pos_frequency_pipeline.py +++ b/pos_frequency_pipeline.py @@ -11,6 +11,11 @@ from constants import ASSETS_PATH +class EmptyFileError(Exception): + """ + Custom error + """ + class POSFrequencyPipeline: def __init__(self, corpus: CorpusManager): self.corpus = corpus @@ -21,9 +26,12 @@ def run(self): for article in articles.values(): self.current_article = article frequencies = self._count_frequencies() - self._update_meta(frequencies) - path = Path(ASSETS_PATH) / f'{article.article_id}_image.png' - visualize(frequencies, path) + if frequencies: + self._update_meta(frequencies) + path = Path(ASSETS_PATH) / f'{article.article_id}_image.png' + visualize(frequencies, path) + else: + raise EmptyFileError def _count_frequencies(self): path = Path(ASSETS_PATH) / f'{self.current_article.article_id}_processed.txt' @@ -33,6 +41,10 @@ def _count_frequencies(self): frequencies = {} for tag in tags_found: frequencies[tag] = tags_found.count(tag) + if not frequencies: + print('THERE IS AM EMPTY FILE, CHECK ') + print(self.current_article.article_id) + return frequencies def _update_meta(self, frequencies): diff --git a/scrapper.py b/scrapper.py index 8ac37d99..d9a3d5cb 100644 --- a/scrapper.py +++ b/scrapper.py @@ -6,6 +6,7 @@ import json import os from random import randint +import shutil from time import sleep as wait from bs4 import BeautifulSoup @@ -232,6 +233,9 @@ def prepare_environment(base_path, backup_path_dir): """ if not os.path.exists(base_path): os.makedirs(base_path) + else: + shutil.rmtree(os.path.dirname(base_path)) + os.makedirs(base_path) if not os.path.exists(backup_path_dir): os.makedirs(backup_path_dir) From f6c703445842a566ae49ff78f3541cd0e2be983d Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 5 Apr 2021 16:01:47 +0300 Subject: [PATCH 46/50] fixed several drawbacks --- crawler_config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index 36f2c523..0dd527c8 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -6,6 +6,6 @@ "https://burunen.ru/news/incidents/", "https://burunen.ru/news/politic/" ], - "total_articles_to_find_and_parse": 2, - "max_number_articles_to_get_from_one_seed": 2 + "total_articles_to_find_and_parse": 5, + "max_number_articles_to_get_from_one_seed": 5 } \ No newline at end of file From b8399cc31effe4692c766f5e1a5e401227648e6e Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 5 Apr 2021 16:05:18 +0300 Subject: [PATCH 47/50] fixed several drawbacks --- crawler_config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler_config.json b/crawler_config.json index 0dd527c8..36f2c523 100644 --- a/crawler_config.json +++ b/crawler_config.json @@ -6,6 +6,6 @@ "https://burunen.ru/news/incidents/", "https://burunen.ru/news/politic/" ], - "total_articles_to_find_and_parse": 5, - "max_number_articles_to_get_from_one_seed": 5 + "total_articles_to_find_and_parse": 2, + "max_number_articles_to_get_from_one_seed": 2 } \ No newline at end of file From b8dc298233335c3c29f28b550950bdd3fd478337 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 5 Apr 2021 16:17:23 +0300 Subject: [PATCH 48/50] oh well I noticed smt else --- pos_frequency_pipeline.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pos_frequency_pipeline.py b/pos_frequency_pipeline.py index f1c2b6cf..6752454e 100644 --- a/pos_frequency_pipeline.py +++ b/pos_frequency_pipeline.py @@ -16,6 +16,7 @@ class EmptyFileError(Exception): Custom error """ + class POSFrequencyPipeline: def __init__(self, corpus: CorpusManager): self.corpus = corpus @@ -41,10 +42,6 @@ def _count_frequencies(self): frequencies = {} for tag in tags_found: frequencies[tag] = tags_found.count(tag) - if not frequencies: - print('THERE IS AM EMPTY FILE, CHECK ') - print(self.current_article.article_id) - return frequencies def _update_meta(self, frequencies): From 96f7e8e3f707c06be35e16d0d46d1854aae755d2 Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 5 Apr 2021 16:58:33 +0300 Subject: [PATCH 49/50] turned get articles into authentic getter --- pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipeline.py b/pipeline.py index 239ad3c8..e3cf4755 100644 --- a/pipeline.py +++ b/pipeline.py @@ -51,24 +51,25 @@ class CorpusManager: """ def __init__(self, path_to_raw_txt_data: str): self.path_to_raw = path_to_raw_txt_data - self._storage = {} + self._storage = self._scan_dataset() def _scan_dataset(self): """ Register each dataset entry """ path = Path(ASSETS_PATH) + arts = {} for file in path.iterdir(): file_name = file.relative_to(path) if str(file_name).endswith('_raw.txt'): index = str(file_name).split('_raw.txt')[0] - self._storage[index] = Article(url=None, article_id=int(index)) + arts[index] = Article(url=None, article_id=int(index)) + return arts def get_articles(self): """ Returns storage params """ - self._scan_dataset() return self._storage From ff570e9d3b75d0d68f4c743751344ba8a3337e1f Mon Sep 17 00:00:00 2001 From: marina-kaz Date: Mon, 5 Apr 2021 17:03:12 +0300 Subject: [PATCH 50/50] fixed lintering --- pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index e3cf4755..ccf60eb0 100644 --- a/pipeline.py +++ b/pipeline.py @@ -53,7 +53,8 @@ def __init__(self, path_to_raw_txt_data: str): self.path_to_raw = path_to_raw_txt_data self._storage = self._scan_dataset() - def _scan_dataset(self): + @staticmethod + def _scan_dataset(): """ Register each dataset entry """