Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Collector #1, Bykova Ekaterina - 19FPL2 #38

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions article.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def save_raw(self):
indent=4,
ensure_ascii=False,
separators=(',', ': '))

@staticmethod
def from_meta_json(json_path: str):
"""
Expand Down Expand Up @@ -90,17 +90,17 @@ def _get_meta(self):
'id': self.article_id,
'url': self.url,
'title': self.title,
'date': self._date_to_text(),
'date': self.date,
'author': self.author,
'topics': self.topics
}

def _date_to_text(self):
"""
Converts datetime object to text
"""
return self.date.strftime("%Y-%m-%d %H:%M:%S")

def _get_raw_text_path(self):
"""
Returns path for requested raw article
Expand Down
4 changes: 4 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@
PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
HEADERS = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/89.0.4389.90 Safari/537.36'
}
6 changes: 3 additions & 3 deletions crawler_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_urls": [],
"total_articles_to_find_and_parse": 0,
"max_number_articles_to_get_from_one_seed": 0
"base_urls": ["https://www.e1.ru/news/"],
"total_articles_to_find_and_parse": 5,
"max_number_articles_to_get_from_one_seed": 5
}
70 changes: 61 additions & 9 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
Pipeline for text processing implementation
"""

from pathlib import Path
from typing import List
from pymorphy2 import MorphAnalyzer
from pymystem3 import Mystem
from article import Article
from constants import ASSETS_PATH


class EmptyDirectoryError(Exception):
Expand All @@ -27,63 +32,110 @@ class MorphologicalToken:
"""
Stores language params for each processed token
"""
def __init__(self, original_word, normalized_form):
pass
def __init__(self, normalized_form, original_word):
self.original_word = original_word
self.normalized_form = normalized_form
self.mystem_tags = ''
self.pymorphy_tags = ''

def __str__(self):
return "MorphologicalToken instance here"
return f"{self.normalized_form}<{self.mystem_tags}>({self.pymorphy_tags})"

def public_method(self):
pass


class CorpusManager:
"""
Works with articles and stores them
"""

def __init__(self, path_to_raw_txt_data: str):
pass
self.path_to_raw_txt_date = path_to_raw_txt_data
self._storage = {}
self._scan_dataset()

def _scan_dataset(self):
"""
Register each dataset entry
"""
pass
for file in Path(self.path_to_raw_txt_date).rglob('*_raw.txt'):
id_each = int(file.parts[-1].split('_')[0])
self._storage[id_each] = Article(url=None, article_id=id_each)

def get_articles(self):
"""
Returns storage params
"""
return self._storage

def public_method(self):
pass


class TextProcessingPipeline:
"""
Process articles from corpus manager
"""

def __init__(self, corpus_manager: CorpusManager):
pass
self.corpus_manager = corpus_manager
self.raw_text = ''

def run(self):
"""
Runs pipeline process scenario
"""
pass
for article in self.corpus_manager.get_articles().values():
self.raw_text = article.get_raw_text()
processed_text = list(map(str, self._process()))
article.save_processed(' '.join(processed_text))

def _process(self) -> List[type(MorphologicalToken)]:
"""
Performs processing of each text
"""
process = Mystem().analyze(self.raw_text)
tokens = []

for tok in process:
if tok.get('analysis') and tok.get('text'):
morph_token = MorphologicalToken(original_word=tok['text'], normalized_form=tok['analysis'][0]['lex'])
morph_token.mystem_tags = tok['analysis'][0]['gr']
morph_token.pymorphy_tags = MorphAnalyzer().parse(word=morph_token.original_word)[0].tag
tokens.append(morph_token)

return tokens

def public_method(self):
pass


def validate_dataset(path_to_validate):
"""
Validates folder with assets
"""
pass
path = Path(path_to_validate)

if not path.exists():
raise FileNotFoundError

if not path.is_dir():
raise NotADirectoryError

if not list(path.iterdir()):
raise EmptyDirectoryError


def main():
print('Your code goes here')
validate_dataset(ASSETS_PATH)

corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH)
pipeline = TextProcessingPipeline(corpus_manager=corpus_manager)

pipeline.run()


if __name__ == "__main__":
# YOUR CODE HERE
main()
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
beautifulsoup4==4.9.3
lxml==4.6.2
pymorphy2==0.9.1
pymystem3==0.2.0
requests==2.25.1
97 changes: 85 additions & 12 deletions scrapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"""
Crawler implementation
"""
import os
import json
import requests
from bs4 import BeautifulSoup
from article import Article
from constants import CRAWLER_CONFIG_PATH, HEADERS, ASSETS_PATH


class IncorrectURLError(Exception):
Expand Down Expand Up @@ -31,38 +37,65 @@ class Crawler:
"""
Crawler implementation
"""
def __init__(self, seed_urls: list, max_articles: int):
pass
def __init__(self, seed_urls: list, max_articles: int, max_articles_per_seed: int):
self.seed_urls = seed_urls
self.total_max_articles = max_articles
self.max_articles_per_seed = max_articles_per_seed
self.urls = []

@staticmethod
def _extract_url(article_bs):
pass
article_link = article_bs.find('h2', class_="G9ax").find('a').get('href')
return 'https://www.e1.ru' + article_link

def find_articles(self):
"""
Finds articles
"""
pass
for url in self.seed_urls:
response = requests.get(url, headers=HEADERS)
if not response:
raise IncorrectURLError

page_soup = BeautifulSoup(response.content, features='lxml')
article_soup = page_soup.find_all('article', class_="G9alp")

for articles in article_soup[:max_num_per_seed]:
seed_url = self._extract_url(articles)
self.urls.append(seed_url)

if len(self.urls) == max_num_articles:
break

if len(self.urls) == max_num_articles:
break

def get_search_urls(self):
"""
Returns seed_urls param
"""
pass
return self.seed_urls


class ArticleParser:
"""
ArticleParser implementation
"""
def __init__(self, full_url: str, article_id: int):
pass
self.full_url = full_url
self.article_id = article_id
self.article = Article(url=full_url, article_id=article_id)

def _fill_article_with_text(self, article_soup):
pass
article_text = article_soup.find('div', class_="GFahz").find('div').find_all('p')
for par in article_text:
self.article.text += par.text.strip() + '\n'

def _fill_article_with_meta_information(self, article_soup):
pass
self.article.title = article_soup.find('h2', class_="CRqd CRsn JPax").find('span').text
self.article.author = 'NOT FOUND'
self.article.topics = article_soup.find('a', class_="CRqz CRsv JPall").find('span').text
self.article.date = article_soup.find('time', class_="HHkz").find('a').text

@staticmethod
def unify_date_format(date_str):
Expand All @@ -75,23 +108,63 @@ def parse(self):
"""
Parses each article
"""
pass
response = requests.get(self.full_url, headers=HEADERS)
if not response:
raise IncorrectURLError

article_soup = BeautifulSoup(response.text, 'lxml')
self._fill_article_with_text(article_soup)
self._fill_article_with_meta_information(article_soup)
return self.article


def prepare_environment(base_path):
"""
Creates ASSETS_PATH folder if not created and removes existing folder
"""
pass
if not os.path.exists(base_path):
os.makedirs(base_path)


def validate_config(crawler_path):
"""
Validates given config
"""
pass
try:
with open(crawler_path, 'r', encoding='utf-8') as config:
params = json.load(config)

seed_urls = params.get('base_urls')
max_articles = params.get('total_articles_to_find_and_parse')
max_articles_per_seed = params.get('max_number_articles_to_get_from_one_seed')

if not isinstance(seed_urls, list):
raise IncorrectURLError
for url in seed_urls:
if not isinstance(url, str) or not url.startswith('http'):
raise IncorrectURLError

if not isinstance(max_articles, int) or max_articles < 0:
raise IncorrectNumberOfArticlesError

if not isinstance(max_articles_per_seed, int) or max_articles_per_seed > 100:
raise NumberOfArticlesOutOfRangeError

except(IncorrectURLError, IncorrectNumberOfArticlesError, NumberOfArticlesOutOfRangeError) as error:
raise error
else:
return seed_urls, max_articles, max_articles_per_seed


if __name__ == '__main__':
# YOUR CODE HERE
pass
seed_urls_list, max_num_articles, max_num_per_seed = validate_config(CRAWLER_CONFIG_PATH)
crawler = Crawler(seed_urls=seed_urls_list,
max_articles=max_num_articles,
max_articles_per_seed=max_num_per_seed)
crawler.find_articles()
prepare_environment(ASSETS_PATH)
for article_id_num, article_url in enumerate(crawler.urls, 1):
parser = ArticleParser(full_url=article_url, article_id=article_id_num)
article = parser.parse()
article.save_raw()
2 changes: 1 addition & 1 deletion target_score.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
6

# Target score for pipeline.py:
0
8