Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Collector #1, Chudinova Alla - 19FPL2 #37

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ disable=print-statement,
bad-continuation,
unused-argument,
unnecessary-pass,
import-error
import-error,
too-few-public-methods

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down
6 changes: 3 additions & 3 deletions article.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def save_raw(self):
indent=4,
ensure_ascii=False,
separators=(',', ': '))

@staticmethod
def from_meta_json(json_path: str):
"""
Expand Down Expand Up @@ -94,13 +94,13 @@ def _get_meta(self):
'author': self.author,
'topics': self.topics
}

def _date_to_text(self):
"""
Converts datetime object to text
"""
return self.date.strftime("%Y-%m-%d %H:%M:%S")

def _get_raw_text_path(self):
"""
Returns path for requested raw article
Expand Down
3 changes: 2 additions & 1 deletion config/raw_metadata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import unittest
import requests
from constants import ASSETS_PATH, CRAWLER_CONFIG_PATH
from bs4 import BeautifulSoup


class RawDataValidator(unittest.TestCase):
Expand Down Expand Up @@ -48,7 +49,7 @@ def test_validate_metadata(self):
msg="Can not open URL: <{}>. Check how you collect URLs".format(
metadata[1]['url']))

html_source = requests.get(metadata[1]['url']).text
html_source = BeautifulSoup(requests.get(metadata[1]['url']).content, features='lxml').text

self.assertTrue(metadata[1]['title'] in
html_source,
Expand Down
2 changes: 1 addition & 1 deletion config/student_text_preprocess_score_eight_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from constants import ASSETS_PATH


TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO"]
TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO", "INTJ"]


class StudentTextPreprocessTest(unittest.TestCase):
Expand Down
6 changes: 3 additions & 3 deletions crawler_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_urls": [],
"total_articles_to_find_and_parse": 0,
"max_number_articles_to_get_from_one_seed": 0
"base_urls": ["https://mordovia-news.ru/"],
"total_articles_to_find_and_parse": 10,
"max_number_articles_to_get_from_one_seed": 3
}
60 changes: 49 additions & 11 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@
Pipeline for text processing implementation
"""

import os
from typing import List

from pymystem3 import Mystem
from pymorphy2 import MorphAnalyzer

from constants import ASSETS_PATH
from article import Article


class EmptyDirectoryError(Exception):
"""
Expand All @@ -28,61 +35,92 @@ class MorphologicalToken:
Stores language params for each processed token
"""
def __init__(self, original_word, normalized_form):
pass
self.normalized_form = normalized_form
self.original_word = original_word
self.tags = []
self.morphy_tags = []

def __str__(self):
return "MorphologicalToken instance here"
return f'{self.normalized_form}<{self.tags}>({self.morphy_tags})'


class CorpusManager:
"""
Works with articles and stores them
"""
def __init__(self, path_to_raw_txt_data: str):
pass
self._storage = dict()
self._path = path_to_raw_txt_data
self._scan_dataset()

def _scan_dataset(self):
"""
Register each dataset entry
"""
pass
for file in os.listdir(self._path):
if file.endswith('_raw.txt'):
self._storage[int(file[:-8])] = Article(url=None, article_id=int(file[:-8]))

def get_articles(self):
"""
Returns storage params
"""
pass
return self._storage


class TextProcessingPipeline:
"""
Process articles from corpus manager
"""
def __init__(self, corpus_manager: CorpusManager):
pass
self.corpus_manager = corpus_manager

def run(self):
"""
Runs pipeline process scenario
"""
pass
for article in self.corpus_manager.get_articles().values():
original_text = article.get_raw_text().lower()
processed_text = self._process(original_text)
article.save_processed(' '.join([str(token) for token in processed_text]))

def _process(self) -> List[type(MorphologicalToken)]:
@staticmethod
def _process(text) -> List[type(MorphologicalToken)]:
"""
Performs processing of each text
"""
pass
analyze = Mystem().analyze(text)
morph = MorphAnalyzer()
tokens = []
for feature in analyze:
if 'analysis' not in feature or not feature['analysis']:
continue
token = MorphologicalToken(feature['text'],
feature['analysis'][0]['lex'])
token.tags = feature['analysis'][0]['gr']
token.morphy_tags = morph.parse(token.original_word)[0].tag
tokens.append(token)
return tokens


def validate_dataset(path_to_validate):
"""
Validates folder with assets
"""
pass
if not os.path.exists(path_to_validate):
raise FileNotFoundError
if not os.path.isdir(path_to_validate):
raise NotADirectoryError
if not os.listdir(path_to_validate):
raise EmptyDirectoryError


def main():
print('Your code goes here')
validate_dataset(ASSETS_PATH)
corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH)
pipeline = TextProcessingPipeline(corpus_manager)
pipeline.run()
print('Text processing pipeline has just finished')


if __name__ == "__main__":
Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
requests == 2.25.1
beautifulsoup4 == 4.9.3
lxml == 4.6.2
pymystem3
pymorphy2
84 changes: 68 additions & 16 deletions scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
Crawler implementation
"""

import re
import os
import json
import datetime
import requests
from bs4 import BeautifulSoup
from article import Article
from constants import CRAWLER_CONFIG_PATH, ASSETS_PATH


class IncorrectURLError(Exception):
"""
Expand Down Expand Up @@ -32,66 +41,109 @@ class Crawler:
Crawler implementation
"""
def __init__(self, seed_urls: list, max_articles: int):
pass

@staticmethod
def _extract_url(article_bs):
pass
self.search_urls = seed_urls
self.max_articles = max_articles
self.found_urls = []
self.link_pattern = r'/?news-\d+-\d+\.htm'

def _extract_url(self, article_bs):
links = []
for link in article_bs.find_all('a', href=True):
potential_link = re.match(self.link_pattern, link['href'])
if potential_link:
links.append(potential_link.group(0))
return links

def find_articles(self):
"""
Finds articles
"""
pass
for url in self.search_urls:
request = requests.get(url).content
soup = BeautifulSoup(request,
features='lxml')
for article_url in self._extract_url(soup):
if len(self.found_urls) != self.max_articles \
and url+article_url not in self.found_urls:
self.found_urls.append(url+article_url)
print(f'Found {len(self.found_urls)} links to articles to process')

def get_search_urls(self):
"""
Returns seed_urls param
"""
pass
return self.found_urls


class ArticleParser:
"""
ArticleParser implementation
"""
def __init__(self, full_url: str, article_id: int):
pass
self.article = Article(url=full_url, article_id=article_id)

def _fill_article_with_text(self, article_soup):
pass
self.article.text = article_soup.find('dd', class_='text').text

def _fill_article_with_meta_information(self, article_soup):
pass
self.article.title = article_soup.find('dd', class_='title').text.strip()
self.article.author = 'NOT FOUND'
self.article.topics = article_soup.find('span', class_='title_text').find_all('a')[1].text
self.article.date = self.unify_date_format(article_soup.find('span', class_='title_data').text[-10:])

@staticmethod
def unify_date_format(date_str):
"""
Unifies date format
"""
pass
return datetime.datetime.strptime(date_str, "%d.%m.%Y")

def parse(self):
"""
Parses each article
"""
pass
request = requests.get(self.article.url).content
soup = BeautifulSoup(request, features='lxml')
self._fill_article_with_meta_information(soup)
self._fill_article_with_text(soup)
self.article.save_raw()


def prepare_environment(base_path):
"""
Creates ASSETS_PATH folder if not created and removes existing folder
"""
pass
if not os.path.exists(base_path):
os.makedirs(base_path)


def validate_config(crawler_path):
"""
Validates given config
"""
pass
with open(crawler_path, 'r', encoding='utf-8') as data:
settings = json.load(data)
url_pattern = 'https://'

for url in settings['base_urls']:
if url_pattern not in url:
raise IncorrectURLError

if not isinstance(settings['total_articles_to_find_and_parse'], int):
raise IncorrectNumberOfArticlesError

if settings['total_articles_to_find_and_parse'] > 100:
raise NumberOfArticlesOutOfRangeError
return settings['base_urls'], settings['total_articles_to_find_and_parse']


if __name__ == '__main__':
# YOUR CODE HERE
pass
urls, num_articles = validate_config(CRAWLER_CONFIG_PATH)
prepare_environment(ASSETS_PATH)

crawler = Crawler(seed_urls=urls, max_articles=num_articles)
crawler.find_articles()

for _article_id, _article_link in enumerate(crawler.get_search_urls()):
parser = ArticleParser(_article_link, _article_id+1)
parser.parse()
4 changes: 2 additions & 2 deletions target_score.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Target score for scrapper.py:
6
8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that s good switch.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that s good switch.

No, I do not think so. It is eminent I'd better switch to 4 or 6 later.


# Target score for pipeline.py:
0
8