Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset Collector #1, Kazyulina Marina - 19FPL1 #42

Open
wants to merge 55 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 49 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
3261adb
target score change
marina-kaz Feb 28, 2021
f471f4a
completed stages 1 and 2
marina-kaz Mar 8, 2021
742558c
uhm whatever i'm doing here i want to get upded branch that'all i ask…
marina-kaz Mar 9, 2021
dbcaa4d
build for four
marina-kaz Mar 9, 2021
0c35e77
build for four, fixed target score
marina-kaz Mar 9, 2021
8a09da7
build for idk what score
marina-kaz Mar 9, 2021
6fee8b3
fixed linting a little
marina-kaz Mar 9, 2021
aafdefd
major link work
marina-kaz Mar 9, 2021
0cc3a91
added requirements
marina-kaz Mar 9, 2021
e3e8eda
changed target score
marina-kaz Mar 9, 2021
1355f14
i dont understand why test fails
marina-kaz Mar 11, 2021
56fbb38
oooohh
marina-kaz Mar 11, 2021
cfeae30
i have questions
marina-kaz Mar 11, 2021
39e8923
didn't work
marina-kaz Mar 11, 2021
6e5ec03
working on
marina-kaz Mar 11, 2021
4733964
there is no pleasing you
marina-kaz Mar 11, 2021
ead1f17
i'm experimenting
marina-kaz Mar 11, 2021
cf7e97b
added stuff for 10, tests will fail?
marina-kaz Mar 11, 2021
54a49ed
fixed my favorite lint
marina-kaz Mar 11, 2021
2854e58
optimized a few things
marina-kaz Mar 11, 2021
358f74b
improved text formation
marina-kaz Mar 11, 2021
f7aa7f6
uhm
marina-kaz Mar 11, 2021
9f45627
Merge remote-tracking branch 'upstream/main' into HEAD
dmitry-uraev Mar 12, 2021
e75381c
fixed some problems from review
marina-kaz Mar 13, 2021
225e72a
fixed lint
marina-kaz Mar 13, 2021
dc7c08a
fixed config valid
marina-kaz Mar 13, 2021
b147b47
optimized
marina-kaz Mar 13, 2021
d09c9f3
fixed date format
marina-kaz Mar 13, 2021
731448c
removed user interaction and links folder
marina-kaz Mar 15, 2021
fd304d7
removed user interaction and links folder[2]
marina-kaz Mar 15, 2021
d7afc62
Merge remote-tracking branch 'upstream/main' into main
marina-kaz Mar 26, 2021
23182bb
initial commit with the build for 8 and bad lintering
marina-kaz Mar 26, 2021
0ee4d5e
build for 8 with a lot of lintering interface kostyly
marina-kaz Mar 26, 2021
f1a59fa
added features for 10, did not upd tests
marina-kaz Mar 26, 2021
631ec92
updated tests
marina-kaz Mar 26, 2021
21e4fd8
build for 10 except i did not refactor
marina-kaz Mar 26, 2021
1b73cb3
refactor from os to pathlib
marina-kaz Mar 26, 2021
7a7a01f
Merge remote-tracking branch 'upstream/main' into main
marina-kaz Mar 30, 2021
19ff11d
fixed everything
marina-kaz Mar 30, 2021
c30d85a
now everything
marina-kaz Mar 30, 2021
c58526b
i don't understand
marina-kaz Mar 30, 2021
0a9c9fe
fixed regular expression
marina-kaz Mar 30, 2021
db17bf6
added COM to tags
marina-kaz Mar 30, 2021
7ea4755
added requested changes, ready to fight linter
marina-kaz Apr 4, 2021
7e85982
fought linter
marina-kaz Apr 4, 2021
a83469e
fixed dataset validation
marina-kaz Apr 4, 2021
27280a0
adjusted ds validator
marina-kaz Apr 4, 2021
4efc05d
i am sorry for my terrible commit history
marina-kaz Apr 4, 2021
b45ee40
adjusted ds validator
marina-kaz Apr 4, 2021
5c3a245
fixed several drawbacks
marina-kaz Apr 5, 2021
f6c7034
fixed several drawbacks
marina-kaz Apr 5, 2021
b8399cc
fixed several drawbacks
marina-kaz Apr 5, 2021
b8dc298
oh well I noticed smt else
marina-kaz Apr 5, 2021
96f7e8e
turned get articles into authentic getter
marina-kaz Apr 5, 2021
ff570e9
fixed lintering
marina-kaz Apr 5, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions article.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# pylint: disable=R0902
"""
Article implementation
"""
Expand Down Expand Up @@ -29,6 +30,7 @@ def __init__(self, url, article_id):
self.author = ''
self.topics = []
self.text = ''
self.pos_frequencies = {}

def save_raw(self):
"""
Expand All @@ -46,7 +48,7 @@ def save_raw(self):
indent=4,
ensure_ascii=False,
separators=(',', ': '))

demid5111 marked this conversation as resolved.
Show resolved Hide resolved
@staticmethod
def from_meta_json(json_path: str):
"""
Expand All @@ -62,6 +64,7 @@ def from_meta_json(json_path: str):
article.date = date_from_meta(meta.get('date', None))
article.author = meta.get('author', None)
article.topics = meta.get('topics', None)
article.pos_frequencies = meta.get('pos_frequencies', None)

# intentionally leave it empty
article.text = None
Expand Down Expand Up @@ -92,15 +95,16 @@ def _get_meta(self):
'title': self.title,
'date': self._date_to_text(),
'author': self.author,
'topics': self.topics
'topics': self.topics,
'pos_frequencies': self.pos_frequencies
}

def _date_to_text(self):
"""
Converts datetime object to text
"""
return self.date.strftime("%Y-%m-%d %H:%M:%S")

def _get_raw_text_path(self):
"""
Returns path for requested raw article
Expand Down
2 changes: 1 addition & 1 deletion config/student_text_preprocess_score_eight_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from constants import ASSETS_PATH


TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO"]
TAGS = ["A", "ADV", "S", "V", "PR", "ANUM", "CONJ", "SPRO", "APRO", "PART", "NUM", "ADVPRO", "COM"]


class StudentTextPreprocessTest(unittest.TestCase):
Expand Down
6 changes: 6 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@
PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
ASSETS_PATH = os.path.join(PROJECT_ROOT, 'tmp', 'articles')
demid5111 marked this conversation as resolved.
Show resolved Hide resolved
CRAWLER_CONFIG_PATH = os.path.join(PROJECT_ROOT, 'crawler_config.json')
LINKS_STORAGE_DIR = os.path.join(PROJECT_ROOT, 'links')
LINKS_STORAGE_FILE = os.path.join(LINKS_STORAGE_DIR, 'links.txt')

URL_START = 'https://burunen.ru'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it is seed url or any other configuration of the crawler, then use configuration file

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nope, it is the beginning of every url in my sourse

you see, as in many webpages, the href tags in my source are filled with the continuation of the url, emitting the 'https://burunen.ru', so each and every time i try to request the link i found automatically i have to concatenate it with this beginning first.

i thought about making an additional attribute to the Crawler class (something like self.url_start = "), but then i decided that if it is constant, it should be placed among other constants... am i wrong?

HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'
' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
12 changes: 9 additions & 3 deletions crawler_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
{
"base_urls": [],
"total_articles_to_find_and_parse": 0,
"max_number_articles_to_get_from_one_seed": 0
"base_urls": ["https://burunen.ru/news/society/",
"https://burunen.ru/news/culture/",
"https://burunen.ru/news/economy/",
"https://burunen.ru/news/sports/",
"https://burunen.ru/news/incidents/",
"https://burunen.ru/news/politic/"
],
"total_articles_to_find_and_parse": 10,
"max_number_articles_to_get_from_one_seed": 5
}
85 changes: 71 additions & 14 deletions pipeline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
# pylint: disable=R0903

"""
Pipeline for text processing implementation
"""

from typing import List

from pathlib import Path
from pymorphy2 import MorphAnalyzer
from pymystem3 import Mystem

from article import Article
from constants import ASSETS_PATH


class EmptyDirectoryError(Exception):
"""
Expand All @@ -28,61 +36,110 @@ class MorphologicalToken:
Stores language params for each processed token
"""
def __init__(self, original_word, normalized_form):
pass
self.original = original_word
self.normalized = normalized_form
self.mystem_tags = ''
self.pymorphy_tags = ''

def __str__(self):
return "MorphologicalToken instance here"
return f'{self.normalized}<{self.mystem_tags}>({str(self.pymorphy_tags)})'


class CorpusManager:
"""
Works with articles and stores them
"""
def __init__(self, path_to_raw_txt_data: str):
pass
self.path_to_raw = path_to_raw_txt_data
self._storage = {}

def _scan_dataset(self):
"""
Register each dataset entry
"""
pass
path = Path(ASSETS_PATH)
for file in path.iterdir():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you find a way to directly get files by the given name template?

file_name = file.relative_to(path)
if str(file_name).endswith('_raw.txt'):
index = str(file_name).split('_raw.txt')[0]
self._storage[index] = Article(url=None, article_id=int(index))

def get_articles(self):
"""
Returns storage params
"""
pass
self._scan_dataset()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it cannot be in the plain getter - read the requirements

return self._storage


class TextProcessingPipeline:
"""
Process articles from corpus manager
"""
def __init__(self, corpus_manager: CorpusManager):
pass
self.corpus = corpus_manager
self.current_raw_text = ''

def run(self):
"""
Runs pipeline process scenario
"""
pass
for article in self.corpus.get_articles().values():
self.current_raw_text = article.get_raw_text()
tokens = self._process()
processed = ' '.join(map(str, tokens))
article.save_processed(processed)

def _process(self) -> List[type(MorphologicalToken)]:
"""
Performs processing of each text
"""
pass
mystem = Mystem()
pymorphy = MorphAnalyzer()
words = mystem.analyze(self.current_raw_text)
tokens = []
for word in words:
orig = word['text'].strip()
if orig.isalpha():
try:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

too broad try/except resulting in the following: you have mystem reported you some information, pymorphy failed -> what would be the result ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usually it is mystem that fails
pymorphy, on the contrary, has a tag for exceptional cases so there is no chance of getting IndexError with it
but it is a valid correction that i dont even check if it is recognised as unknown here
i'll fix it thanks

token = MorphologicalToken(original_word=orig, normalized_form=word['analysis'][0]['lex'])
token.mystem_tags = word['analysis'][0]['gr'].strip()
token.pymorphy_tags = pymorphy.parse(orig)[0].tag
tokens.append(token)
except IndexError:
token = MorphologicalToken(original_word=orig, normalized_form=orig)
if str(pymorphy.parse(orig)[0].tag) != 'UNKN':
token.pymorphy_tags = pymorphy.parse(orig)[0].tag
tokens.append(token)
return tokens


def validate_dataset(path_to_validate):
"""
Validates folder with assets
"""
pass
path = Path(path_to_validate)
if not path.exists():
raise FileNotFoundError
if not path.is_dir():
raise NotADirectoryError
if not list(path.iterdir()):
raise EmptyDirectoryError
files = [str(file.relative_to(path)) for file in path.iterdir()]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no strings

metas = list(filter(lambda x: x.endswith('_raw.txt'), files))
raws = list(filter(lambda x: x.endswith('_meta.json'), files))
if not len(metas) == len(raws):
raise InconsistentDatasetError
meta_indices = sorted(list(map(lambda x: int(x.split('_')[0]), metas)))
raw_indices = sorted(list(map(lambda x: int(x.split('_')[0]), raws)))
if not raw_indices == meta_indices:
raise InconsistentDatasetError


def main():
print('Your code goes here')
validate_dataset(ASSETS_PATH)
print('validated dataset')
corpus_manager = CorpusManager(ASSETS_PATH)
print('onto processing')
pipeline = TextProcessingPipeline(corpus_manager=corpus_manager)
pipeline.run()


if __name__ == "__main__":
Expand Down
49 changes: 47 additions & 2 deletions pos_frequency_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,53 @@
"""
Implementation of POSFrequencyPipeline for score ten only.
"""
import re

from pathlib import Path

from visualizer import visualize
from pipeline import CorpusManager
demid5111 marked this conversation as resolved.
Show resolved Hide resolved

from constants import ASSETS_PATH


class POSFrequencyPipeline:
def __init__(self, assets):
pass
def __init__(self, corpus: CorpusManager):
self.corpus = corpus
self.current_article = None

def run(self):
articles = self.corpus.get_articles()
for article in articles.values():
self.current_article = article
frequencies = self._count_frequencies()
self._update_meta(frequencies)
path = Path(ASSETS_PATH) / f'{article.article_id}_image.png'
visualize(frequencies, path)

def _count_frequencies(self):
marina-kaz marked this conversation as resolved.
Show resolved Hide resolved
path = Path(ASSETS_PATH) / f'{self.current_article.article_id}_processed.txt'
with open(path, encoding='utf-8') as file:
contents = file.read()
tags_found = re.findall(r"<([A-Z]+)[,=]?", contents)
frequencies = {}
for tag in tags_found:
frequencies[tag] = tags_found.count(tag)
return frequencies

def _update_meta(self, frequencies):
meta_path = Path(ASSETS_PATH) / f'{self.current_article.article_id}_meta.json'
article = self.current_article.from_meta_json(meta_path)
article.pos_frequencies = frequencies
article.text = article.get_raw_text()
article.save_raw()


def main():
corpus_manager = CorpusManager(ASSETS_PATH)
visualizer = POSFrequencyPipeline(corpus_manager)
visualizer.run()


if __name__ == "__main__":
marina-kaz marked this conversation as resolved.
Show resolved Hide resolved
main()
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
beautifulsoup4==4.9.0
pymorphy2==0.9.1
pymystem3==0.2.0
requests==2.23.0
numpy==1.20.1
matplotlib==3.4.0
Loading