Skip to content

Commit

Permalink
added realisation for lab6
Browse files Browse the repository at this point in the history
  • Loading branch information
shoodeen committed Jun 3, 2024
1 parent e105240 commit 88d4c4b
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 7 deletions.
85 changes: 79 additions & 6 deletions lab_6_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,34 @@
# pylint: disable=too-few-public-methods, unused-import, undefined-variable, too-many-nested-blocks
import pathlib

try:
from networkx import DiGraph
except ImportError: # pragma: no cover
DiGraph = None # type: ignore
print('No libraries installed. Failed to import.')
import spacy_udpipe
from networkx import DiGraph

from core_utils.article.article import Article
from core_utils.article.article import (Article, ArtifactType, get_article_id_from_filepath)
from core_utils.article.io import from_raw, to_cleaned
from core_utils.constants import ASSETS_PATH, UDPIPE_MODEL_PATH
from core_utils.pipeline import (AbstractCoNLLUAnalyzer, CoNLLUDocument, LibraryWrapper,
PipelineProtocol, StanzaDocument, TreeNode)


class EmptyDirectoryError(Exception):
"""
Directory is empty
"""


class InconsistentDatasetError(Exception):
"""
Dataset contains slips in IDs of raw files or files are empty
"""


class EmptyFileError(Exception):
"""
File is empty
"""


class CorpusManager:
"""
Work with articles and store them.
Expand All @@ -27,16 +44,44 @@ def __init__(self, path_to_raw_txt_data: pathlib.Path) -> None:
Args:
path_to_raw_txt_data (pathlib.Path): Path to raw txt data
"""
self.path_to_raw_txt_data = path_to_raw_txt_data
self._storage = {}

self._validate_dataset()
self._scan_dataset()

def _validate_dataset(self) -> None:
"""
Validate folder with assets.
"""
if not self.path_to_raw_txt_data.exists():
raise FileNotFoundError
if not self.path_to_raw_txt_data.is_dir():
raise NotADirectoryError
if not any(self.path_to_raw_txt_data.iterdir()):
raise EmptyDirectoryError

raw_f = list(self.path_to_raw_txt_data.glob("*_raw.txt"))
meta_f = list(self.path_to_raw_txt_data.glob("*_meta.json"))
if len(meta_f) != len(raw_f):
raise InconsistentDatasetError()
raw_f = sorted(raw_f, key=get_article_id_from_filepath)
meta_f = sorted(meta_f, key=get_article_id_from_filepath)

for ind, (raw, meta) in enumerate(iterable=zip(raw_f, meta_f), start=1):
if ind != get_article_id_from_filepath(raw) \
or ind != get_article_id_from_filepath(meta) \
or not raw.stat().st_size \
or not meta.stat().st_size:
raise InconsistentDatasetError()

def _scan_dataset(self) -> None:
"""
Register each dataset entry.
"""
for file in list(self.path_to_raw_txt_data.glob("*_raw.txt")):
art_id = get_article_id_from_filepath(file)
self._storage[art_id] = from_raw(file)

def get_articles(self) -> dict:
"""
Expand All @@ -45,6 +90,7 @@ def get_articles(self) -> dict:
Returns:
dict: Storage params
"""
return self._storage


class TextProcessingPipeline(PipelineProtocol):
Expand All @@ -62,11 +108,20 @@ def __init__(
corpus_manager (CorpusManager): CorpusManager instance
analyzer (LibraryWrapper | None): Analyzer instance
"""
self._corpus = corpus_manager
self._analyzer = analyzer

def run(self) -> None:
"""
Perform basic preprocessing and write processed text to files.
"""
docs = self._analyzer.analyze([article.text for article in
self._corpus.get_articles().values()])

for ind, article in self._corpus.get_articles().items():
to_cleaned(article)
article.set_conllu_info(docs[ind - 1])
self._analyzer.to_conllu(article)


class UDPipeAnalyzer(LibraryWrapper):
Expand All @@ -80,6 +135,7 @@ def __init__(self) -> None:
"""
Initialize an instance of the UDPipeAnalyzer class.
"""
self._analyzer = self._bootstrap()

def _bootstrap(self) -> AbstractCoNLLUAnalyzer:
"""
Expand All @@ -88,6 +144,16 @@ def _bootstrap(self) -> AbstractCoNLLUAnalyzer:
Returns:
AbstractCoNLLUAnalyzer: Analyzer instance
"""
model = spacy_udpipe.load_from_path(
lang="ru",
path=str(UDPIPE_MODEL_PATH)
)
model.add_pipe(
"conll_formatter",
last=True,
config={"conversion_maps": {"XPOS": {"": "_"}}, "include_headers": True},
)
return model

def analyze(self, texts: list[str]) -> list[StanzaDocument | str]:
"""
Expand All @@ -99,6 +165,7 @@ def analyze(self, texts: list[str]) -> list[StanzaDocument | str]:
Returns:
list[StanzaDocument | str]: List of documents
"""
return [f'{self._analyzer(text)._.conll_str}\n' for text in texts]

def to_conllu(self, article: Article) -> None:
"""
Expand All @@ -107,6 +174,9 @@ def to_conllu(self, article: Article) -> None:
Args:
article (Article): Article containing information to save
"""
path = article.get_file_path(ArtifactType.UDPIPE_CONLLU)
with open(path, 'w', encoding='utf-8') as annotation_file:
annotation_file.writelines(article.get_conllu_info())


class StanzaAnalyzer(LibraryWrapper):
Expand Down Expand Up @@ -253,6 +323,9 @@ def main() -> None:
"""
Entrypoint for pipeline module.
"""
corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH)
pipeline = TextProcessingPipeline(corpus_manager, UDPipeAnalyzer())
pipeline.run()


if __name__ == "__main__":
Expand Down
10 changes: 9 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
beautifulsoup4==4.12.2
beautifulsoup4==4.12.0
digraph==0.1.4
lxml==4.9.2
networkx==3.3
requests==2.31.0
spacy-conll==3.4.0
spacy-udpipe==1.0.0
spacy==3.7.4
stanza==1.8.2

0 comments on commit 88d4c4b

Please sign in to comment.