diff --git a/lab_5_scrapper/scrapper.py b/lab_5_scrapper/scrapper.py index c060232b..2e84ee94 100644 --- a/lab_5_scrapper/scrapper.py +++ b/lab_5_scrapper/scrapper.py @@ -2,9 +2,65 @@ Crawler implementation. """ # pylint: disable=too-many-arguments, too-many-instance-attributes, unused-import, undefined-variable +import json import pathlib +import re +import shutil +from datetime import datetime +from random import randrange +from time import sleep from typing import Pattern, Union +import requests +from bs4 import BeautifulSoup + +from core_utils import constants +from core_utils.article.article import Article +from core_utils.article.io import to_meta, to_raw +from core_utils.config_dto import ConfigDTO + + +class IncorrectSeedURLError(Exception): + """ + The seed url is not alike the pattern. + """ + + +class NumberOfArticlesOutOfRangeError(Exception): + """ + The number of articles is not in range of 1 to 150. + """ + + +class IncorrectNumberOfArticlesError(Exception): + """ + The article number is not integer. + """ + + +class IncorrectHeadersError(Exception): + """ + The headers are not stored in a dictionary. + """ + + +class IncorrectEncodingError(Exception): + """ + The encoding is not a string. + """ + + +class IncorrectTimeoutError(Exception): + """ + The timeout is not an integer or is not in the range. + """ + + +class IncorrectVerifyError(Exception): + """ + Verification check or Headless mode are not boolean. + """ + class Config: """ @@ -18,6 +74,17 @@ def __init__(self, path_to_config: pathlib.Path) -> None: Args: path_to_config (pathlib.Path): Path to configuration. """ + self.path_to_config = path_to_config + self.config = self._extract_config_content() + self._validate_config_content() + + self._encoding = self.config.encoding + self._headers = self.config.headers + self._headless_mode = self.config.headless_mode + self._num_articles = self.config.total_articles + self._seed_urls = self.config.seed_urls + self._should_verify_certificate = self.config.should_verify_certificate + self._timeout = self.config.timeout def _extract_config_content(self) -> ConfigDTO: """ @@ -26,11 +93,42 @@ def _extract_config_content(self) -> ConfigDTO: Returns: ConfigDTO: Config values """ + with open(self.path_to_config, 'r', encoding='utf-8') as f: + confi = json.load(f) + + return ConfigDTO(**confi) def _validate_config_content(self) -> None: """ Ensure configuration parameters are not corrupt. """ + config = self._extract_config_content() + + if not isinstance(config.seed_urls, list): + raise IncorrectSeedURLError + + for seed_url in config.seed_urls: + if not re.match(r"https?://(www.)?vtomske\.ru", seed_url): + raise IncorrectSeedURLError + + if not isinstance(config.total_articles, int) or config.total_articles <= 0: + raise IncorrectNumberOfArticlesError + + if config.total_articles > 150: + raise NumberOfArticlesOutOfRangeError + + if not isinstance(config.headers, dict): + raise IncorrectHeadersError + + if not isinstance(config.encoding, str): + raise IncorrectEncodingError + + if not isinstance(config.timeout, int) or not 0 <= config.timeout < 60: + raise IncorrectTimeoutError + + if (not isinstance(config.should_verify_certificate, bool) + or not isinstance(config.headless_mode, bool)): + raise IncorrectVerifyError def get_seed_urls(self) -> list[str]: """ @@ -39,6 +137,7 @@ def get_seed_urls(self) -> list[str]: Returns: list[str]: Seed urls """ + return self._seed_urls def get_num_articles(self) -> int: """ @@ -47,6 +146,7 @@ def get_num_articles(self) -> int: Returns: int: Total number of articles to scrape """ + return self._num_articles def get_headers(self) -> dict[str, str]: """ @@ -55,6 +155,7 @@ def get_headers(self) -> dict[str, str]: Returns: dict[str, str]: Headers """ + return self._headers def get_encoding(self) -> str: """ @@ -63,6 +164,7 @@ def get_encoding(self) -> str: Returns: str: Encoding """ + return self._encoding def get_timeout(self) -> int: """ @@ -71,6 +173,7 @@ def get_timeout(self) -> int: Returns: int: Number of seconds to wait for response """ + return self._timeout def get_verify_certificate(self) -> bool: """ @@ -79,6 +182,7 @@ def get_verify_certificate(self) -> bool: Returns: bool: Whether to verify certificate or not """ + return self._should_verify_certificate def get_headless_mode(self) -> bool: """ @@ -87,6 +191,7 @@ def get_headless_mode(self) -> bool: Returns: bool: Whether to use headless mode or not """ + return self._headless_mode def make_request(url: str, config: Config) -> requests.models.Response: @@ -100,6 +205,14 @@ def make_request(url: str, config: Config) -> requests.models.Response: Returns: requests.models.Response: A response from a request """ + sleep(randrange(3)) + + return requests.get( + url=url, + timeout=config.get_timeout(), + headers=config.get_headers(), + verify=config.get_verify_certificate() + ) class Crawler: @@ -116,6 +229,9 @@ def __init__(self, config: Config) -> None: Args: config (Config): Configuration """ + self.config = config + self.urls = [] + self.base_url = "https://vtomske.ru" def _extract_url(self, article_bs: BeautifulSoup) -> str: """ @@ -128,10 +244,38 @@ def _extract_url(self, article_bs: BeautifulSoup) -> str: str: Url from HTML """ + link = article_bs.find(class_='mainbar') + if link: + links = link.find_all('a') + for link in links: + href = link.get('href') + if href: + url = self.base_url + href + if url not in self.get_search_urls() and url not in self.urls: + return url + return '' + def find_articles(self) -> None: """ Find articles. """ + seed_urls = self.get_search_urls() + + for seed_url in seed_urls: + response = make_request(seed_url, self.config) + if not response.ok: + continue + + article_soup = BeautifulSoup(response.text, features='lxml') + new_url = self._extract_url(article_soup) + while new_url: + if len(self.urls) == self.config.get_num_articles(): + break + self.urls.append(new_url) + new_url = self._extract_url(article_soup) + + if len(self.urls) == self.config.get_num_articles(): + break def get_search_urls(self) -> list: """ @@ -140,6 +284,7 @@ def get_search_urls(self) -> list: Returns: list: seed_urls param """ + return self.config.get_seed_urls() # 10 @@ -160,6 +305,10 @@ def __init__(self, full_url: str, article_id: int, config: Config) -> None: article_id (int): Article id config (Config): Configuration """ + self.full_url = full_url + self.article_id = article_id + self.config = config + self.article = Article(full_url, article_id) def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None: """ @@ -168,6 +317,10 @@ def _fill_article_with_text(self, article_soup: BeautifulSoup) -> None: Args: article_soup (bs4.BeautifulSoup): BeautifulSoup instance """ + body = article_soup.find('div', class_='material-content') + if body: + content = body.find_all('p') + self.article.text = '\n'.join([p_tag.text for p_tag in content]) def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> None: """ @@ -176,8 +329,29 @@ def _fill_article_with_meta_information(self, article_soup: BeautifulSoup) -> No Args: article_soup (bs4.BeautifulSoup): BeautifulSoup instance """ + cont = article_soup.find('div', class_='material-content') + if cont: + title = cont.find('h1') + if title: + self.article.title = title.text + + author = article_soup.find('a', class_='material-author') + if not author: + self.article.author.append('NOT FOUND') + else: + self.article.author.append(author.text.strip()) + + date = article_soup.find('time', class_='material-date') + if date: + date_str = date.attrs.get('datetime') + if isinstance(date_str, str): + self.article.date = self.unify_date_format(date_str) + + tags = article_soup.find_all(class_='material-tags') + for tag in tags: + self.article.topics.append(tag.text) - def unify_date_format(self, date_str: str) -> datetime.datetime: + def unify_date_format(self, date_str: str) -> datetime: """ Unify date format. @@ -187,6 +361,7 @@ def unify_date_format(self, date_str: str) -> datetime.datetime: Returns: datetime.datetime: Datetime object """ + return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S%z') def parse(self) -> Union[Article, bool, list]: """ @@ -195,6 +370,12 @@ def parse(self) -> Union[Article, bool, list]: Returns: Union[Article, bool, list]: Article instance """ + response = make_request(self.full_url, self.config) + if response.ok: + article_bs = BeautifulSoup(response.text, features='html.parser') + self._fill_article_with_text(article_bs) + self._fill_article_with_meta_information(article_bs) + return self.article def prepare_environment(base_path: Union[pathlib.Path, str]) -> None: @@ -204,12 +385,28 @@ def prepare_environment(base_path: Union[pathlib.Path, str]) -> None: Args: base_path (Union[pathlib.Path, str]): Path where articles stores """ + if base_path.exists(): + shutil.rmtree(base_path) + base_path.mkdir(parents=True) def main() -> None: """ Entrypoint for scrapper module. """ + configuration = Config(constants.CRAWLER_CONFIG_PATH) + crawler = Crawler(configuration) + prepare_environment(constants.ASSETS_PATH) + + crawler.find_articles() + i = 1 + for url in crawler.urls: + parser = HTMLParser(full_url=url, article_id=i, config=configuration) + article = parser.parse() + if isinstance(article, Article): + to_raw(article) + to_meta(article) + i += 1 if __name__ == "__main__": diff --git a/lab_5_scrapper/scrapper_config.json b/lab_5_scrapper/scrapper_config.json index 771fe42b..424fc53b 100644 --- a/lab_5_scrapper/scrapper_config.json +++ b/lab_5_scrapper/scrapper_config.json @@ -1,9 +1,9 @@ { - "seed_urls": [], - "headers": {}, - "total_articles_to_find_and_parse": 0, - "encoding": "", - "timeout": 0, + "seed_urls": ["https://vtomske.ru/"], + "headers": {"Accept":"*/*", "User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 YaBrowser/24.1.0.0 Safari/537.36"}, + "total_articles_to_find_and_parse": 3, + "encoding": "utf-8", + "timeout": 5, "should_verify_certificate": true, "headless_mode": true } diff --git a/lab_5_scrapper/settings.json b/lab_5_scrapper/settings.json index ee7a97c3..9ba94145 100644 --- a/lab_5_scrapper/settings.json +++ b/lab_5_scrapper/settings.json @@ -1,3 +1,3 @@ { - "target_score": 0 + "target_score": 8 } diff --git a/lab_6_pipeline/pipeline.py b/lab_6_pipeline/pipeline.py index f70f27de..9e311874 100644 --- a/lab_6_pipeline/pipeline.py +++ b/lab_6_pipeline/pipeline.py @@ -3,16 +3,41 @@ """ # pylint: disable=too-few-public-methods, unused-import, undefined-variable, too-many-nested-blocks import pathlib - -try: - from networkx import DiGraph -except ImportError: # pragma: no cover - DiGraph = None # type: ignore - print('No libraries installed. Failed to import.') - -from core_utils.article.article import Article +from dataclasses import asdict + +import spacy_udpipe +import stanza +from networkx import to_dict_of_lists +from networkx.algorithms.isomorphism.vf2userfunc import GraphMatcher +from networkx.classes.digraph import DiGraph +from stanza.models.common.doc import Document +from stanza.pipeline.core import Pipeline +from stanza.utils.conll import CoNLL + +from core_utils.article.article import Article, ArtifactType, get_article_id_from_filepath +from core_utils.article.io import from_meta, from_raw, to_cleaned, to_meta +from core_utils.constants import ASSETS_PATH, UDPIPE_MODEL_PATH from core_utils.pipeline import (AbstractCoNLLUAnalyzer, CoNLLUDocument, LibraryWrapper, PipelineProtocol, StanzaDocument, TreeNode) +from core_utils.visualizer import visualize + + +class EmptyDirectoryError(Exception): + """ + Directory is empty + """ + + +class InconsistentDatasetError(Exception): + """ + Dataset contains slips in IDs of raw files or files are empty + """ + + +class EmptyFileError(Exception): + """ + File is empty + """ class CorpusManager: @@ -27,16 +52,46 @@ def __init__(self, path_to_raw_txt_data: pathlib.Path) -> None: Args: path_to_raw_txt_data (pathlib.Path): Path to raw txt data """ + self.path_to_raw_txt_data = path_to_raw_txt_data + self._storage = {} + self._validate_dataset() + self._scan_dataset() def _validate_dataset(self) -> None: """ Validate folder with assets. """ + if not self.path_to_raw_txt_data.exists(): + raise FileNotFoundError + + if not self.path_to_raw_txt_data.is_dir(): + raise NotADirectoryError + + if not any(self.path_to_raw_txt_data.iterdir()): + raise EmptyDirectoryError + + raw_f = list(self.path_to_raw_txt_data.glob("*_raw.txt")) + meta_f = list(self.path_to_raw_txt_data.glob("*_meta.json")) + if len(raw_f) != len(meta_f): + raise InconsistentDatasetError + sorted_raw_files = sorted(raw_f, key=get_article_id_from_filepath) + sorted_meta_files = sorted(meta_f, key=get_article_id_from_filepath) + + for index, (meta, raw) in enumerate(zip(sorted_meta_files, sorted_raw_files), 1): + if index != get_article_id_from_filepath(meta) \ + or index != get_article_id_from_filepath(raw) \ + or not meta.stat().st_size or not raw.stat().st_size: + raise InconsistentDatasetError def _scan_dataset(self) -> None: """ Register each dataset entry. """ + self._storage = { + get_article_id_from_filepath(file): + from_raw(file, Article(url=None, article_id=get_article_id_from_filepath(file))) + for file in list(self.path_to_raw_txt_data.glob("*_raw.txt")) + } def get_articles(self) -> dict: """ @@ -45,6 +100,7 @@ def get_articles(self) -> dict: Returns: dict: Storage params """ + return self._storage class TextProcessingPipeline(PipelineProtocol): @@ -62,11 +118,23 @@ def __init__( corpus_manager (CorpusManager): CorpusManager instance analyzer (LibraryWrapper | None): Analyzer instance """ + self._corpus = corpus_manager + self.analyzer = analyzer def run(self) -> None: """ Perform basic preprocessing and write processed text to files. """ + documents = [] + if self.analyzer: + documents = self.analyzer.analyze([article.text for article + in self._corpus.get_articles().values()]) + + for num, article in enumerate(self._corpus.get_articles().values()): + to_cleaned(article) + if self.analyzer and documents: + article.set_conllu_info(documents[num]) + self.analyzer.to_conllu(article) class UDPipeAnalyzer(LibraryWrapper): @@ -80,6 +148,7 @@ def __init__(self) -> None: """ Initialize an instance of the UDPipeAnalyzer class. """ + self._analyzer = self._bootstrap() def _bootstrap(self) -> AbstractCoNLLUAnalyzer: """ @@ -88,6 +157,16 @@ def _bootstrap(self) -> AbstractCoNLLUAnalyzer: Returns: AbstractCoNLLUAnalyzer: Analyzer instance """ + model = spacy_udpipe.load_from_path( + lang="ru", + path=str(UDPIPE_MODEL_PATH) + ) + model.add_pipe( + "conll_formatter", + last=True, + config={"conversion_maps": {"XPOS": {"": "_"}}, "include_headers": True}, + ) + return model def analyze(self, texts: list[str]) -> list[StanzaDocument | str]: """ @@ -99,6 +178,7 @@ def analyze(self, texts: list[str]) -> list[StanzaDocument | str]: Returns: list[StanzaDocument | str]: List of documents """ + return [self._analyzer(text)._.conll_str for text in texts] def to_conllu(self, article: Article) -> None: """ @@ -107,7 +187,10 @@ def to_conllu(self, article: Article) -> None: Args: article (Article): Article containing information to save """ - + with open(article.get_file_path(kind=ArtifactType.UDPIPE_CONLLU), + 'w', encoding='utf-8') as annotation_file: + annotation_file.writelines(article.get_conllu_info()) + annotation_file.write('\n') class StanzaAnalyzer(LibraryWrapper): """ @@ -120,6 +203,7 @@ def __init__(self) -> None: """ Initialize an instance of the StanzaAnalyzer class. """ + self._analyzer = self._bootstrap() def _bootstrap(self) -> AbstractCoNLLUAnalyzer: """ @@ -128,6 +212,13 @@ def _bootstrap(self) -> AbstractCoNLLUAnalyzer: Returns: AbstractCoNLLUAnalyzer: Analyzer instance """ + stanza.download(lang="ru", processors="tokenize,pos,lemma,depparse", logging_level="INFO") + return Pipeline( + lang="ru", + processors="tokenize,pos,lemma,depparse", + logging_level="INFO", + download_method=None + ) def analyze(self, texts: list[str]) -> list[StanzaDocument]: """ @@ -139,6 +230,7 @@ def analyze(self, texts: list[str]) -> list[StanzaDocument]: Returns: list[StanzaDocument]: List of documents """ + return self._analyzer.process([Document([], text=text) for text in texts]) def to_conllu(self, article: Article) -> None: """ @@ -147,6 +239,10 @@ def to_conllu(self, article: Article) -> None: Args: article (Article): Article containing information to save """ + CoNLL.write_doc2conll( + doc=article.get_conllu_info(), + filename=article.get_file_path(kind=ArtifactType.STANZA_CONLLU), + ) def from_conllu(self, article: Article) -> CoNLLUDocument: """ @@ -158,6 +254,7 @@ def from_conllu(self, article: Article) -> CoNLLUDocument: Returns: CoNLLUDocument: Document ready for parsing """ + return CoNLL.conll2doc(input_file=article.get_file_path(kind=ArtifactType.STANZA_CONLLU)) class POSFrequencyPipeline: @@ -173,11 +270,23 @@ def __init__(self, corpus_manager: CorpusManager, analyzer: LibraryWrapper) -> N corpus_manager (CorpusManager): CorpusManager instance analyzer (LibraryWrapper): Analyzer instance """ + self._corpus = corpus_manager + self._analyzer = analyzer def run(self) -> None: """ Visualize the frequencies of each part of speech. """ + for article_id, article in self._corpus.get_articles().items(): + if not article.get_file_path(kind=ArtifactType.STANZA_CONLLU).stat().st_size: + raise EmptyFileError + + from_meta(article.get_meta_file_path(), article) + article.set_pos_info(self._count_frequencies(article)) + to_meta(article) + visualize(article=article, + path_to_save=self._corpus.path_to_raw_txt_data / + f'{article_id}_image.png') def _count_frequencies(self, article: Article) -> dict[str, int]: """ @@ -189,6 +298,14 @@ def _count_frequencies(self, article: Article) -> dict[str, int]: Returns: dict[str, int]: POS frequencies """ + pos_freq = {} + for conllu_sentence in self._analyzer.from_conllu(article).sentences: + words = [word.to_dict().get('upos') for word in conllu_sentence.words] + pos_freq.update({ + word: pos_freq.get(word, 0) + words.count(word) + for word in set(words) + }) + return pos_freq class PatternSearchPipeline(PipelineProtocol): @@ -207,6 +324,17 @@ def __init__( analyzer (LibraryWrapper): Analyzer instance pos (tuple[str, ...]): Root, Dependency, Child part of speech """ + self._corpus = corpus_manager + self._analyzer = analyzer + self._node_labels = pos + + self.ideal_graph = DiGraph() + self.ideal_graph.add_nodes_from( + (index, {'label': label}) + for index, label in enumerate(self._node_labels) + ) + self.ideal_graph.add_edges_from((index, index + 1) + for index in range(len(self._node_labels) - 1)) def _make_graphs(self, doc: CoNLLUDocument) -> list[DiGraph]: """ @@ -218,6 +346,25 @@ def _make_graphs(self, doc: CoNLLUDocument) -> list[DiGraph]: Returns: list[DiGraph]: Graphs for the sentences in the document """ + graphs = [] + for conllu_sent in doc.sentences: + digraph = DiGraph() + for word in conllu_sent.words: + word = word.to_dict() + digraph.add_node( + word['id'], + label=word['upos'], + text=word['text'], + ) + + digraph.add_edge( + word['head'], + word['id'], + label=word["deprel"] + ) + + graphs.append(digraph) + return graphs def _add_children( self, graph: DiGraph, subgraph_to_graph: dict, node_id: int, tree_node: TreeNode @@ -231,6 +378,21 @@ def _add_children( node_id (int): ID of root node of the match tree_node (TreeNode): Root node of the match """ + children = tuple(graph.neighbors(node_id)) + if not children or tree_node.children or node_id not in subgraph_to_graph: + return + for child_num in children: + if child_num not in [node_match[0] + for node_match in subgraph_to_graph.values() + if node_match]: + continue + child_info = dict(graph.nodes(data=True))[child_num] + child_node = TreeNode(child_info['label'], + child_info['text'], + []) + tree_node.children.append(child_node) + self._add_children(graph, subgraph_to_graph, child_num, child_node) + return def _find_pattern(self, doc_graphs: list) -> dict[int, list[TreeNode]]: """ @@ -242,17 +404,61 @@ def _find_pattern(self, doc_graphs: list) -> dict[int, list[TreeNode]]: Returns: dict[int, list[TreeNode]]: A dictionary with pattern matches """ + found_patterns = {} + for (sentence_id, graph) in enumerate(doc_graphs): + matcher = GraphMatcher(graph, self.ideal_graph, + node_match=lambda n1, n2: + n1.get('label', '') == n2['label']) + pattern_nodes = [] + for isograph in matcher.subgraph_isomorphisms_iter(): + digraph = graph.subgraph(isograph.keys()).copy() + base_nodes = [node for node in digraph.nodes + if not tuple(digraph.predecessors(node))] + + for node_id in base_nodes: + tree_node = TreeNode(graph.nodes[node_id].get('label'), + graph.nodes[node_id].get('text'), + []) + self._add_children(graph, to_dict_of_lists(digraph), node_id, tree_node) + pattern_nodes.append(tree_node) + + if pattern_nodes: + found_patterns[int(sentence_id)] = pattern_nodes + return found_patterns def run(self) -> None: """ Search for a pattern in documents and writes found information to JSON file. """ + for article in self._corpus.get_articles().values(): + conllu_doc = self._analyzer.from_conllu(article) + graphs = self._make_graphs(conllu_doc) + pattern_matches = self._find_pattern(graphs) + dict_matches = {sentence_id: [asdict(match) for match in matches] + for sentence_id, matches in pattern_matches.items()} + article.set_patterns_info(dict_matches) + to_meta(article) def main() -> None: """ Entrypoint for pipeline module. """ + corpus_manager = CorpusManager(path_to_raw_txt_data=ASSETS_PATH) + + pipeline = TextProcessingPipeline(corpus_manager, UDPipeAnalyzer()) + pipeline.run() + + stanza_analyzer = StanzaAnalyzer() + pipeline = TextProcessingPipeline(corpus_manager, stanza_analyzer) + pipeline.run() + + visualizer_pos = POSFrequencyPipeline(corpus_manager, stanza_analyzer) + visualizer_pos.run() + + visualizer_patterns = PatternSearchPipeline(corpus_manager, stanza_analyzer, + ("VERB", "NOUN", "ADP")) + visualizer_patterns.run() if __name__ == "__main__": diff --git a/lab_6_pipeline/settings.json b/lab_6_pipeline/settings.json index ee7a97c3..07e6a65b 100644 --- a/lab_6_pipeline/settings.json +++ b/lab_6_pipeline/settings.json @@ -1,3 +1,3 @@ { - "target_score": 0 + "target_score": 10 } diff --git a/requirements.txt b/requirements.txt index 8b137891..7830d937 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,9 @@ - +beautifulsoup4==4.12.0 +lxml==5.2.1 +matplotlib==3.8.4 +networkx==3.3 +requests==2.31.0 +spacy-conll==3.4.0 +spacy-udpipe==1.0.0 +spacy==3.7.4 +stanza==1.8.2 \ No newline at end of file