From 2da7caaf5c75dd5b18289194a46f4e77ddb43a7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Kl=C3=BCber?= Date: Wed, 10 Apr 2024 15:42:55 +0200 Subject: [PATCH] added direct answers --- archive_query_log/cli/parsers.py | 65 ++++++++++++++++++++++++++++ archive_query_log/imports/yaml.py | 72 +++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) diff --git a/archive_query_log/cli/parsers.py b/archive_query_log/cli/parsers.py index c6e371e..8637eaf 100644 --- a/archive_query_log/cli/parsers.py +++ b/archive_query_log/cli/parsers.py @@ -9,6 +9,7 @@ UrlQueryParser, UrlPageParserType, UrlPageParser, \ UrlOffsetParser, UrlOffsetParserType, WarcQueryParserType, \ WarcQueryParser, WarcSnippetsParserType, WarcSnippetsParser, \ + WarcDirectAnswersParserType, WarcDirectAnswersParser, \ WarcMainContentParserType, WarcMainContentParser @@ -380,6 +381,70 @@ def warc_snippets_import(config: Config, services_path: Path) -> None: import_warc_snippets_parsers(config, services_path) +@parsers.group() +def warc_direct_answers() -> None: + pass + + +CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE = [ + "xpath", +] + + +@warc_direct_answers.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE), required=True) +@option("--xpath", type=str) +@option("--url-xpath", type=str) +@option("--title-xpath", type=str) +@option("--text-xpath", type=str) +@pass_config +def warc_direct_answers_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + parser_type: str, + xpath: str | None, + url_xpath: str | None, + text_xpath: str | None, +) -> None: + from archive_query_log.parsers.warc_snippets import \ + add_warc_direct_answers_parser + parser_type_strict: WarcDirectAnswersParserType + if parser_type == "xpath": + parser_type_strict = "xpath" + if xpath is None: + raise UsageError("No XPath given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + WarcDirectAnswersParser.init(using=config.es.client) + add_warc_direct_answers_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + parser_type=parser_type_strict, + xpath=xpath, + url_xpath=url_xpath, + text_xpath=text_xpath, + ) + + +@warc_direct_answers.command("import") +@option("-s", "--services-file", "services_path", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, resolve_path=True, + allow_dash=False), + default=Path("data") / "selected-services.yaml") +@pass_config +def warc_direct_answers_import(config: Config, services_path: Path) -> None: + from archive_query_log.imports.yaml import import_warc_direct_answers_parsers + WarcDirectAnswersParser.init(using=config.es.client) + import_warc_direct_answers_parsers(config, services_path) + + @parsers.group() def warc_main_content() -> None: pass diff --git a/archive_query_log/imports/yaml.py b/archive_query_log/imports/yaml.py index a3ffd0b..caceacc 100644 --- a/archive_query_log/imports/yaml.py +++ b/archive_query_log/imports/yaml.py @@ -19,6 +19,7 @@ from archive_query_log.parsers.url_query import add_url_query_parser from archive_query_log.parsers.warc_query import add_warc_query_parser from archive_query_log.parsers.warc_snippets import add_warc_snippets_parser +from archive_query_log.parsers.warc_direct_answers import add_warc_direct_answers_parser from archive_query_log.parsers.xml import xpaths_from_css_selector, \ text_xpath, merge_xpaths from archive_query_log.providers import add_provider @@ -479,3 +480,74 @@ def import_warc_snippets_parsers(config: Config, services_path: Path) -> None: title_xpath=title_xpath, text_xpath=snippet_xpath, ) + + +def import_warc_direct_answers_parsers(config: Config, services_path: Path) -> None: + echo("Load providers from services file.") + with services_path.open("r") as file: + services_list: Sequence[dict] = safe_load(file) + echo(f"Found {len(services_list)} service definitions.") + + services: Iterable[dict] = services_list + # noinspection PyTypeChecker + services = tqdm( + services, + desc="Import parsers for providers", + unit="provider", + ) + for service in services: + if ("domains" not in service or "results_parsers" not in service): + continue + + results_parsers = service["results_parsers"] + + providers = ( + Provider.search(using=config.es.client) + .query(Terms(domains=service["domains"])) + .scan() + ) + providers = safe_iter_scan(providers) + for provider in providers: + for results_parser in enumerate(results_parsers): + if results_parser["type"] != "html_selector": + continue + results_selector = results_parser["results_selector"] + url_selector = results_parser.get("url_selector") + direct_answer_selector = results_parser.get("direct_answer_selector") + + results_xpaths = xpaths_from_css_selector(results_selector) + results_xpaths = [ + "//" + result_xpath + for result_xpath in results_xpaths + ] + results_xpath = merge_xpaths(results_xpaths) + + if url_selector is not None: + url_xpaths = xpaths_from_css_selector(url_selector) + url_xpaths = [ + text_xpath(xpath, attribute="href") + for xpath in url_xpaths + ] + url_xpath = merge_xpaths(url_xpaths) + else: + url_xpath = None + + if direct_answer_selector is not None: + direct_answer_xpaths = xpaths_from_css_selector(direct_answer_selector) + direct_answer_xpaths = [ + text_xpath(xpath, text=True) + for xpath in direct_answer_xpaths + ] + direct_answer_xpath = merge_xpaths(direct_answer_xpaths) + else: + direct_answer_xpath = None + + add_warc_direct_answers_parser( + config=config, + provider_id=provider.meta.id, + url_pattern_regex=results_parser.get("url_pattern"), + parser_type="xpath", + xpath=results_xpath, + url_xpath=url_xpath, + text_xpath=direct_answer_xpath, + )