From 2da7caaf5c75dd5b18289194a46f4e77ddb43a7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Julian=20Kl=C3=BCber?= <julian.klueber@uni-jena.de>
Date: Wed, 10 Apr 2024 15:42:55 +0200
Subject: [PATCH] added direct answers

---
 archive_query_log/cli/parsers.py  | 65 ++++++++++++++++++++++++++++
 archive_query_log/imports/yaml.py | 72 +++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)

diff --git a/archive_query_log/cli/parsers.py b/archive_query_log/cli/parsers.py
index c6e371e..8637eaf 100644
--- a/archive_query_log/cli/parsers.py
+++ b/archive_query_log/cli/parsers.py
@@ -9,6 +9,7 @@
     UrlQueryParser, UrlPageParserType, UrlPageParser, \
     UrlOffsetParser, UrlOffsetParserType, WarcQueryParserType, \
     WarcQueryParser, WarcSnippetsParserType, WarcSnippetsParser, \
+    WarcDirectAnswersParserType, WarcDirectAnswersParser, \
     WarcMainContentParserType, WarcMainContentParser
 
 
@@ -380,6 +381,70 @@ def warc_snippets_import(config: Config, services_path: Path) -> None:
     import_warc_snippets_parsers(config, services_path)
 
 
+@parsers.group()
+def warc_direct_answers() -> None:
+    pass
+
+
+CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE = [
+    "xpath",
+]
+
+
+@warc_direct_answers.command("add")
+@option("--provider-id", type=str)
+@option("--url-pattern-regex", type=str)
+@option("--priority", type=FloatRange(min=0, min_open=False))
+@option("--parser-type",
+        type=Choice(CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE), required=True)
+@option("--xpath", type=str)
+@option("--url-xpath", type=str)
+@option("--title-xpath", type=str)
+@option("--text-xpath", type=str)
+@pass_config
+def warc_direct_answers_add(
+        config: Config,
+        provider_id: str | None,
+        url_pattern_regex: str | None,
+        parser_type: str,
+        xpath: str | None,
+        url_xpath: str | None,
+        text_xpath: str | None,
+) -> None:
+    from archive_query_log.parsers.warc_snippets import \
+        add_warc_direct_answers_parser
+    parser_type_strict: WarcDirectAnswersParserType
+    if parser_type == "xpath":
+        parser_type_strict = "xpath"
+        if xpath is None:
+            raise UsageError("No XPath given.")
+    else:
+        raise ValueError(f"Invalid parser type: {parser_type}")
+    WarcDirectAnswersParser.init(using=config.es.client)
+    add_warc_direct_answers_parser(
+        config=config,
+        provider_id=provider_id,
+        url_pattern_regex=url_pattern_regex,
+        parser_type=parser_type_strict,
+        xpath=xpath,
+        url_xpath=url_xpath,
+        text_xpath=text_xpath,
+    )
+
+
+@warc_direct_answers.command("import")
+@option("-s", "--services-file", "services_path",
+        type=PathType(path_type=Path, exists=True, file_okay=True,
+                      dir_okay=False, readable=True, resolve_path=True,
+                      allow_dash=False),
+        default=Path("data") / "selected-services.yaml")
+@pass_config
+def warc_direct_answers_import(config: Config, services_path: Path) -> None:
+    from archive_query_log.imports.yaml import import_warc_direct_answers_parsers
+    WarcDirectAnswersParser.init(using=config.es.client)
+    import_warc_direct_answers_parsers(config, services_path)
+
+
 @parsers.group()
 def warc_main_content() -> None:
     pass
diff --git a/archive_query_log/imports/yaml.py b/archive_query_log/imports/yaml.py
index a3ffd0b..caceacc 100644
--- a/archive_query_log/imports/yaml.py
+++ b/archive_query_log/imports/yaml.py
@@ -19,6 +19,7 @@
 from archive_query_log.parsers.url_query import add_url_query_parser
 from archive_query_log.parsers.warc_query import add_warc_query_parser
 from archive_query_log.parsers.warc_snippets import add_warc_snippets_parser
+from archive_query_log.parsers.warc_direct_answers import add_warc_direct_answers_parser
 from archive_query_log.parsers.xml import xpaths_from_css_selector, \
     text_xpath, merge_xpaths
 from archive_query_log.providers import add_provider
@@ -479,3 +480,74 @@ def import_warc_snippets_parsers(config: Config, services_path: Path) -> None:
                     title_xpath=title_xpath,
                     text_xpath=snippet_xpath,
                 )
+
+
+def import_warc_direct_answers_parsers(config: Config, services_path: Path) -> None:
+    echo("Load providers from services file.")
+    with services_path.open("r") as file:
+        services_list: Sequence[dict] = safe_load(file)
+    echo(f"Found {len(services_list)} service definitions.")
+
+    services: Iterable[dict] = services_list
+    # noinspection PyTypeChecker
+    services = tqdm(
+        services,
+        desc="Import parsers for providers",
+        unit="provider",
+    )
+    for service in services:
+        if ("domains" not in service or "results_parsers" not in service):
+            continue
+
+        results_parsers = service["results_parsers"]
+
+        providers = (
+            Provider.search(using=config.es.client)
+            .query(Terms(domains=service["domains"]))
+            .scan()
+        )
+        providers = safe_iter_scan(providers)
+        for provider in providers:
+            for results_parser in enumerate(results_parsers):
+                if results_parser["type"] != "html_selector":
+                    continue
+                results_selector = results_parser["results_selector"]
+                url_selector = results_parser.get("url_selector")
+                direct_answer_selector = results_parser.get("direct_answer_selector")
+
+                results_xpaths = xpaths_from_css_selector(results_selector)
+                results_xpaths = [
+                    "//" + result_xpath
+                    for result_xpath in results_xpaths
+                ]
+                results_xpath = merge_xpaths(results_xpaths)
+
+                if url_selector is not None:
+                    url_xpaths = xpaths_from_css_selector(url_selector)
+                    url_xpaths = [
+                        text_xpath(xpath, attribute="href")
+                        for xpath in url_xpaths
+                    ]
+                    url_xpath = merge_xpaths(url_xpaths)
+                else:
+                    url_xpath = None
+
+                if direct_answer_selector is not None:
+                    direct_answer_xpaths = xpaths_from_css_selector(direct_answer_selector)
+                    direct_answer_xpaths = [
+                        text_xpath(xpath, text=True)
+                        for xpath in direct_answer_xpaths
+                    ]
+                    direct_answer_xpath = merge_xpaths(direct_answer_xpaths)
+                else:
+                    direct_answer_xpath = None
+
+                add_warc_direct_answers_parser(
+                    config=config,
+                    provider_id=provider.meta.id,
+                    url_pattern_regex=results_parser.get("url_pattern"),
+                    parser_type="xpath",
+                    xpath=results_xpath,
+                    url_xpath=url_xpath,
+                    text_xpath=direct_answer_xpath,
+                )