Skip to content

Commit

Permalink
Merge branch 'elastic' of github.com:webis-de/archive-query-log into …
Browse files Browse the repository at this point in the history
…elastic
  • Loading branch information
janheinrichmerker committed Apr 15, 2024
2 parents d98e59b + 218ae44 commit 948a55e
Show file tree
Hide file tree
Showing 4 changed files with 366 additions and 2 deletions.
61 changes: 61 additions & 0 deletions archive_query_log/cli/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
UrlQueryParser, UrlPageParserType, UrlPageParser, \
UrlOffsetParser, UrlOffsetParserType, WarcQueryParserType, \
WarcQueryParser, WarcSnippetsParserType, WarcSnippetsParser, \
WarcDirectAnswersParserType, WarcDirectAnswersParser, \
WarcMainContentParserType, WarcMainContentParser


Expand Down Expand Up @@ -380,6 +381,66 @@ def warc_snippets_import(config: Config, services_path: Path) -> None:
import_warc_snippets_parsers(config, services_path)


@parsers.group()
def warc_direct_answers() -> None:
pass


CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE = [
"xpath",
]


@warc_direct_answers.command("add")
@option("--provider-id", type=str)
@option("--url-pattern-regex", type=str)
@option("--priority", type=FloatRange(min=0, min_open=False))
@option("--parser-type",
type=Choice(CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE), required=True)
@option("--xpath", type=str)
@option("--url-xpath", type=str)
@option("--text-xpath", type=str)
@pass_config
def warc_direct_answers_add(
config: Config,
provider_id: str | None,
url_pattern_regex: str | None,
priority: float | None,
parser_type: str,
xpath: str | None,
url_xpath: str | None,
text_xpath: str | None,
) -> None:
from archive_query_log.parsers.warc_direct_answers import \
add_warc_direct_answers_parser
parser_type_strict: WarcDirectAnswersParserType
if parser_type == "xpath":
parser_type_strict = "xpath"
if xpath is None:
raise UsageError("No XPath given.")
else:
raise ValueError(f"Invalid parser type: {parser_type}")
WarcDirectAnswersParser.init(using=config.es.client)
add_warc_direct_answers_parser(
config=config,
provider_id=provider_id,
url_pattern_regex=url_pattern_regex,
priority=priority,
parser_type=parser_type_strict,
xpath=xpath,
url_xpath=url_xpath,
text_xpath=text_xpath,
)


@warc_direct_answers.command("import")
@option("-s", "--services-file", "services_path",
type=PathType(path_type=Path, exists=True, file_okay=True,
dir_okay=False, readable=True, resolve_path=True,
allow_dash=False),
default=Path("data") / "selected-services.yaml")


@parsers.group()
def warc_main_content() -> None:
pass
Expand Down
4 changes: 2 additions & 2 deletions archive_query_log/namespaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
NAMESPACE_AQL, "warc_snippets_parser")
NAMESPACE_WARC_MAIN_CONTENT_PARSER = uuid5(
NAMESPACE_AQL, "warc_main_content_parser")
NAMESPACE_WARC_DIRECT_ANSWER_PARSER = uuid5(
NAMESPACE_AQL, "warc_direct_answer_parser")
NAMESPACE_WARC_DIRECT_ANSWERS_PARSER = uuid5(
NAMESPACE_AQL, "warc_direct_answers_parser")
NAMESPACE_WARC_DOWNLOADER = uuid5(NAMESPACE_AQL, "warc_downloader")
40 changes: 40 additions & 0 deletions archive_query_log/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,16 @@ class Snippet(SnippetId):
text: str | None = Text()


class DirectAnswerId(InnerDocument):
id: str = Keyword()


class DirectAnswer(DirectAnswerId):
content: str = Text()
url: str | None = Keyword()
text: str | None = Text()


class Serp(BaseDocument):
archive: InnerArchive = Object(InnerArchive)
provider: InnerProvider = Object(InnerProvider)
Expand All @@ -208,6 +218,8 @@ class Serp(BaseDocument):
warc_query_parser: InnerParser | None = Object(InnerParser)
warc_snippets: list[SnippetId] | None = Nested(SnippetId)
warc_snippets_parser: InnerParser | None = Object(InnerParser)
warc_direct_answers: list[DirectAnswerId] | None = Nested(DirectAnswerId)
warc_direct_answers_parser: InnerParser | None = Object(InnerParser)

# rendered_warc_location: WarcLocation | None = Object(WarcLocation)
# rendered_warc_downloader: InnerDownloader | None = (
Expand Down Expand Up @@ -437,6 +449,34 @@ class Index:
}


WarcDirectAnswersParserType = Literal[
"xpath",
]


class WarcDirectAnswersParser(BaseDocument):
provider: InnerProviderId | None = Object(InnerProviderId)
url_pattern_regex: str | None = Keyword()
priority: float | None = RankFeature(positive_score_impact=True)
parser_type: WarcDirectAnswersParserType = Keyword()
xpath: str | None = Keyword()
url_xpath: str | None = Keyword()
text_xpath: str | None = Keyword()

@cached_property
def url_pattern(self) -> Pattern | None:
if self.url_pattern_regex is None:
raise ValueError("No URL pattern regex.")
return pattern(self.url_pattern_regex)

class Index:
name = "aql_warc_direct_answers_parsers"
settings = {
"number_of_shards": 1,
"number_of_replicas": 2,
}


WarcMainContentParserType = Literal[
"resiliparse",
]
Expand Down
Loading

0 comments on commit 948a55e

Please sign in to comment.