Skip to content

Commit

Permalink
Fix code
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Nov 24, 2023
1 parent 34e0cf0 commit 27ffc43
Show file tree
Hide file tree
Showing 7 changed files with 20 additions and 14 deletions.
22 changes: 13 additions & 9 deletions archive_query_log/downloaders/warc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime
from itertools import chain
from typing import Iterable, Iterator, Final, TypeVar, Generic, Type
from typing import Iterable, Iterator, TypeVar, Generic, Type, Callable
from uuid import uuid5

from click import echo
Expand All @@ -23,7 +24,7 @@


class _WrapperArcWarcRecord(ArcWarcRecord, Generic[_T]):
wrapped: Final[_T]
wrapped: _T

def __init__(self, wrapped: _T, record: ArcWarcRecord):
super().__init__(
Expand Down Expand Up @@ -153,6 +154,14 @@ class _ResultArcWarcRecord(_WrapperArcWarcRecord[Result]):
pass


def _capture_timestamp_distance(
timestamp: datetime) -> Callable[[CdxCapture], float]:
def _distance(capture: CdxCapture) -> float:
return abs(timestamp - capture.timestamp).total_seconds()

return _distance


def _download_result_warc(
config: Config,
result: Result,
Expand All @@ -176,7 +185,7 @@ def _download_result_warc(
match_type=CdxMatchType.EXACT,
to_timestamp=capture_timestamp,
),
key=lambda capture: abs(capture_timestamp - capture.timestamp),
key=_capture_timestamp_distance(capture_timestamp),
default=None,
)
nearest_result_capture_after_serp: CdxCapture | None = min(
Expand All @@ -185,7 +194,7 @@ def _download_result_warc(
match_type=CdxMatchType.EXACT,
from_timestamp=capture_timestamp,
),
key=lambda capture: abs(capture_timestamp - capture.timestamp),
key=_capture_timestamp_distance(capture_timestamp),
default=None,
)
if nearest_result_capture_before_serp is None:
Expand Down Expand Up @@ -251,11 +260,6 @@ def download_results_warc(config: Config) -> None:
for result in changed_results
)

for record in result_records:
print(record.rec_headers["WARC-Record-Id"])

return

# Write to S3.
stored_records: Iterator[WarcS3Record] = (
config.s3.warc_store.write(result_records))
Expand Down
2 changes: 2 additions & 0 deletions archive_query_log/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class BaseDocument(Document):
# We could use a different field for that and use this one for the last
# modified date.

# pylint: disable=redefined-builtin
# noinspection PyShadowingBuiltins
def __init__(self, id: str | None = None, **kwargs):
if id is not None:
if "meta" not in kwargs:
Expand Down
2 changes: 1 addition & 1 deletion archive_query_log/parsers/url_offset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def add_url_offset_parser(
config: Config,
provider_id: str,
url_pattern_regex: str | None,
priority: int | None,
priority: float | None,
parser_type: UrlOffsetParserType,
parameter: str | None,
segment: int | None,
Expand Down
2 changes: 1 addition & 1 deletion archive_query_log/parsers/url_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def add_url_page_parser(
config: Config,
provider_id: str,
url_pattern_regex: str | None,
priority: int | None,
priority: float | None,
parser_type: UrlPageParserType,
parameter: str | None,
segment: int | None,
Expand Down
2 changes: 1 addition & 1 deletion archive_query_log/parsers/url_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def add_url_query_parser(
config: Config,
provider_id: str,
url_pattern_regex: str | None,
priority: int | None,
priority: float | None,
parser_type: UrlQueryParserType,
parameter: str | None,
segment: int | None,
Expand Down
2 changes: 1 addition & 1 deletion archive_query_log/parsers/warc_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def add_warc_query_parser(
config: Config,
provider_id: str,
url_pattern_regex: str | None,
priority: int | None,
priority: float | None,
parser_type: WarcQueryParserType,
xpath: str | None,
remove_pattern_regex: str | None,
Expand Down
2 changes: 1 addition & 1 deletion archive_query_log/parsers/warc_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def add_warc_snippets_parser(
config: Config,
provider_id: str,
url_pattern_regex: str | None,
priority: int | None,
priority: float | None,
parser_type: WarcSnippetsParserType,
xpath: str | None,
url_xpath: str | None,
Expand Down

0 comments on commit 27ffc43

Please sign in to comment.