diff --git a/archive_query_log/cli/archives.py b/archive_query_log/cli/archives.py index 735fcbd3..3467e719 100644 --- a/archive_query_log/cli/archives.py +++ b/archive_query_log/cli/archives.py @@ -148,7 +148,7 @@ def archive_it( echo("Load Archive-It collections.") collections_api_url = urljoin(api_url, "/api/collection") - response = config.http_session.get( + response = config.http.session.get( collections_api_url, params={"limit": 0, "format": "json"}) num_collections = int(response.headers["Total-Row-Count"]) @@ -159,7 +159,7 @@ def archive_it( unit="archives", disable=not auto_merge and not no_merge) offset_range = range(0, num_collections, page_size) for offset in offset_range: - response = config.http_session.get( + response = config.http.session.get( collections_api_url, params={"limit": page_size, "offset": offset, "format": "json"}) response_list = response.json() diff --git a/archive_query_log/cli/captures.py b/archive_query_log/cli/captures.py index ed9b5f2b..1dede4d7 100644 --- a/archive_query_log/cli/captures.py +++ b/archive_query_log/cli/captures.py @@ -33,7 +33,7 @@ def _iter_captures( ) -> Iterator[Capture]: cdx_api = CdxApi( api_url=source.archive.cdx_api_url, - session=config.http_session, + session=config.http.session, ) url = f"https://{source.provider.domain}" url = urljoin(url, source.provider.url_path_prefix) diff --git a/archive_query_log/config.py b/archive_query_log/config.py index ab59c1f6..f875e865 100644 --- a/archive_query_log/config.py +++ b/archive_query_log/config.py @@ -12,13 +12,6 @@ from archive_query_log import __version__ as version -@dataclass(frozen=True) -class EsIndex(DataClassJsonMixin): - name: str - mapping: dict - settings: dict - - @dataclass(frozen=True) class EsConfig(DataClassJsonMixin): host: str @@ -74,20 +67,11 @@ def parallel_bulk(self, actions, *args, **kwargs): @dataclass(frozen=True) -class Config(DataClassJsonMixin): - es: EsConfig - es_index_serps: EsIndex - es_index_results: EsIndex - es_index_url_query_parsers: EsIndex - es_index_url_page_parsers: EsIndex - es_index_url_offset_parsers: EsIndex - es_index_url_language_parsers: EsIndex - es_index_serp_query_parsers: EsIndex - es_index_serp_snippets_parsers: EsIndex - es_index_serp_direct_answer_parsers: EsIndex +class HttpConfig(DataClassJsonMixin): + max_retries: int = 5 @cached_property - def http_session(self) -> Session: + def session(self) -> Session: session = Session() session.headers.update({ "User-Agent": f"AQL/{version} (Webis group)", @@ -108,3 +92,9 @@ def http_session(self) -> Session: session.mount("http://", _adapter) session.mount("https://", _adapter) return session + + +@dataclass(frozen=True) +class Config(DataClassJsonMixin): + es: EsConfig + http: HttpConfig = HttpConfig() diff --git a/config.yml b/config.yml index 9b889c4e..df1d3c5f 100644 --- a/config.yml +++ b/config.yml @@ -3,161 +3,3 @@ es: port: 9200 username: null password: null -es_index_serps: - name: aql_serps - settings: - number_of_shards: 10 - number_of_replicas: 2 - mapping: {} # TODO -es_index_results: - name: aql_results - settings: - number_of_shards: 20 - number_of_replicas: 2 - mapping: {} # TODO -es_index_url_query_parsers: - name: aql_url_query_parsers - settings: - number_of_shards: 1 - number_of_replicas: 2 - mapping: - properties: - url_filter_regex: - type: keyword - parser_type: - type: keyword - parameter: - type: keyword - segment: - type: integer - replacements: - type: object - properties: - search: - type: keyword - replace: - type: keyword -es_index_url_page_parsers: - name: aql_url_page_parsers - settings: - number_of_shards: 1 - number_of_replicas: 2 - mapping: - properties: - url_filter_regex: - type: keyword - parser_type: - type: keyword - parameter: - type: keyword - segment: - type: integer - replacements: - type: object - properties: - search: - type: keyword - replace: - type: keyword -es_index_url_offset_parsers: - name: aql_url_offset_parsers - settings: - number_of_shards: 1 - number_of_replicas: 2 - mapping: - properties: - url_filter_regex: - type: keyword - parser_type: - type: keyword - parameter: - type: keyword - segment: - type: integer - replacements: - type: object - properties: - search: - type: keyword - replace: - type: keyword -es_index_url_language_parsers: - name: aql_url_language_parsers - settings: - number_of_shards: 1 - number_of_replicas: 2 - mapping: - properties: - url_filter_regex: - type: keyword - parser_type: - type: keyword - parameter: - type: keyword - segment: - type: integer - replacements: - type: object - properties: - search: - type: keyword - replace: - type: keyword -es_index_serp_query_parsers: - name: aql_serp_query_parsers - settings: - number_of_shards: 1 - number_of_replicas: 2 - mapping: - properties: - url_filter_regex: - type: keyword - parser_type: - type: keyword - html_selector: - type: keyword - html_attribute: - type: keyword - html_text: - type: boolean - replacements: - type: object - properties: - search: - type: keyword - replace: - type: keyword -es_index_serp_snippets_parsers: - name: aql_serp_snippets_parsers - settings: - number_of_shards: 1 - number_of_replicas: 2 - mapping: - properties: - url_filter_regex: - type: keyword - parser_type: - type: keyword - html_selector: - type: keyword - html_selector_url: - type: keyword - html_attribute_url: - type: keyword - html_selector_title: - type: keyword - html_selector_text: - type: keyword -es_index_serp_direct_answer_parsers: - name: aql_serp_direct_answer_parsers - settings: - number_of_shards: 1 - number_of_replicas: 2 - mapping: - properties: - url_filter_regex: - type: keyword - parser_type: - type: keyword - html_selector: - type: keyword