diff --git a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml index 41df4adfe1..cd3101d64d 100644 --- a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml +++ b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml @@ -1,12 +1,14 @@ -{{- range $key, $values := (.Values.organisms | default .Values.defaultOrganisms) }} -{{- if $values.preprocessing.configFile }} +{{- range $organism, $organismConfig := (.Values.organisms | default .Values.defaultOrganisms) }} +{{- range $processingIndex, $processingConfig := $organismConfig.preprocessing }} +{{- if $processingConfig.configFile }} --- apiVersion: v1 kind: ConfigMap metadata: - name: loculus-preprocessing-config-{{ $key }} + name: loculus-preprocessing-config-{{ $organism }}-v{{ $processingConfig.version }}-{{ $processingIndex }} data: preprocessing-config.yaml: | - {{- $values.preprocessing.configFile | toYaml | nindent 4 -}} + {{- $processingConfig.configFile | toYaml | nindent 4 -}} +{{- end }} {{- end }} {{- end }} \ No newline at end of file diff --git a/kubernetes/loculus/templates/loculus-preprocessing-deployment.yaml b/kubernetes/loculus/templates/loculus-preprocessing-deployment.yaml index 36595e7d51..9cfde8e7ec 100644 --- a/kubernetes/loculus/templates/loculus-preprocessing-deployment.yaml +++ b/kubernetes/loculus/templates/loculus-preprocessing-deployment.yaml @@ -3,17 +3,18 @@ "http://host.k3d.internal:8079" "http://loculus-backend-service:8079" }} -{{- $keycloakHost := .Values.environment | eq "server" | ternary +{{- $organismcloakHost := .Values.environment | eq "server" | ternary (printf "https://authentication-%s" $.Values.host) "http://loculus-keycloak-service:8083" }} {{- if not .Values.disablePreprocessing }} -{{- range $key, $value := (.Values.organisms | default .Values.defaultOrganisms) }} +{{- range $organism, $organismConfig := (.Values.organisms | default .Values.defaultOrganisms) }} +{{- range $processingIndex, $processingConfig := $organismConfig.preprocessing }} --- apiVersion: apps/v1 kind: Deployment metadata: - name: loculus-preprocessing-{{ $key }} + name: loculus-preprocessing-{{ $organism }}-v{{ $processingConfig.version }}-{{ $processingIndex }} annotations: argocd.argoproj.io/sync-options: Replace=true reloader.stakater.com/auto: "true" @@ -22,43 +23,36 @@ spec: selector: matchLabels: app: loculus - component: loculus-preprocessing-{{ $key }} + component: loculus-preprocessing-{{ $organism }}-v{{ $processingConfig.version }}-{{ $processingIndex }} template: metadata: labels: app: loculus - component: loculus-preprocessing-{{ $key }} + component: loculus-preprocessing-{{ $organism }}-v{{ $processingConfig.version }}-{{ $processingIndex }} spec: containers: - - name: preprocessing-{{ $key }} - image: {{ $value.preprocessing.image}}:{{ $dockerTag }} + - name: preprocessing-{{ $organism }} + image: {{ $processingConfig.image}}:{{ $dockerTag }} imagePullPolicy: Always args: - {{- range $arg := $value.preprocessing.args }} + {{- range $arg := $processingConfig.args }} - "{{ $arg }}" {{- end }} - - "--backend-host={{ $backendHost }}/{{ $key }}" - - "--keycloak-host={{ $keycloakHost }}" - {{- if $value.preprocessing.warnings }} - - "--withWarnings" - {{- end }} - {{- if $value.preprocessing.errors }} - - "--withErrors" - {{- end }} - {{- if $value.preprocessing.randomWarnError }} - - "--randomWarnError" - {{- end }} - {{- if $value.preprocessing.configFile }} + - "--backend-host={{ $backendHost }}/{{ $organism }}" + - "--keycloak-host={{ $organismcloakHost }}" + - "--pipeline-version={{ $processingConfig.version }}" + {{- if $processingConfig.configFile }} - "--config=/etc/config/preprocessing-config.yaml" volumeMounts: - - name: loculus-preprocessing-config-volume-{{ $key }} + - name: preprocessing-config-volume-{{ $organism }}-v{{ $processingConfig.version }}-{{ $processingIndex }} mountPath: /etc/config volumes: - - name: loculus-preprocessing-config-volume-{{ $key }} + - name: preprocessing-config-volume-{{ $organism }}-v{{ $processingConfig.version }}-{{ $processingIndex }} configMap: - name: loculus-preprocessing-config-{{ $key }} + name: loculus-preprocessing-config-{{ $organism }}-v{{ $processingConfig.version }}-{{ $processingIndex }} {{- end }} imagePullSecrets: - name: ghcr-secret {{- end }} {{- end }} +{{- end }} diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 0fb3ca96ea..082812cdc1 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -62,12 +62,20 @@ defaultOrganisms: dateToSortBy: date partitionBy: pango_lineage preprocessing: - image: ghcr.io/loculus-project/preprocessing-dummy - args: - - "--watch" - warnings: true - errors: true - randomWarnError: true + - version: 1 + image: ghcr.io/loculus-project/preprocessing-dummy + args: + - "--watch" + - "--withWarnings" + - "--withErrors" + - "--randomWarnError" + - version: 2 + image: ghcr.io/loculus-project/preprocessing-dummy + args: + - "--watch" + - "--withWarnings" + - "--withErrors" + - "--randomWarnError" referenceGenomes: nucleotideSequences: - name: "main" @@ -146,57 +154,108 @@ defaultOrganisms: silo: dateToSortBy: collection_date preprocessing: - image: ghcr.io/loculus-project/preprocessing-nextclade - args: - - "prepro" - configFile: - version: 1 - log_level: DEBUG - nextclade_dataset_name: nextstrain/mpox/all-clades - nextclade_dataset_tag: 2024-01-16--20-31-02Z - genes: - OPG001: 247 - batch_size: 100 - processing_spec: - collection_date: - function: process_date - inputs: - date: collection_date - release_date: ncbi_release_date - required: true - ncbi_release_date: - function: parse_timestamp - inputs: - timestamp: ncbi_release_date - clade: - function: identity - inputs: - input: nextclade.clade - outbreak: - function: identity - inputs: - input: nextclade.customNodeAttributes.outbreak - lineage: - function: identity - inputs: - input: nextclade.customNodeAttributes.lineage - country: - function: identity - inputs: - input: country - required: true - author_affiliation: - function: identity - inputs: - input: author_affiliation - authors: - function: identity - inputs: - input: authors - isolate_name: - function: identity - inputs: - input: isolate_name + - version: 1 + image: ghcr.io/loculus-project/preprocessing-nextclade + args: + - "prepro" + configFile: + log_level: DEBUG + nextclade_dataset_name: nextstrain/mpox/all-clades + nextclade_dataset_tag: 2024-01-16--20-31-02Z + genes: + OPG001: 247 + batch_size: 100 + processing_spec: + collection_date: + function: process_date + inputs: + date: collection_date + release_date: ncbi_release_date + required: true + ncbi_release_date: + function: parse_timestamp + inputs: + timestamp: ncbi_release_date + clade: + function: identity + inputs: + input: nextclade.clade + outbreak: + function: identity + inputs: + input: nextclade.customNodeAttributes.outbreak + lineage: + function: identity + inputs: + input: nextclade.customNodeAttributes.lineage + country: + function: identity + inputs: + input: country + required: true + author_affiliation: + function: identity + inputs: + input: author_affiliation + authors: + function: identity + inputs: + input: authors + isolate_name: + function: identity + inputs: + input: isolate_name + - version: 2 + image: ghcr.io/loculus-project/preprocessing-nextclade + args: + - "prepro" + configFile: + log_level: DEBUG + nextclade_dataset_name: nextstrain/mpox/all-clades + nextclade_dataset_tag: 2024-01-16--20-31-02Z + genes: + OPG001: 247 + batch_size: 100 + processing_spec: + collection_date: + function: process_date + inputs: + date: collection_date + release_date: ncbi_release_date + required: true + ncbi_release_date: + function: parse_timestamp + inputs: + timestamp: ncbi_release_date + clade: + function: identity + inputs: + input: nextclade.clade + outbreak: + function: identity + inputs: + input: nextclade.customNodeAttributes.outbreak + lineage: + function: identity + inputs: + input: nextclade.customNodeAttributes.lineage + country: + function: identity + inputs: + input: country + required: true + author_affiliation: + function: identity + inputs: + input: author_affiliation + authors: + function: identity + inputs: + input: authors + isolate_name: + function: identity + inputs: + input: isolate_name referenceGenomes: nucleotideSequences: - name: "main" @@ -335,173 +394,340 @@ defaultOrganisms: silo: dateToSortBy: collection_date preprocessing: - image: ghcr.io/loculus-project/preprocessing-nextclade - args: - - "prepro" - configFile: - version: 1 - log_level: DEBUG - nextclade_dataset_name: nextstrain/ebola/zaire - nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output - genes: - - NP - - VP35 - - VP40 - - GP - - sGP - - ssGP - - VP30 - - VP24 - - L - batch_size: 100 - processing_spec: - total_snps: - function: identity - inputs: - input: nextclade.totalSubstitutions - total_inserted_nucs: - function: identity - inputs: - input: nextclade.totalInsertions - total_deleted_nucs: - function: identity - inputs: - input: nextclade.totalDeletions - total_ambiguous_nucs: - function: identity - inputs: - input: nextclade.totalNonACGTNs - total_unknown_nucs: - function: identity - inputs: - input: nextclade.totalMissing - total_frame_shifts: - function: identity - inputs: - input: nextclade.totalFrameShifts - frame_shifts: - function: identity - inputs: - input: nextclade.frameShifts - completeness: - function: identity - inputs: - input: nextclade.coverage - total_stop_codons: - function: identity - inputs: - input: nextclade.qc.stopCodons.totalStopCodons - stop_codons: - function: identity - inputs: - input: nextclade.qc.stopCodons.stopCodons - collection_date: - function: process_date - inputs: - date: collection_date - release_date: ncbi_release_date - required: true - ncbi_release_date: - function: parse_timestamp - inputs: - timestamp: ncbi_release_date - country: - function: identity - inputs: - input: country - required: true - author_affiliation: - function: identity - inputs: - input: author_affiliation - authors: - function: identity - inputs: - input: authors - isolate_name: - function: identity - inputs: - input: isolate_name - submitter_country: - function: identity - inputs: - input: submitter_country - division: - function: identity - inputs: - input: division - insdc_accession_base: - function: identity - inputs: - input: insdc_accession_base - insdc_version: - function: identity - inputs: - input: insdc_version - insdc_accession_full: - function: identity - inputs: - input: insdc_accession_full - bioprojects: - function: identity - inputs: - input: bioprojects - biosample_accession: - function: identity - inputs: - input: biosample_accession - ncbi_completeness: - function: identity - inputs: - input: ncbi_completeness - ncbi_host_name: - function: identity - inputs: - input: ncbi_host_name - ncbi_host_tax_id: - function: identity - inputs: - input: ncbi_host_tax_id - ncbi_is_lab_host: - function: identity - inputs: - input: ncbi_is_lab_host - ncbi_length: - function: identity - inputs: - input: ncbi_length - ncbi_protein_count: - function: identity - inputs: - input: ncbi_protein_count - ncbi_update_date: - function: parse_timestamp - inputs: - timestamp: ncbi_update_date - ncbi_sourcedb: - function: identity - inputs: - input: ncbi_sourcedb - ncbi_virus_name: - function: identity - inputs: - input: ncbi_virus_name - ncbi_virus_tax_id: - function: identity - inputs: - input: ncbi_virus_tax_id - isolate_source: - function: identity - inputs: - input: isolate_source - sra_accessions: - function: identity - inputs: - input: sra_accessions - metadata_hash: - function: identity - inputs: - input: metadata_hash + - version: 1 + image: ghcr.io/loculus-project/preprocessing-nextclade + args: + - "prepro" + configFile: + log_level: DEBUG + nextclade_dataset_name: nextstrain/ebola/zaire + nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output + genes: + - NP + - VP35 + - VP40 + - GP + - sGP + - ssGP + - VP30 + - VP24 + - L + batch_size: 100 + processing_spec: + total_snps: + function: identity + inputs: + input: nextclade.totalSubstitutions + total_inserted_nucs: + function: identity + inputs: + input: nextclade.totalInsertions + total_deleted_nucs: + function: identity + inputs: + input: nextclade.totalDeletions + total_ambiguous_nucs: + function: identity + inputs: + input: nextclade.totalNonACGTNs + total_unknown_nucs: + function: identity + inputs: + input: nextclade.totalMissing + total_frame_shifts: + function: identity + inputs: + input: nextclade.totalFrameShifts + frame_shifts: + function: identity + inputs: + input: nextclade.frameShifts + completeness: + function: identity + inputs: + input: nextclade.coverage + total_stop_codons: + function: identity + inputs: + input: nextclade.qc.stopCodons.totalStopCodons + stop_codons: + function: identity + inputs: + input: nextclade.qc.stopCodons.stopCodons + collection_date: + function: process_date + inputs: + date: collection_date + release_date: ncbi_release_date + required: true + ncbi_release_date: + function: parse_timestamp + inputs: + timestamp: ncbi_release_date + country: + function: identity + inputs: + input: country + required: true + author_affiliation: + function: identity + inputs: + input: author_affiliation + authors: + function: identity + inputs: + input: authors + isolate_name: + function: identity + inputs: + input: isolate_name + submitter_country: + function: identity + inputs: + input: submitter_country + division: + function: identity + inputs: + input: division + insdc_accession_base: + function: identity + inputs: + input: insdc_accession_base + insdc_version: + function: identity + inputs: + input: insdc_version + insdc_accession_full: + function: identity + inputs: + input: insdc_accession_full + bioprojects: + function: identity + inputs: + input: bioprojects + biosample_accession: + function: identity + inputs: + input: biosample_accession + ncbi_completeness: + function: identity + inputs: + input: ncbi_completeness + ncbi_host_name: + function: identity + inputs: + input: ncbi_host_name + ncbi_host_tax_id: + function: identity + inputs: + input: ncbi_host_tax_id + ncbi_is_lab_host: + function: identity + inputs: + input: ncbi_is_lab_host + ncbi_length: + function: identity + inputs: + input: ncbi_length + ncbi_protein_count: + function: identity + inputs: + input: ncbi_protein_count + ncbi_update_date: + function: parse_timestamp + inputs: + timestamp: ncbi_update_date + ncbi_sourcedb: + function: identity + inputs: + input: ncbi_sourcedb + ncbi_virus_name: + function: identity + inputs: + input: ncbi_virus_name + ncbi_virus_tax_id: + function: identity + inputs: + input: ncbi_virus_tax_id + isolate_source: + function: identity + inputs: + input: isolate_source + sra_accessions: + function: identity + inputs: + input: sra_accessions + metadata_hash: + function: identity + inputs: + input: metadata_hash + - version: 2 + image: ghcr.io/loculus-project/preprocessing-nextclade + args: + - "prepro" + configFile: + log_level: DEBUG + nextclade_dataset_name: nextstrain/ebola/zaire + nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output + genes: + - NP + - VP35 + - VP40 + - GP + - sGP + - ssGP + - VP30 + - VP24 + - L + batch_size: 100 + processing_spec: + total_snps: + function: identity + inputs: + input: nextclade.totalSubstitutions + total_inserted_nucs: + function: identity + inputs: + input: nextclade.totalInsertions + total_deleted_nucs: + function: identity + inputs: + input: nextclade.totalDeletions + total_ambiguous_nucs: + function: identity + inputs: + input: nextclade.totalNonACGTNs + total_unknown_nucs: + function: identity + inputs: + input: nextclade.totalMissing + total_frame_shifts: + function: identity + inputs: + input: nextclade.totalFrameShifts + frame_shifts: + function: identity + inputs: + input: nextclade.frameShifts + completeness: + function: identity + inputs: + input: nextclade.coverage + total_stop_codons: + function: identity + inputs: + input: nextclade.qc.stopCodons.totalStopCodons + stop_codons: + function: identity + inputs: + input: nextclade.qc.stopCodons.stopCodons + collection_date: + function: process_date + inputs: + date: collection_date + release_date: ncbi_release_date + required: true + ncbi_release_date: + function: parse_timestamp + inputs: + timestamp: ncbi_release_date + country: + function: identity + inputs: + input: country + required: true + author_affiliation: + function: identity + inputs: + input: author_affiliation + authors: + function: identity + inputs: + input: authors + isolate_name: + function: identity + inputs: + input: isolate_name + submitter_country: + function: identity + inputs: + input: submitter_country + division: + function: identity + inputs: + input: division + insdc_accession_base: + function: identity + inputs: + input: insdc_accession_base + insdc_version: + function: identity + inputs: + input: insdc_version + insdc_accession_full: + function: identity + inputs: + input: insdc_accession_full + bioprojects: + function: identity + inputs: + input: bioprojects + biosample_accession: + function: identity + inputs: + input: biosample_accession + ncbi_completeness: + function: identity + inputs: + input: ncbi_completeness + ncbi_host_name: + function: identity + inputs: + input: ncbi_host_name + ncbi_host_tax_id: + function: identity + inputs: + input: ncbi_host_tax_id + ncbi_is_lab_host: + function: identity + inputs: + input: ncbi_is_lab_host + ncbi_length: + function: identity + inputs: + input: ncbi_length + ncbi_protein_count: + function: identity + inputs: + input: ncbi_protein_count + ncbi_update_date: + function: parse_timestamp + inputs: + timestamp: ncbi_update_date + ncbi_sourcedb: + function: identity + inputs: + input: ncbi_sourcedb + ncbi_virus_name: + function: identity + inputs: + input: ncbi_virus_name + ncbi_virus_tax_id: + function: identity + inputs: + input: ncbi_virus_tax_id + isolate_source: + function: identity + inputs: + input: isolate_source + sra_accessions: + function: identity + inputs: + input: sra_accessions + metadata_hash: + function: identity + inputs: + input: metadata_hash ingest: args: - snakemake diff --git a/preprocessing/dummy/main.py b/preprocessing/dummy/main.py index 127cec4680..1ba5dca117 100644 --- a/preprocessing/dummy/main.py +++ b/preprocessing/dummy/main.py @@ -22,7 +22,7 @@ parser.add_argument("--keycloak-password", type=str, default="dummy_preprocessing_pipeline", help="Keycloak password to use for authentication") parser.add_argument("--keycloak-token-path", type=str, default="/realms/loculus/protocol/openid-connect/token", help="Path to Keycloak token endpoint") -parser.add_argument("--pipelineVersion", type=int, default=1) +parser.add_argument("--pipeline-version", type=int, default=1) args = parser.parse_args() backendHost = args.backend_host @@ -34,7 +34,7 @@ keycloakUser = args.keycloak_user keycloakPassword = args.keycloak_password keycloakTokenPath = args.keycloak_token_path -pipelineVersion = args.pipelineVersion +pipeline_version = args.pipeline_version @dataclass @@ -62,7 +62,7 @@ def fetch_unprocessed_sequences(n: int) -> List[Sequence]: url = backendHost + "/extract-unprocessed-data" params = { "numberOfSequenceEntries": n, - "pipelineVersion": pipelineVersion + "pipelineVersion": pipeline_version } headers = {'Authorization': 'Bearer ' + get_jwt()} response = requests.post(url, data=params, headers=headers) @@ -141,7 +141,7 @@ def process(unprocessed: List[Sequence]) -> List[Sequence]: def submit_processed_sequences(processed: List[Sequence]): json_strings = [json.dumps(dataclasses.asdict(sequence)) for sequence in processed] ndjson_string = '\n'.join(json_strings) - url = backendHost + "/submit-processed-data?pipelineVersion=" + str(pipelineVersion) + url = backendHost + "/submit-processed-data?pipelineVersion=" + str(pipeline_version) headers = {'Content-Type': 'application/x-ndjson', 'Authorization': 'Bearer ' + get_jwt()} response = requests.post(url, data=ndjson_string, headers=headers) if not response.ok: diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py index 531b2d3ac9..6eb9fe1024 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/config.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/config.py @@ -32,7 +32,7 @@ class Config: reference_length: int = 197209 batch_size: int = 5 processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict) - version: int = 1 + pipeline_version: int = 1 def load_config_from_yaml(config_file: str, config: Config) -> Config: diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 1af6b383d6..d3b9e347f3 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -38,7 +38,7 @@ def fetch_unprocessed_sequences(n: int, config: Config) -> Sequence[UnprocessedEntry]: url = config.backend_host.rstrip("/") + "/extract-unprocessed-data" logging.debug(f"Fetching {n} unprocessed sequences from {url}") - params = {"numberOfSequenceEntries": n, "pipelineVersion": config.version} + params = {"numberOfSequenceEntries": n, "pipelineVersion": config.pipeline_version} headers = {"Authorization": "Bearer " + get_jwt(config)} response = requests.post(url, data=params, headers=headers, timeout=10) if not response.ok: @@ -309,7 +309,7 @@ def submit_processed_sequences(processed: Sequence[ProcessedEntry], config: Conf "Content-Type": "application/x-ndjson", "Authorization": "Bearer " + get_jwt(config), } - params = {"pipelineVersion": config.version} + params = {"pipelineVersion": config.pipeline_version} response = requests.post(url, data=ndjson_string, headers=headers, params=params, timeout=10) if not response.ok: with open("failed_submission.json", "w", encoding="utf-8") as f: