Remove nextclade.qc data from prepocessing as not included in dataset…

…. Update docs.
loculus-project · May 16, 2024 · cbef803 · cbef803
1 parent e18eb1e
commit cbef803
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 21 deletions.
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -1705,12 +1705,6 @@ defaultOrganisms:
         - name: completeness
           type: float
           header: "Alignment states and QC metrics"
-        - name: total_stop_codons
-          type: int
-          header: "Alignment states and QC metrics"
-        - name: stop_codons
-          type: string
-          header: "Alignment states and QC metrics"
       website:
         tableColumns:
           - collection_date
@@ -1844,16 +1838,6 @@ defaultOrganisms:
                 type: float
               inputs:
                 input: nextclade.coverage
-            total_stop_codons:
-              function: identity
-              args:
-                type: int
-              inputs:
-                input: nextclade.qc.stopCodons.totalStopCodons
-            stop_codons:
-              function: identity
-              inputs:
-                input: nextclade.qc.stopCodons.stopCodons
             collection_date:
               function: process_date
               inputs:

diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md
@@ -60,3 +60,11 @@ docker run -it --platform=linux/amd64 --network host --rm nextclade_processing p
 
 - Install Ruff to lint/format
 - Use `mypy` to check types: `mypy -p src  --python-version 3.12`
+
+When deployed on kubernetes the preprocessing pipeline reads in config files which are created in `loculus/kubernetes/loculus/templates/loculus-preprocessing-config.yaml`. When run locally the pipeline uses only the default values defined in `preprocessing/nextclade/src/loculus_preprocessing/config.py`. When running the preprocessing pipeline locally it makes sense to create a local config file and use this in the pipeline as follows:
+
+```
+prepro --config-file=preprocessing-config.yaml --keep-tmp-dir
+```
+
+Additionally, the `--keep-tmp-dir` is useful for debugging issues.
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -123,7 +123,7 @@ def enrich_with_nextclade(
             os.makedirs(os.path.dirname(input_file), exist_ok=True)
             with open(input_file, "w", encoding="utf-8") as f:
                 for id, seg_dict in unaligned_nucleotide_sequences.items():
-                    if segment in seg_dict:
+                    if segment in seg_dict and seg_dict[segment] is not None:
                         f.write(f">{id}\n")
                         f.write(f"{seg_dict[segment]}\n")
 
@@ -133,7 +133,7 @@ def enrich_with_nextclade(
                 f"--output-all={result_dir_seg}",
                 f"--input-dataset={dataset_dir_seg}",
                 f"--output-translations={
-                    result_dir}/nextclade.cds_translation.{{cds}}.fasta",
+                    result_dir_seg}/nextclade.cds_translation.{{cds}}.fasta",
                 "--jobs=1",
                 "--",
                 f"{input_file}",
@@ -315,9 +315,15 @@ def process_single(
                 )
                 continue
             input_data[arg_name] = unprocessed.inputMetadata[input_path]
-        processing_result = ProcessingFunctions.call_function(
-            spec.function, spec.args, input_data, output_field
-        )
+        try:
+            processing_result = ProcessingFunctions.call_function(
+                spec.function, spec.args, input_data, output_field
+            )
+        except:
+            print(spec)
+            print(input_data)
+            raise Exception("processing failed")
+
         errors.extend(processing_result.errors)
         warnings.extend(processing_result.warnings)
         output_metadata[output_field] = processing_result.datum