PIP-1639-docker-to-input (#74)

ENCODE-DCC · Nov 15, 2021 · 087baba · 087baba
1 parent e3f42fd
commit 087baba
Show file tree

Hide file tree

Showing 33 changed files with 304 additions and 36 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -17,7 +17,7 @@ python_defaults: &python_defaults
 
 machine_defaults: &machine_defaults
   machine:
-    image: ubuntu-1604:202007-01 
+    image: ubuntu-2004:202107-02
   working_directory: ~/long-read-rna-pipeline
 
 commands:
@@ -27,15 +27,16 @@ commands:
       - run:
           command: |
             echo "export TAG=encodedcc/${CIRCLE_PROJECT_REPONAME}:${CIRCLE_BRANCH}_${CIRCLE_WORKFLOW_ID}" >> ${BASH_ENV}
-            echo "export CROMWELL=./cromwell-49.jar" >> ${BASH_ENV}
-            echo "export WOMTOOL=./womtool-49.jar" >> ${BASH_ENV}
+            echo "export CROMWELL=./cromwell-70.jar" >> ${BASH_ENV}
+            echo "export WOMTOOL=./womtool-70.jar" >> ${BASH_ENV}
             source ${BASH_ENV}
 
   install_python_requirements:
     description: "Install pytest workflow requirements"
     steps:
+      - run: pyenv install 3.7.8
       - run: pyenv global 3.7.8
-      - run: pip install --upgrade pip
+      - run: pip install --upgrade pip==21.3.1
       - run: pip install -r requirements.txt
 
   download_file:
@@ -52,9 +53,9 @@ commands:
       - set_env_variables
       - install_python_requirements
       - download_file:
-          file_url: "https://github.com/broadinstitute/cromwell/releases/download/49/cromwell-49.jar"
+          file_url: "https://github.com/broadinstitute/cromwell/releases/download/70/cromwell-70.jar"
       - download_file:
-          file_url: "https://github.com/broadinstitute/cromwell/releases/download/49/womtool-49.jar"
+          file_url: "https://github.com/broadinstitute/cromwell/releases/download/70/womtool-70.jar"
 
   run_test_tag:
     description: "Run tagged test"
@@ -68,6 +69,39 @@ commands:
           command: pytest -v --tag <<parameters.tag>> --wt <<parameters.wt>> --symlink --kwd
           no_output_timeout: 60m
 
+  add_docker_image_tag_to_inputs:
+    description: "Add docker tag"
+    parameters:
+      path_to_inputs:
+        type: string
+    steps:
+      - run:
+          command: |
+            for json in <<parameters.path_to_inputs>>
+            do
+              prefix=$(basename $json .json)
+              cat $json | jq ".+{\"${prefix}.docker\": \"${TAG}\"}" > tmp.json
+              cp tmp.json $json 
+              rm tmp.json
+            done
+
+  add_runtime_environment_to_input:
+    description: "Add runtime environment"
+    parameters:
+      path_to_input:
+        type: string
+      prefix:
+        type: string
+    steps:
+      - run:
+          command: |
+            json=<<parameters.path_to_input>>
+            prefix=<<parameters.prefix>>
+            cat $json | jq ".+{\"${prefix}.runtime_environment\": {\"docker\": \"${TAG}\", \"singularity\": \"\"}}" > tmp.json
+            cp tmp.json $json 
+            rm tmp.json
+
+
 # Jobs
 jobs:
   build:
@@ -111,6 +145,8 @@ jobs:
     steps:
       - checkout
       - prepare_pytest_environment
+      - add_docker_image_tag_to_inputs:
+          path_to_inputs: "test/unit/json/*.json"
       - run_test_tag:
           tag: "unit"
           wt: "4"
@@ -120,6 +156,15 @@ jobs:
     steps:
       - checkout
       - prepare_pytest_environment
+      - add_runtime_environment_to_input:
+          path_to_input: "test/integration/json/test_concatenate_files.json"
+          prefix: "test_concatenate_files.concatenate_files" 
+      - add_runtime_environment_to_input:
+          path_to_input: "test/integration/json/test_crop_reference_fasta_headers.json"
+          prefix: "test_crop_reference_fasta_headers.crop_reference_fasta_headers"
+      - add_runtime_environment_to_input:
+          path_to_input: "test/integration/json/test_make_gtf_from_spikein_fasta.json"
+          prefix: "test_make_gtf_from_spikein_fasta.make_gtf_from_spikein_fasta" 
       - run_test_tag:
           tag: "integration"
           wt: "4"
@@ -129,6 +174,8 @@ jobs:
     steps:
       - checkout
       - prepare_pytest_environment
+      - run: test/add_docker_image_tag_to_input.sh test/functional/json/test_pipeline_with_spikeins.json long_read_rna_pipeline
+      - run: test/add_docker_image_tag_to_input.sh test/functional/json/test_pipeline_with_two_spikeins.json  long_read_rna_pipeline
       - run_test_tag:
           tag: "functional"
           wt: "4"
@@ -138,6 +185,7 @@ jobs:
     steps:
       - checkout
       - prepare_pytest_environment
+      - run: test/add_docker_image_tag_to_input.sh test/test_workflow/json/test_workflow_input.json long_read_rna_pipeline
       - run_test_tag:
           tag: "workflow_one_replicate"
           wt: "4"
@@ -147,6 +195,7 @@ jobs:
     steps:
       - checkout
       - prepare_pytest_environment
+      - run: test/add_docker_image_tag_to_input.sh test/test_workflow/json/test_workflow_2reps_input.json long_read_rna_pipeline
       - run_test_tag:
           tag: "workflow_two_replicates"
           wt: "4"
@@ -158,6 +207,7 @@ jobs:
       - prepare_pytest_environment 
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_minimap2_input.json test_minimap2
             test/caper_run.sh test/test_task/test_minimap2.wdl test/test_task/test_minimap2_input.json
             python3 src/compare_md5.py --keys_to_inspect test_minimap2.skipNfirstlines.output_file test_minimap2.minimap2.mapping_qc \
             --metadata_json metadata.json \
@@ -174,6 +224,7 @@ jobs:
       - prepare_pytest_environment
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_transcriptclean_input.json test_transcriptclean
             test/caper_run.sh test/test_task/test_transcriptclean.wdl test/test_task/test_transcriptclean_input.json
             python3 src/compare_md5.py \
             --keys_to_inspect test_transcriptclean.transcriptclean.transcript_log \
@@ -187,30 +238,14 @@ jobs:
             python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_transcriptclean_input.result.json
           no_output_timeout: 30m
 
-  test_filter_transcriptclean:
-    <<: *machine_defaults
-    steps:
-      - checkout
-      - prepare_pytest_environment
-      - run: 
-          command: |
-            test/caper_run.sh test/test_task/test_filter_transcriptclean.wdl test/test_task/test_filter_transcriptclean_input.json
-            python3 src/compare_md5.py \
-            --keys_to_inspect test_filter_transcriptclean.skipNfirstlines.output_file \
-            --metadata_json metadata.json \
-            --reference_json test/test_task/test_filter_transcriptclean_reference_md5.json \
-            --outfile test_filter_transcriptclean_input.result.json
-            cat test_filter_transcriptclean_input.result.json
-            python3 -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data['match_overall']))" < test_filter_transcriptclean_input.result.json
-          no_output_timeout: 30m
-
   test_init_talon_database:
     <<: *machine_defaults
     steps:
       - checkout
       - prepare_pytest_environment
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_init_talon_db_input.json test_init_talon_db
             test/caper_run.sh test/test_task/test_init_talon_db.wdl test/test_task/test_init_talon_db_input.json
             python3 src/compare_md5.py \
             --keys_to_inspect test_init_talon_db.init_talon_db.database test_init_talon_db.init_talon_db.talon_inputs \
@@ -228,6 +263,7 @@ jobs:
       - prepare_pytest_environment
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_init_talon_db_idxprefix_input.json test_init_talon_db
             test/caper_run.sh test/test_task/test_init_talon_db.wdl test/test_task/test_init_talon_db_idxprefix_input.json
             python3 src/compare_md5.py \
             --keys_to_inspect test_init_talon_db.init_talon_db.database test_init_talon_db.init_talon_db.talon_inputs \
@@ -245,6 +281,7 @@ jobs:
       - prepare_pytest_environment
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_talon_input.json test_talon
             test/caper_run.sh test/test_task/test_talon.wdl test/test_task/test_talon_input.json
             python3 src/compare_md5.py \
             --keys_to_inspect test_talon.talon.talon_log \
@@ -262,6 +299,7 @@ jobs:
       - prepare_pytest_environment
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_create_abundance_from_talon_db_input.json test_create_abundance_from_talon_db
             test/caper_run.sh test/test_task/test_create_abundance_from_talon_db.wdl test/test_task/test_create_abundance_from_talon_db_input.json
             python3 src/compare_md5.py \
             --keys_to_inspect test_create_abundance_from_talon_db.create_abundance_from_talon_db.talon_abundance \
@@ -280,6 +318,7 @@ jobs:
       - prepare_pytest_environment
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_create_gtf_from_talon_db_input.json test_create_gtf_from_talon_db
             test/caper_run.sh test/test_task/test_create_gtf_from_talon_db.wdl test/test_task/test_create_gtf_from_talon_db_input.json
             python3 src/compare_md5.py \
             --keys_to_inspect test_create_gtf_from_talon_db.create_gtf_from_talon_db.gtf \
@@ -297,6 +336,7 @@ jobs:
       - prepare_pytest_environment
       - run: 
           command: |
+            test/add_docker_image_tag_to_input.sh test/test_task/test_calculate_spearman_input.json test_calculate_spearman
             test/caper_run.sh test/test_task/test_calculate_spearman.wdl test/test_task/test_calculate_spearman_input.json
             python3 src/compare_md5.py \
             --keys_to_inspect test_calculate_spearman.calculate_spearman.spearman \

diff --git a/Dockerfile b/Dockerfile
@@ -60,14 +60,13 @@ RUN echo "r <- getOption('repos'); r['CRAN'] <- 'https://cloud.r-project.org'; o
     Rscript -e "install.packages('reshape2')"
 
 # Install TC dependencies
-RUN python3.7 -m pip install --upgrade pip
+RUN python3.7 -m pip install --upgrade pip==21.3.1
 RUN python3.7 -m pip install cython
 RUN python3.7 -m pip install pybedtools==0.8.0 pyfasta==0.5.2 numpy pandas
 
 # splice junction finding accessory script from TC still runs in python2 and requires pyfasta, which in turn requires numpy
 
-RUN python -m pip install --upgrade pip
-RUN python -m pip install pyfasta==0.5.2 numpy
+RUN python -m pip install pyfasta==0.5.2 numpy==1.16.6
 
 # Install qc-utils to python 3.7
 

diff --git a/docs/reference.md b/docs/reference.md
@@ -84,6 +84,7 @@ The following elaborates on the meaning of each line in the input file.
 * `long_read_rna_pipeline.annotation_name` Annotation name in the initial TALON database. This is internal metadata variable you typically do not need to touch.
 * `long_read_rna_pipeline.talon_prefixes` This is a list of strings that, if provided, will be prefixes to the transcript names in the gtf generated by `create_gtf_from_talon_db`. If this is not defined, "TALON" will be the default prefix. Note, that if this list is defined, its length must be equal to the number of replicates.
 * `long_read_rna_pipeline.canonical_only` If this option is set to true, TranscriptClean will only output transcripts that are either canonical or that contain annotated noncanonical junctions to the clean SAM and Fasta files at the end of the run. Set this parameter to false to output all transcripts.
+* `long_read_rna_pipeline.docker` and `long_read_rna_pipeline.singularity` These should not need to be touched, unless you are hosting your own images, in which case you probably know what to do.
 In addition, if you have used spikeins in your experiment, you can define them as an array of gzipped fasta files in workflow level variable `long_read_rna_pipeline.spikeins`.
 The rest of the variables are for adjusting the computational resources of the pipeline tasks. See [notes about resources](reference.md#note-about-resources) below for more details.