From 6ceb82b1c09febef8391669f405515f8b8884eb2 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 15 Nov 2023 21:29:56 -0800 Subject: [PATCH 01/28] [GEN-973] Fix bed file-clinical file cross-validation (#540) * fix bed/clinical file validation * update bed file case for msg formatting purposes --- genie_registry/clinical.py | 40 ++++++++--- tests/test_clinical.py | 135 ++++++++++++++++++++++++------------- 2 files changed, 120 insertions(+), 55 deletions(-) diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 014b0b55..e20dac75 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1012,28 +1012,45 @@ def _get_dataframe(self, filePathList): return clinicaldf - def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple: + def _cross_validate_bed_files_exist(self, clinicaldf) -> list: """Check that a bed file exist per SEQ_ASSAY_ID value in clinical file""" - errors = "" - warnings = "" missing_files = [] - seq_assay_ids = clinicaldf["SEQ_ASSAY_ID"].unique().tolist() + exception_params = {"ignore_case": True, "allow_underscore": True} + + # standardize and get unique seq assay ids before searching bed files + seq_assay_ids = set( + [ + validate.standardize_string_for_validation(sq_id, **exception_params) + for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique() + ] + ) for seq_assay_id in seq_assay_ids: bed_files = validate.parse_file_info_in_nested_list( nested_list=self.ancillary_files, search_str=f"{seq_assay_id}.bed", # type: ignore[arg-type] - ignore_case=True, - allow_underscore=True, + **exception_params, ) if not bed_files["files"]: - missing_files.append(f"{seq_assay_id}.bed") + missing_files.append(f"{seq_assay_id.upper()}.bed") + return missing_files + + def _cross_validate_bed_files_exist_message(self, missing_bed_files: list) -> tuple: + """Gets the warning/error messages given the missing bed files list - if missing_files: + Args: + missing_bed_files (list): list of missing bed files + + Returns: + tuple: error + warning + """ + errors = "" + warnings = "" + if missing_bed_files: errors = ( "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " "Please update your file(s) to be consistent.\n" - f"Missing BED files: {', '.join(missing_files)}\n" + f"Missing BED files: {', '.join(missing_bed_files)}\n" ) return errors, warnings @@ -1087,7 +1104,10 @@ def _cross_validate(self, clinicaldf) -> tuple: errors_assay, warnings_assay = self._cross_validate_assay_info_has_seq( clinicaldf ) - errors_bed, warnings_bed = self._cross_validate_bed_files_exist(clinicaldf) + missing_bed_files = self._cross_validate_bed_files_exist(clinicaldf) + errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message( + missing_bed_files + ) errors = errors_assay + errors_bed warnings = warnings_assay + warnings_bed diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 92c4ec30..91815c97 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -1,3 +1,4 @@ +from collections import Counter import datetime import json from unittest import mock @@ -1061,65 +1062,76 @@ def test__check_int_dead_consistency_inconsistent(inconsistent_df): ) -@pytest.mark.parametrize( - "test_clinical_df,test_ancillary_files,expected_error,expected_warning", - [ - ( - pd.DataFrame( - {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-SAGE-1", "SAGE-1", "SAGE-1"]} +def get_cross_validate_bed_files_test_cases(): + return [ + { + "name": "all_match", + "test_clinical_df": pd.DataFrame( + { + "SEQ_ASSAY_ID": [ + "SAGE-1-1", + "SAGE-SAGE-1", + "SAGE-1", + "SAGE-1", + "SaGe-1", + ] + } ), - [ + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2"]}), - [ + "expected_missing_files": [], + }, + { + "name": "partial_match", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2", "SaGe-1_1"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed\n", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}), - [ + "expected_missing_files": ["SAGE-1-2.bed"], + }, + { + "name": "no_match", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SaGe_1_2"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}), - [ + "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"], + }, + { + "name": "no_bed_files", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SAge-1_2"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-1.txt", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n", - "", - ), - ], - ids=["all_match", "partial_match", "no_match", "no_bed_files"], + "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"], + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get_cross_validate_bed_files_test_cases(), ids=lambda x: x["name"] ) def test_that_cross_validate_bed_files_exist_returns_correct_msgs( - clin_class, test_clinical_df, test_ancillary_files, expected_error, expected_warning + clin_class, test_cases ): - clin_class.ancillary_files = test_ancillary_files - errors, warnings = clin_class._cross_validate_bed_files_exist(test_clinical_df) - assert errors == expected_error - assert warnings == expected_warning + clin_class.ancillary_files = test_cases["test_ancillary_files"] + missing_files = clin_class._cross_validate_bed_files_exist( + test_cases["test_clinical_df"] + ) + assert Counter(test_cases["expected_missing_files"]) == Counter(missing_files) def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class): @@ -1138,29 +1150,62 @@ def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class): clin_class._cross_validate_bed_files_exist(test_clinical_df) patch_parse_file_info.assert_called_once_with( nested_list=clin_class.ancillary_files, - search_str="SAGE-SAGE-1.bed", + search_str="sage-sage-1.bed", ignore_case=True, allow_underscore=True, ) +@pytest.mark.parametrize( + "missing_files,expected_error,expected_warning", + [ + ( + [], + "", + "", + ), + ( + ["test1.bed", "test2.bed"], + "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " + "Please update your file(s) to be consistent.\n" + "Missing BED files: test1.bed, test2.bed\n", + "", + ), + ], + ids=["no_missing_files", "missing_files"], +) +def test_that_cross_validate_bed_files_exist_message_returns_correct_msgs( + clin_class, missing_files, expected_error, expected_warning +): + errors, warnings = clin_class._cross_validate_bed_files_exist_message(missing_files) + assert errors == expected_error + assert warnings == expected_warning + + def test_that__cross_validate_calls_expected_methods(clin_class): with mock.patch.object( Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "") ) as patch__cross_validate_assay, mock.patch.object( Clinical, "_cross_validate_bed_files_exist", return_value=("", "") - ) as patch__cross_validate_bed: + ) as patch__cross_validate_bed, mock.patch.object( + Clinical, "_cross_validate_bed_files_exist_message", return_value=("", "") + ) as patch__cross_validate_bed_msg: clin_class._cross_validate(clinicaldf=pd.DataFrame({"something": [1]})) patch__cross_validate_assay.assert_called_once() patch__cross_validate_bed.assert_called_once() + patch__cross_validate_bed_msg.assert_called_once() def test_that__cross_validate_returns_correct_format_for_errors_warnings(clin_class): with mock.patch.object( Clinical, "_cross_validate_assay_info_has_seq", return_value=("test1", "") ) as patch__cross_validate_assay, mock.patch.object( - Clinical, "_cross_validate_bed_files_exist", return_value=("test3\n", "") - ) as patch__cross_validate_bed: + Clinical, "_cross_validate_bed_files_exist", return_value=["something_missing"] + ) as patch__cross_validate_bed, mock.patch.object( + Clinical, + "_cross_validate_bed_files_exist_message", + return_value=("test3\n", ""), + ) as patch__cross_validate_bed_msg: errors, warnings = clin_class._cross_validate( clinicaldf=pd.DataFrame({"something": [1]}) ) From 246e462d7483d0d99b5e78dc26e3189258b7fc35 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 28 Nov 2023 15:05:58 -0800 Subject: [PATCH 02/28] update annotation-tools version --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f171542a..60e741d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -83,6 +83,6 @@ WORKDIR /root/ # Must move this git clone to after the install of Genie, # because must update cbioportal RUN git clone https://github.com/cBioPortal/cbioportal.git -b v5.3.19 -RUN git clone https://github.com/Sage-Bionetworks/annotation-tools.git -b 0.0.2 +RUN git clone https://github.com/Sage-Bionetworks/annotation-tools.git -b 0.0.3 WORKDIR /root/Genie From b0ee034c8d5214178f2dc259c323b2390f1c77f2 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Mon, 11 Dec 2023 15:46:39 -0800 Subject: [PATCH 03/28] initial doc update --- CONTRIBUTING.md | 17 ++++++++++++++++- README.md | 28 +++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 96475f19..a08f9740 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -94,6 +94,10 @@ This package uses [semantic versioning](https://semver.org/) for releasing new v ### Testing +#### Running test pipeline + +Make sure to run each of the [pipeline steps here](README.md#developing-locally) on the test pipeline and verify that your pipeline runs as expected. This is __not__ automatically run by Github Actions and have to be manually run. + #### Running tests This package uses [`pytest`](https://pytest.org/en/latest/) to run tests. The test code is located in the [tests](./tests) subdirectory. @@ -134,6 +138,17 @@ Follow gitflow best practices as linked above. 1. Merge `main` back into `develop` 1. Push `develop` -### DockerHub +### Modifying Docker + +Follow this section when modifying the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/develop/Dockerfile) + +1. Make sure you have your synapse config setup in your working directory +1. ```docker build -f Dockerfile -t genie-docker .``` +1. ```docker run --rm -it -e DISABLE_SSL=true -p 4040:4040 -p 18080:18080 -v ~/.synapseConfig:/root/.synapseConfig genie-docker``` +1. Run [test code](README.md#developing-locally) relevant to the dockerfile changes to make sure changes are present and working +1. Once changes are tested, follow [genie contributing guidelines](#developing) for adding it to the repo +1. Once deployed to main, make sure docker image was successfully deployed remotely (our docker image gets automatically deployed) [here](https://hub.docker.com/repository/docker/sagebionetworks/genie/builds) + +#### Dockerhub This repository does not use github actions to push docker images. By adding the `sagebiodockerhub` github user as an Admin to this GitHub repository, we can configure an automated build in DockerHub. You can view the builds [here](https://hub.docker.com/repository/docker/sagebionetworks/genie/builds). To get admin access to the DockerHub repository, ask Sage IT to be added to the `genieadmin` DockerHub team. diff --git a/README.md b/README.md index 736da68a..bd285e4e 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,6 @@ genie validate data_clinical_supp_SAGE.txt SAGE ``` - ## Contributing Please view [contributing guide](CONTRIBUTING.md) to learn how to contribute to the GENIE package. @@ -65,6 +64,18 @@ These are instructions on how you would develop and test the pipeline locally. pip install -r requirements-dev.txt ``` +If you are having trouble with the above, try installing via `pipenv` + +1. Specify a python version that is supported by this repo: + +```pipenv --python ``` + +1. [pipenv install from requirements file](https://docs.pipenv.org/en/latest/advanced.html#importing-from-requirements-txt) + +1. Activate your `pipenv`: + +```pipenv shell``` + 1. Configure the Synapse client to authenticate to Synapse. 1. Create a Synapse [Personal Access token (PAT)](https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens). 1. Add a `~/.synapseConfig` file @@ -83,33 +94,40 @@ These are instructions on how you would develop and test the pipeline locally. 1. Run the different pipelines on the test project. The `--project_id syn7208886` points to the test project. - 1. Validate all the files. + 1. Validate all the files **excluding vcf files**: ``` python bin/input_to_database.py main --project_id syn7208886 --onlyValidate ``` + 1. Validate **all** the files: + + ``` + python bin/input_to_database.py mutation --project_id syn7208886 --onlyValidate --genie_annotation_pkg ../annotation-tools + ``` + 1. Process all the files aside from the mutation (maf, vcf) files. The mutation processing was split because it takes at least 2 days to process all the production mutation data. Ideally, there is a parameter to exclude or include file types to process/validate, but that is not implemented. ``` python bin/input_to_database.py main --project_id syn7208886 --deleteOld ``` - 1. Process the mutation data. Be sure to clone this repo: https://github.com/Sage-Bionetworks/annotation-tools. This repo houses the code that re-annotates the mutation data with genome nexus. The `--createNewMafDatabase` will create a new mutation tables in the test project. This flag is necessary for production data for two main reasons: + 1. Process the mutation data. Be sure to clone this repo: https://github.com/Sage-Bionetworks/annotation-tools and `git checkout` the version of the repo pinned to the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile). This repo houses the code that re-annotates the mutation data with genome nexus. The `--createNewMafDatabase` will create a new mutation tables in the test project. This flag is necessary for production data for two main reasons: * During processing of mutation data, the data is appended to the data, so without creating an empty table, there will be duplicated data uploaded. * By design, Synapse Tables were meant to be appended to. When a Synapse Tables is updated, it takes time to index the table and return results. This can cause problems for the pipeline when trying to query the mutation table. It is actually faster to create an entire new table than updating or deleting all rows and appending new rows when dealing with millions of rows. + * If you run this more than once on the same day, you'll run into an issue with overwriting the narrow maf table as it already exists. Be sure to rename the current narrow maf database under `Tables` in the test synapse project and try again. ``` python bin/input_to_database.py mutation --project_id syn7208886 --deleteOld --genie_annotation_pkg ../annotation-tools --createNewMafDatabase ``` - 1. Create a consortium release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal + 1. Create a consortium release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal and `git checkout` the version of the repo pinned to the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile) ``` python bin/database_to_staging.py Jan-2017 ../cbioportal TEST --test ``` - 1. Create a public release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal + 1. Create a public release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal and `git checkout` the version of the repo pinned to the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile) ``` python bin/consortium_to_public.py Jan-2017 ../cbioportal TEST --test From 4cb7cc1eaa1af2101991398e09bca5c54f5e5446 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Mon, 11 Dec 2023 15:49:27 -0800 Subject: [PATCH 04/28] adjust bullet pts --- README.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index bd285e4e..1002e492 100644 --- a/README.md +++ b/README.md @@ -64,17 +64,15 @@ These are instructions on how you would develop and test the pipeline locally. pip install -r requirements-dev.txt ``` -If you are having trouble with the above, try installing via `pipenv` + If you are having trouble with the above, try installing via `pipenv` -1. Specify a python version that is supported by this repo: + 1. Specify a python version that is supported by this repo: + ```pipenv --python ``` -```pipenv --python ``` + 1. [pipenv install from requirements file](https://docs.pipenv.org/en/latest/advanced.html#importing-from-requirements-txt) -1. [pipenv install from requirements file](https://docs.pipenv.org/en/latest/advanced.html#importing-from-requirements-txt) - -1. Activate your `pipenv`: - -```pipenv shell``` + 1. Activate your `pipenv`: + ```pipenv shell``` 1. Configure the Synapse client to authenticate to Synapse. 1. Create a Synapse [Personal Access token (PAT)](https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens). From 83aaff1f0fd484debab6a8288117f6c70a6bffd8 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Mon, 11 Dec 2023 15:52:04 -0800 Subject: [PATCH 05/28] standardize dockerfile url --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a08f9740..8be9b44d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -140,7 +140,7 @@ Follow gitflow best practices as linked above. ### Modifying Docker -Follow this section when modifying the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/develop/Dockerfile) +Follow this section when modifying the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile): 1. Make sure you have your synapse config setup in your working directory 1. ```docker build -f Dockerfile -t genie-docker .``` From de207d65aa4e1220de114716fc02f5bec16cfae6 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 12 Dec 2023 10:01:09 -0800 Subject: [PATCH 06/28] adjust docker cmd with relevant params --- CONTRIBUTING.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8be9b44d..2520bc40 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -142,9 +142,9 @@ Follow gitflow best practices as linked above. Follow this section when modifying the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile): -1. Make sure you have your synapse config setup in your working directory -1. ```docker build -f Dockerfile -t genie-docker .``` -1. ```docker run --rm -it -e DISABLE_SSL=true -p 4040:4040 -p 18080:18080 -v ~/.synapseConfig:/root/.synapseConfig genie-docker``` +1. Have your synapse authentication token handy +1. ```docker build -f Dockerfile -t .``` +1. ```docker run --rm -it -e SYNAPSE_AUTH_TOKEN=$YOUR_SYNAPSE_TOKEN ``` 1. Run [test code](README.md#developing-locally) relevant to the dockerfile changes to make sure changes are present and working 1. Once changes are tested, follow [genie contributing guidelines](#developing) for adding it to the repo 1. Once deployed to main, make sure docker image was successfully deployed remotely (our docker image gets automatically deployed) [here](https://hub.docker.com/repository/docker/sagebionetworks/genie/builds) From cc081a146c05b3840fdeb755ed848c27a4766bac Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 4 Jan 2024 21:52:49 -0800 Subject: [PATCH 07/28] update java version to match gn annotator.jar file, change tag version on annotation-tools --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 60e741d3..0df6fe30 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ RUN apt-get update && apt-get install -y --allow-unauthenticated --no-install-re # texlive-generic-recommended \ texlive-latex-extra \ # genome nexus - openjdk-8-jre \ + openjdk-11-jre \ # This is for reticulate python3.8-venv && \ apt-get clean && \ @@ -83,6 +83,6 @@ WORKDIR /root/ # Must move this git clone to after the install of Genie, # because must update cbioportal RUN git clone https://github.com/cBioPortal/cbioportal.git -b v5.3.19 -RUN git clone https://github.com/Sage-Bionetworks/annotation-tools.git -b 0.0.3 +RUN git clone https://github.com/Sage-Bionetworks/annotation-tools.git -b 0.0.4 WORKDIR /root/Genie From 00f527e5628040418b345f65f9cb3976e366564c Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Sun, 7 Jan 2024 18:28:34 -0800 Subject: [PATCH 08/28] use newer version of annotator.jar --- genie/process_mutation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index f4aa334d..d6d31d29 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -202,12 +202,14 @@ def process_mutation_workflow( "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], + version=1, #TODO: This should pull from a config file in the future ) # Genome Nexus Jar file syn.get( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], + version=13, #TODO: This should pull from a config file in the future ) annotated_maf_path = annotate_mutation( From 5c955cd392aae9b41361a243f0d7775718133989 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Mon, 8 Jan 2024 13:18:00 -0800 Subject: [PATCH 09/28] define error report dir --- genie/process_mutation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index d6d31d29..6021b944 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -250,6 +250,7 @@ def annotate_mutation( """ input_files_dir = tempfile.mkdtemp(dir=workdir) output_files_dir = tempfile.mkdtemp(dir=workdir) + error_dir = os.path.join(output_files_dir, f"{center}_error_reports") for mutation_file in mutation_files: move_mutation(mutation_file, input_files_dir) @@ -262,6 +263,7 @@ def annotate_mutation( os.path.join(genie_annotation_pkg, "annotation_suite_wrapper.sh"), f"-i={input_files_dir}", f"-o={output_files_dir}", + f"-e={error_dir}", f"-m={merged_maf_path}", f"-c={center}", "-s=WXS", From d7b3bd0a39c3c15fc4f3c870c070519dda509686 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Mon, 8 Jan 2024 13:47:36 -0800 Subject: [PATCH 10/28] add error dir changes to tests --- tests/test_process_mutation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 1e945e10..44239bca 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -104,11 +104,13 @@ def test_process_mutation_workflow(syn, genie_config): "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, + version=1, #TODO: This should pull from a config file in the future ), call( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, + version=13, #TODO: This should pull from a config file in the future ), ] center = "SAGE" @@ -148,6 +150,7 @@ def test_annotate_mutation(): workdir = "working/dir/path" mktemp_calls = [call(dir=workdir)] * 2 input_dir = "input/dir" + error_dir = "input/dir/SAGE_error_reports" with patch.object( tempfile, "mkdtemp", return_value=input_dir ) as patch_mktemp, patch.object( @@ -169,6 +172,7 @@ def test_annotate_mutation(): "annotation/pkg/path/annotation_suite_wrapper.sh", f"-i={input_dir}", f"-o={input_dir}", + f"-e={error_dir}", f"-m=input/dir/data_mutations_extended_{center}.txt", f"-c={center}", "-s=WXS", From 2267c4a95e21a9c1b2b1588110c979e4f01b20c7 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Mon, 8 Jan 2024 14:13:27 -0800 Subject: [PATCH 11/28] lint --- genie/process_mutation.py | 4 ++-- tests/test_process_mutation.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index 6021b944..f60e8b22 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -202,14 +202,14 @@ def process_mutation_workflow( "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], - version=1, #TODO: This should pull from a config file in the future + version=1, # TODO: This should pull from a config file in the future ) # Genome Nexus Jar file syn.get( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], - version=13, #TODO: This should pull from a config file in the future + version=13, # TODO: This should pull from a config file in the future ) annotated_maf_path = annotate_mutation( diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 44239bca..8041af03 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -104,13 +104,13 @@ def test_process_mutation_workflow(syn, genie_config): "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, - version=1, #TODO: This should pull from a config file in the future + version=1, # TODO: This should pull from a config file in the future ), call( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, - version=13, #TODO: This should pull from a config file in the future + version=13, # TODO: This should pull from a config file in the future ), ] center = "SAGE" From 94112888a2b1fa186d8e3cc35135098fb7922138 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 9 Jan 2024 12:36:12 -0800 Subject: [PATCH 12/28] initial commit to concat and store failed annotations report --- genie/process_mutation.py | 126 ++++++++++++++++++++++----- tests/conftest.py | 10 +++ tests/test_process_mutation.py | 155 +++++++++++++++++++++++++++------ 3 files changed, 244 insertions(+), 47 deletions(-) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index f60e8b22..4ff07763 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -1,5 +1,6 @@ """Process mutation files TODO deprecate this module and spread functions around""" +from collections import namedtuple import logging import os import shutil @@ -211,12 +212,12 @@ def process_mutation_workflow( downloadLocation=genie_config["genie_annotation_pkg"], version=13, # TODO: This should pull from a config file in the future ) - - annotated_maf_path = annotate_mutation( + annotation_paths = create_annotation_paths(center=center, workdir=workdir) + annotate_mutation( + annotation_paths=annotation_paths, center=center, mutation_files=valid_mutation_files, genie_annotation_pkg=genie_config["genie_annotation_pkg"], - workdir=workdir, ) maf_tableid = genie_config["vcf2maf"] @@ -226,18 +227,110 @@ def process_mutation_workflow( syn=syn, center=center, maf_tableid=maf_tableid, - annotated_maf_path=annotated_maf_path, + annotated_maf_path=annotation_paths.merged_maf_path, flatfiles_synid=flatfiles_synid, workdir=workdir, ) - return annotated_maf_path + full_error_report = concat_annotation_error_reports( + center=center, + input_dir=annotation_paths.error_dir, + ) + + store_annotation_error_reports( + full_error_report=full_error_report, + syn=syn, + errors_folder_synid=genie_config["center_config"][center]["errorsSynId"], + release="test", + ) + return annotation_paths.merged_maf_path + + +def create_annotation_paths(center: str, workdir: str) -> namedtuple: + """Creates the filepaths required in the annotation process + + Args: + center (str): name of the center + workdir (str): work directory to create paths in + + Returns: + namedtuple: tuple with all the paths + """ + input_files_dir = tempfile.mkdtemp(dir=workdir) + output_files_dir = tempfile.mkdtemp(dir=workdir) + Filepaths = namedtuple( + "Filepaths", + ["input_files_dir", "output_files_dir", "error_dir", "merged_maf_path"], + ) + annotation_paths = Filepaths( + input_files_dir=input_files_dir, + output_files_dir=output_files_dir, + error_dir=os.path.join(output_files_dir, f"{center}_error_reports"), + merged_maf_path=os.path.join( + output_files_dir, f"data_mutations_extended_{center}.txt" + ), + ) + return annotation_paths + + +def concat_annotation_error_reports( + center: str, + input_dir: str, +) -> pd.DataFrame: + """Concatenates the annotation error reports + + Args: + center (str): name of center associated with error report + input_dir (str): directory where error reports are + Returns: + pd.DataFrame: full annotation error report + """ + error_files = os.listdir(input_dir) + chunk_size = 10000 + error_reports = [] + + # Read and concatenate TSV files in chunks + for file in error_files: + for chunk in pd.read_csv( + os.path.join(input_dir, file), sep="\t", chunksize=chunk_size + ): + error_reports.append(chunk) + full_error_report = pd.concat(error_reports) + full_error_report["Center"] = center + return full_error_report + + +def store_annotation_error_reports( + full_error_report: pd.DataFrame, + syn: Synapse, + errors_folder_synid: str, + release: str, +) -> None: + """Stores the annotation error reports to synapse + + Args: + full_error_report (pd.DataFrame): full error report to store + syn (synapseclient.Synapse): synapse client object + errors_folder_synid (str): synapse id of error report folder + to store reports in + release (str): name of the release for this processing run + """ + full_error_report.to_csv("failed_annotations_report.tsv", sep="\t", index=False) + load.store_file( + syn=syn, + filepath="failed_annotations_report.tsv", + parentid=errors_folder_synid, + version_comment=release, + ) # TODO: add to transform def annotate_mutation( - center: str, mutation_files: list, genie_annotation_pkg: str, workdir: str -) -> str: + annotation_paths: namedtuple, + mutation_files: list, + genie_annotation_pkg: str, + center: str, +) -> None: """Process vcf/maf files Args: @@ -248,23 +341,16 @@ def annotate_mutation( Returns: Path to final maf """ - input_files_dir = tempfile.mkdtemp(dir=workdir) - output_files_dir = tempfile.mkdtemp(dir=workdir) - error_dir = os.path.join(output_files_dir, f"{center}_error_reports") - for mutation_file in mutation_files: - move_mutation(mutation_file, input_files_dir) + move_mutation(mutation_file, annotation_paths.input_files_dir) - merged_maf_path = os.path.join( - output_files_dir, f"data_mutations_extended_{center}.txt" - ) annotater_cmd = [ "bash", os.path.join(genie_annotation_pkg, "annotation_suite_wrapper.sh"), - f"-i={input_files_dir}", - f"-o={output_files_dir}", - f"-e={error_dir}", - f"-m={merged_maf_path}", + f"-i={annotation_paths.input_files_dir}", + f"-o={annotation_paths.output_files_dir}", + f"-e={annotation_paths.error_dir}", + f"-m={annotation_paths.merged_maf_path}", f"-c={center}", "-s=WXS", f"-p={genie_annotation_pkg}", @@ -272,8 +358,6 @@ def annotate_mutation( subprocess.check_call(annotater_cmd) - return merged_maf_path - # TODO: add to transform def append_or_createdf(dataframe: pd.DataFrame, filepath: str): diff --git a/tests/conftest.py b/tests/conftest.py index 4027fa3b..90c47769 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -51,6 +51,7 @@ def genie_config(): "center": "SAGE", "inputSynId": "syn11601335", "stagingSynId": "syn11601337", + "errorsSynId": "syn53239079", "release": True, "mutationInCisFilter": "ON", }, @@ -58,6 +59,15 @@ def genie_config(): "center": "TEST", "inputSynId": "syn11601340", "stagingSynId": "syn11601342", + "errorsSynId": "syn53239081", + "release": True, + "mutationInCisFilter": "ON", + }, + "GOLD": { + "center": "GOLD", + "inputSynId": "syn52950860", + "stagingSynId": "syn52950861", + "errorsSynId": "syn53239077", "release": True, "mutationInCisFilter": "ON", }, diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 8041af03..358e6450 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -1,10 +1,15 @@ """Test process mutation functions""" +import os +from collections import namedtuple +from pandas.testing import assert_frame_equal import shutil import subprocess import tempfile from unittest.mock import patch, call import pandas as pd +import pytest +import synapseclient from genie import load, process_mutation @@ -93,7 +98,21 @@ def test_move_mutation_maf(self): patch_move.assert_called_once_with(self.mutation_path, self.input_dir) -def test_process_mutation_workflow(syn, genie_config): +@pytest.fixture +def annotation_paths(): + Filepaths = namedtuple( + "Filepaths", + ["input_files_dir", "output_files_dir", "error_dir", "merged_maf_path"], + ) + yield Filepaths( + input_files_dir="input/dir", + output_files_dir="input/dir", + error_dir="input/dir/SAGE_error_reports", + merged_maf_path="input/dir/data_mutations_extended_SAGE.txt", + ) + + +def test_process_mutation_workflow(syn, genie_config, annotation_paths): """Integration test to make sure workflow runs""" validfiles = pd.DataFrame( {"fileType": ["vcf", "maf"], "path": ["path/to/vcf", "path/to/maf"]} @@ -115,71 +134,155 @@ def test_process_mutation_workflow(syn, genie_config): ] center = "SAGE" workdir = "working/dir/path" - maf_path = "path/to/maf" - with patch.object(syn, "get") as patch_synget, patch.object( - process_mutation, "annotate_mutation", return_value=maf_path + sample_error_report = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4]}) + with patch.object( + process_mutation, "create_annotation_paths", return_value=annotation_paths + ) as patch_annotation_paths, patch.object(syn, "get") as patch_synget, patch.object( + process_mutation, "annotate_mutation" ) as patch_annotation, patch.object( process_mutation, "split_and_store_maf" - ) as patch_split: + ) as patch_split, patch.object( + process_mutation, + "concat_annotation_error_reports", + return_value=sample_error_report, + ) as patch_concat_error, patch.object( + process_mutation, "store_annotation_error_reports" + ) as patch_store_error: maf = process_mutation.process_mutation_workflow( syn, center, validfiles, genie_config, workdir ) + patch_annotation_paths.assert_called_once_with(center=center, workdir=workdir) patch_synget.assert_has_calls(syn_get_calls) patch_annotation.assert_called_once_with( + annotation_paths=annotation_paths, center=center, mutation_files=["path/to/vcf", "path/to/maf"], genie_annotation_pkg=genie_annotation_pkg, - workdir=workdir, ) patch_split.assert_called_once_with( syn=syn, center=center, maf_tableid="syn22493903", - annotated_maf_path=maf_path, + annotated_maf_path=annotation_paths.merged_maf_path, flatfiles_synid="syn12279903", workdir=workdir, ) - assert maf == maf_path + patch_concat_error.assert_called_once_with( + center=center, + input_dir=annotation_paths.error_dir, + ) + patch_store_error.assert_called_once_with( + full_error_report=sample_error_report, + syn=syn, + errors_folder_synid="syn53239079", + release="test", + ) + assert maf == annotation_paths.merged_maf_path + + +def test_that_create_annotation_paths_returns_expected_paths(annotation_paths): + center = "SAGE" + input_dir = "input/dir" + workdir = "test/dir" + + with patch.object(tempfile, "mkdtemp", return_value=input_dir) as patch_mktemp: + test_paths = process_mutation.create_annotation_paths( + center=center, + workdir=workdir, + ) + mktemp_calls = [call(dir=workdir)] * 2 + patch_mktemp.assert_has_calls(mktemp_calls) + assert test_paths.input_files_dir == annotation_paths.input_files_dir + assert test_paths.output_files_dir == annotation_paths.output_files_dir + assert test_paths.error_dir == annotation_paths.error_dir + assert test_paths.merged_maf_path == annotation_paths.merged_maf_path + + +class TestAnnotationErrorReports: + @classmethod + def setup_class(cls): + cls.source_dir = "source_test_directory" + os.makedirs(cls.source_dir, exist_ok=True) + + # Create sample files in the source directory + with open(os.path.join(cls.source_dir, "file1.tsv"), "w") as f1: + f1.write("col1\tcol2\tcol3\nval1\tval2\tval3\n") + with open(os.path.join(cls.source_dir, "file2.tsv"), "w") as f2: + f2.write("col1\tcol2\tcol3\nval4\tval5\tval6\n") + + @classmethod + def teardown_class(cls): + shutil.rmtree(cls.source_dir) + + @pytest.fixture + def test_error_report(self): + yield pd.DataFrame( + { + "col1": ["val1", "val4"], + "col2": ["val2", "val5"], + "col3": ["val3", "val6"], + "Center": ["SAGE", "SAGE"], + } + ) + def test_concat_annotation_error_reports_returns_expected(self, test_error_report): + compiled_report = process_mutation.concat_annotation_error_reports( + input_dir="source_test_directory", + center="SAGE", + ) + assert_frame_equal( + compiled_report.sort_values(by="col1").reset_index(drop=True), + test_error_report.sort_values(by="col1").reset_index(drop=True), + ) -def test_annotate_mutation(): + def test_store_annotation_error_reports(self, syn, test_error_report): + errors_folder_synid = "syn11111" + with patch.object(load, "store_file", return_value=None) as patch_store: + process_mutation.store_annotation_error_reports( + full_error_report=test_error_report, + syn=syn, + errors_folder_synid=errors_folder_synid, + release="test", + ) + patch_store.assert_called_once_with( + syn=syn, + filepath="failed_annotations_report.tsv", + parentid=errors_folder_synid, + version_comment="test", + ) + + +def test_annotate_mutation(annotation_paths): """Integration test, test that annotate mutation is called currectly""" center = "SAGE" mutation_files = ["path/to/vcf"] genie_annotation_pkg = "annotation/pkg/path" - workdir = "working/dir/path" - mktemp_calls = [call(dir=workdir)] * 2 - input_dir = "input/dir" - error_dir = "input/dir/SAGE_error_reports" - with patch.object( - tempfile, "mkdtemp", return_value=input_dir - ) as patch_mktemp, patch.object( - process_mutation, "move_mutation" - ) as patch_move, patch.object( + + with patch.object(process_mutation, "move_mutation") as patch_move, patch.object( subprocess, "check_call" ) as patch_call: maf_path = process_mutation.annotate_mutation( + annotation_paths=annotation_paths, center=center, mutation_files=mutation_files, genie_annotation_pkg=genie_annotation_pkg, - workdir=workdir, ) - patch_mktemp.assert_has_calls(mktemp_calls) - patch_move.assert_called_once_with("path/to/vcf", input_dir) + patch_move.assert_called_once_with( + "path/to/vcf", annotation_paths.input_files_dir + ) patch_call.assert_called_once_with( [ "bash", "annotation/pkg/path/annotation_suite_wrapper.sh", - f"-i={input_dir}", - f"-o={input_dir}", - f"-e={error_dir}", - f"-m=input/dir/data_mutations_extended_{center}.txt", + f"-i={annotation_paths.input_files_dir}", + f"-o={annotation_paths.input_files_dir}", + f"-e={annotation_paths.error_dir}", + f"-m={annotation_paths.merged_maf_path}", f"-c={center}", "-s=WXS", f"-p={genie_annotation_pkg}", ] ) - assert maf_path == f"input/dir/data_mutations_extended_{center}.txt" def test_append_or_createdf_append(): From 1a75eae3cf96c43598d32d2ba17f6e10ced353ba Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 9 Jan 2024 13:54:36 -0800 Subject: [PATCH 13/28] add GOLD center to test --- tests/test_validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 4e16cfee..ef196704 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -455,7 +455,7 @@ def test_perform_validate(syn, genie_config): validate._perform_validate(syn, arg) patch_check_parentid.assert_called_once_with(syn=syn, parentid=arg.parentid) patch_get_config.assert_called_once_with(syn=syn, project_id=arg.project_id) - patch_check_center.assert_called_once_with(arg.center, ["SAGE", "TEST"]) + patch_check_center.assert_called_once_with(arg.center, ["SAGE", "TEST", "GOLD"]) patch_get_onco.assert_called_once() patch_validate.assert_called_once_with( nosymbol_check=arg.nosymbol_check, project_id=arg.project_id From 63c2af45f0bba0f0fb54b669008f7533fa206dae Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 10 Jan 2024 15:21:48 -0800 Subject: [PATCH 14/28] remove release param based on pr comments --- genie/process_mutation.py | 4 ---- tests/test_process_mutation.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index 4ff07763..8a1e08c2 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -241,7 +241,6 @@ def process_mutation_workflow( full_error_report=full_error_report, syn=syn, errors_folder_synid=genie_config["center_config"][center]["errorsSynId"], - release="test", ) return annotation_paths.merged_maf_path @@ -304,7 +303,6 @@ def store_annotation_error_reports( full_error_report: pd.DataFrame, syn: Synapse, errors_folder_synid: str, - release: str, ) -> None: """Stores the annotation error reports to synapse @@ -313,14 +311,12 @@ def store_annotation_error_reports( syn (synapseclient.Synapse): synapse client object errors_folder_synid (str): synapse id of error report folder to store reports in - release (str): name of the release for this processing run """ full_error_report.to_csv("failed_annotations_report.tsv", sep="\t", index=False) load.store_file( syn=syn, filepath="failed_annotations_report.tsv", parentid=errors_folder_synid, - version_comment=release, ) diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 358e6450..a1c9348f 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -175,7 +175,6 @@ def test_process_mutation_workflow(syn, genie_config, annotation_paths): full_error_report=sample_error_report, syn=syn, errors_folder_synid="syn53239079", - release="test", ) assert maf == annotation_paths.merged_maf_path @@ -242,13 +241,11 @@ def test_store_annotation_error_reports(self, syn, test_error_report): full_error_report=test_error_report, syn=syn, errors_folder_synid=errors_folder_synid, - release="test", ) patch_store.assert_called_once_with( syn=syn, filepath="failed_annotations_report.tsv", parentid=errors_folder_synid, - version_comment="test", ) From b3d649164407ad4322ddab3842e27e491045bacd Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 11 Jan 2024 11:01:24 -0800 Subject: [PATCH 15/28] comment out version until we can pull from config file --- genie/process_mutation.py | 4 ++-- tests/test_process_mutation.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index f60e8b22..3631ef77 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -202,14 +202,14 @@ def process_mutation_workflow( "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], - version=1, # TODO: This should pull from a config file in the future + # version=1, # TODO: This should pull from a config file in the future ) # Genome Nexus Jar file syn.get( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], - version=13, # TODO: This should pull from a config file in the future + # version=13, # TODO: This should pull from a config file in the future ) annotated_maf_path = annotate_mutation( diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 8041af03..53c0c4c5 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -104,13 +104,13 @@ def test_process_mutation_workflow(syn, genie_config): "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, - version=1, # TODO: This should pull from a config file in the future + # version=1, # TODO: This should pull from a config file in the future ), call( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, - version=13, # TODO: This should pull from a config file in the future + # version=13, # TODO: This should pull from a config file in the future ), ] center = "SAGE" From 80367dbdfae4b9ce0ecb1219d9d2307f138fecae Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 11 Jan 2024 23:25:14 -0800 Subject: [PATCH 16/28] add check for narrow maf annotations vs full failed annotations report, move filepaths --- genie/process_mutation.py | 100 +++++++++++++++++++++++++-------- tests/test_process_mutation.py | 84 +++++++++++++++++++++++---- 2 files changed, 150 insertions(+), 34 deletions(-) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index 9284cf30..b2846ffd 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -11,7 +11,7 @@ import pandas as pd from synapseclient import Synapse -from . import load, process_functions +from . import extract, load, process_functions logger = logging.getLogger(__name__) @@ -199,12 +199,14 @@ def process_mutation_workflow( logger.info("No mutation data") return None # Certificate to use GENIE Genome Nexus + syn.get( "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], # version=1, # TODO: This should pull from a config file in the future ) + # Genome Nexus Jar file syn.get( "syn22084320", @@ -212,6 +214,7 @@ def process_mutation_workflow( downloadLocation=genie_config["genie_annotation_pkg"], # version=13, # TODO: This should pull from a config file in the future ) + annotation_paths = create_annotation_paths(center=center, workdir=workdir) annotate_mutation( annotation_paths=annotation_paths, @@ -227,18 +230,23 @@ def process_mutation_workflow( syn=syn, center=center, maf_tableid=maf_tableid, - annotated_maf_path=annotation_paths.merged_maf_path, + annotation_paths=annotation_paths, flatfiles_synid=flatfiles_synid, - workdir=workdir, ) full_error_report = concat_annotation_error_reports( center=center, input_dir=annotation_paths.error_dir, ) - + check_annotation_error_reports( + syn=syn, + maf_table_synid=maf_tableid, + full_error_report=full_error_report, + center=center, + ) store_annotation_error_reports( full_error_report=full_error_report, + full_error_report_path=annotation_paths.full_error_report_path, syn=syn, errors_folder_synid=genie_config["center_config"][center]["errorsSynId"], ) @@ -259,7 +267,15 @@ def create_annotation_paths(center: str, workdir: str) -> namedtuple: output_files_dir = tempfile.mkdtemp(dir=workdir) Filepaths = namedtuple( "Filepaths", - ["input_files_dir", "output_files_dir", "error_dir", "merged_maf_path"], + [ + "input_files_dir", + "output_files_dir", + "error_dir", + "merged_maf_path", + "full_maf_path", + "narrow_maf_path", + "full_error_report_path", + ], ) annotation_paths = Filepaths( input_files_dir=input_files_dir, @@ -268,6 +284,21 @@ def create_annotation_paths(center: str, workdir: str) -> namedtuple: merged_maf_path=os.path.join( output_files_dir, f"data_mutations_extended_{center}.txt" ), + full_maf_path=os.path.join( + workdir, center, "staging", f"data_mutations_extended_{center}.txt" + ), + narrow_maf_path=os.path.join( + workdir, + center, + "staging", + f"data_mutations_extended_{center}_MAF_narrow.txt", + ), + full_error_report_path=os.path.join( + workdir, + center, + "staging", + f"failed_annotations_error_report.txt", + ), ) return annotation_paths @@ -299,8 +330,36 @@ def concat_annotation_error_reports( return full_error_report +def check_annotation_error_reports( + syn: Synapse, maf_table_synid: str, full_error_report: pd.DataFrame, center: str +) -> None: + """A simple QC check to make sure our genome nexus error report + failed annotations matches our final processed maf table's failed + annotations + + Args: + syn (Synapse): synapse client + maf_table_synid (str): synapse_id of the narrow maf table + full_error_report (pd.DataFrame): the failed annotations error report + center (str): the center this is for + """ + maf_table_df = extract.get_syntabledf( + syn=syn, + query_string=( + f"SELECT * FROM {maf_table_synid} " + f"WHERE Center = '{center}' AND " + "Annotation_Status = 'FAILED'" + ), + ) + assert len(maf_table_df) == len(full_error_report), ( + "Genome nexus's failed annotations error report rows doesn't match" + f"maf table's failed annotations for {center}" + ) + + def store_annotation_error_reports( full_error_report: pd.DataFrame, + full_error_report_path: str, syn: Synapse, errors_folder_synid: str, ) -> None: @@ -308,14 +367,15 @@ def store_annotation_error_reports( Args: full_error_report (pd.DataFrame): full error report to store + full_error_report_path (str) where to store the flat file of the full error report syn (synapseclient.Synapse): synapse client object errors_folder_synid (str): synapse id of error report folder to store reports in """ - full_error_report.to_csv("failed_annotations_report.tsv", sep="\t", index=False) + full_error_report.to_csv(full_error_report_path, sep="\t", index=False) load.store_file( syn=syn, - filepath="failed_annotations_report.tsv", + filepath=full_error_report_path, parentid=errors_folder_synid, ) @@ -399,9 +459,8 @@ def split_and_store_maf( syn: Synapse, center: str, maf_tableid: str, - annotated_maf_path: str, + annotation_paths: namedtuple, flatfiles_synid: str, - workdir: str, ): """Separates annotated maf file into narrow and full maf and stores them @@ -409,7 +468,7 @@ def split_and_store_maf( syn: Synapse connection center: Center maf_tableid: Mutation table synapse id - annotated_maf_path: Annotated maf + annotation_paths: filepaths in the annotation process flatfiles_synid: GENIE flat files folder """ @@ -418,22 +477,19 @@ def split_and_store_maf( for col in syn.getTableColumns(maf_tableid) if col["name"] != "inBED" ] - full_maf_path = os.path.join( - workdir, center, "staging", f"data_mutations_extended_{center}.txt" - ) - narrow_maf_path = os.path.join( - workdir, center, "staging", f"data_mutations_extended_{center}_MAF_narrow.txt" - ) maf_chunks = pd.read_csv( - annotated_maf_path, sep="\t", chunksize=100000, comment="#" + annotation_paths.merged_maf_path, sep="\t", chunksize=100000, comment="#" ) - for maf_chunk in maf_chunks: maf_chunk = format_maf(maf_chunk, center) - append_or_createdf(maf_chunk, full_maf_path) + append_or_createdf(maf_chunk, annotation_paths.full_maf_path) narrow_maf_chunk = maf_chunk[narrow_maf_cols] - append_or_createdf(narrow_maf_chunk, narrow_maf_path) + append_or_createdf(narrow_maf_chunk, annotation_paths.narrow_maf_path) - load.store_table(syn=syn, filepath=narrow_maf_path, tableid=maf_tableid) + load.store_table( + syn=syn, filepath=annotation_paths.narrow_maf_path, tableid=maf_tableid + ) # Store MAF flat file into synapse - load.store_file(syn=syn, filepath=full_maf_path, parentid=flatfiles_synid) + load.store_file( + syn=syn, filepath=annotation_paths.full_maf_path, parentid=flatfiles_synid + ) diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 5fe83e4d..30049cdf 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -11,7 +11,7 @@ import pytest import synapseclient -from genie import load, process_mutation +from genie import extract, load, process_mutation def test_format_maf(): @@ -102,13 +102,24 @@ def test_move_mutation_maf(self): def annotation_paths(): Filepaths = namedtuple( "Filepaths", - ["input_files_dir", "output_files_dir", "error_dir", "merged_maf_path"], + [ + "input_files_dir", + "output_files_dir", + "error_dir", + "merged_maf_path", + "narrow_maf_path", + "full_maf_path", + "full_error_report_path", + ], ) yield Filepaths( input_files_dir="input/dir", output_files_dir="input/dir", error_dir="input/dir/SAGE_error_reports", merged_maf_path="input/dir/data_mutations_extended_SAGE.txt", + narrow_maf_path="input/SAGE/staging/data_mutations_extended_SAGE_MAF_narrow.txt", + full_maf_path="input/SAGE/staging/data_mutations_extended_SAGE.txt", + full_error_report_path="input/SAGE/staging/failed_annotations_report.txt", ) @@ -134,6 +145,7 @@ def test_process_mutation_workflow(syn, genie_config, annotation_paths): ] center = "SAGE" workdir = "working/dir/path" + maf_table_id = "syn22493903" sample_error_report = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4]}) with patch.object( process_mutation, "create_annotation_paths", return_value=annotation_paths @@ -146,6 +158,8 @@ def test_process_mutation_workflow(syn, genie_config, annotation_paths): "concat_annotation_error_reports", return_value=sample_error_report, ) as patch_concat_error, patch.object( + process_mutation, "check_annotation_error_reports" + ) as patch_check_error, patch.object( process_mutation, "store_annotation_error_reports" ) as patch_store_error: maf = process_mutation.process_mutation_workflow( @@ -162,15 +176,20 @@ def test_process_mutation_workflow(syn, genie_config, annotation_paths): patch_split.assert_called_once_with( syn=syn, center=center, - maf_tableid="syn22493903", - annotated_maf_path=annotation_paths.merged_maf_path, + maf_tableid=maf_table_id, + annotated_maf_path=annotation_paths, flatfiles_synid="syn12279903", - workdir=workdir, ) patch_concat_error.assert_called_once_with( center=center, input_dir=annotation_paths.error_dir, ) + patch_check_error.assert_called_once_with( + syn=syn, + maf_table_synid=maf_table_id, + full_error_report=sample_error_report, + center=center, + ) patch_store_error.assert_called_once_with( full_error_report=sample_error_report, syn=syn, @@ -224,6 +243,46 @@ def test_error_report(self): } ) + def test_check_annotation_error_reports_passes_if_match( + self, syn, test_error_report + ): + maf_table_synid = "synZZZZ" + with patch.object( + extract, "get_syntabledf", return_value=test_error_report + ) as patch_get_syntabledf: + process_mutation.check_annotation_error_reports( + syn=syn, + maf_table_synid="synZZZZ", + full_error_report=test_error_report, + center="SAGE", + ) + patch_get_syntabledf.assert_called_once_with( + syn=syn, + query_string=f"SELECT * FROM {maf_table_synid} " + "WHERE Center = 'SAGE' AND " + "Annotation_Status = 'FAILED'", + ) + + def test_check_annotation_error_reports_throws_assertion_if_no_match( + self, syn, test_error_report + ): + with patch.object( + extract, + "get_syntabledf", + return_value=pd.DataFrame({"test": [1], "test2": [2]}), + ): + with pytest.raises( + AssertionError, + match="Genome nexus's failed annotations error report rows doesn't match" + "maf table's failed annotations for SAGE", + ): + process_mutation.check_annotation_error_reports( + syn=syn, + maf_table_synid="synZZZZ", + full_error_report=test_error_report, + center="SAGE", + ) + def test_concat_annotation_error_reports_returns_expected(self, test_error_report): compiled_report = process_mutation.concat_annotation_error_reports( input_dir="source_test_directory", @@ -236,15 +295,17 @@ def test_concat_annotation_error_reports_returns_expected(self, test_error_repor def test_store_annotation_error_reports(self, syn, test_error_report): errors_folder_synid = "syn11111" + full_error_report_path = "test.tsv" with patch.object(load, "store_file", return_value=None) as patch_store: process_mutation.store_annotation_error_reports( full_error_report=test_error_report, + full_error_report_path=full_error_report_path, syn=syn, errors_folder_synid=errors_folder_synid, ) patch_store.assert_called_once_with( syn=syn, - filepath="failed_annotations_report.tsv", + filepath=full_error_report_path, parentid=errors_folder_synid, ) @@ -315,7 +376,7 @@ def test_append_or_createdf_create_file_0size(): patch_tocsv.assert_called_once_with(temp_file.name, sep="\t", index=False) -def test_split_and_store_maf(syn): +def test_split_and_store_maf(syn, annotation_paths): """Integration test, check splitting and storing of maf functions are called""" # getTableColumns @@ -352,21 +413,20 @@ def test_split_and_store_maf(syn): syn=syn, center=center, maf_tableid="sy12345", - annotated_maf_path=annotated_maf_path, + annotation_paths=annotation_paths, flatfiles_synid="syn2345", - workdir="workdir/path", ) patch_getcols.assert_called_once_with("sy12345") patch_readcsv.assert_called_once_with( - annotated_maf_path, sep="\t", chunksize=100000, comment="#" + annotation_paths.merged_maf_path, sep="\t", chunksize=100000, comment="#" ) patch_format.assert_called_once_with(exampledf, center) assert patch_append.call_count == 2 patch_store_table.assert_called_once_with( - syn=syn, filepath=narrow_maf_path, tableid="sy12345" + syn=syn, filepath=annotation_paths.narrow_maf_path, tableid="sy12345" ) patch_store_file.assert_called_once_with( - syn=syn, filepath=full_maf_path, parentid="syn2345" + syn=syn, filepath=annotation_paths.full_maf_path, parentid="syn2345" ) From 503b21d4088b06e90b9c7c1e61474a7453f9160d Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Fri, 12 Jan 2024 00:26:43 -0800 Subject: [PATCH 17/28] fix tests --- tests/test_process_mutation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 30049cdf..5895dee3 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -177,7 +177,7 @@ def test_process_mutation_workflow(syn, genie_config, annotation_paths): syn=syn, center=center, maf_tableid=maf_table_id, - annotated_maf_path=annotation_paths, + annotation_paths=annotation_paths, flatfiles_synid="syn12279903", ) patch_concat_error.assert_called_once_with( @@ -192,6 +192,7 @@ def test_process_mutation_workflow(syn, genie_config, annotation_paths): ) patch_store_error.assert_called_once_with( full_error_report=sample_error_report, + full_error_report_path=annotation_paths.full_error_report_path, syn=syn, errors_folder_synid="syn53239079", ) From 446de5cd3516708b9ea728765aa5c5c6b861d680 Mon Sep 17 00:00:00 2001 From: Chelsea-Na <109613735+Chelsea-Na@users.noreply.github.com> Date: Fri, 12 Jan 2024 19:47:18 -0800 Subject: [PATCH 18/28] [GEN-1018] Update __main__.py and validate.py (#543) * Update __main__.py Updating the -h advice for filetype, oncotree code, and other minor updates. * Updating validate.py and __main__.py * lint * Update tests --------- Co-authored-by: Thomas Yu --- README.md | 2 +- genie/__main__.py | 63 +++++++++++++++++++++++------------------- genie/validate.py | 7 ++++- tests/test_validate.py | 6 ++-- 4 files changed, 46 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 1002e492..555bb925 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ These are instructions on how you would develop and test the pipeline locally. If you are having trouble with the above, try installing via `pipenv` - 1. Specify a python version that is supported by this repo: + 1. Specify a python version that is supported by this repo: ```pipenv --python ``` 1. [pipenv install from requirements file](https://docs.pipenv.org/en/latest/advanced.html#importing-from-requirements-txt) diff --git a/genie/__main__.py b/genie/__main__.py index 3e5ad0ec..f9d21d17 100644 --- a/genie/__main__.py +++ b/genie/__main__.py @@ -44,49 +44,35 @@ def build_parser(): subparsers = parser.add_subparsers( title="commands", - description="The following commands are available:", - help='For additional help: "genie -h"', + description="The following commands are available: ", + help='For additional help use: "genie -h"', ) parser_validate = subparsers.add_parser( - "validate", help="Validates GENIE file formats" + "validate", help="Validates GENIE file formats. " ) parser_validate.add_argument( "filepath", type=str, nargs="+", - help="File(s) that you are validating." - "If you validation your clinical files and you have both sample and " - "patient files, you must provide both", + help="File(s) that you are validating. " + "If you have separate clinical sample and patient files, " + "you must provide both files when validating.", ) parser_validate.add_argument("center", type=str, help="Contributing Centers") - parser_validate.add_argument( - "--format_registry_packages", - type=str, - nargs="+", - default=["genie_registry"], - help="Python package name(s) to get valid file formats from (default: %(default)s).", - ) - - parser_validate.add_argument( - "--oncotree_link", type=str, help="Link to oncotree code" - ) - validate_group = parser_validate.add_mutually_exclusive_group() validate_group.add_argument( "--filetype", type=str, - help="By default, the validator uses the filename to match " + help="Use the --filetype {FILETYPE} parameter to ignore filename validation. " + "By default, the validator uses the filename to match " "the file format. If your filename is incorrectly named, " - "it will be invalid. If you know the file format you are " - "validating, you can ignore the filename validation and skip " - "to file content validation. " - "Note, the filetypes with SP at " - "the end are for special sponsored projects.", + "it will be invalid. " + "Options: [maf, vcf, clinical, assayinfo, bed, cna, sv, seg, mutationsInCis]", ) validate_group.add_argument( @@ -98,18 +84,39 @@ def build_parser(): "to this directory.", ) + parser_validate.add_argument( + "--oncotree_link", + type=str, + help="Specify an oncotree url when validating your clinical " + "file " + "(e.g: https://oncotree.info/api/tumorTypes/tree?version=oncotree_2021_11_02). " + "By default the oncotree version used will be specified in this entity: " + "syn13890902", + ) + + parser_validate.add_argument( + "--nosymbol-check", + action="store_true", + help="Ignores specific post-processing validation criteria related to HUGO symbols " + "in the structural variant and cna files.", + ) + # TODO: remove this default when private genie project is ready parser_validate.add_argument( "--project_id", type=str, default="syn3380222", - help="Synapse Project ID where data is stored. (default: %(default)s).", + help="FOR DEVELOPER USE ONLY: Synapse Project ID where data is stored. " + "(default: %(default)s).", ) parser_validate.add_argument( - "--nosymbol-check", - action="store_true", - help="Do not check hugo symbols of fusion and cna file", + "--format_registry_packages", + type=str, + nargs="+", + default=["genie_registry"], + help="FOR DEVELOPER USE ONLY: Python package name(s) to get valid file formats " + "from (default: %(default)s).", ) parser_validate.set_defaults(func=validate._perform_validate) diff --git a/genie/validate.py b/genie/validate.py index 5bf0f17a..af6de306 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -95,8 +95,13 @@ def validate_single_file(self, **kwargs): valid: Boolean value of validation status """ if self.file_type not in self._format_registry: + allowed_filetypes = list(self._format_registry.keys()) + error_message = ( + f"Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally. " + f"If specifying filetype, options are: [{', '.join(allowed_filetypes)}]\n" + ) valid_result_cls = example_filetype_format.ValidationResults( - errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally", + errors=error_message, warnings="", ) else: diff --git a/tests/test_validate.py b/tests/test_validate.py index 4e16cfee..85b9610b 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -162,7 +162,8 @@ def test_filetype_validate_single_file(syn): "----------------ERRORS----------------\n" "Your filename is incorrect! Please change your " "filename before you run the validator or specify " - "--filetype if you are running the validator locally" + "--filetype if you are running the validator locally. " + "If specifying filetype, options are: [wrong]\n" ) with patch.object(FileFormat, "validateFilename", side_effect=AssertionError): @@ -185,7 +186,8 @@ def test_wrongfiletype_validate_single_file(syn): "----------------ERRORS----------------\n" "Your filename is incorrect! Please change your " "filename before you run the validator or specify " - "--filetype if you are running the validator locally" + "--filetype if you are running the validator locally. " + "If specifying filetype, options are: [wrong]\n" ) with patch.object( From b27d134f912eb01788a309bbdfaee57dbad7f2bc Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 16 Jan 2024 09:32:02 -0800 Subject: [PATCH 19/28] convert to warning --- genie/process_mutation.py | 10 ++++++---- tests/test_process_mutation.py | 27 ++++++++++++++------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index b2846ffd..fe27d647 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -342,6 +342,7 @@ def check_annotation_error_reports( maf_table_synid (str): synapse_id of the narrow maf table full_error_report (pd.DataFrame): the failed annotations error report center (str): the center this is for + """ maf_table_df = extract.get_syntabledf( syn=syn, @@ -351,10 +352,11 @@ def check_annotation_error_reports( "Annotation_Status = 'FAILED'" ), ) - assert len(maf_table_df) == len(full_error_report), ( - "Genome nexus's failed annotations error report rows doesn't match" - f"maf table's failed annotations for {center}" - ) + if len(maf_table_df) != len(full_error_report): + logger.warning( + "Genome nexus's failed annotations error report rows doesn't match" + f"maf table's failed annotations for {center}" + ) def store_annotation_error_reports( diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 5895dee3..94e03d3f 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -264,25 +264,26 @@ def test_check_annotation_error_reports_passes_if_match( "Annotation_Status = 'FAILED'", ) - def test_check_annotation_error_reports_throws_assertion_if_no_match( - self, syn, test_error_report + def test_check_annotation_error_reports_raises_warning_if_no_match( + self, syn, test_error_report, caplog ): with patch.object( extract, "get_syntabledf", return_value=pd.DataFrame({"test": [1], "test2": [2]}), ): - with pytest.raises( - AssertionError, - match="Genome nexus's failed annotations error report rows doesn't match" - "maf table's failed annotations for SAGE", - ): - process_mutation.check_annotation_error_reports( - syn=syn, - maf_table_synid="synZZZZ", - full_error_report=test_error_report, - center="SAGE", - ) + process_mutation.check_annotation_error_reports( + syn=syn, + maf_table_synid="synZZZZ", + full_error_report=test_error_report, + center="SAGE", + ) + assert ( + "Genome nexus's failed annotations error report rows doesn't match" + "maf table's failed annotations for SAGE" in caplog.text + ) + # check that at least one is a warning + assert any(record.levelname == "WARNING" for record in caplog.records) def test_concat_annotation_error_reports_returns_expected(self, test_error_report): compiled_report = process_mutation.concat_annotation_error_reports( From 4c924198a4b5584e867be25e305aef1459d7ef47 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 24 Jan 2024 10:25:58 -0800 Subject: [PATCH 20/28] add release schema and constrictions for full maf file --- genie/database_to_staging.py | 74 +++++++++++++++- genie/process_functions.py | 29 ++++++ tests/test_process_functions.py | 150 ++++++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+), 4 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 264b0937..7d0580db 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -35,6 +35,73 @@ SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt") BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv") +FULL_MAF_RELEASE_SCHEMA = { + "Hugo_Symbol": "string", + "Entrez_Gene_Id": "float", + "Center": "string", + "NCBI_Build": "string", + "Chromosome": "integer", + "Start_Position": "integer", + "End_Position": "integer", + "Strand": "string", + "Consequence": "string", + "Variant_Classification": "string", + "Variant_Type": "string", + "Reference_Allele": "string", + "Tumor_Seq_Allele1": "string", + "Tumor_Seq_Allele2": "string", + "dbSNP_RS": "string", + "dbSNP_Val_Status": "float", + "Tumor_Sample_Barcode": "string", + "Matched_Norm_Sample_Barcode": "string", + "Match_Norm_Seq_Allele1": "string", + "Match_Norm_Seq_Allele2": "string", + "Tumor_Validation_Allele1": "float", + "Tumor_Validation_Allele2": "float", + "Match_Norm_Validation_Allele1": "float", + "Match_Norm_Validation_Allele2": "float", + "Verification_Status": "string", + "Validation_Status": "float", + "Mutation_Status": "string", + "Sequencing_Phase": "float", + "Sequence_Source": "float", + "Validation_Method": "float", + "Score": "float", + "BAM_File": "float", + "Sequencer": "float", + "t_ref_count": "float", + "t_alt_count": "float", + "n_ref_count": "float", + "n_alt_count": "float", + "HGVSc": "string", + "HGVSp": "string", + "HGVSp_Short": "string", + "Transcript_ID": "string", + "RefSeq": "string", + "Protein_position": "float", + "Codons": "string", + "Exon_Number": "string", + "gnomAD_AF": "float", + "gnomAD_AFR_AF": "float", + "gnomAD_AMR_AF": "float", + "gnomAD_ASJ_AF": "float", + "gnomAD_EAS_AF": "float", + "gnomAD_FIN_AF": "float", + "gnomAD_NFE_AF": "float", + "gnomAD_OTH_AF": "float", + "gnomAD_SAS_AF": "float", + "FILTER": "string", + "Polyphen_Prediction": "string", + "Polyphen_Score": "float", + "SIFT_Prediction": "string", + "SIFT_Score": "float", + "SWISSPROT": "float", + # "genomic_location_explanation": "string", + "n_depth": "float", + "t_depth": "float", + # "Annotation_Status": "string" +} + # TODO: Add to transform.py def _to_redact_interval(df_col): @@ -757,8 +824,6 @@ def store_maf_files( used_entities = [] # Must get the headers (because can't assume headers are the same order) maf_ent = syn.get(centerMafSynIdsDf.id[0]) - headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0) - column_order = headerdf.columns for _, mafSynId in enumerate(centerMafSynIdsDf.id): maf_ent = syn.get(mafSynId) logger.info(maf_ent.path) @@ -771,8 +836,9 @@ def store_maf_files( ) for mafchunk in mafchunks: - # Reorder column headers - mafchunk = mafchunk[column_order] + mafchunk = process_functions.create_missing_columns( + dataset=mafchunk, schema=FULL_MAF_RELEASE_SCHEMA + ) # Get center for center staging maf # Configure maf configured_mafdf = configure_maf( diff --git a/genie/process_functions.py b/genie/process_functions.py index e45d3c90..e4d8a42e 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -948,3 +948,32 @@ def create_new_fileformat_table( "newdb_mappingdf": newdb_mappingdf, "moved_ent": moved_ent, } + + +def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: + """Creates and fills missing columns with the relevant NA value for the + given data type. Note that special handling had to occur for + allowing NAs in integer based columns in pandas by converting + the integer column into the Int64 (pandas nullable integer data type) + + Args: + dataset (pd.DataFrame): input dataset to fill missing columns for + schema (dict): the expected schema {column_name(str): data_type(str)} + for the input dataset + + Returns: + pd.Series: updated dataset + """ + missing_values = { + "string": "", + "integer": None, + "float": float("nan"), + } + for column, data_type in schema.items(): + if column not in dataset.columns: + dataset = dataset.assign(**{column: missing_values[data_type]}) + + # only way to preserve NAs for integer based columns in pandas + if data_type == "integer": + dataset[column] = dataset[column].astype("Int64") + return dataset[list(schema.keys())] diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index a0365e54..91f0cb00 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -2,6 +2,8 @@ import uuid import pandas as pd +from pandas.api.types import is_float_dtype, is_integer_dtype, is_string_dtype +from pandas.testing import assert_frame_equal import pytest import synapseclient @@ -500,3 +502,151 @@ def test_that_func_returns_correct_error_warning_if_input_col_has_na_and_nas_is_ ) assert error == "" assert warning == "" + + +def get_create_missing_columns_test_cases(): + return [ + { + "name": "str_no_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col1": "string"}, + "expected_output": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "str_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col2": "string"}, + "expected_output": pd.DataFrame({"col2": ["", "", ""]}), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "float_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col2": "float"}, + "expected_output": pd.DataFrame( + {"col2": [float("nan"), float("nan"), float("nan")]} + ), + "expected_dtype": is_float_dtype, + "expected_na_count": 3, + }, + { + "name": "float_no_na", + "test_input": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}), + "test_schema": {"col1": "float"}, + "expected_output": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}), + "expected_dtype": is_float_dtype, + "expected_na_count": 1, + }, + { + "name": "int_na", + "test_input": pd.DataFrame({"col1": [2, 3, 4]}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame( + {"col2": [None, None, None]}, dtype=pd.Int64Dtype() + ), + "expected_dtype": is_integer_dtype, + "expected_na_count": 3, + }, + { + "name": "int_no_na", + "test_input": pd.DataFrame({"col1": [2, 3, 4]}), + "test_schema": {"col1": "integer"}, + "expected_output": pd.DataFrame({"col1": [2, 3, 4]}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_col", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "string"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=str), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_df", + "test_input": pd.DataFrame({}), + "test_schema": {"col1": "float"}, + "expected_output": pd.DataFrame({"col1": []}, index=[]), + "expected_dtype": is_float_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_col_int", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_df_int", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + ] + + +@pytest.mark.parametrize( + "test_cases", + get_create_missing_columns_test_cases(), + ids=lambda x: x["name"], +) +def test_that_create_missing_columns_gets_expected_output_with_single_col_df( + test_cases, +): + result = process_functions.create_missing_columns( + dataset=test_cases["test_input"], schema=test_cases["test_schema"] + ) + assert_frame_equal(result, test_cases["expected_output"], check_exact=True) + assert test_cases["expected_dtype"](result.iloc[:, 0].dtype) + assert result.isna().sum().sum() == test_cases["expected_na_count"] + + +def test_that_create_missing_columns_returns_expected_output_with_multi_col_df(): + test_input = pd.DataFrame( + { + "col2": ["str1", "str2", "str3"], + "col1": [2, 3, 4], + "col3": [2.0, 3.0, float("nan")], + } + ) + test_schema = { + "col1": "integer", + "col2": "string", + "col3": "float", + "col4": "integer", + "col5": "string", + "col6": "float", + } + result = process_functions.create_missing_columns( + dataset=test_input, schema=test_schema + ) + expected_output = pd.DataFrame( + { + "col1": [2, 3, 4], + "col2": ["str1", "str2", "str3"], + "col3": [2.0, 3.0, float("nan")], + "col4": [None, None, None], + "col5": ["", "", ""], + "col6": [float("nan"), float("nan"), float("nan")], + } + ) + expected_output["col1"] = expected_output["col1"].astype("Int64") + expected_output["col4"] = expected_output["col4"].astype("Int64") + + assert result["col1"].dtype == pd.Int64Dtype() + assert is_string_dtype(result["col2"]) + assert is_float_dtype(result["col3"]) + assert result["col4"].dtype == pd.Int64Dtype() + assert is_string_dtype(result["col5"]) + assert is_float_dtype(result["col6"]) + assert result.isna().sum().sum() == 7 + + assert_frame_equal(result, expected_output, check_exact=True) From 30c39e045d8d1a280c2c603c37ea699b9ac6a21d Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 24 Jan 2024 11:41:31 -0800 Subject: [PATCH 21/28] add support for boolean in schema --- genie/database_to_staging.py | 8 ++++--- genie/process_functions.py | 5 ++++- tests/test_process_functions.py | 40 ++++++++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 7d0580db..55f046bf 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -99,6 +99,7 @@ # "genomic_location_explanation": "string", "n_depth": "float", "t_depth": "float", + "mutationInCis_Flag": "boolean" # "Annotation_Status": "string" } @@ -836,18 +837,19 @@ def store_maf_files( ) for mafchunk in mafchunks: - mafchunk = process_functions.create_missing_columns( - dataset=mafchunk, schema=FULL_MAF_RELEASE_SCHEMA - ) # Get center for center staging maf # Configure maf configured_mafdf = configure_maf( mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants ) + configured_mafdf = process_functions.create_missing_columns( + dataset=configured_mafdf, schema=FULL_MAF_RELEASE_SCHEMA + ) # Create maf for release merged_mafdf = remove_maf_samples( configured_mafdf, keep_for_merged_consortium_samples ) + append_or_create_release_maf(merged_mafdf, mutations_path) # Create maf for center staging center_mafdf = remove_maf_samples( diff --git a/genie/process_functions.py b/genie/process_functions.py index e4d8a42e..3f0fc3e4 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -968,12 +968,15 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: "string": "", "integer": None, "float": float("nan"), + "boolean": None, } for column, data_type in schema.items(): if column not in dataset.columns: dataset = dataset.assign(**{column: missing_values[data_type]}) - # only way to preserve NAs for integer based columns in pandas + # only way to preserve NAs for these specific dtype columns if data_type == "integer": dataset[column] = dataset[column].astype("Int64") + elif data_type == "boolean": + dataset[column] = dataset[column].astype(pd.BooleanDtype()) return dataset[list(schema.keys())] diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index 91f0cb00..e4a95d56 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -2,7 +2,12 @@ import uuid import pandas as pd -from pandas.api.types import is_float_dtype, is_integer_dtype, is_string_dtype +from pandas.api.types import ( + is_bool_dtype, + is_float_dtype, + is_integer_dtype, + is_string_dtype, +) from pandas.testing import assert_frame_equal import pytest import synapseclient @@ -558,6 +563,26 @@ def get_create_missing_columns_test_cases(): "expected_dtype": is_integer_dtype, "expected_na_count": 0, }, + { + "name": "bool_na", + "test_input": pd.DataFrame({"col1": [True, False, None]}), + "test_schema": {"col2": "boolean"}, + "expected_output": pd.DataFrame( + {"col2": [None, None, None]}, dtype=pd.BooleanDtype() + ), + "expected_dtype": is_bool_dtype, + "expected_na_count": 3, + }, + { + "name": "bool_no_na", + "test_input": pd.DataFrame({"col1": [True, False, None]}), + "test_schema": {"col1": "boolean"}, + "expected_output": pd.DataFrame( + {"col1": [True, False, None]}, dtype=pd.BooleanDtype() + ), + "expected_dtype": is_bool_dtype, + "expected_na_count": 1, + }, { "name": "empty_col", "test_input": pd.DataFrame({"col1": []}), @@ -605,7 +630,7 @@ def test_that_create_missing_columns_gets_expected_output_with_single_col_df( dataset=test_cases["test_input"], schema=test_cases["test_schema"] ) assert_frame_equal(result, test_cases["expected_output"], check_exact=True) - assert test_cases["expected_dtype"](result.iloc[:, 0].dtype) + assert test_cases["expected_dtype"](result.iloc[:, 0]) assert result.isna().sum().sum() == test_cases["expected_na_count"] @@ -615,6 +640,7 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() "col2": ["str1", "str2", "str3"], "col1": [2, 3, 4], "col3": [2.0, 3.0, float("nan")], + "col7": [True, False, None], } ) test_schema = { @@ -624,6 +650,8 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() "col4": "integer", "col5": "string", "col6": "float", + "col7": "boolean", + "col8": "boolean", } result = process_functions.create_missing_columns( dataset=test_input, schema=test_schema @@ -636,10 +664,14 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() "col4": [None, None, None], "col5": ["", "", ""], "col6": [float("nan"), float("nan"), float("nan")], + "col7": [True, False, None], + "col8": [None, None, None], } ) expected_output["col1"] = expected_output["col1"].astype("Int64") expected_output["col4"] = expected_output["col4"].astype("Int64") + expected_output["col7"] = expected_output["col7"].astype(pd.BooleanDtype()) + expected_output["col8"] = expected_output["col8"].astype(pd.BooleanDtype()) assert result["col1"].dtype == pd.Int64Dtype() assert is_string_dtype(result["col2"]) @@ -647,6 +679,8 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() assert result["col4"].dtype == pd.Int64Dtype() assert is_string_dtype(result["col5"]) assert is_float_dtype(result["col6"]) - assert result.isna().sum().sum() == 7 + assert result["col7"].dtype == pd.BooleanDtype() + assert result["col8"].dtype == pd.BooleanDtype() + assert result.isna().sum().sum() == 11 assert_frame_equal(result, expected_output, check_exact=True) From 7b6c2b337061710eecd33af99340344495ccc08b Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 24 Jan 2024 15:39:51 -0800 Subject: [PATCH 22/28] remove use of create_missing_column function, just subset on required release cols --- genie/database_to_staging.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 55f046bf..0b76b6b6 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -96,11 +96,10 @@ "SIFT_Prediction": "string", "SIFT_Score": "float", "SWISSPROT": "float", - # "genomic_location_explanation": "string", "n_depth": "float", "t_depth": "float", - "mutationInCis_Flag": "boolean" - # "Annotation_Status": "string" + "mutationInCis_Flag": "boolean", + "Annotation_Status": "string" } @@ -823,7 +822,6 @@ def store_maf_files( with open(MUTATIONS_CENTER_PATH % center, "w"): pass used_entities = [] - # Must get the headers (because can't assume headers are the same order) maf_ent = syn.get(centerMafSynIdsDf.id[0]) for _, mafSynId in enumerate(centerMafSynIdsDf.id): maf_ent = syn.get(mafSynId) @@ -842,9 +840,7 @@ def store_maf_files( configured_mafdf = configure_maf( mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants ) - configured_mafdf = process_functions.create_missing_columns( - dataset=configured_mafdf, schema=FULL_MAF_RELEASE_SCHEMA - ) + configured_mafdf = configured_mafdf[list(FULL_MAF_RELEASE_SCHEMA.keys())] # Create maf for release merged_mafdf = remove_maf_samples( configured_mafdf, keep_for_merged_consortium_samples From e0d741783e59a9d6268c32b184c508efb38a4ff2 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 24 Jan 2024 15:43:04 -0800 Subject: [PATCH 23/28] lint --- genie/database_to_staging.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 0b76b6b6..3520eb97 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -99,7 +99,7 @@ "n_depth": "float", "t_depth": "float", "mutationInCis_Flag": "boolean", - "Annotation_Status": "string" + "Annotation_Status": "string", } @@ -840,12 +840,13 @@ def store_maf_files( configured_mafdf = configure_maf( mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants ) - configured_mafdf = configured_mafdf[list(FULL_MAF_RELEASE_SCHEMA.keys())] + configured_mafdf = configured_mafdf[ + list(FULL_MAF_RELEASE_SCHEMA.keys()) + ] # Create maf for release merged_mafdf = remove_maf_samples( configured_mafdf, keep_for_merged_consortium_samples ) - append_or_create_release_maf(merged_mafdf, mutations_path) # Create maf for center staging center_mafdf = remove_maf_samples( From 339eabed5483bc54437babab495c8511cea4a25d Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 24 Jan 2024 16:08:00 -0800 Subject: [PATCH 24/28] convert to list --- genie/database_to_staging.py | 136 +++++++++++++++++------------------ 1 file changed, 67 insertions(+), 69 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 3520eb97..e563dc88 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -35,72 +35,72 @@ SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt") BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv") -FULL_MAF_RELEASE_SCHEMA = { - "Hugo_Symbol": "string", - "Entrez_Gene_Id": "float", - "Center": "string", - "NCBI_Build": "string", - "Chromosome": "integer", - "Start_Position": "integer", - "End_Position": "integer", - "Strand": "string", - "Consequence": "string", - "Variant_Classification": "string", - "Variant_Type": "string", - "Reference_Allele": "string", - "Tumor_Seq_Allele1": "string", - "Tumor_Seq_Allele2": "string", - "dbSNP_RS": "string", - "dbSNP_Val_Status": "float", - "Tumor_Sample_Barcode": "string", - "Matched_Norm_Sample_Barcode": "string", - "Match_Norm_Seq_Allele1": "string", - "Match_Norm_Seq_Allele2": "string", - "Tumor_Validation_Allele1": "float", - "Tumor_Validation_Allele2": "float", - "Match_Norm_Validation_Allele1": "float", - "Match_Norm_Validation_Allele2": "float", - "Verification_Status": "string", - "Validation_Status": "float", - "Mutation_Status": "string", - "Sequencing_Phase": "float", - "Sequence_Source": "float", - "Validation_Method": "float", - "Score": "float", - "BAM_File": "float", - "Sequencer": "float", - "t_ref_count": "float", - "t_alt_count": "float", - "n_ref_count": "float", - "n_alt_count": "float", - "HGVSc": "string", - "HGVSp": "string", - "HGVSp_Short": "string", - "Transcript_ID": "string", - "RefSeq": "string", - "Protein_position": "float", - "Codons": "string", - "Exon_Number": "string", - "gnomAD_AF": "float", - "gnomAD_AFR_AF": "float", - "gnomAD_AMR_AF": "float", - "gnomAD_ASJ_AF": "float", - "gnomAD_EAS_AF": "float", - "gnomAD_FIN_AF": "float", - "gnomAD_NFE_AF": "float", - "gnomAD_OTH_AF": "float", - "gnomAD_SAS_AF": "float", - "FILTER": "string", - "Polyphen_Prediction": "string", - "Polyphen_Score": "float", - "SIFT_Prediction": "string", - "SIFT_Score": "float", - "SWISSPROT": "float", - "n_depth": "float", - "t_depth": "float", - "mutationInCis_Flag": "boolean", - "Annotation_Status": "string", -} +FULL_MAF_RELEASE_COLUMNS = [ + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Consequence", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "Matched_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "t_ref_count", + "t_alt_count", + "n_ref_count", + "n_alt_count", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "RefSeq", + "Protein_position", + "Codons", + "Exon_Number", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "FILTER", + "Polyphen_Prediction", + "Polyphen_Score", + "SIFT_Prediction", + "SIFT_Score", + "SWISSPROT", + "n_depth", + "t_depth", + "Annotation_Status", + "mutationInCis_Flag", +] # TODO: Add to transform.py @@ -840,9 +840,7 @@ def store_maf_files( configured_mafdf = configure_maf( mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants ) - configured_mafdf = configured_mafdf[ - list(FULL_MAF_RELEASE_SCHEMA.keys()) - ] + configured_mafdf = configured_mafdf[FULL_MAF_RELEASE_COLUMNS] # Create maf for release merged_mafdf = remove_maf_samples( configured_mafdf, keep_for_merged_consortium_samples From 45e54b381f27ce5de118705f84080b1eb3a10875 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:10:26 -0800 Subject: [PATCH 25/28] [GEN-1028] Generate failed annotations report dashboard (#546) * initial commit to add failed annotations section to release dashboard * correct source file path * fix test to non-changing db table --- CONTRIBUTING.md | 14 +++- R/dashboard_template_functions.R | 41 +++++++++++ .../test_dashboard_template_functions.R | 69 +++++++++++++++++++ templates/dashboardTemplate.Rmd | 36 ++++++++++ 4 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 R/dashboard_template_functions.R create mode 100644 R/tests/testthat/test_dashboard_template_functions.R diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2520bc40..d13c99d5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -100,6 +100,8 @@ Make sure to run each of the [pipeline steps here](README.md#developing-locally) #### Running tests +##### Tests in Python + This package uses [`pytest`](https://pytest.org/en/latest/) to run tests. The test code is located in the [tests](./tests) subdirectory. Here's how to run the test suite: @@ -108,7 +110,17 @@ Here's how to run the test suite: pytest -vs tests/ ``` -Tests are also run automatically by Github Actions on any pull request and are required to pass before merging. +Tests in Python are also run automatically by Github Actions on any pull request and are required to pass before merging. + +##### Tests in R + +This package uses [`testthat`](https://testthat.r-lib.org/) to run tests in R. The test code is located in the [testthat](./R/tests/testthat) subdirectory. + +Here's how to run the test suite: + +```shell +Rscript -e "testthat::test_dir('R/tests/testthat/')" +``` #### Test Development diff --git a/R/dashboard_template_functions.R b/R/dashboard_template_functions.R new file mode 100644 index 00000000..0fe44323 --- /dev/null +++ b/R/dashboard_template_functions.R @@ -0,0 +1,41 @@ +# --------------------------------------------------------------------------- +# Title: dashboard_template_functions.R +# Description: This script contains helper functions used in +# templates/dashboardTemplate.Rmd +# --------------------------------------------------------------------------- + +#' This function gets the database to synapse id mapping table, +#' maps the provided database_name to its synapse id and returns it +#' +#' @param database_name (str) database name in database +#' to synapse id mapping table +#' @param database_synid_mappingid (str) synapse id of the database +#' to synapse id mapping table +#' +#' @return (str) synapse id of the mapped database name +get_syn_id_from_mapped_database <- function(database_name, database_synid_mappingid){ + database_synid_mapping = synTableQuery(sprintf('select * from %s', + database_synid_mappingid)) + database_synid_mappingdf = as.data.frame(database_synid_mapping) + table_synid = database_synid_mappingdf$Id[database_synid_mappingdf$Database == database_name] + return(table_synid) +} + +#' This function creates a table of failed annotation counts by grouped columns +#' @param maf_data (data.frame) input maf data frame +#' @param group_by_cols (str vector) list of columns to create counts by +#' @param counts_col_name (str) name to give to the counts column +#' +#' @return (data.frame) counts table +get_failed_annotation_table_counts <- function(maf_data, group_by_cols, counts_col_name){ + table_counts <- table(maf_data[maf_data$Annotation_Status == "FAILED", group_by_cols]) + + if (nrow(table_counts) == 0){ + counts_table <- data.frame(matrix(ncol = length(group_by_cols) + 1, nrow = 0)) + } else{ + counts_table <- as.data.frame(table_counts) + } + colnames(counts_table) <- c(group_by_cols, counts_col_name) + counts_table <- counts_table[do.call(order, counts_table[group_by_cols]), ] + return(counts_table) +} \ No newline at end of file diff --git a/R/tests/testthat/test_dashboard_template_functions.R b/R/tests/testthat/test_dashboard_template_functions.R new file mode 100644 index 00000000..ceb6aa6e --- /dev/null +++ b/R/tests/testthat/test_dashboard_template_functions.R @@ -0,0 +1,69 @@ +# tests for dashboard_template_functions.R + +source("../../dashboard_template_functions.R") + +library(synapser) +library(testthat) + +sample_counts_table <- function() { + data <- data.frame( + Center = factor(c("GOLD","SAGE", "TEST"), + levels = c("GOLD", "SAGE", "TEST")), + Counts = c(1, 2, 1) + ) + return(data) +} + +empty_counts_table <- function() { + data <- data.frame( + Center = logical(), + Counts = logical() + ) + return(data) +} + +sample_maf_table <- function() { + data <- data.frame( + Center = c("TEST", "TEST", "SAGE", "SAGE", "GOLD", "BRONZE"), + Tumor_Sample_Barcode = c("SAGE1", "SAGE2", "SAGE3", "SAGE4", "SAGE5", "SAGE6"), + Annotation_Status = c("SUCCESS", "FAILED", "FAILED", "FAILED", "FAILED", "SUCCESS") + ) + return(data) +} + +sample_maf_table_no_failed_annotations <- function() { + data <- data.frame( + Center = c("TEST", "SAGE", "GOLD"), + Tumor_Sample_Barcode = c("SAGE1", "SAGE2", "SAGE3"), + Annotation_Status = c("SUCCESS", "SUCCESS", "SUCCESS") + ) + return(data) +} + + +test_that("get_syn_id_from_mapped_database_gets_correct_value", { + synLogin() + result <- get_syn_id_from_mapped_database( + database_name = "main", + database_synid_mappingid = "syn11600968" + ) + expect_equal(result, "syn7208886") +}) + + +test_that("get_failed_annotation_table_counts_returns_expected_output", { + result <- get_failed_annotation_table_counts( + maf_data=sample_maf_table(), + group_by_cols="Center", + counts_col_name="Counts") + expect_equal(result, sample_counts_table()) +}) + +test_that("get_failed_annotation_table_counts_returns_empty_table_with_no_failed_annotations", { + result <- get_failed_annotation_table_counts( + maf_data=sample_maf_table_no_failed_annotations(), + group_by_cols="Center", + counts_col_name="Counts") + expect_equal(result, empty_counts_table()) +}) + diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd index 2f5558df..bb984c21 100644 --- a/templates/dashboardTemplate.Rmd +++ b/templates/dashboardTemplate.Rmd @@ -30,6 +30,8 @@ suppressMessages(library(RColorBrewer)) suppressMessages(library(jsonlite)) suppressMessages(library(knitr)) +source("R/dashboard_template_functions.R") + createCenterColumn <- function(clinicalDf) { if (is.null(clinicalDf$CENTER)) { centers = unlist( @@ -401,6 +403,40 @@ For patient retractions submitted between these months, the records will be remo * April-June * October-December +--- + +### Genome nexus failed annotations summary + +The table below displays the number of failed annotations per center. If a center has no failed annotations, no record will appear in the table for that center. + +```{r} +maf_table_synid = get_syn_id_from_mapped_database( + database_name="vcf2maf", + database_synid_mappingid = database_synid_mappingid +) + +# get narrow maf table +maf_table <- as.data.frame(synTableQuery(sprintf('select * from %s', maf_table_synid))) + +counts_table <- get_failed_annotation_table_counts( + maf_table, + group_by_cols = "Center", + counts_col_name = "Number of failed annotations" + ) +knitr::kable(counts_table, col.names = c("Center", "Number of failed annotations")) + +#get project page on synapse +main_syn_id <- get_syn_id_from_mapped_database( + database_name="main", + database_synid_mappingid = database_synid_mappingid +) +``` + +Follow this navigation guide from the `r paste0("[Synapse project files page](https://www.synapse.org/#!Synapse:", main_syn_id, "/files/){target='_blank'}")` to find your center's detailed failed annotations error report. + +Files → Centers → [Center name] → Errors → failed_annotations_error_report.txt + +View the version comment column in Synapse for your report to find the version associated with this release. --- From 5e69a7349fdec493c6437a5ecabb9ee1b9bd0c38 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:47:39 -0800 Subject: [PATCH 26/28] [GEN-974] Allow NaN, nan and NA strings for mutation data (#549) * initial commit * deprecated setup, replace with setup_method * update black version and re-lint * update _get_dataframe docstring for vcf and maf * add additional tests * fix relevant code smells * remove unused conversion code * add None into valid vals in test --- genie/config.py | 1 + genie/create_case_lists.py | 1 + genie/dashboard_table_updater.py | 15 +++--- genie/database_to_staging.py | 40 ++++++++++------ genie/example_filetype_format.py | 1 + genie/load.py | 1 + genie/process_functions.py | 1 + genie/process_mutation.py | 1 + genie/transform.py | 23 +++++++++ genie/write_invalid_reasons.py | 1 + genie_registry/__init__.py | 1 + genie_registry/assay.py | 1 + genie_registry/bed.py | 1 + genie_registry/clinical.py | 1 + genie_registry/maf.py | 63 +++++++++++++++++------- genie_registry/vcf.py | 52 ++++++++++++++++++-- setup.py | 1 + tests/test_assay.py | 1 + tests/test_bed.py | 1 + tests/test_database_to_staging.py | 1 + tests/test_extract.py | 1 + tests/test_filters.py | 1 + tests/test_input_to_database.py | 2 +- tests/test_maf.py | 74 ++++++++++++++++++++++++----- tests/test_process_mutation.py | 3 +- tests/test_transform.py | 58 ++++++++++++++++++++++ tests/test_validate.py | 1 + tests/test_vcf.py | 74 +++++++++++++++++++++++++++++ tests/test_write_invalid_reasons.py | 1 + 29 files changed, 369 insertions(+), 54 deletions(-) diff --git a/genie/config.py b/genie/config.py index 51a09825..343ac990 100644 --- a/genie/config.py +++ b/genie/config.py @@ -1,4 +1,5 @@ """Configuration to obtain registry classes""" + import importlib import logging diff --git a/genie/create_case_lists.py b/genie/create_case_lists.py index da18a319..547bfe87 100644 --- a/genie/create_case_lists.py +++ b/genie/create_case_lists.py @@ -1,6 +1,7 @@ """ Creates case lists per cancer type """ + from collections import defaultdict import csv import os diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 9dca7fa0..33a44d5f 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -1,4 +1,5 @@ """Updates dashboard tables""" + import argparse import datetime import logging @@ -347,9 +348,11 @@ def update_oncotree_code_tables(syn, database_mappingdf): oncotree_mapping = process_functions.get_oncotree_code_mappings(oncotree_link) clinicaldf["PRIMARY_CODES"] = [ - oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"] - if i.upper() in oncotree_mapping.keys() - else "DEPRECATED_CODE" + ( + oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"] + if i.upper() in oncotree_mapping.keys() + else "DEPRECATED_CODE" + ) for i in clinicaldf.ONCOTREE_CODE ] @@ -457,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf): .applymap(int) ) - diff_between_releasesdf[ - ["Clinical", "Mutation", "CNV", "SEG", "Fusions"] - ] = new_values + diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = ( + new_values + ) load._update_table( syn, diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index e563dc88..c7e1b7c3 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -1052,30 +1052,38 @@ def store_clinical_files( } clinicaldf["CANCER_TYPE"] = [ - oncotree_dict[code.upper()]["CANCER_TYPE"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["CANCER_TYPE"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] clinicaldf["CANCER_TYPE_DETAILED"] = [ - oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] clinicaldf["ONCOTREE_PRIMARY_NODE"] = [ - oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] clinicaldf["ONCOTREE_SECONDARY_NODE"] = [ - oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] @@ -1086,9 +1094,11 @@ def store_clinical_files( # descriptions can match clinicaldf["AGE_AT_SEQ_REPORT_DAYS"] = clinicaldf["AGE_AT_SEQ_REPORT"] clinicaldf["AGE_AT_SEQ_REPORT"] = [ - int(math.floor(int(float(age)) / 365.25)) - if process_functions.checkInt(age) - else age + ( + int(math.floor(int(float(age)) / 365.25)) + if process_functions.checkInt(age) + else age + ) for age in clinicaldf["AGE_AT_SEQ_REPORT"] ] clinicaldf["AGE_AT_SEQ_REPORT"][clinicaldf["AGE_AT_SEQ_REPORT"] == ">32485"] = ">89" diff --git a/genie/example_filetype_format.py b/genie/example_filetype_format.py index b02bd983..a568f87c 100644 --- a/genie/example_filetype_format.py +++ b/genie/example_filetype_format.py @@ -1,6 +1,7 @@ """TODO: Rename this to model.py This contains the GENIE model objects """ + from abc import ABCMeta from dataclasses import dataclass import logging diff --git a/genie/load.py b/genie/load.py index b9ceea5d..8b9fe26c 100644 --- a/genie/load.py +++ b/genie/load.py @@ -2,6 +2,7 @@ This module contains all the functions that stores data to Synapse """ + import logging import os import time diff --git a/genie/process_functions.py b/genie/process_functions.py index 3f0fc3e4..d07d6b8d 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -1,4 +1,5 @@ """Processing functions that are used in the GENIE pipeline""" + import datetime import json import logging diff --git a/genie/process_mutation.py b/genie/process_mutation.py index fe27d647..bc5436ab 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -1,5 +1,6 @@ """Process mutation files TODO deprecate this module and spread functions around""" + from collections import namedtuple import logging import os diff --git a/genie/transform.py b/genie/transform.py index 6145115d..faed45ee 100644 --- a/genie/transform.py +++ b/genie/transform.py @@ -1,5 +1,7 @@ """This module contains all the transformation functions used throughout the GENIE package""" + +from typing import List import warnings import pandas as pd @@ -64,3 +66,24 @@ def _convert_df_with_mixed_dtypes(read_csv_params: dict) -> pd.DataFrame: df = pd.read_csv(**read_csv_params, low_memory=False, engine="c") warnings.resetwarnings() return df + + +def _convert_values_to_na( + input_df: pd.DataFrame, values_to_replace: List[str], columns_to_convert: List[str] +) -> pd.DataFrame: + """Converts given values to NA in an input dataset + + Args: + input_df (pd.DataFrame): input dataset + values_to_replace (List[str]): string values to replace with na + columns_to_convert (List[str]): subset of columns to convert with na in + + Returns: + pd.DataFrame: dataset with specified values replaced with NAs + """ + if not input_df.empty: + replace_mapping = {value: None for value in values_to_replace} + input_df[columns_to_convert] = input_df[columns_to_convert].replace( + replace_mapping + ) + return input_df diff --git a/genie/write_invalid_reasons.py b/genie/write_invalid_reasons.py index 704697b9..f16f1ad9 100644 --- a/genie/write_invalid_reasons.py +++ b/genie/write_invalid_reasons.py @@ -1,4 +1,5 @@ """Write invalid reasons""" + import logging import os diff --git a/genie_registry/__init__.py b/genie_registry/__init__.py index c4a6595f..e730a9da 100644 --- a/genie_registry/__init__.py +++ b/genie_registry/__init__.py @@ -1,4 +1,5 @@ """Initialize GENIE registry""" + # Import logging last to not take in synapseclient logging import logging diff --git a/genie_registry/assay.py b/genie_registry/assay.py index bdfea502..33362ddc 100644 --- a/genie_registry/assay.py +++ b/genie_registry/assay.py @@ -1,4 +1,5 @@ """Assay information class""" + import os import yaml diff --git a/genie_registry/bed.py b/genie_registry/bed.py index c1f7680f..7b7294c4 100644 --- a/genie_registry/bed.py +++ b/genie_registry/bed.py @@ -1,4 +1,5 @@ """GENIE bed class and functions""" + import os import logging import subprocess diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index e20dac75..e5a4244c 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1,4 +1,5 @@ """Clinical file format validation and processing""" + # from __future__ import annotations import datetime from io import StringIO diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 3adcc2ce..aeb8246a 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -1,6 +1,7 @@ from io import StringIO -import os import logging +import os +from typing import List import pandas as pd @@ -198,10 +199,6 @@ def _validate(self, mutationDF): for col in numerical_cols: col_exists = process_functions.checkColExist(mutationDF, col) if col_exists: - # Since NA is an allowed value, when reading in the dataframe - # the 'NA' string is not converted. This will convert all - # 'NA' values in the numerical columns into actual float('nan') - mutationDF.loc[mutationDF[col] == "NA", col] = float("nan") # Attempt to convert column to float try: mutationDF[col] = mutationDF[col].astype(float) @@ -352,13 +349,38 @@ def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple: ) return errors, warnings - def _get_dataframe(self, filePathList): - """Get mutation dataframe""" - # Must do this because pandas.read_csv will allow for a file to - # have more column headers than content. E.g. - # A,B,C,D,E - # 1,2 - # 2,3 + def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame: + """Get mutation dataframe + + 1) Starts reading the first line in the file + 2) Skips lines that starts with # + 3) Reads in second line + 4) Checks that first line fields matches second line. Must do this because + pandas.read_csv will allow for a file to have more column headers than content. + E.g) A,B,C,D,E + 1,2 + 2,3 + + 5) We keep the 'NA', 'nan', and 'NaN' as strings in the data because + these are valid allele values + then convert the ones in the non-allele columns back to actual NAs + + NOTE: Because allele columns are case-insensitive in maf data, we must + standardize the case of the columns when checking for the non-allele columns + to convert the NA strings to NAs + + NOTE: This code allows empty dataframes to pass through + without errors + + Args: + filePathList (List[str]): list of filepath(s) + + Raises: + ValueError: First line fields doesn't match second line fields in file + + Returns: + pd.DataFrame: mutation data + """ with open(filePathList[0], "r") as maf_f: firstline = maf_f.readline() if firstline.startswith("#"): @@ -370,28 +392,26 @@ def _get_dataframe(self, filePathList): "Number of fields in a line do not match the " "expected number of columns" ) + read_csv_params = { "filepath_or_buffer": filePathList[0], "sep": "\t", "comment": "#", - # Keep the value 'NA' + "keep_default_na": False, "na_values": [ "-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A N/A", - "NaN", "#N/A", "N/A", "#NA", "NULL", "-NaN", - "nan", "-nan", "", ], - "keep_default_na": False, # This is to check if people write files # with R, quote=T "quoting": 3, @@ -399,5 +419,16 @@ def _get_dataframe(self, filePathList): # validator will cause the file to fail "skip_blank_lines": False, } + mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params) + + mutationdf = transform._convert_values_to_na( + input_df=mutationdf, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=[ + col + for col in mutationdf.columns + if col.upper() not in self._allele_cols + ], + ) return mutationdf diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index cf381086..eb21a36b 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -1,10 +1,11 @@ import logging import os +from typing import List import pandas as pd from genie.example_filetype_format import FileTypeFormat -from genie import process_functions, validate +from genie import process_functions, transform, validate logger = logging.getLogger(__name__) @@ -28,7 +29,25 @@ def _validateFilename(self, filePath): endswith_vcf = basename.endswith(".vcf") assert startswith_genie and endswith_vcf - def _get_dataframe(self, filePathList): + def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame: + """Get mutation dataframe + + 1) Looks for the line in the file starting with #CHROM, that will be + the header line (columns). + + 2) When reading in the data, we keep the 'NA', 'nan', and 'NaN' + as strings in the data because these are valid allele values + then convert the ones in the non-allele columns back to actual NAs + + Args: + filePathList (List[str]): list of filepath(s) + + Raises: + ValueError: when line with #CHROM doesn't exist in file + + Returns: + pd.DataFrame: mutation data + """ headers = None filepath = filePathList[0] with open(filepath, "r") as vcffile: @@ -38,10 +57,37 @@ def _get_dataframe(self, filePathList): break if headers is not None: vcfdf = pd.read_csv( - filepath, sep="\t", comment="#", header=None, names=headers + filepath, + sep="\t", + comment="#", + header=None, + names=headers, + keep_default_na=False, + na_values=[ + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A N/A", + "#N/A", + "N/A", + "#NA", + "NULL", + "-NaN", + "-nan", + "", + ], ) else: raise ValueError("Your vcf must start with the header #CHROM") + + vcfdf = transform._convert_values_to_na( + input_df=vcfdf, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=[ + col for col in vcfdf.columns if col not in self._allele_cols + ], + ) return vcfdf def process_steps(self, df): diff --git a/setup.py b/setup.py index ed8415b3..4c8d7f3c 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ """genie package setup""" + from setuptools import setup setup() diff --git a/tests/test_assay.py b/tests/test_assay.py index 20d594b7..691c693e 100644 --- a/tests/test_assay.py +++ b/tests/test_assay.py @@ -1,4 +1,5 @@ """Test assay information validation and processing""" + import copy from unittest.mock import patch diff --git a/tests/test_bed.py b/tests/test_bed.py index b720faa8..973c7ea3 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -1,4 +1,5 @@ """Test GENIE Bed class""" + import tempfile import shutil from unittest import mock diff --git a/tests/test_database_to_staging.py b/tests/test_database_to_staging.py index 4f2acede..01e8f4a0 100644 --- a/tests/test_database_to_staging.py +++ b/tests/test_database_to_staging.py @@ -1,4 +1,5 @@ """Tests database to staging functions""" + import os from unittest import mock from unittest.mock import patch diff --git a/tests/test_extract.py b/tests/test_extract.py index 5b94a109..2d8c5ac2 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,4 +1,5 @@ """Test genie.extract module""" + from unittest.mock import patch import pandas as pd diff --git a/tests/test_filters.py b/tests/test_filters.py index e0191c17..e9af50d2 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -1,6 +1,7 @@ """ Test GENIE filters """ + import datetime import os import sys diff --git a/tests/test_input_to_database.py b/tests/test_input_to_database.py index bd3bbf9f..3d52d4f6 100644 --- a/tests/test_input_to_database.py +++ b/tests/test_input_to_database.py @@ -668,7 +668,7 @@ def asDataFrame(self): class TestValidation: - def setup(self): + def setup_method(self): valid = [ [ sample_clinical_entity.id, diff --git a/tests/test_maf.py b/tests/test_maf.py index 71d61e64..b86b202b 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -3,10 +3,12 @@ import pandas as pd import pytest -from genie import process_functions, validate +from genie import process_functions, transform, validate import genie_registry.maf from genie_registry.maf import maf +OPEN_BUILTIN = "builtins.open" + @pytest.fixture def maf_class(syn): @@ -38,7 +40,7 @@ def valid_maf_df(): T_REF_COUNT=[1, 2, 3, 4, 3], N_DEPTH=[1, 2, 3, float("nan"), 3], N_REF_COUNT=[1, 2, 3, 4, 3], - N_ALT_COUNT=[1, 2, 3, 4, 3], + N_ALT_COUNT=[1, None, 3, 4, 3], TUMOR_SEQ_ALLELE2=["A", "A", "A", "A", "A"], ) ) @@ -428,22 +430,70 @@ def test_that__cross_validate_returns_expected_msg_if_valid( ids=["no_pound_sign", "pound_sign"], ) def test_that__get_dataframe_returns_expected_result(maf_class, test_input, expected): - with patch("builtins.open", mock_open(read_data=test_input)): + with patch(OPEN_BUILTIN, mock_open(read_data=test_input)): test = maf_class._get_dataframe(["some_path"]) pd.testing.assert_frame_equal(test, expected) -def test_that__get_dataframe_throws_value_error(maf_class): +def test_that__get_dataframe_reads_in_correct_nas(maf_class): file = ( - "#Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome\n" - "TEST 3845 TEST GRCh37 12" + "Hugo_Symbol\tEntrez_Gene_Id\tReference_Allele\n" + "TEST\t3845\tNA\n" + "TEST\tNA\tnan\n" + "TEST\t3846\tN/A\n" + "NA\tnan\tNaN" ) - with patch("builtins.open", mock_open(read_data=file)): - with pytest.raises( - ValueError, - match="Number of fields in a line do not match the expected number of columns", - ): - maf_class._get_dataframe(["some_path"]) + with patch(OPEN_BUILTIN, mock_open(read_data=file)): + expected = pd.DataFrame( + { + "Hugo_Symbol": ["TEST", "TEST", "TEST", None], + "Entrez_Gene_Id": ["3845", None, "3846", None], + "Reference_Allele": ["NA", "nan", None, "NaN"], + } + ) + maf_df = maf_class._get_dataframe(["some_path"]) + pd.testing.assert_frame_equal(maf_df, expected) + + +@pytest.mark.parametrize( + "input,expected_columns", + [ + ( + pd.DataFrame( + { + "Hugo_Symbol": ["TEST"], + "Entrez_Gene_Id": ["3845"], + "RefErence_Allele": ["NA"], + } + ), + ["Hugo_Symbol", "Entrez_Gene_Id"], + ), + ( + pd.DataFrame( + { + "#CHROM": ["TEST"], + "ALT": ["3845"], + "Reference_a": ["NA"], + } + ), + ["#CHROM", "ALT", "Reference_a"], + ), + ], + ids=["with_allele_col", "no_allele_col"], +) +def test_that__get_dataframe_uses_correct_columns_to_replace( + maf_class, input, expected_columns +): + file = "Hugo_Symbol\tEntrez_Gene_Id\tReference_Allele\n" "TEST\t3845\tNA" + with patch(OPEN_BUILTIN, mock_open(read_data=file)), patch.object( + pd, "read_csv", return_value=input + ), patch.object(transform, "_convert_values_to_na") as patch_convert_to_na: + maf_class._get_dataframe(["some_path"]) + patch_convert_to_na.assert_called_once_with( + input_df=input, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=expected_columns, + ) @pytest.mark.parametrize( diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 94e03d3f..b24e6209 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -1,4 +1,5 @@ """Test process mutation functions""" + import os from collections import namedtuple from pandas.testing import assert_frame_equal @@ -40,7 +41,7 @@ def test_format_maf(): class TestDtype: - def setup(self): + def setup_method(self): self.testdf = pd.DataFrame({"foo": [1], "bar": ["baz"]}) self.column_types = {"foo": "int64", "bar": "object"} self.mutation_path = "/path/test.csv" diff --git a/tests/test_transform.py b/tests/test_transform.py index 7e0d1ca7..12142163 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -1,4 +1,5 @@ """Test genie.transform module""" + import os from io import BytesIO from unittest import mock @@ -186,3 +187,60 @@ def test_that__convert_df_with_mixed_dtypes_catches_mixed_dtype_exception( ], any_order=False, ) + + +def get__convert_values_to_na_test_cases(): + return [ + { + "name": "single_col", + "input_df": pd.DataFrame({"col1": ["N/A", "SAGE-SAGE-1", "-nan"]}), + "values_to_replace": ["N/A", "-nan"], + "columns_to_convert": ["col1"], + "expected_df": pd.DataFrame({"col1": [None, "SAGE-SAGE-1", None]}), + }, + { + "name": "multi_col", + "input_df": pd.DataFrame( + {"col1": ["N/A", "-nan", "1"], "col2": ["-nan", "N/A", "2"]} + ), + "values_to_replace": ["N/A", "-nan"], + "columns_to_convert": ["col1", "col2"], + "expected_df": pd.DataFrame( + {"col1": [None, None, "1"], "col2": [None, None, "2"]} + ), + }, + { + "name": "empty_cols", + "input_df": pd.DataFrame(columns=["col1", "col2"]), + "values_to_replace": ["NaN", "NA", "nan"], + "columns_to_convert": ["col1", "col2"], + "expected_df": pd.DataFrame(columns=["col1", "col2"]), + }, + { + "name": "empty df", + "input_df": pd.DataFrame(), + "values_to_replace": ["NaN"], + "columns_to_convert": ["col3"], + "expected_df": pd.DataFrame(), + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get__convert_values_to_na_test_cases(), ids=lambda x: x["name"] +) +def test_that__convert_values_to_na_returns_expected(test_cases): + result = transform._convert_values_to_na( + test_cases["input_df"], + test_cases["values_to_replace"], + test_cases["columns_to_convert"], + ) + pd.testing.assert_frame_equal(result, test_cases["expected_df"]) + + +def test_that__convert_values_to_na_throws_key_error_if_nonexistent_cols(): + input_df = pd.DataFrame({"col1": ["N/A", "SAGE-SAGE-1", "-nan"]}) + values_to_replace = "NaN" + columns_to_convert = "col3" + with pytest.raises(KeyError): + transform._convert_values_to_na(input_df, values_to_replace, columns_to_convert) diff --git a/tests/test_validate.py b/tests/test_validate.py index 833c5ecb..b50b0e7e 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,4 +1,5 @@ """Tests validate.py""" + from unittest.mock import Mock, patch import pandas as pd diff --git a/tests/test_vcf.py b/tests/test_vcf.py index d8bbabf5..f508d778 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -1,8 +1,12 @@ import pandas as pd import pytest +from unittest.mock import mock_open, patch +from genie import transform from genie_registry.vcf import vcf +OPEN_BUILTIN = "builtins.open" + @pytest.fixture def vcf_class(syn): @@ -251,3 +255,73 @@ def test_validation_more_than_11_cols(vcf_class): "Only single sample or matched tumor normal vcf files are accepted.\n" ) assert warning == "" + + +def test_that__get_dataframe_throws_value_error_if_no_headers(vcf_class): + file = "CHROM\tALT\tREF\n" "TEST\t3845\tNA" + with patch(OPEN_BUILTIN, mock_open(read_data=file)): + with pytest.raises( + ValueError, match="Your vcf must start with the header #CHROM" + ): + vcf_class._get_dataframe(["some_path"]) + + +def test_that__get_dataframe_reads_in_correct_nas(vcf_class): + file = ( + "#CHROM\tALT\tREF\n" + "TEST\t3845\tNA\n" + "TEST\tNA\tnan\n" + "TEST\t3846\tN/A\n" + "NA\tnan\tNaN" + ) + with patch(OPEN_BUILTIN, mock_open(read_data=file)): + expected = pd.DataFrame( + { + "#CHROM": ["TEST", "TEST", "TEST", None], + "ALT": ["3845", None, "3846", None], + "REF": ["NA", "nan", None, "NaN"], + } + ) + vcf_df = vcf_class._get_dataframe(["some_path"]) + pd.testing.assert_frame_equal(vcf_df, expected) + + +@pytest.mark.parametrize( + "input,expected_columns", + [ + ( + pd.DataFrame( + { + "#CHROM": ["TEST"], + "ALT": ["3845"], + "REF": ["NA"], + } + ), + ["#CHROM", "ALT"], + ), + ( + pd.DataFrame( + { + "#CHROM": ["TEST"], + "ALT": ["3845"], + "rEf": ["NA"], + } + ), + ["#CHROM", "ALT", "rEf"], + ), + ], + ids=["with_allele_col", "no_allele_col"], +) +def test_that__get_dataframe_uses_correct_columns_to_replace( + vcf_class, input, expected_columns +): + file = "#CHROM\tALT\tref\n" "TEST\t3845\tNA" + with patch(OPEN_BUILTIN, mock_open(read_data=file)), patch.object( + pd, "read_csv", return_value=input + ), patch.object(transform, "_convert_values_to_na") as patch_convert_to_na: + vcf_class._get_dataframe(["some_path"]) + patch_convert_to_na.assert_called_once_with( + input_df=input, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=expected_columns, + ) diff --git a/tests/test_write_invalid_reasons.py b/tests/test_write_invalid_reasons.py index c49a5e17..31a86b0b 100644 --- a/tests/test_write_invalid_reasons.py +++ b/tests/test_write_invalid_reasons.py @@ -1,4 +1,5 @@ """Test write invalid reasons module""" + from unittest import mock from unittest.mock import patch From b281c5416196bb4d1cf10a3613f9e7033e841557 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 13 Feb 2024 19:46:09 -0800 Subject: [PATCH 27/28] [GEN-1065] Use OIDC with pypi in genie package publishing (#550) * test upload with pypi and oidc * update to use build instead of setup.py * check dist dir * try merging build and publish * add back in condition to deploy on release * remove listing of dist dir --- .github/workflows/ci.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 292f51d2..2dfd3558 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,6 +61,8 @@ jobs: needs: [test, lint] runs-on: ubuntu-latest if: github.event_name == 'release' + permissions: + id-token: write steps: - uses: actions/checkout@v2 - name: Set up Python @@ -70,11 +72,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* + pip install setuptools wheel build + - name: Build distributions + run: python -m build + - name: Publish to pypi + uses: pypa/gh-action-pypi-publish@release/v1 From 012e48c1e12292745d826c54595c4f479878663f Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 13 Feb 2024 20:02:38 -0800 Subject: [PATCH 28/28] update to release 16.2.0 --- genie/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genie/__init__.py b/genie/__init__.py index 01049da6..e991f2c2 100644 --- a/genie/__init__.py +++ b/genie/__init__.py @@ -7,6 +7,6 @@ # create version in __init__.py # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ -__version__ = "16.1.0" +__version__ = "16.2.0" __all__ = ["__version__"]