diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 292f51d2..2dfd3558 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,6 +61,8 @@ jobs: needs: [test, lint] runs-on: ubuntu-latest if: github.event_name == 'release' + permissions: + id-token: write steps: - uses: actions/checkout@v2 - name: Set up Python @@ -70,11 +72,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* + pip install setuptools wheel build + - name: Build distributions + run: python -m build + - name: Publish to pypi + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 96475f19..d13c99d5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -94,8 +94,14 @@ This package uses [semantic versioning](https://semver.org/) for releasing new v ### Testing +#### Running test pipeline + +Make sure to run each of the [pipeline steps here](README.md#developing-locally) on the test pipeline and verify that your pipeline runs as expected. This is __not__ automatically run by Github Actions and have to be manually run. + #### Running tests +##### Tests in Python + This package uses [`pytest`](https://pytest.org/en/latest/) to run tests. The test code is located in the [tests](./tests) subdirectory. Here's how to run the test suite: @@ -104,7 +110,17 @@ Here's how to run the test suite: pytest -vs tests/ ``` -Tests are also run automatically by Github Actions on any pull request and are required to pass before merging. +Tests in Python are also run automatically by Github Actions on any pull request and are required to pass before merging. + +##### Tests in R + +This package uses [`testthat`](https://testthat.r-lib.org/) to run tests in R. The test code is located in the [testthat](./R/tests/testthat) subdirectory. + +Here's how to run the test suite: + +```shell +Rscript -e "testthat::test_dir('R/tests/testthat/')" +``` #### Test Development @@ -134,6 +150,17 @@ Follow gitflow best practices as linked above. 1. Merge `main` back into `develop` 1. Push `develop` -### DockerHub +### Modifying Docker + +Follow this section when modifying the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile): + +1. Have your synapse authentication token handy +1. ```docker build -f Dockerfile -t .``` +1. ```docker run --rm -it -e SYNAPSE_AUTH_TOKEN=$YOUR_SYNAPSE_TOKEN ``` +1. Run [test code](README.md#developing-locally) relevant to the dockerfile changes to make sure changes are present and working +1. Once changes are tested, follow [genie contributing guidelines](#developing) for adding it to the repo +1. Once deployed to main, make sure docker image was successfully deployed remotely (our docker image gets automatically deployed) [here](https://hub.docker.com/repository/docker/sagebionetworks/genie/builds) + +#### Dockerhub This repository does not use github actions to push docker images. By adding the `sagebiodockerhub` github user as an Admin to this GitHub repository, we can configure an automated build in DockerHub. You can view the builds [here](https://hub.docker.com/repository/docker/sagebionetworks/genie/builds). To get admin access to the DockerHub repository, ask Sage IT to be added to the `genieadmin` DockerHub team. diff --git a/Dockerfile b/Dockerfile index f171542a..0df6fe30 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ RUN apt-get update && apt-get install -y --allow-unauthenticated --no-install-re # texlive-generic-recommended \ texlive-latex-extra \ # genome nexus - openjdk-8-jre \ + openjdk-11-jre \ # This is for reticulate python3.8-venv && \ apt-get clean && \ @@ -83,6 +83,6 @@ WORKDIR /root/ # Must move this git clone to after the install of Genie, # because must update cbioportal RUN git clone https://github.com/cBioPortal/cbioportal.git -b v5.3.19 -RUN git clone https://github.com/Sage-Bionetworks/annotation-tools.git -b 0.0.2 +RUN git clone https://github.com/Sage-Bionetworks/annotation-tools.git -b 0.0.4 WORKDIR /root/Genie diff --git a/R/dashboard_template_functions.R b/R/dashboard_template_functions.R new file mode 100644 index 00000000..0fe44323 --- /dev/null +++ b/R/dashboard_template_functions.R @@ -0,0 +1,41 @@ +# --------------------------------------------------------------------------- +# Title: dashboard_template_functions.R +# Description: This script contains helper functions used in +# templates/dashboardTemplate.Rmd +# --------------------------------------------------------------------------- + +#' This function gets the database to synapse id mapping table, +#' maps the provided database_name to its synapse id and returns it +#' +#' @param database_name (str) database name in database +#' to synapse id mapping table +#' @param database_synid_mappingid (str) synapse id of the database +#' to synapse id mapping table +#' +#' @return (str) synapse id of the mapped database name +get_syn_id_from_mapped_database <- function(database_name, database_synid_mappingid){ + database_synid_mapping = synTableQuery(sprintf('select * from %s', + database_synid_mappingid)) + database_synid_mappingdf = as.data.frame(database_synid_mapping) + table_synid = database_synid_mappingdf$Id[database_synid_mappingdf$Database == database_name] + return(table_synid) +} + +#' This function creates a table of failed annotation counts by grouped columns +#' @param maf_data (data.frame) input maf data frame +#' @param group_by_cols (str vector) list of columns to create counts by +#' @param counts_col_name (str) name to give to the counts column +#' +#' @return (data.frame) counts table +get_failed_annotation_table_counts <- function(maf_data, group_by_cols, counts_col_name){ + table_counts <- table(maf_data[maf_data$Annotation_Status == "FAILED", group_by_cols]) + + if (nrow(table_counts) == 0){ + counts_table <- data.frame(matrix(ncol = length(group_by_cols) + 1, nrow = 0)) + } else{ + counts_table <- as.data.frame(table_counts) + } + colnames(counts_table) <- c(group_by_cols, counts_col_name) + counts_table <- counts_table[do.call(order, counts_table[group_by_cols]), ] + return(counts_table) +} \ No newline at end of file diff --git a/R/tests/testthat/test_dashboard_template_functions.R b/R/tests/testthat/test_dashboard_template_functions.R new file mode 100644 index 00000000..ceb6aa6e --- /dev/null +++ b/R/tests/testthat/test_dashboard_template_functions.R @@ -0,0 +1,69 @@ +# tests for dashboard_template_functions.R + +source("../../dashboard_template_functions.R") + +library(synapser) +library(testthat) + +sample_counts_table <- function() { + data <- data.frame( + Center = factor(c("GOLD","SAGE", "TEST"), + levels = c("GOLD", "SAGE", "TEST")), + Counts = c(1, 2, 1) + ) + return(data) +} + +empty_counts_table <- function() { + data <- data.frame( + Center = logical(), + Counts = logical() + ) + return(data) +} + +sample_maf_table <- function() { + data <- data.frame( + Center = c("TEST", "TEST", "SAGE", "SAGE", "GOLD", "BRONZE"), + Tumor_Sample_Barcode = c("SAGE1", "SAGE2", "SAGE3", "SAGE4", "SAGE5", "SAGE6"), + Annotation_Status = c("SUCCESS", "FAILED", "FAILED", "FAILED", "FAILED", "SUCCESS") + ) + return(data) +} + +sample_maf_table_no_failed_annotations <- function() { + data <- data.frame( + Center = c("TEST", "SAGE", "GOLD"), + Tumor_Sample_Barcode = c("SAGE1", "SAGE2", "SAGE3"), + Annotation_Status = c("SUCCESS", "SUCCESS", "SUCCESS") + ) + return(data) +} + + +test_that("get_syn_id_from_mapped_database_gets_correct_value", { + synLogin() + result <- get_syn_id_from_mapped_database( + database_name = "main", + database_synid_mappingid = "syn11600968" + ) + expect_equal(result, "syn7208886") +}) + + +test_that("get_failed_annotation_table_counts_returns_expected_output", { + result <- get_failed_annotation_table_counts( + maf_data=sample_maf_table(), + group_by_cols="Center", + counts_col_name="Counts") + expect_equal(result, sample_counts_table()) +}) + +test_that("get_failed_annotation_table_counts_returns_empty_table_with_no_failed_annotations", { + result <- get_failed_annotation_table_counts( + maf_data=sample_maf_table_no_failed_annotations(), + group_by_cols="Center", + counts_col_name="Counts") + expect_equal(result, empty_counts_table()) +}) + diff --git a/README.md b/README.md index 736da68a..555bb925 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,6 @@ genie validate data_clinical_supp_SAGE.txt SAGE ``` - ## Contributing Please view [contributing guide](CONTRIBUTING.md) to learn how to contribute to the GENIE package. @@ -65,6 +64,16 @@ These are instructions on how you would develop and test the pipeline locally. pip install -r requirements-dev.txt ``` + If you are having trouble with the above, try installing via `pipenv` + + 1. Specify a python version that is supported by this repo: + ```pipenv --python ``` + + 1. [pipenv install from requirements file](https://docs.pipenv.org/en/latest/advanced.html#importing-from-requirements-txt) + + 1. Activate your `pipenv`: + ```pipenv shell``` + 1. Configure the Synapse client to authenticate to Synapse. 1. Create a Synapse [Personal Access token (PAT)](https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens). 1. Add a `~/.synapseConfig` file @@ -83,33 +92,40 @@ These are instructions on how you would develop and test the pipeline locally. 1. Run the different pipelines on the test project. The `--project_id syn7208886` points to the test project. - 1. Validate all the files. + 1. Validate all the files **excluding vcf files**: ``` python bin/input_to_database.py main --project_id syn7208886 --onlyValidate ``` + 1. Validate **all** the files: + + ``` + python bin/input_to_database.py mutation --project_id syn7208886 --onlyValidate --genie_annotation_pkg ../annotation-tools + ``` + 1. Process all the files aside from the mutation (maf, vcf) files. The mutation processing was split because it takes at least 2 days to process all the production mutation data. Ideally, there is a parameter to exclude or include file types to process/validate, but that is not implemented. ``` python bin/input_to_database.py main --project_id syn7208886 --deleteOld ``` - 1. Process the mutation data. Be sure to clone this repo: https://github.com/Sage-Bionetworks/annotation-tools. This repo houses the code that re-annotates the mutation data with genome nexus. The `--createNewMafDatabase` will create a new mutation tables in the test project. This flag is necessary for production data for two main reasons: + 1. Process the mutation data. Be sure to clone this repo: https://github.com/Sage-Bionetworks/annotation-tools and `git checkout` the version of the repo pinned to the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile). This repo houses the code that re-annotates the mutation data with genome nexus. The `--createNewMafDatabase` will create a new mutation tables in the test project. This flag is necessary for production data for two main reasons: * During processing of mutation data, the data is appended to the data, so without creating an empty table, there will be duplicated data uploaded. * By design, Synapse Tables were meant to be appended to. When a Synapse Tables is updated, it takes time to index the table and return results. This can cause problems for the pipeline when trying to query the mutation table. It is actually faster to create an entire new table than updating or deleting all rows and appending new rows when dealing with millions of rows. + * If you run this more than once on the same day, you'll run into an issue with overwriting the narrow maf table as it already exists. Be sure to rename the current narrow maf database under `Tables` in the test synapse project and try again. ``` python bin/input_to_database.py mutation --project_id syn7208886 --deleteOld --genie_annotation_pkg ../annotation-tools --createNewMafDatabase ``` - 1. Create a consortium release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal + 1. Create a consortium release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal and `git checkout` the version of the repo pinned to the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile) ``` python bin/database_to_staging.py Jan-2017 ../cbioportal TEST --test ``` - 1. Create a public release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal + 1. Create a public release. Be sure to add the `--test` parameter. Be sure to clone the cbioportal repo: https://github.com/cBioPortal/cbioportal and `git checkout` the version of the repo pinned to the [Dockerfile](https://github.com/Sage-Bionetworks/Genie/blob/main/Dockerfile) ``` python bin/consortium_to_public.py Jan-2017 ../cbioportal TEST --test diff --git a/genie/__init__.py b/genie/__init__.py index 01049da6..e991f2c2 100644 --- a/genie/__init__.py +++ b/genie/__init__.py @@ -7,6 +7,6 @@ # create version in __init__.py # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ -__version__ = "16.1.0" +__version__ = "16.2.0" __all__ = ["__version__"] diff --git a/genie/__main__.py b/genie/__main__.py index 3e5ad0ec..f9d21d17 100644 --- a/genie/__main__.py +++ b/genie/__main__.py @@ -44,49 +44,35 @@ def build_parser(): subparsers = parser.add_subparsers( title="commands", - description="The following commands are available:", - help='For additional help: "genie -h"', + description="The following commands are available: ", + help='For additional help use: "genie -h"', ) parser_validate = subparsers.add_parser( - "validate", help="Validates GENIE file formats" + "validate", help="Validates GENIE file formats. " ) parser_validate.add_argument( "filepath", type=str, nargs="+", - help="File(s) that you are validating." - "If you validation your clinical files and you have both sample and " - "patient files, you must provide both", + help="File(s) that you are validating. " + "If you have separate clinical sample and patient files, " + "you must provide both files when validating.", ) parser_validate.add_argument("center", type=str, help="Contributing Centers") - parser_validate.add_argument( - "--format_registry_packages", - type=str, - nargs="+", - default=["genie_registry"], - help="Python package name(s) to get valid file formats from (default: %(default)s).", - ) - - parser_validate.add_argument( - "--oncotree_link", type=str, help="Link to oncotree code" - ) - validate_group = parser_validate.add_mutually_exclusive_group() validate_group.add_argument( "--filetype", type=str, - help="By default, the validator uses the filename to match " + help="Use the --filetype {FILETYPE} parameter to ignore filename validation. " + "By default, the validator uses the filename to match " "the file format. If your filename is incorrectly named, " - "it will be invalid. If you know the file format you are " - "validating, you can ignore the filename validation and skip " - "to file content validation. " - "Note, the filetypes with SP at " - "the end are for special sponsored projects.", + "it will be invalid. " + "Options: [maf, vcf, clinical, assayinfo, bed, cna, sv, seg, mutationsInCis]", ) validate_group.add_argument( @@ -98,18 +84,39 @@ def build_parser(): "to this directory.", ) + parser_validate.add_argument( + "--oncotree_link", + type=str, + help="Specify an oncotree url when validating your clinical " + "file " + "(e.g: https://oncotree.info/api/tumorTypes/tree?version=oncotree_2021_11_02). " + "By default the oncotree version used will be specified in this entity: " + "syn13890902", + ) + + parser_validate.add_argument( + "--nosymbol-check", + action="store_true", + help="Ignores specific post-processing validation criteria related to HUGO symbols " + "in the structural variant and cna files.", + ) + # TODO: remove this default when private genie project is ready parser_validate.add_argument( "--project_id", type=str, default="syn3380222", - help="Synapse Project ID where data is stored. (default: %(default)s).", + help="FOR DEVELOPER USE ONLY: Synapse Project ID where data is stored. " + "(default: %(default)s).", ) parser_validate.add_argument( - "--nosymbol-check", - action="store_true", - help="Do not check hugo symbols of fusion and cna file", + "--format_registry_packages", + type=str, + nargs="+", + default=["genie_registry"], + help="FOR DEVELOPER USE ONLY: Python package name(s) to get valid file formats " + "from (default: %(default)s).", ) parser_validate.set_defaults(func=validate._perform_validate) diff --git a/genie/config.py b/genie/config.py index 51a09825..343ac990 100644 --- a/genie/config.py +++ b/genie/config.py @@ -1,4 +1,5 @@ """Configuration to obtain registry classes""" + import importlib import logging diff --git a/genie/create_case_lists.py b/genie/create_case_lists.py index da18a319..547bfe87 100644 --- a/genie/create_case_lists.py +++ b/genie/create_case_lists.py @@ -1,6 +1,7 @@ """ Creates case lists per cancer type """ + from collections import defaultdict import csv import os diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 9dca7fa0..33a44d5f 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -1,4 +1,5 @@ """Updates dashboard tables""" + import argparse import datetime import logging @@ -347,9 +348,11 @@ def update_oncotree_code_tables(syn, database_mappingdf): oncotree_mapping = process_functions.get_oncotree_code_mappings(oncotree_link) clinicaldf["PRIMARY_CODES"] = [ - oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"] - if i.upper() in oncotree_mapping.keys() - else "DEPRECATED_CODE" + ( + oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"] + if i.upper() in oncotree_mapping.keys() + else "DEPRECATED_CODE" + ) for i in clinicaldf.ONCOTREE_CODE ] @@ -457,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf): .applymap(int) ) - diff_between_releasesdf[ - ["Clinical", "Mutation", "CNV", "SEG", "Fusions"] - ] = new_values + diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = ( + new_values + ) load._update_table( syn, diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 264b0937..c7e1b7c3 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -35,6 +35,73 @@ SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt") BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv") +FULL_MAF_RELEASE_COLUMNS = [ + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Consequence", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "Matched_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "t_ref_count", + "t_alt_count", + "n_ref_count", + "n_alt_count", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "RefSeq", + "Protein_position", + "Codons", + "Exon_Number", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "FILTER", + "Polyphen_Prediction", + "Polyphen_Score", + "SIFT_Prediction", + "SIFT_Score", + "SWISSPROT", + "n_depth", + "t_depth", + "Annotation_Status", + "mutationInCis_Flag", +] + # TODO: Add to transform.py def _to_redact_interval(df_col): @@ -755,10 +822,7 @@ def store_maf_files( with open(MUTATIONS_CENTER_PATH % center, "w"): pass used_entities = [] - # Must get the headers (because can't assume headers are the same order) maf_ent = syn.get(centerMafSynIdsDf.id[0]) - headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0) - column_order = headerdf.columns for _, mafSynId in enumerate(centerMafSynIdsDf.id): maf_ent = syn.get(mafSynId) logger.info(maf_ent.path) @@ -771,13 +835,12 @@ def store_maf_files( ) for mafchunk in mafchunks: - # Reorder column headers - mafchunk = mafchunk[column_order] # Get center for center staging maf # Configure maf configured_mafdf = configure_maf( mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants ) + configured_mafdf = configured_mafdf[FULL_MAF_RELEASE_COLUMNS] # Create maf for release merged_mafdf = remove_maf_samples( configured_mafdf, keep_for_merged_consortium_samples @@ -989,30 +1052,38 @@ def store_clinical_files( } clinicaldf["CANCER_TYPE"] = [ - oncotree_dict[code.upper()]["CANCER_TYPE"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["CANCER_TYPE"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] clinicaldf["CANCER_TYPE_DETAILED"] = [ - oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] clinicaldf["ONCOTREE_PRIMARY_NODE"] = [ - oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] clinicaldf["ONCOTREE_SECONDARY_NODE"] = [ - oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"] - if code.upper() in oncotree_dict.keys() - else float("nan") + ( + oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"] + if code.upper() in oncotree_dict.keys() + else float("nan") + ) for code in clinicaldf["ONCOTREE_CODE"] ] @@ -1023,9 +1094,11 @@ def store_clinical_files( # descriptions can match clinicaldf["AGE_AT_SEQ_REPORT_DAYS"] = clinicaldf["AGE_AT_SEQ_REPORT"] clinicaldf["AGE_AT_SEQ_REPORT"] = [ - int(math.floor(int(float(age)) / 365.25)) - if process_functions.checkInt(age) - else age + ( + int(math.floor(int(float(age)) / 365.25)) + if process_functions.checkInt(age) + else age + ) for age in clinicaldf["AGE_AT_SEQ_REPORT"] ] clinicaldf["AGE_AT_SEQ_REPORT"][clinicaldf["AGE_AT_SEQ_REPORT"] == ">32485"] = ">89" diff --git a/genie/example_filetype_format.py b/genie/example_filetype_format.py index b02bd983..a568f87c 100644 --- a/genie/example_filetype_format.py +++ b/genie/example_filetype_format.py @@ -1,6 +1,7 @@ """TODO: Rename this to model.py This contains the GENIE model objects """ + from abc import ABCMeta from dataclasses import dataclass import logging diff --git a/genie/load.py b/genie/load.py index b9ceea5d..8b9fe26c 100644 --- a/genie/load.py +++ b/genie/load.py @@ -2,6 +2,7 @@ This module contains all the functions that stores data to Synapse """ + import logging import os import time diff --git a/genie/process_functions.py b/genie/process_functions.py index e45d3c90..d07d6b8d 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -1,4 +1,5 @@ """Processing functions that are used in the GENIE pipeline""" + import datetime import json import logging @@ -948,3 +949,35 @@ def create_new_fileformat_table( "newdb_mappingdf": newdb_mappingdf, "moved_ent": moved_ent, } + + +def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: + """Creates and fills missing columns with the relevant NA value for the + given data type. Note that special handling had to occur for + allowing NAs in integer based columns in pandas by converting + the integer column into the Int64 (pandas nullable integer data type) + + Args: + dataset (pd.DataFrame): input dataset to fill missing columns for + schema (dict): the expected schema {column_name(str): data_type(str)} + for the input dataset + + Returns: + pd.Series: updated dataset + """ + missing_values = { + "string": "", + "integer": None, + "float": float("nan"), + "boolean": None, + } + for column, data_type in schema.items(): + if column not in dataset.columns: + dataset = dataset.assign(**{column: missing_values[data_type]}) + + # only way to preserve NAs for these specific dtype columns + if data_type == "integer": + dataset[column] = dataset[column].astype("Int64") + elif data_type == "boolean": + dataset[column] = dataset[column].astype(pd.BooleanDtype()) + return dataset[list(schema.keys())] diff --git a/genie/process_mutation.py b/genie/process_mutation.py index 2c747057..2d54617d 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -1,5 +1,7 @@ """Process mutation files TODO deprecate this module and spread functions around""" + +from collections import namedtuple import logging import os import shutil @@ -10,7 +12,7 @@ import pandas as pd from synapseclient import Synapse -from . import load, process_functions +from . import extract, load, process_functions logger = logging.getLogger(__name__) @@ -201,23 +203,28 @@ def process_mutation_workflow( logger.info("No mutation data") return None # Certificate to use GENIE Genome Nexus + syn.get( "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], + # version=1, # TODO: This should pull from a config file in the future ) + # Genome Nexus Jar file syn.get( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_config["genie_annotation_pkg"], + # version=13, # TODO: This should pull from a config file in the future ) - annotated_maf_path = annotate_mutation( + annotation_paths = create_annotation_paths(center=center, workdir=workdir) + annotate_mutation( + annotation_paths=annotation_paths, center=center, mutation_files=valid_mutation_files, genie_annotation_pkg=genie_config["genie_annotation_pkg"], - workdir=workdir, ) maf_tableid = genie_config["vcf2maf"] @@ -227,18 +234,165 @@ def process_mutation_workflow( syn=syn, center=center, maf_tableid=maf_tableid, - annotated_maf_path=annotated_maf_path, + annotation_paths=annotation_paths, flatfiles_synid=flatfiles_synid, - workdir=workdir, ) - return annotated_maf_path + full_error_report = concat_annotation_error_reports( + center=center, + input_dir=annotation_paths.error_dir, + ) + check_annotation_error_reports( + syn=syn, + maf_table_synid=maf_tableid, + full_error_report=full_error_report, + center=center, + ) + store_annotation_error_reports( + full_error_report=full_error_report, + full_error_report_path=annotation_paths.full_error_report_path, + syn=syn, + errors_folder_synid=genie_config["center_config"][center]["errorsSynId"], + ) + return annotation_paths.merged_maf_path + + +def create_annotation_paths(center: str, workdir: str) -> namedtuple: + """Creates the filepaths required in the annotation process + + Args: + center (str): name of the center + workdir (str): work directory to create paths in + + Returns: + namedtuple: tuple with all the paths + """ + input_files_dir = tempfile.mkdtemp(dir=workdir) + output_files_dir = tempfile.mkdtemp(dir=workdir) + Filepaths = namedtuple( + "Filepaths", + [ + "input_files_dir", + "output_files_dir", + "error_dir", + "merged_maf_path", + "full_maf_path", + "narrow_maf_path", + "full_error_report_path", + ], + ) + annotation_paths = Filepaths( + input_files_dir=input_files_dir, + output_files_dir=output_files_dir, + error_dir=os.path.join(output_files_dir, f"{center}_error_reports"), + merged_maf_path=os.path.join( + output_files_dir, f"data_mutations_extended_{center}.txt" + ), + full_maf_path=os.path.join( + workdir, center, "staging", f"data_mutations_extended_{center}.txt" + ), + narrow_maf_path=os.path.join( + workdir, + center, + "staging", + f"data_mutations_extended_{center}_MAF_narrow.txt", + ), + full_error_report_path=os.path.join( + workdir, + center, + "staging", + f"failed_annotations_error_report.txt", + ), + ) + return annotation_paths + + +def concat_annotation_error_reports( + center: str, + input_dir: str, +) -> pd.DataFrame: + """Concatenates the annotation error reports + + Args: + center (str): name of center associated with error report + input_dir (str): directory where error reports are + Returns: + pd.DataFrame: full annotation error report + """ + error_files = os.listdir(input_dir) + chunk_size = 10000 + error_reports = [] + + # Read and concatenate TSV files in chunks + for file in error_files: + for chunk in pd.read_csv( + os.path.join(input_dir, file), sep="\t", chunksize=chunk_size + ): + error_reports.append(chunk) + full_error_report = pd.concat(error_reports) + full_error_report["Center"] = center + return full_error_report + + +def check_annotation_error_reports( + syn: Synapse, maf_table_synid: str, full_error_report: pd.DataFrame, center: str +) -> None: + """A simple QC check to make sure our genome nexus error report + failed annotations matches our final processed maf table's failed + annotations + + Args: + syn (Synapse): synapse client + maf_table_synid (str): synapse_id of the narrow maf table + full_error_report (pd.DataFrame): the failed annotations error report + center (str): the center this is for + + """ + maf_table_df = extract.get_syntabledf( + syn=syn, + query_string=( + f"SELECT * FROM {maf_table_synid} " + f"WHERE Center = '{center}' AND " + "Annotation_Status = 'FAILED'" + ), + ) + if len(maf_table_df) != len(full_error_report): + logger.warning( + "Genome nexus's failed annotations error report rows doesn't match" + f"maf table's failed annotations for {center}" + ) + + +def store_annotation_error_reports( + full_error_report: pd.DataFrame, + full_error_report_path: str, + syn: Synapse, + errors_folder_synid: str, +) -> None: + """Stores the annotation error reports to synapse + + Args: + full_error_report (pd.DataFrame): full error report to store + full_error_report_path (str) where to store the flat file of the full error report + syn (synapseclient.Synapse): synapse client object + errors_folder_synid (str): synapse id of error report folder + to store reports in + """ + full_error_report.to_csv(full_error_report_path, sep="\t", index=False) + load.store_file( + syn=syn, + filepath=full_error_report_path, + parentid=errors_folder_synid, + ) # TODO: add to transform def annotate_mutation( - center: str, mutation_files: list, genie_annotation_pkg: str, workdir: str -) -> str: + annotation_paths: namedtuple, + mutation_files: list, + genie_annotation_pkg: str, + center: str, +) -> None: """Process vcf/maf files Args: @@ -249,21 +403,16 @@ def annotate_mutation( Returns: Path to final maf """ - input_files_dir = tempfile.mkdtemp(dir=workdir) - output_files_dir = tempfile.mkdtemp(dir=workdir) - for mutation_file in mutation_files: - move_mutation(mutation_file, input_files_dir) + move_mutation(mutation_file, annotation_paths.input_files_dir) - merged_maf_path = os.path.join( - output_files_dir, f"data_mutations_extended_{center}.txt" - ) annotater_cmd = [ "bash", os.path.join(genie_annotation_pkg, "annotation_suite_wrapper.sh"), - f"-i={input_files_dir}", - f"-o={output_files_dir}", - f"-m={merged_maf_path}", + f"-i={annotation_paths.input_files_dir}", + f"-o={annotation_paths.output_files_dir}", + f"-e={annotation_paths.error_dir}", + f"-m={annotation_paths.merged_maf_path}", f"-c={center}", "-s=WXS", f"-p={genie_annotation_pkg}", @@ -271,8 +420,6 @@ def annotate_mutation( subprocess.check_call(annotater_cmd) - return merged_maf_path - # TODO: add to transform def append_or_createdf(dataframe: pd.DataFrame, filepath: str): @@ -318,9 +465,8 @@ def split_and_store_maf( syn: Synapse, center: str, maf_tableid: str, - annotated_maf_path: str, + annotation_paths: namedtuple, flatfiles_synid: str, - workdir: str, ): """Separates annotated maf file into narrow and full maf and stores them @@ -328,7 +474,7 @@ def split_and_store_maf( syn: Synapse connection center: Center maf_tableid: Mutation table synapse id - annotated_maf_path: Annotated maf + annotation_paths: filepaths in the annotation process flatfiles_synid: GENIE flat files folder """ @@ -337,22 +483,19 @@ def split_and_store_maf( for col in syn.getTableColumns(maf_tableid) if col["name"] != "inBED" ] - full_maf_path = os.path.join( - workdir, center, "staging", f"data_mutations_extended_{center}.txt" - ) - narrow_maf_path = os.path.join( - workdir, center, "staging", f"data_mutations_extended_{center}_MAF_narrow.txt" - ) maf_chunks = pd.read_csv( - annotated_maf_path, sep="\t", chunksize=100000, comment="#" + annotation_paths.merged_maf_path, sep="\t", chunksize=100000, comment="#" ) - for maf_chunk in maf_chunks: maf_chunk = format_maf(maf_chunk, center) - append_or_createdf(maf_chunk, full_maf_path) + append_or_createdf(maf_chunk, annotation_paths.full_maf_path) narrow_maf_chunk = maf_chunk[narrow_maf_cols] - append_or_createdf(narrow_maf_chunk, narrow_maf_path) + append_or_createdf(narrow_maf_chunk, annotation_paths.narrow_maf_path) - load.store_table(syn=syn, filepath=narrow_maf_path, tableid=maf_tableid) + load.store_table( + syn=syn, filepath=annotation_paths.narrow_maf_path, tableid=maf_tableid + ) # Store MAF flat file into synapse - load.store_file(syn=syn, filepath=full_maf_path, parentid=flatfiles_synid) + load.store_file( + syn=syn, filepath=annotation_paths.full_maf_path, parentid=flatfiles_synid + ) diff --git a/genie/transform.py b/genie/transform.py index 6145115d..faed45ee 100644 --- a/genie/transform.py +++ b/genie/transform.py @@ -1,5 +1,7 @@ """This module contains all the transformation functions used throughout the GENIE package""" + +from typing import List import warnings import pandas as pd @@ -64,3 +66,24 @@ def _convert_df_with_mixed_dtypes(read_csv_params: dict) -> pd.DataFrame: df = pd.read_csv(**read_csv_params, low_memory=False, engine="c") warnings.resetwarnings() return df + + +def _convert_values_to_na( + input_df: pd.DataFrame, values_to_replace: List[str], columns_to_convert: List[str] +) -> pd.DataFrame: + """Converts given values to NA in an input dataset + + Args: + input_df (pd.DataFrame): input dataset + values_to_replace (List[str]): string values to replace with na + columns_to_convert (List[str]): subset of columns to convert with na in + + Returns: + pd.DataFrame: dataset with specified values replaced with NAs + """ + if not input_df.empty: + replace_mapping = {value: None for value in values_to_replace} + input_df[columns_to_convert] = input_df[columns_to_convert].replace( + replace_mapping + ) + return input_df diff --git a/genie/validate.py b/genie/validate.py index 5bf0f17a..af6de306 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -95,8 +95,13 @@ def validate_single_file(self, **kwargs): valid: Boolean value of validation status """ if self.file_type not in self._format_registry: + allowed_filetypes = list(self._format_registry.keys()) + error_message = ( + f"Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally. " + f"If specifying filetype, options are: [{', '.join(allowed_filetypes)}]\n" + ) valid_result_cls = example_filetype_format.ValidationResults( - errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally", + errors=error_message, warnings="", ) else: diff --git a/genie/write_invalid_reasons.py b/genie/write_invalid_reasons.py index 704697b9..f16f1ad9 100644 --- a/genie/write_invalid_reasons.py +++ b/genie/write_invalid_reasons.py @@ -1,4 +1,5 @@ """Write invalid reasons""" + import logging import os diff --git a/genie_registry/__init__.py b/genie_registry/__init__.py index c4a6595f..e730a9da 100644 --- a/genie_registry/__init__.py +++ b/genie_registry/__init__.py @@ -1,4 +1,5 @@ """Initialize GENIE registry""" + # Import logging last to not take in synapseclient logging import logging diff --git a/genie_registry/assay.py b/genie_registry/assay.py index bdfea502..33362ddc 100644 --- a/genie_registry/assay.py +++ b/genie_registry/assay.py @@ -1,4 +1,5 @@ """Assay information class""" + import os import yaml diff --git a/genie_registry/bed.py b/genie_registry/bed.py index c1f7680f..7b7294c4 100644 --- a/genie_registry/bed.py +++ b/genie_registry/bed.py @@ -1,4 +1,5 @@ """GENIE bed class and functions""" + import os import logging import subprocess diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 014b0b55..e5a4244c 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1,4 +1,5 @@ """Clinical file format validation and processing""" + # from __future__ import annotations import datetime from io import StringIO @@ -1012,28 +1013,45 @@ def _get_dataframe(self, filePathList): return clinicaldf - def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple: + def _cross_validate_bed_files_exist(self, clinicaldf) -> list: """Check that a bed file exist per SEQ_ASSAY_ID value in clinical file""" - errors = "" - warnings = "" missing_files = [] - seq_assay_ids = clinicaldf["SEQ_ASSAY_ID"].unique().tolist() + exception_params = {"ignore_case": True, "allow_underscore": True} + + # standardize and get unique seq assay ids before searching bed files + seq_assay_ids = set( + [ + validate.standardize_string_for_validation(sq_id, **exception_params) + for sq_id in clinicaldf["SEQ_ASSAY_ID"].unique() + ] + ) for seq_assay_id in seq_assay_ids: bed_files = validate.parse_file_info_in_nested_list( nested_list=self.ancillary_files, search_str=f"{seq_assay_id}.bed", # type: ignore[arg-type] - ignore_case=True, - allow_underscore=True, + **exception_params, ) if not bed_files["files"]: - missing_files.append(f"{seq_assay_id}.bed") + missing_files.append(f"{seq_assay_id.upper()}.bed") + return missing_files + + def _cross_validate_bed_files_exist_message(self, missing_bed_files: list) -> tuple: + """Gets the warning/error messages given the missing bed files list - if missing_files: + Args: + missing_bed_files (list): list of missing bed files + + Returns: + tuple: error + warning + """ + errors = "" + warnings = "" + if missing_bed_files: errors = ( "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " "Please update your file(s) to be consistent.\n" - f"Missing BED files: {', '.join(missing_files)}\n" + f"Missing BED files: {', '.join(missing_bed_files)}\n" ) return errors, warnings @@ -1087,7 +1105,10 @@ def _cross_validate(self, clinicaldf) -> tuple: errors_assay, warnings_assay = self._cross_validate_assay_info_has_seq( clinicaldf ) - errors_bed, warnings_bed = self._cross_validate_bed_files_exist(clinicaldf) + missing_bed_files = self._cross_validate_bed_files_exist(clinicaldf) + errors_bed, warnings_bed = self._cross_validate_bed_files_exist_message( + missing_bed_files + ) errors = errors_assay + errors_bed warnings = warnings_assay + warnings_bed diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 3adcc2ce..aeb8246a 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -1,6 +1,7 @@ from io import StringIO -import os import logging +import os +from typing import List import pandas as pd @@ -198,10 +199,6 @@ def _validate(self, mutationDF): for col in numerical_cols: col_exists = process_functions.checkColExist(mutationDF, col) if col_exists: - # Since NA is an allowed value, when reading in the dataframe - # the 'NA' string is not converted. This will convert all - # 'NA' values in the numerical columns into actual float('nan') - mutationDF.loc[mutationDF[col] == "NA", col] = float("nan") # Attempt to convert column to float try: mutationDF[col] = mutationDF[col].astype(float) @@ -352,13 +349,38 @@ def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple: ) return errors, warnings - def _get_dataframe(self, filePathList): - """Get mutation dataframe""" - # Must do this because pandas.read_csv will allow for a file to - # have more column headers than content. E.g. - # A,B,C,D,E - # 1,2 - # 2,3 + def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame: + """Get mutation dataframe + + 1) Starts reading the first line in the file + 2) Skips lines that starts with # + 3) Reads in second line + 4) Checks that first line fields matches second line. Must do this because + pandas.read_csv will allow for a file to have more column headers than content. + E.g) A,B,C,D,E + 1,2 + 2,3 + + 5) We keep the 'NA', 'nan', and 'NaN' as strings in the data because + these are valid allele values + then convert the ones in the non-allele columns back to actual NAs + + NOTE: Because allele columns are case-insensitive in maf data, we must + standardize the case of the columns when checking for the non-allele columns + to convert the NA strings to NAs + + NOTE: This code allows empty dataframes to pass through + without errors + + Args: + filePathList (List[str]): list of filepath(s) + + Raises: + ValueError: First line fields doesn't match second line fields in file + + Returns: + pd.DataFrame: mutation data + """ with open(filePathList[0], "r") as maf_f: firstline = maf_f.readline() if firstline.startswith("#"): @@ -370,28 +392,26 @@ def _get_dataframe(self, filePathList): "Number of fields in a line do not match the " "expected number of columns" ) + read_csv_params = { "filepath_or_buffer": filePathList[0], "sep": "\t", "comment": "#", - # Keep the value 'NA' + "keep_default_na": False, "na_values": [ "-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A N/A", - "NaN", "#N/A", "N/A", "#NA", "NULL", "-NaN", - "nan", "-nan", "", ], - "keep_default_na": False, # This is to check if people write files # with R, quote=T "quoting": 3, @@ -399,5 +419,16 @@ def _get_dataframe(self, filePathList): # validator will cause the file to fail "skip_blank_lines": False, } + mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params) + + mutationdf = transform._convert_values_to_na( + input_df=mutationdf, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=[ + col + for col in mutationdf.columns + if col.upper() not in self._allele_cols + ], + ) return mutationdf diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index cf381086..eb21a36b 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -1,10 +1,11 @@ import logging import os +from typing import List import pandas as pd from genie.example_filetype_format import FileTypeFormat -from genie import process_functions, validate +from genie import process_functions, transform, validate logger = logging.getLogger(__name__) @@ -28,7 +29,25 @@ def _validateFilename(self, filePath): endswith_vcf = basename.endswith(".vcf") assert startswith_genie and endswith_vcf - def _get_dataframe(self, filePathList): + def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame: + """Get mutation dataframe + + 1) Looks for the line in the file starting with #CHROM, that will be + the header line (columns). + + 2) When reading in the data, we keep the 'NA', 'nan', and 'NaN' + as strings in the data because these are valid allele values + then convert the ones in the non-allele columns back to actual NAs + + Args: + filePathList (List[str]): list of filepath(s) + + Raises: + ValueError: when line with #CHROM doesn't exist in file + + Returns: + pd.DataFrame: mutation data + """ headers = None filepath = filePathList[0] with open(filepath, "r") as vcffile: @@ -38,10 +57,37 @@ def _get_dataframe(self, filePathList): break if headers is not None: vcfdf = pd.read_csv( - filepath, sep="\t", comment="#", header=None, names=headers + filepath, + sep="\t", + comment="#", + header=None, + names=headers, + keep_default_na=False, + na_values=[ + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A N/A", + "#N/A", + "N/A", + "#NA", + "NULL", + "-NaN", + "-nan", + "", + ], ) else: raise ValueError("Your vcf must start with the header #CHROM") + + vcfdf = transform._convert_values_to_na( + input_df=vcfdf, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=[ + col for col in vcfdf.columns if col not in self._allele_cols + ], + ) return vcfdf def process_steps(self, df): diff --git a/setup.py b/setup.py index ed8415b3..4c8d7f3c 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ """genie package setup""" + from setuptools import setup setup() diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd index 2f5558df..bb984c21 100644 --- a/templates/dashboardTemplate.Rmd +++ b/templates/dashboardTemplate.Rmd @@ -30,6 +30,8 @@ suppressMessages(library(RColorBrewer)) suppressMessages(library(jsonlite)) suppressMessages(library(knitr)) +source("R/dashboard_template_functions.R") + createCenterColumn <- function(clinicalDf) { if (is.null(clinicalDf$CENTER)) { centers = unlist( @@ -401,6 +403,40 @@ For patient retractions submitted between these months, the records will be remo * April-June * October-December +--- + +### Genome nexus failed annotations summary + +The table below displays the number of failed annotations per center. If a center has no failed annotations, no record will appear in the table for that center. + +```{r} +maf_table_synid = get_syn_id_from_mapped_database( + database_name="vcf2maf", + database_synid_mappingid = database_synid_mappingid +) + +# get narrow maf table +maf_table <- as.data.frame(synTableQuery(sprintf('select * from %s', maf_table_synid))) + +counts_table <- get_failed_annotation_table_counts( + maf_table, + group_by_cols = "Center", + counts_col_name = "Number of failed annotations" + ) +knitr::kable(counts_table, col.names = c("Center", "Number of failed annotations")) + +#get project page on synapse +main_syn_id <- get_syn_id_from_mapped_database( + database_name="main", + database_synid_mappingid = database_synid_mappingid +) +``` + +Follow this navigation guide from the `r paste0("[Synapse project files page](https://www.synapse.org/#!Synapse:", main_syn_id, "/files/){target='_blank'}")` to find your center's detailed failed annotations error report. + +Files → Centers → [Center name] → Errors → failed_annotations_error_report.txt + +View the version comment column in Synapse for your report to find the version associated with this release. --- diff --git a/tests/conftest.py b/tests/conftest.py index 4027fa3b..90c47769 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -51,6 +51,7 @@ def genie_config(): "center": "SAGE", "inputSynId": "syn11601335", "stagingSynId": "syn11601337", + "errorsSynId": "syn53239079", "release": True, "mutationInCisFilter": "ON", }, @@ -58,6 +59,15 @@ def genie_config(): "center": "TEST", "inputSynId": "syn11601340", "stagingSynId": "syn11601342", + "errorsSynId": "syn53239081", + "release": True, + "mutationInCisFilter": "ON", + }, + "GOLD": { + "center": "GOLD", + "inputSynId": "syn52950860", + "stagingSynId": "syn52950861", + "errorsSynId": "syn53239077", "release": True, "mutationInCisFilter": "ON", }, diff --git a/tests/test_assay.py b/tests/test_assay.py index 20d594b7..691c693e 100644 --- a/tests/test_assay.py +++ b/tests/test_assay.py @@ -1,4 +1,5 @@ """Test assay information validation and processing""" + import copy from unittest.mock import patch diff --git a/tests/test_bed.py b/tests/test_bed.py index b720faa8..973c7ea3 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -1,4 +1,5 @@ """Test GENIE Bed class""" + import tempfile import shutil from unittest import mock diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 92c4ec30..91815c97 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -1,3 +1,4 @@ +from collections import Counter import datetime import json from unittest import mock @@ -1061,65 +1062,76 @@ def test__check_int_dead_consistency_inconsistent(inconsistent_df): ) -@pytest.mark.parametrize( - "test_clinical_df,test_ancillary_files,expected_error,expected_warning", - [ - ( - pd.DataFrame( - {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-SAGE-1", "SAGE-1", "SAGE-1"]} +def get_cross_validate_bed_files_test_cases(): + return [ + { + "name": "all_match", + "test_clinical_df": pd.DataFrame( + { + "SEQ_ASSAY_ID": [ + "SAGE-1-1", + "SAGE-SAGE-1", + "SAGE-1", + "SAGE-1", + "SaGe-1", + ] + } ), - [ + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2"]}), - [ + "expected_missing_files": [], + }, + { + "name": "partial_match", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-1", "SAGE-1-2", "SaGe-1_1"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed\n", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}), - [ + "expected_missing_files": ["SAGE-1-2.bed"], + }, + { + "name": "no_match", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SaGe_1_2"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-SAGE-1.bed", "path": ""}], [{"name": "SAGE-1-1.bed", "path": ""}], [{"name": "SAGE-1.bed", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n", - "", - ), - ( - pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3"]}), - [ + "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"], + }, + { + "name": "no_bed_files", + "test_clinical_df": pd.DataFrame( + {"SEQ_ASSAY_ID": ["SAGE-1-2", "SAGE-1-3", "SAge-1_2"]} + ), + "test_ancillary_files": [ [{"name": "SAGE-1.txt", "path": ""}], ], - "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " - "Please update your file(s) to be consistent.\n" - "Missing BED files: SAGE-1-2.bed, SAGE-1-3.bed\n", - "", - ), - ], - ids=["all_match", "partial_match", "no_match", "no_bed_files"], + "expected_missing_files": ["SAGE-1-2.bed", "SAGE-1-3.bed"], + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get_cross_validate_bed_files_test_cases(), ids=lambda x: x["name"] ) def test_that_cross_validate_bed_files_exist_returns_correct_msgs( - clin_class, test_clinical_df, test_ancillary_files, expected_error, expected_warning + clin_class, test_cases ): - clin_class.ancillary_files = test_ancillary_files - errors, warnings = clin_class._cross_validate_bed_files_exist(test_clinical_df) - assert errors == expected_error - assert warnings == expected_warning + clin_class.ancillary_files = test_cases["test_ancillary_files"] + missing_files = clin_class._cross_validate_bed_files_exist( + test_cases["test_clinical_df"] + ) + assert Counter(test_cases["expected_missing_files"]) == Counter(missing_files) def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class): @@ -1138,29 +1150,62 @@ def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class): clin_class._cross_validate_bed_files_exist(test_clinical_df) patch_parse_file_info.assert_called_once_with( nested_list=clin_class.ancillary_files, - search_str="SAGE-SAGE-1.bed", + search_str="sage-sage-1.bed", ignore_case=True, allow_underscore=True, ) +@pytest.mark.parametrize( + "missing_files,expected_error,expected_warning", + [ + ( + [], + "", + "", + ), + ( + ["test1.bed", "test2.bed"], + "At least one SEQ_ASSAY_ID in your clinical file does not have an associated BED file. " + "Please update your file(s) to be consistent.\n" + "Missing BED files: test1.bed, test2.bed\n", + "", + ), + ], + ids=["no_missing_files", "missing_files"], +) +def test_that_cross_validate_bed_files_exist_message_returns_correct_msgs( + clin_class, missing_files, expected_error, expected_warning +): + errors, warnings = clin_class._cross_validate_bed_files_exist_message(missing_files) + assert errors == expected_error + assert warnings == expected_warning + + def test_that__cross_validate_calls_expected_methods(clin_class): with mock.patch.object( Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "") ) as patch__cross_validate_assay, mock.patch.object( Clinical, "_cross_validate_bed_files_exist", return_value=("", "") - ) as patch__cross_validate_bed: + ) as patch__cross_validate_bed, mock.patch.object( + Clinical, "_cross_validate_bed_files_exist_message", return_value=("", "") + ) as patch__cross_validate_bed_msg: clin_class._cross_validate(clinicaldf=pd.DataFrame({"something": [1]})) patch__cross_validate_assay.assert_called_once() patch__cross_validate_bed.assert_called_once() + patch__cross_validate_bed_msg.assert_called_once() def test_that__cross_validate_returns_correct_format_for_errors_warnings(clin_class): with mock.patch.object( Clinical, "_cross_validate_assay_info_has_seq", return_value=("test1", "") ) as patch__cross_validate_assay, mock.patch.object( - Clinical, "_cross_validate_bed_files_exist", return_value=("test3\n", "") - ) as patch__cross_validate_bed: + Clinical, "_cross_validate_bed_files_exist", return_value=["something_missing"] + ) as patch__cross_validate_bed, mock.patch.object( + Clinical, + "_cross_validate_bed_files_exist_message", + return_value=("test3\n", ""), + ) as patch__cross_validate_bed_msg: errors, warnings = clin_class._cross_validate( clinicaldf=pd.DataFrame({"something": [1]}) ) diff --git a/tests/test_database_to_staging.py b/tests/test_database_to_staging.py index 4f2acede..01e8f4a0 100644 --- a/tests/test_database_to_staging.py +++ b/tests/test_database_to_staging.py @@ -1,4 +1,5 @@ """Tests database to staging functions""" + import os from unittest import mock from unittest.mock import patch diff --git a/tests/test_extract.py b/tests/test_extract.py index 5b94a109..2d8c5ac2 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,4 +1,5 @@ """Test genie.extract module""" + from unittest.mock import patch import pandas as pd diff --git a/tests/test_filters.py b/tests/test_filters.py index e0191c17..e9af50d2 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -1,6 +1,7 @@ """ Test GENIE filters """ + import datetime import os import sys diff --git a/tests/test_input_to_database.py b/tests/test_input_to_database.py index bd3bbf9f..3d52d4f6 100644 --- a/tests/test_input_to_database.py +++ b/tests/test_input_to_database.py @@ -668,7 +668,7 @@ def asDataFrame(self): class TestValidation: - def setup(self): + def setup_method(self): valid = [ [ sample_clinical_entity.id, diff --git a/tests/test_maf.py b/tests/test_maf.py index 71d61e64..b86b202b 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -3,10 +3,12 @@ import pandas as pd import pytest -from genie import process_functions, validate +from genie import process_functions, transform, validate import genie_registry.maf from genie_registry.maf import maf +OPEN_BUILTIN = "builtins.open" + @pytest.fixture def maf_class(syn): @@ -38,7 +40,7 @@ def valid_maf_df(): T_REF_COUNT=[1, 2, 3, 4, 3], N_DEPTH=[1, 2, 3, float("nan"), 3], N_REF_COUNT=[1, 2, 3, 4, 3], - N_ALT_COUNT=[1, 2, 3, 4, 3], + N_ALT_COUNT=[1, None, 3, 4, 3], TUMOR_SEQ_ALLELE2=["A", "A", "A", "A", "A"], ) ) @@ -428,22 +430,70 @@ def test_that__cross_validate_returns_expected_msg_if_valid( ids=["no_pound_sign", "pound_sign"], ) def test_that__get_dataframe_returns_expected_result(maf_class, test_input, expected): - with patch("builtins.open", mock_open(read_data=test_input)): + with patch(OPEN_BUILTIN, mock_open(read_data=test_input)): test = maf_class._get_dataframe(["some_path"]) pd.testing.assert_frame_equal(test, expected) -def test_that__get_dataframe_throws_value_error(maf_class): +def test_that__get_dataframe_reads_in_correct_nas(maf_class): file = ( - "#Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome\n" - "TEST 3845 TEST GRCh37 12" + "Hugo_Symbol\tEntrez_Gene_Id\tReference_Allele\n" + "TEST\t3845\tNA\n" + "TEST\tNA\tnan\n" + "TEST\t3846\tN/A\n" + "NA\tnan\tNaN" ) - with patch("builtins.open", mock_open(read_data=file)): - with pytest.raises( - ValueError, - match="Number of fields in a line do not match the expected number of columns", - ): - maf_class._get_dataframe(["some_path"]) + with patch(OPEN_BUILTIN, mock_open(read_data=file)): + expected = pd.DataFrame( + { + "Hugo_Symbol": ["TEST", "TEST", "TEST", None], + "Entrez_Gene_Id": ["3845", None, "3846", None], + "Reference_Allele": ["NA", "nan", None, "NaN"], + } + ) + maf_df = maf_class._get_dataframe(["some_path"]) + pd.testing.assert_frame_equal(maf_df, expected) + + +@pytest.mark.parametrize( + "input,expected_columns", + [ + ( + pd.DataFrame( + { + "Hugo_Symbol": ["TEST"], + "Entrez_Gene_Id": ["3845"], + "RefErence_Allele": ["NA"], + } + ), + ["Hugo_Symbol", "Entrez_Gene_Id"], + ), + ( + pd.DataFrame( + { + "#CHROM": ["TEST"], + "ALT": ["3845"], + "Reference_a": ["NA"], + } + ), + ["#CHROM", "ALT", "Reference_a"], + ), + ], + ids=["with_allele_col", "no_allele_col"], +) +def test_that__get_dataframe_uses_correct_columns_to_replace( + maf_class, input, expected_columns +): + file = "Hugo_Symbol\tEntrez_Gene_Id\tReference_Allele\n" "TEST\t3845\tNA" + with patch(OPEN_BUILTIN, mock_open(read_data=file)), patch.object( + pd, "read_csv", return_value=input + ), patch.object(transform, "_convert_values_to_na") as patch_convert_to_na: + maf_class._get_dataframe(["some_path"]) + patch_convert_to_na.assert_called_once_with( + input_df=input, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=expected_columns, + ) @pytest.mark.parametrize( diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index a0365e54..e4a95d56 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -2,6 +2,13 @@ import uuid import pandas as pd +from pandas.api.types import ( + is_bool_dtype, + is_float_dtype, + is_integer_dtype, + is_string_dtype, +) +from pandas.testing import assert_frame_equal import pytest import synapseclient @@ -500,3 +507,180 @@ def test_that_func_returns_correct_error_warning_if_input_col_has_na_and_nas_is_ ) assert error == "" assert warning == "" + + +def get_create_missing_columns_test_cases(): + return [ + { + "name": "str_no_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col1": "string"}, + "expected_output": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "str_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col2": "string"}, + "expected_output": pd.DataFrame({"col2": ["", "", ""]}), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "float_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col2": "float"}, + "expected_output": pd.DataFrame( + {"col2": [float("nan"), float("nan"), float("nan")]} + ), + "expected_dtype": is_float_dtype, + "expected_na_count": 3, + }, + { + "name": "float_no_na", + "test_input": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}), + "test_schema": {"col1": "float"}, + "expected_output": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}), + "expected_dtype": is_float_dtype, + "expected_na_count": 1, + }, + { + "name": "int_na", + "test_input": pd.DataFrame({"col1": [2, 3, 4]}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame( + {"col2": [None, None, None]}, dtype=pd.Int64Dtype() + ), + "expected_dtype": is_integer_dtype, + "expected_na_count": 3, + }, + { + "name": "int_no_na", + "test_input": pd.DataFrame({"col1": [2, 3, 4]}), + "test_schema": {"col1": "integer"}, + "expected_output": pd.DataFrame({"col1": [2, 3, 4]}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + { + "name": "bool_na", + "test_input": pd.DataFrame({"col1": [True, False, None]}), + "test_schema": {"col2": "boolean"}, + "expected_output": pd.DataFrame( + {"col2": [None, None, None]}, dtype=pd.BooleanDtype() + ), + "expected_dtype": is_bool_dtype, + "expected_na_count": 3, + }, + { + "name": "bool_no_na", + "test_input": pd.DataFrame({"col1": [True, False, None]}), + "test_schema": {"col1": "boolean"}, + "expected_output": pd.DataFrame( + {"col1": [True, False, None]}, dtype=pd.BooleanDtype() + ), + "expected_dtype": is_bool_dtype, + "expected_na_count": 1, + }, + { + "name": "empty_col", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "string"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=str), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_df", + "test_input": pd.DataFrame({}), + "test_schema": {"col1": "float"}, + "expected_output": pd.DataFrame({"col1": []}, index=[]), + "expected_dtype": is_float_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_col_int", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_df_int", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + ] + + +@pytest.mark.parametrize( + "test_cases", + get_create_missing_columns_test_cases(), + ids=lambda x: x["name"], +) +def test_that_create_missing_columns_gets_expected_output_with_single_col_df( + test_cases, +): + result = process_functions.create_missing_columns( + dataset=test_cases["test_input"], schema=test_cases["test_schema"] + ) + assert_frame_equal(result, test_cases["expected_output"], check_exact=True) + assert test_cases["expected_dtype"](result.iloc[:, 0]) + assert result.isna().sum().sum() == test_cases["expected_na_count"] + + +def test_that_create_missing_columns_returns_expected_output_with_multi_col_df(): + test_input = pd.DataFrame( + { + "col2": ["str1", "str2", "str3"], + "col1": [2, 3, 4], + "col3": [2.0, 3.0, float("nan")], + "col7": [True, False, None], + } + ) + test_schema = { + "col1": "integer", + "col2": "string", + "col3": "float", + "col4": "integer", + "col5": "string", + "col6": "float", + "col7": "boolean", + "col8": "boolean", + } + result = process_functions.create_missing_columns( + dataset=test_input, schema=test_schema + ) + expected_output = pd.DataFrame( + { + "col1": [2, 3, 4], + "col2": ["str1", "str2", "str3"], + "col3": [2.0, 3.0, float("nan")], + "col4": [None, None, None], + "col5": ["", "", ""], + "col6": [float("nan"), float("nan"), float("nan")], + "col7": [True, False, None], + "col8": [None, None, None], + } + ) + expected_output["col1"] = expected_output["col1"].astype("Int64") + expected_output["col4"] = expected_output["col4"].astype("Int64") + expected_output["col7"] = expected_output["col7"].astype(pd.BooleanDtype()) + expected_output["col8"] = expected_output["col8"].astype(pd.BooleanDtype()) + + assert result["col1"].dtype == pd.Int64Dtype() + assert is_string_dtype(result["col2"]) + assert is_float_dtype(result["col3"]) + assert result["col4"].dtype == pd.Int64Dtype() + assert is_string_dtype(result["col5"]) + assert is_float_dtype(result["col6"]) + assert result["col7"].dtype == pd.BooleanDtype() + assert result["col8"].dtype == pd.BooleanDtype() + assert result.isna().sum().sum() == 11 + + assert_frame_equal(result, expected_output, check_exact=True) diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 1e945e10..b24e6209 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -1,12 +1,18 @@ """Test process mutation functions""" + +import os +from collections import namedtuple +from pandas.testing import assert_frame_equal import shutil import subprocess import tempfile from unittest.mock import patch, call import pandas as pd +import pytest +import synapseclient -from genie import load, process_mutation +from genie import extract, load, process_mutation def test_format_maf(): @@ -35,7 +41,7 @@ def test_format_maf(): class TestDtype: - def setup(self): + def setup_method(self): self.testdf = pd.DataFrame({"foo": [1], "bar": ["baz"]}) self.column_types = {"foo": "int64", "bar": "object"} self.mutation_path = "/path/test.csv" @@ -93,7 +99,32 @@ def test_move_mutation_maf(self): patch_move.assert_called_once_with(self.mutation_path, self.input_dir) -def test_process_mutation_workflow(syn, genie_config): +@pytest.fixture +def annotation_paths(): + Filepaths = namedtuple( + "Filepaths", + [ + "input_files_dir", + "output_files_dir", + "error_dir", + "merged_maf_path", + "narrow_maf_path", + "full_maf_path", + "full_error_report_path", + ], + ) + yield Filepaths( + input_files_dir="input/dir", + output_files_dir="input/dir", + error_dir="input/dir/SAGE_error_reports", + merged_maf_path="input/dir/data_mutations_extended_SAGE.txt", + narrow_maf_path="input/SAGE/staging/data_mutations_extended_SAGE_MAF_narrow.txt", + full_maf_path="input/SAGE/staging/data_mutations_extended_SAGE.txt", + full_error_report_path="input/SAGE/staging/failed_annotations_report.txt", + ) + + +def test_process_mutation_workflow(syn, genie_config, annotation_paths): """Integration test to make sure workflow runs""" validfiles = pd.DataFrame( {"fileType": ["vcf", "maf"], "path": ["path/to/vcf", "path/to/maf"]} @@ -104,78 +135,215 @@ def test_process_mutation_workflow(syn, genie_config): "syn22053204", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, + # version=1, # TODO: This should pull from a config file in the future ), call( "syn22084320", ifcollision="overwrite.local", downloadLocation=genie_annotation_pkg, + # version=13, # TODO: This should pull from a config file in the future ), ] center = "SAGE" workdir = "working/dir/path" - maf_path = "path/to/maf" - with patch.object(syn, "get") as patch_synget, patch.object( - process_mutation, "annotate_mutation", return_value=maf_path + maf_table_id = "syn22493903" + sample_error_report = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4]}) + with patch.object( + process_mutation, "create_annotation_paths", return_value=annotation_paths + ) as patch_annotation_paths, patch.object(syn, "get") as patch_synget, patch.object( + process_mutation, "annotate_mutation" ) as patch_annotation, patch.object( process_mutation, "split_and_store_maf" - ) as patch_split: + ) as patch_split, patch.object( + process_mutation, + "concat_annotation_error_reports", + return_value=sample_error_report, + ) as patch_concat_error, patch.object( + process_mutation, "check_annotation_error_reports" + ) as patch_check_error, patch.object( + process_mutation, "store_annotation_error_reports" + ) as patch_store_error: maf = process_mutation.process_mutation_workflow( syn, center, validfiles, genie_config, workdir ) + patch_annotation_paths.assert_called_once_with(center=center, workdir=workdir) patch_synget.assert_has_calls(syn_get_calls) patch_annotation.assert_called_once_with( + annotation_paths=annotation_paths, center=center, mutation_files=["path/to/vcf", "path/to/maf"], genie_annotation_pkg=genie_annotation_pkg, - workdir=workdir, ) patch_split.assert_called_once_with( syn=syn, center=center, - maf_tableid="syn22493903", - annotated_maf_path=maf_path, + maf_tableid=maf_table_id, + annotation_paths=annotation_paths, flatfiles_synid="syn12279903", + ) + patch_concat_error.assert_called_once_with( + center=center, + input_dir=annotation_paths.error_dir, + ) + patch_check_error.assert_called_once_with( + syn=syn, + maf_table_synid=maf_table_id, + full_error_report=sample_error_report, + center=center, + ) + patch_store_error.assert_called_once_with( + full_error_report=sample_error_report, + full_error_report_path=annotation_paths.full_error_report_path, + syn=syn, + errors_folder_synid="syn53239079", + ) + assert maf == annotation_paths.merged_maf_path + + +def test_that_create_annotation_paths_returns_expected_paths(annotation_paths): + center = "SAGE" + input_dir = "input/dir" + workdir = "test/dir" + + with patch.object(tempfile, "mkdtemp", return_value=input_dir) as patch_mktemp: + test_paths = process_mutation.create_annotation_paths( + center=center, workdir=workdir, ) - assert maf == maf_path + mktemp_calls = [call(dir=workdir)] * 2 + patch_mktemp.assert_has_calls(mktemp_calls) + assert test_paths.input_files_dir == annotation_paths.input_files_dir + assert test_paths.output_files_dir == annotation_paths.output_files_dir + assert test_paths.error_dir == annotation_paths.error_dir + assert test_paths.merged_maf_path == annotation_paths.merged_maf_path + + +class TestAnnotationErrorReports: + @classmethod + def setup_class(cls): + cls.source_dir = "source_test_directory" + os.makedirs(cls.source_dir, exist_ok=True) + # Create sample files in the source directory + with open(os.path.join(cls.source_dir, "file1.tsv"), "w") as f1: + f1.write("col1\tcol2\tcol3\nval1\tval2\tval3\n") + with open(os.path.join(cls.source_dir, "file2.tsv"), "w") as f2: + f2.write("col1\tcol2\tcol3\nval4\tval5\tval6\n") -def test_annotate_mutation(): + @classmethod + def teardown_class(cls): + shutil.rmtree(cls.source_dir) + + @pytest.fixture + def test_error_report(self): + yield pd.DataFrame( + { + "col1": ["val1", "val4"], + "col2": ["val2", "val5"], + "col3": ["val3", "val6"], + "Center": ["SAGE", "SAGE"], + } + ) + + def test_check_annotation_error_reports_passes_if_match( + self, syn, test_error_report + ): + maf_table_synid = "synZZZZ" + with patch.object( + extract, "get_syntabledf", return_value=test_error_report + ) as patch_get_syntabledf: + process_mutation.check_annotation_error_reports( + syn=syn, + maf_table_synid="synZZZZ", + full_error_report=test_error_report, + center="SAGE", + ) + patch_get_syntabledf.assert_called_once_with( + syn=syn, + query_string=f"SELECT * FROM {maf_table_synid} " + "WHERE Center = 'SAGE' AND " + "Annotation_Status = 'FAILED'", + ) + + def test_check_annotation_error_reports_raises_warning_if_no_match( + self, syn, test_error_report, caplog + ): + with patch.object( + extract, + "get_syntabledf", + return_value=pd.DataFrame({"test": [1], "test2": [2]}), + ): + process_mutation.check_annotation_error_reports( + syn=syn, + maf_table_synid="synZZZZ", + full_error_report=test_error_report, + center="SAGE", + ) + assert ( + "Genome nexus's failed annotations error report rows doesn't match" + "maf table's failed annotations for SAGE" in caplog.text + ) + # check that at least one is a warning + assert any(record.levelname == "WARNING" for record in caplog.records) + + def test_concat_annotation_error_reports_returns_expected(self, test_error_report): + compiled_report = process_mutation.concat_annotation_error_reports( + input_dir="source_test_directory", + center="SAGE", + ) + assert_frame_equal( + compiled_report.sort_values(by="col1").reset_index(drop=True), + test_error_report.sort_values(by="col1").reset_index(drop=True), + ) + + def test_store_annotation_error_reports(self, syn, test_error_report): + errors_folder_synid = "syn11111" + full_error_report_path = "test.tsv" + with patch.object(load, "store_file", return_value=None) as patch_store: + process_mutation.store_annotation_error_reports( + full_error_report=test_error_report, + full_error_report_path=full_error_report_path, + syn=syn, + errors_folder_synid=errors_folder_synid, + ) + patch_store.assert_called_once_with( + syn=syn, + filepath=full_error_report_path, + parentid=errors_folder_synid, + ) + + +def test_annotate_mutation(annotation_paths): """Integration test, test that annotate mutation is called currectly""" center = "SAGE" mutation_files = ["path/to/vcf"] genie_annotation_pkg = "annotation/pkg/path" - workdir = "working/dir/path" - mktemp_calls = [call(dir=workdir)] * 2 - input_dir = "input/dir" - with patch.object( - tempfile, "mkdtemp", return_value=input_dir - ) as patch_mktemp, patch.object( - process_mutation, "move_mutation" - ) as patch_move, patch.object( + + with patch.object(process_mutation, "move_mutation") as patch_move, patch.object( subprocess, "check_call" ) as patch_call: maf_path = process_mutation.annotate_mutation( + annotation_paths=annotation_paths, center=center, mutation_files=mutation_files, genie_annotation_pkg=genie_annotation_pkg, - workdir=workdir, ) - patch_mktemp.assert_has_calls(mktemp_calls) - patch_move.assert_called_once_with("path/to/vcf", input_dir) + patch_move.assert_called_once_with( + "path/to/vcf", annotation_paths.input_files_dir + ) patch_call.assert_called_once_with( [ "bash", "annotation/pkg/path/annotation_suite_wrapper.sh", - f"-i={input_dir}", - f"-o={input_dir}", - f"-m=input/dir/data_mutations_extended_{center}.txt", + f"-i={annotation_paths.input_files_dir}", + f"-o={annotation_paths.input_files_dir}", + f"-e={annotation_paths.error_dir}", + f"-m={annotation_paths.merged_maf_path}", f"-c={center}", "-s=WXS", f"-p={genie_annotation_pkg}", ] ) - assert maf_path == f"input/dir/data_mutations_extended_{center}.txt" def test_append_or_createdf_append(): @@ -211,7 +379,7 @@ def test_append_or_createdf_create_file_0size(): patch_tocsv.assert_called_once_with(temp_file.name, sep="\t", index=False) -def test_split_and_store_maf(syn): +def test_split_and_store_maf(syn, annotation_paths): """Integration test, check splitting and storing of maf functions are called""" # getTableColumns @@ -248,21 +416,20 @@ def test_split_and_store_maf(syn): syn=syn, center=center, maf_tableid="sy12345", - annotated_maf_path=annotated_maf_path, + annotation_paths=annotation_paths, flatfiles_synid="syn2345", - workdir="workdir/path", ) patch_getcols.assert_called_once_with("sy12345") patch_readcsv.assert_called_once_with( - annotated_maf_path, sep="\t", chunksize=100000, comment="#" + annotation_paths.merged_maf_path, sep="\t", chunksize=100000, comment="#" ) patch_format.assert_called_once_with(exampledf, center) assert patch_append.call_count == 2 patch_store_table.assert_called_once_with( - syn=syn, filepath=narrow_maf_path, tableid="sy12345" + syn=syn, filepath=annotation_paths.narrow_maf_path, tableid="sy12345" ) patch_store_file.assert_called_once_with( - syn=syn, filepath=full_maf_path, parentid="syn2345" + syn=syn, filepath=annotation_paths.full_maf_path, parentid="syn2345" ) diff --git a/tests/test_transform.py b/tests/test_transform.py index 7e0d1ca7..12142163 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -1,4 +1,5 @@ """Test genie.transform module""" + import os from io import BytesIO from unittest import mock @@ -186,3 +187,60 @@ def test_that__convert_df_with_mixed_dtypes_catches_mixed_dtype_exception( ], any_order=False, ) + + +def get__convert_values_to_na_test_cases(): + return [ + { + "name": "single_col", + "input_df": pd.DataFrame({"col1": ["N/A", "SAGE-SAGE-1", "-nan"]}), + "values_to_replace": ["N/A", "-nan"], + "columns_to_convert": ["col1"], + "expected_df": pd.DataFrame({"col1": [None, "SAGE-SAGE-1", None]}), + }, + { + "name": "multi_col", + "input_df": pd.DataFrame( + {"col1": ["N/A", "-nan", "1"], "col2": ["-nan", "N/A", "2"]} + ), + "values_to_replace": ["N/A", "-nan"], + "columns_to_convert": ["col1", "col2"], + "expected_df": pd.DataFrame( + {"col1": [None, None, "1"], "col2": [None, None, "2"]} + ), + }, + { + "name": "empty_cols", + "input_df": pd.DataFrame(columns=["col1", "col2"]), + "values_to_replace": ["NaN", "NA", "nan"], + "columns_to_convert": ["col1", "col2"], + "expected_df": pd.DataFrame(columns=["col1", "col2"]), + }, + { + "name": "empty df", + "input_df": pd.DataFrame(), + "values_to_replace": ["NaN"], + "columns_to_convert": ["col3"], + "expected_df": pd.DataFrame(), + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get__convert_values_to_na_test_cases(), ids=lambda x: x["name"] +) +def test_that__convert_values_to_na_returns_expected(test_cases): + result = transform._convert_values_to_na( + test_cases["input_df"], + test_cases["values_to_replace"], + test_cases["columns_to_convert"], + ) + pd.testing.assert_frame_equal(result, test_cases["expected_df"]) + + +def test_that__convert_values_to_na_throws_key_error_if_nonexistent_cols(): + input_df = pd.DataFrame({"col1": ["N/A", "SAGE-SAGE-1", "-nan"]}) + values_to_replace = "NaN" + columns_to_convert = "col3" + with pytest.raises(KeyError): + transform._convert_values_to_na(input_df, values_to_replace, columns_to_convert) diff --git a/tests/test_validate.py b/tests/test_validate.py index 4e16cfee..b50b0e7e 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,4 +1,5 @@ """Tests validate.py""" + from unittest.mock import Mock, patch import pandas as pd @@ -162,7 +163,8 @@ def test_filetype_validate_single_file(syn): "----------------ERRORS----------------\n" "Your filename is incorrect! Please change your " "filename before you run the validator or specify " - "--filetype if you are running the validator locally" + "--filetype if you are running the validator locally. " + "If specifying filetype, options are: [wrong]\n" ) with patch.object(FileFormat, "validateFilename", side_effect=AssertionError): @@ -185,7 +187,8 @@ def test_wrongfiletype_validate_single_file(syn): "----------------ERRORS----------------\n" "Your filename is incorrect! Please change your " "filename before you run the validator or specify " - "--filetype if you are running the validator locally" + "--filetype if you are running the validator locally. " + "If specifying filetype, options are: [wrong]\n" ) with patch.object( @@ -455,7 +458,7 @@ def test_perform_validate(syn, genie_config): validate._perform_validate(syn, arg) patch_check_parentid.assert_called_once_with(syn=syn, parentid=arg.parentid) patch_get_config.assert_called_once_with(syn=syn, project_id=arg.project_id) - patch_check_center.assert_called_once_with(arg.center, ["SAGE", "TEST"]) + patch_check_center.assert_called_once_with(arg.center, ["SAGE", "TEST", "GOLD"]) patch_get_onco.assert_called_once() patch_validate.assert_called_once_with( nosymbol_check=arg.nosymbol_check, project_id=arg.project_id diff --git a/tests/test_vcf.py b/tests/test_vcf.py index d8bbabf5..f508d778 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -1,8 +1,12 @@ import pandas as pd import pytest +from unittest.mock import mock_open, patch +from genie import transform from genie_registry.vcf import vcf +OPEN_BUILTIN = "builtins.open" + @pytest.fixture def vcf_class(syn): @@ -251,3 +255,73 @@ def test_validation_more_than_11_cols(vcf_class): "Only single sample or matched tumor normal vcf files are accepted.\n" ) assert warning == "" + + +def test_that__get_dataframe_throws_value_error_if_no_headers(vcf_class): + file = "CHROM\tALT\tREF\n" "TEST\t3845\tNA" + with patch(OPEN_BUILTIN, mock_open(read_data=file)): + with pytest.raises( + ValueError, match="Your vcf must start with the header #CHROM" + ): + vcf_class._get_dataframe(["some_path"]) + + +def test_that__get_dataframe_reads_in_correct_nas(vcf_class): + file = ( + "#CHROM\tALT\tREF\n" + "TEST\t3845\tNA\n" + "TEST\tNA\tnan\n" + "TEST\t3846\tN/A\n" + "NA\tnan\tNaN" + ) + with patch(OPEN_BUILTIN, mock_open(read_data=file)): + expected = pd.DataFrame( + { + "#CHROM": ["TEST", "TEST", "TEST", None], + "ALT": ["3845", None, "3846", None], + "REF": ["NA", "nan", None, "NaN"], + } + ) + vcf_df = vcf_class._get_dataframe(["some_path"]) + pd.testing.assert_frame_equal(vcf_df, expected) + + +@pytest.mark.parametrize( + "input,expected_columns", + [ + ( + pd.DataFrame( + { + "#CHROM": ["TEST"], + "ALT": ["3845"], + "REF": ["NA"], + } + ), + ["#CHROM", "ALT"], + ), + ( + pd.DataFrame( + { + "#CHROM": ["TEST"], + "ALT": ["3845"], + "rEf": ["NA"], + } + ), + ["#CHROM", "ALT", "rEf"], + ), + ], + ids=["with_allele_col", "no_allele_col"], +) +def test_that__get_dataframe_uses_correct_columns_to_replace( + vcf_class, input, expected_columns +): + file = "#CHROM\tALT\tref\n" "TEST\t3845\tNA" + with patch(OPEN_BUILTIN, mock_open(read_data=file)), patch.object( + pd, "read_csv", return_value=input + ), patch.object(transform, "_convert_values_to_na") as patch_convert_to_na: + vcf_class._get_dataframe(["some_path"]) + patch_convert_to_na.assert_called_once_with( + input_df=input, + values_to_replace=["NA", "nan", "NaN"], + columns_to_convert=expected_columns, + ) diff --git a/tests/test_write_invalid_reasons.py b/tests/test_write_invalid_reasons.py index c49a5e17..31a86b0b 100644 --- a/tests/test_write_invalid_reasons.py +++ b/tests/test_write_invalid_reasons.py @@ -1,4 +1,5 @@ """Test write invalid reasons module""" + from unittest import mock from unittest.mock import patch