diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 264b0937..e563dc88 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -35,6 +35,73 @@ SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt") BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv") +FULL_MAF_RELEASE_COLUMNS = [ + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Consequence", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "Matched_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "t_ref_count", + "t_alt_count", + "n_ref_count", + "n_alt_count", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "RefSeq", + "Protein_position", + "Codons", + "Exon_Number", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "FILTER", + "Polyphen_Prediction", + "Polyphen_Score", + "SIFT_Prediction", + "SIFT_Score", + "SWISSPROT", + "n_depth", + "t_depth", + "Annotation_Status", + "mutationInCis_Flag", +] + # TODO: Add to transform.py def _to_redact_interval(df_col): @@ -755,10 +822,7 @@ def store_maf_files( with open(MUTATIONS_CENTER_PATH % center, "w"): pass used_entities = [] - # Must get the headers (because can't assume headers are the same order) maf_ent = syn.get(centerMafSynIdsDf.id[0]) - headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0) - column_order = headerdf.columns for _, mafSynId in enumerate(centerMafSynIdsDf.id): maf_ent = syn.get(mafSynId) logger.info(maf_ent.path) @@ -771,13 +835,12 @@ def store_maf_files( ) for mafchunk in mafchunks: - # Reorder column headers - mafchunk = mafchunk[column_order] # Get center for center staging maf # Configure maf configured_mafdf = configure_maf( mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants ) + configured_mafdf = configured_mafdf[FULL_MAF_RELEASE_COLUMNS] # Create maf for release merged_mafdf = remove_maf_samples( configured_mafdf, keep_for_merged_consortium_samples diff --git a/genie/process_functions.py b/genie/process_functions.py index e45d3c90..3f0fc3e4 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -948,3 +948,35 @@ def create_new_fileformat_table( "newdb_mappingdf": newdb_mappingdf, "moved_ent": moved_ent, } + + +def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: + """Creates and fills missing columns with the relevant NA value for the + given data type. Note that special handling had to occur for + allowing NAs in integer based columns in pandas by converting + the integer column into the Int64 (pandas nullable integer data type) + + Args: + dataset (pd.DataFrame): input dataset to fill missing columns for + schema (dict): the expected schema {column_name(str): data_type(str)} + for the input dataset + + Returns: + pd.Series: updated dataset + """ + missing_values = { + "string": "", + "integer": None, + "float": float("nan"), + "boolean": None, + } + for column, data_type in schema.items(): + if column not in dataset.columns: + dataset = dataset.assign(**{column: missing_values[data_type]}) + + # only way to preserve NAs for these specific dtype columns + if data_type == "integer": + dataset[column] = dataset[column].astype("Int64") + elif data_type == "boolean": + dataset[column] = dataset[column].astype(pd.BooleanDtype()) + return dataset[list(schema.keys())] diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index a0365e54..e4a95d56 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -2,6 +2,13 @@ import uuid import pandas as pd +from pandas.api.types import ( + is_bool_dtype, + is_float_dtype, + is_integer_dtype, + is_string_dtype, +) +from pandas.testing import assert_frame_equal import pytest import synapseclient @@ -500,3 +507,180 @@ def test_that_func_returns_correct_error_warning_if_input_col_has_na_and_nas_is_ ) assert error == "" assert warning == "" + + +def get_create_missing_columns_test_cases(): + return [ + { + "name": "str_no_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col1": "string"}, + "expected_output": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "str_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col2": "string"}, + "expected_output": pd.DataFrame({"col2": ["", "", ""]}), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "float_na", + "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}), + "test_schema": {"col2": "float"}, + "expected_output": pd.DataFrame( + {"col2": [float("nan"), float("nan"), float("nan")]} + ), + "expected_dtype": is_float_dtype, + "expected_na_count": 3, + }, + { + "name": "float_no_na", + "test_input": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}), + "test_schema": {"col1": "float"}, + "expected_output": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}), + "expected_dtype": is_float_dtype, + "expected_na_count": 1, + }, + { + "name": "int_na", + "test_input": pd.DataFrame({"col1": [2, 3, 4]}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame( + {"col2": [None, None, None]}, dtype=pd.Int64Dtype() + ), + "expected_dtype": is_integer_dtype, + "expected_na_count": 3, + }, + { + "name": "int_no_na", + "test_input": pd.DataFrame({"col1": [2, 3, 4]}), + "test_schema": {"col1": "integer"}, + "expected_output": pd.DataFrame({"col1": [2, 3, 4]}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + { + "name": "bool_na", + "test_input": pd.DataFrame({"col1": [True, False, None]}), + "test_schema": {"col2": "boolean"}, + "expected_output": pd.DataFrame( + {"col2": [None, None, None]}, dtype=pd.BooleanDtype() + ), + "expected_dtype": is_bool_dtype, + "expected_na_count": 3, + }, + { + "name": "bool_no_na", + "test_input": pd.DataFrame({"col1": [True, False, None]}), + "test_schema": {"col1": "boolean"}, + "expected_output": pd.DataFrame( + {"col1": [True, False, None]}, dtype=pd.BooleanDtype() + ), + "expected_dtype": is_bool_dtype, + "expected_na_count": 1, + }, + { + "name": "empty_col", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "string"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=str), + "expected_dtype": is_string_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_df", + "test_input": pd.DataFrame({}), + "test_schema": {"col1": "float"}, + "expected_output": pd.DataFrame({"col1": []}, index=[]), + "expected_dtype": is_float_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_col_int", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + { + "name": "empty_df_int", + "test_input": pd.DataFrame({"col1": []}), + "test_schema": {"col2": "integer"}, + "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()), + "expected_dtype": is_integer_dtype, + "expected_na_count": 0, + }, + ] + + +@pytest.mark.parametrize( + "test_cases", + get_create_missing_columns_test_cases(), + ids=lambda x: x["name"], +) +def test_that_create_missing_columns_gets_expected_output_with_single_col_df( + test_cases, +): + result = process_functions.create_missing_columns( + dataset=test_cases["test_input"], schema=test_cases["test_schema"] + ) + assert_frame_equal(result, test_cases["expected_output"], check_exact=True) + assert test_cases["expected_dtype"](result.iloc[:, 0]) + assert result.isna().sum().sum() == test_cases["expected_na_count"] + + +def test_that_create_missing_columns_returns_expected_output_with_multi_col_df(): + test_input = pd.DataFrame( + { + "col2": ["str1", "str2", "str3"], + "col1": [2, 3, 4], + "col3": [2.0, 3.0, float("nan")], + "col7": [True, False, None], + } + ) + test_schema = { + "col1": "integer", + "col2": "string", + "col3": "float", + "col4": "integer", + "col5": "string", + "col6": "float", + "col7": "boolean", + "col8": "boolean", + } + result = process_functions.create_missing_columns( + dataset=test_input, schema=test_schema + ) + expected_output = pd.DataFrame( + { + "col1": [2, 3, 4], + "col2": ["str1", "str2", "str3"], + "col3": [2.0, 3.0, float("nan")], + "col4": [None, None, None], + "col5": ["", "", ""], + "col6": [float("nan"), float("nan"), float("nan")], + "col7": [True, False, None], + "col8": [None, None, None], + } + ) + expected_output["col1"] = expected_output["col1"].astype("Int64") + expected_output["col4"] = expected_output["col4"].astype("Int64") + expected_output["col7"] = expected_output["col7"].astype(pd.BooleanDtype()) + expected_output["col8"] = expected_output["col8"].astype(pd.BooleanDtype()) + + assert result["col1"].dtype == pd.Int64Dtype() + assert is_string_dtype(result["col2"]) + assert is_float_dtype(result["col3"]) + assert result["col4"].dtype == pd.Int64Dtype() + assert is_string_dtype(result["col5"]) + assert is_float_dtype(result["col6"]) + assert result["col7"].dtype == pd.BooleanDtype() + assert result["col8"].dtype == pd.BooleanDtype() + assert result.isna().sum().sum() == 11 + + assert_frame_equal(result, expected_output, check_exact=True)