Merge pull request #548 from Sage-Bionetworks/gen-1076-exclude-new-gn…

…-var [GEN-1076] Exclude genomic_location_explanation from release
Sage-Bionetworks · Jan 25, 2024 · 889698d · 889698d
2 parents 4f9be49 + 116a709
commit 889698d
Show file tree

Hide file tree

Showing 3 changed files with 284 additions and 5 deletions.
diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -35,6 +35,73 @@
 SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt")
 BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv")
 
+FULL_MAF_RELEASE_COLUMNS = [
+    "Hugo_Symbol",
+    "Entrez_Gene_Id",
+    "Center",
+    "NCBI_Build",
+    "Chromosome",
+    "Start_Position",
+    "End_Position",
+    "Strand",
+    "Consequence",
+    "Variant_Classification",
+    "Variant_Type",
+    "Reference_Allele",
+    "Tumor_Seq_Allele1",
+    "Tumor_Seq_Allele2",
+    "dbSNP_RS",
+    "dbSNP_Val_Status",
+    "Tumor_Sample_Barcode",
+    "Matched_Norm_Sample_Barcode",
+    "Match_Norm_Seq_Allele1",
+    "Match_Norm_Seq_Allele2",
+    "Tumor_Validation_Allele1",
+    "Tumor_Validation_Allele2",
+    "Match_Norm_Validation_Allele1",
+    "Match_Norm_Validation_Allele2",
+    "Verification_Status",
+    "Validation_Status",
+    "Mutation_Status",
+    "Sequencing_Phase",
+    "Sequence_Source",
+    "Validation_Method",
+    "Score",
+    "BAM_File",
+    "Sequencer",
+    "t_ref_count",
+    "t_alt_count",
+    "n_ref_count",
+    "n_alt_count",
+    "HGVSc",
+    "HGVSp",
+    "HGVSp_Short",
+    "Transcript_ID",
+    "RefSeq",
+    "Protein_position",
+    "Codons",
+    "Exon_Number",
+    "gnomAD_AF",
+    "gnomAD_AFR_AF",
+    "gnomAD_AMR_AF",
+    "gnomAD_ASJ_AF",
+    "gnomAD_EAS_AF",
+    "gnomAD_FIN_AF",
+    "gnomAD_NFE_AF",
+    "gnomAD_OTH_AF",
+    "gnomAD_SAS_AF",
+    "FILTER",
+    "Polyphen_Prediction",
+    "Polyphen_Score",
+    "SIFT_Prediction",
+    "SIFT_Score",
+    "SWISSPROT",
+    "n_depth",
+    "t_depth",
+    "Annotation_Status",
+    "mutationInCis_Flag",
+]
+
 
 # TODO: Add to transform.py
 def _to_redact_interval(df_col):
@@ -755,10 +822,7 @@ def store_maf_files(
         with open(MUTATIONS_CENTER_PATH % center, "w"):
             pass
     used_entities = []
-    # Must get the headers (because can't assume headers are the same order)
     maf_ent = syn.get(centerMafSynIdsDf.id[0])
-    headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0)
-    column_order = headerdf.columns
     for _, mafSynId in enumerate(centerMafSynIdsDf.id):
         maf_ent = syn.get(mafSynId)
         logger.info(maf_ent.path)
@@ -771,13 +835,12 @@ def store_maf_files(
             )
 
             for mafchunk in mafchunks:
-                # Reorder column headers
-                mafchunk = mafchunk[column_order]
                 # Get center for center staging maf
                 # Configure maf
                 configured_mafdf = configure_maf(
                     mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants
                 )
+                configured_mafdf = configured_mafdf[FULL_MAF_RELEASE_COLUMNS]
                 # Create maf for release
                 merged_mafdf = remove_maf_samples(
                     configured_mafdf, keep_for_merged_consortium_samples

diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -948,3 +948,35 @@ def create_new_fileformat_table(
         "newdb_mappingdf": newdb_mappingdf,
         "moved_ent": moved_ent,
     }
+
+
+def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
+    """Creates and fills missing columns with the relevant NA value for the
+        given data type. Note that special handling had to occur for
+        allowing NAs in integer based columns in pandas by converting
+        the integer column into the Int64 (pandas nullable integer data type)
+
+    Args:
+        dataset (pd.DataFrame): input dataset to fill missing columns for
+        schema (dict): the expected schema {column_name(str): data_type(str)}
+            for the input dataset
+
+    Returns:
+        pd.Series: updated dataset
+    """
+    missing_values = {
+        "string": "",
+        "integer": None,
+        "float": float("nan"),
+        "boolean": None,
+    }
+    for column, data_type in schema.items():
+        if column not in dataset.columns:
+            dataset = dataset.assign(**{column: missing_values[data_type]})
+
+        # only way to preserve NAs for these specific dtype columns
+        if data_type == "integer":
+            dataset[column] = dataset[column].astype("Int64")
+        elif data_type == "boolean":
+            dataset[column] = dataset[column].astype(pd.BooleanDtype())
+    return dataset[list(schema.keys())]
diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py
@@ -2,6 +2,13 @@
 import uuid
 
 import pandas as pd
+from pandas.api.types import (
+    is_bool_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    is_string_dtype,
+)
+from pandas.testing import assert_frame_equal
 import pytest
 import synapseclient
 
@@ -500,3 +507,180 @@ def test_that_func_returns_correct_error_warning_if_input_col_has_na_and_nas_is_
         )
         assert error == ""
         assert warning == ""
+
+
+def get_create_missing_columns_test_cases():
+    return [
+        {
+            "name": "str_no_na",
+            "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}),
+            "test_schema": {"col1": "string"},
+            "expected_output": pd.DataFrame({"col1": ["str1", "str2", ""]}),
+            "expected_dtype": is_string_dtype,
+            "expected_na_count": 0,
+        },
+        {
+            "name": "str_na",
+            "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}),
+            "test_schema": {"col2": "string"},
+            "expected_output": pd.DataFrame({"col2": ["", "", ""]}),
+            "expected_dtype": is_string_dtype,
+            "expected_na_count": 0,
+        },
+        {
+            "name": "float_na",
+            "test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}),
+            "test_schema": {"col2": "float"},
+            "expected_output": pd.DataFrame(
+                {"col2": [float("nan"), float("nan"), float("nan")]}
+            ),
+            "expected_dtype": is_float_dtype,
+            "expected_na_count": 3,
+        },
+        {
+            "name": "float_no_na",
+            "test_input": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}),
+            "test_schema": {"col1": "float"},
+            "expected_output": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}),
+            "expected_dtype": is_float_dtype,
+            "expected_na_count": 1,
+        },
+        {
+            "name": "int_na",
+            "test_input": pd.DataFrame({"col1": [2, 3, 4]}),
+            "test_schema": {"col2": "integer"},
+            "expected_output": pd.DataFrame(
+                {"col2": [None, None, None]}, dtype=pd.Int64Dtype()
+            ),
+            "expected_dtype": is_integer_dtype,
+            "expected_na_count": 3,
+        },
+        {
+            "name": "int_no_na",
+            "test_input": pd.DataFrame({"col1": [2, 3, 4]}),
+            "test_schema": {"col1": "integer"},
+            "expected_output": pd.DataFrame({"col1": [2, 3, 4]}, dtype=pd.Int64Dtype()),
+            "expected_dtype": is_integer_dtype,
+            "expected_na_count": 0,
+        },
+        {
+            "name": "bool_na",
+            "test_input": pd.DataFrame({"col1": [True, False, None]}),
+            "test_schema": {"col2": "boolean"},
+            "expected_output": pd.DataFrame(
+                {"col2": [None, None, None]}, dtype=pd.BooleanDtype()
+            ),
+            "expected_dtype": is_bool_dtype,
+            "expected_na_count": 3,
+        },
+        {
+            "name": "bool_no_na",
+            "test_input": pd.DataFrame({"col1": [True, False, None]}),
+            "test_schema": {"col1": "boolean"},
+            "expected_output": pd.DataFrame(
+                {"col1": [True, False, None]}, dtype=pd.BooleanDtype()
+            ),
+            "expected_dtype": is_bool_dtype,
+            "expected_na_count": 1,
+        },
+        {
+            "name": "empty_col",
+            "test_input": pd.DataFrame({"col1": []}),
+            "test_schema": {"col2": "string"},
+            "expected_output": pd.DataFrame({"col2": []}, dtype=str),
+            "expected_dtype": is_string_dtype,
+            "expected_na_count": 0,
+        },
+        {
+            "name": "empty_df",
+            "test_input": pd.DataFrame({}),
+            "test_schema": {"col1": "float"},
+            "expected_output": pd.DataFrame({"col1": []}, index=[]),
+            "expected_dtype": is_float_dtype,
+            "expected_na_count": 0,
+        },
+        {
+            "name": "empty_col_int",
+            "test_input": pd.DataFrame({"col1": []}),
+            "test_schema": {"col2": "integer"},
+            "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()),
+            "expected_dtype": is_integer_dtype,
+            "expected_na_count": 0,
+        },
+        {
+            "name": "empty_df_int",
+            "test_input": pd.DataFrame({"col1": []}),
+            "test_schema": {"col2": "integer"},
+            "expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()),
+            "expected_dtype": is_integer_dtype,
+            "expected_na_count": 0,
+        },
+    ]
+
+
+@pytest.mark.parametrize(
+    "test_cases",
+    get_create_missing_columns_test_cases(),
+    ids=lambda x: x["name"],
+)
+def test_that_create_missing_columns_gets_expected_output_with_single_col_df(
+    test_cases,
+):
+    result = process_functions.create_missing_columns(
+        dataset=test_cases["test_input"], schema=test_cases["test_schema"]
+    )
+    assert_frame_equal(result, test_cases["expected_output"], check_exact=True)
+    assert test_cases["expected_dtype"](result.iloc[:, 0])
+    assert result.isna().sum().sum() == test_cases["expected_na_count"]
+
+
+def test_that_create_missing_columns_returns_expected_output_with_multi_col_df():
+    test_input = pd.DataFrame(
+        {
+            "col2": ["str1", "str2", "str3"],
+            "col1": [2, 3, 4],
+            "col3": [2.0, 3.0, float("nan")],
+            "col7": [True, False, None],
+        }
+    )
+    test_schema = {
+        "col1": "integer",
+        "col2": "string",
+        "col3": "float",
+        "col4": "integer",
+        "col5": "string",
+        "col6": "float",
+        "col7": "boolean",
+        "col8": "boolean",
+    }
+    result = process_functions.create_missing_columns(
+        dataset=test_input, schema=test_schema
+    )
+    expected_output = pd.DataFrame(
+        {
+            "col1": [2, 3, 4],
+            "col2": ["str1", "str2", "str3"],
+            "col3": [2.0, 3.0, float("nan")],
+            "col4": [None, None, None],
+            "col5": ["", "", ""],
+            "col6": [float("nan"), float("nan"), float("nan")],
+            "col7": [True, False, None],
+            "col8": [None, None, None],
+        }
+    )
+    expected_output["col1"] = expected_output["col1"].astype("Int64")
+    expected_output["col4"] = expected_output["col4"].astype("Int64")
+    expected_output["col7"] = expected_output["col7"].astype(pd.BooleanDtype())
+    expected_output["col8"] = expected_output["col8"].astype(pd.BooleanDtype())
+
+    assert result["col1"].dtype == pd.Int64Dtype()
+    assert is_string_dtype(result["col2"])
+    assert is_float_dtype(result["col3"])
+    assert result["col4"].dtype == pd.Int64Dtype()
+    assert is_string_dtype(result["col5"])
+    assert is_float_dtype(result["col6"])
+    assert result["col7"].dtype == pd.BooleanDtype()
+    assert result["col8"].dtype == pd.BooleanDtype()
+    assert result.isna().sum().sum() == 11
+
+    assert_frame_equal(result, expected_output, check_exact=True)