Skip to content

Commit

Permalink
Merge pull request #548 from Sage-Bionetworks/gen-1076-exclude-new-gn…
Browse files Browse the repository at this point in the history
…-var

[GEN-1076] Exclude genomic_location_explanation from release
  • Loading branch information
rxu17 authored Jan 25, 2024
2 parents 4f9be49 + 116a709 commit 889698d
Show file tree
Hide file tree
Showing 3 changed files with 284 additions and 5 deletions.
73 changes: 68 additions & 5 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,73 @@
SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt")
BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv")

FULL_MAF_RELEASE_COLUMNS = [
"Hugo_Symbol",
"Entrez_Gene_Id",
"Center",
"NCBI_Build",
"Chromosome",
"Start_Position",
"End_Position",
"Strand",
"Consequence",
"Variant_Classification",
"Variant_Type",
"Reference_Allele",
"Tumor_Seq_Allele1",
"Tumor_Seq_Allele2",
"dbSNP_RS",
"dbSNP_Val_Status",
"Tumor_Sample_Barcode",
"Matched_Norm_Sample_Barcode",
"Match_Norm_Seq_Allele1",
"Match_Norm_Seq_Allele2",
"Tumor_Validation_Allele1",
"Tumor_Validation_Allele2",
"Match_Norm_Validation_Allele1",
"Match_Norm_Validation_Allele2",
"Verification_Status",
"Validation_Status",
"Mutation_Status",
"Sequencing_Phase",
"Sequence_Source",
"Validation_Method",
"Score",
"BAM_File",
"Sequencer",
"t_ref_count",
"t_alt_count",
"n_ref_count",
"n_alt_count",
"HGVSc",
"HGVSp",
"HGVSp_Short",
"Transcript_ID",
"RefSeq",
"Protein_position",
"Codons",
"Exon_Number",
"gnomAD_AF",
"gnomAD_AFR_AF",
"gnomAD_AMR_AF",
"gnomAD_ASJ_AF",
"gnomAD_EAS_AF",
"gnomAD_FIN_AF",
"gnomAD_NFE_AF",
"gnomAD_OTH_AF",
"gnomAD_SAS_AF",
"FILTER",
"Polyphen_Prediction",
"Polyphen_Score",
"SIFT_Prediction",
"SIFT_Score",
"SWISSPROT",
"n_depth",
"t_depth",
"Annotation_Status",
"mutationInCis_Flag",
]


# TODO: Add to transform.py
def _to_redact_interval(df_col):
Expand Down Expand Up @@ -755,10 +822,7 @@ def store_maf_files(
with open(MUTATIONS_CENTER_PATH % center, "w"):
pass
used_entities = []
# Must get the headers (because can't assume headers are the same order)
maf_ent = syn.get(centerMafSynIdsDf.id[0])
headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0)
column_order = headerdf.columns
for _, mafSynId in enumerate(centerMafSynIdsDf.id):
maf_ent = syn.get(mafSynId)
logger.info(maf_ent.path)
Expand All @@ -771,13 +835,12 @@ def store_maf_files(
)

for mafchunk in mafchunks:
# Reorder column headers
mafchunk = mafchunk[column_order]
# Get center for center staging maf
# Configure maf
configured_mafdf = configure_maf(
mafchunk, remove_mafinbed_variants, flagged_mutationInCis_variants
)
configured_mafdf = configured_mafdf[FULL_MAF_RELEASE_COLUMNS]
# Create maf for release
merged_mafdf = remove_maf_samples(
configured_mafdf, keep_for_merged_consortium_samples
Expand Down
32 changes: 32 additions & 0 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,3 +948,35 @@ def create_new_fileformat_table(
"newdb_mappingdf": newdb_mappingdf,
"moved_ent": moved_ent,
}


def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
"""Creates and fills missing columns with the relevant NA value for the
given data type. Note that special handling had to occur for
allowing NAs in integer based columns in pandas by converting
the integer column into the Int64 (pandas nullable integer data type)
Args:
dataset (pd.DataFrame): input dataset to fill missing columns for
schema (dict): the expected schema {column_name(str): data_type(str)}
for the input dataset
Returns:
pd.Series: updated dataset
"""
missing_values = {
"string": "",
"integer": None,
"float": float("nan"),
"boolean": None,
}
for column, data_type in schema.items():
if column not in dataset.columns:
dataset = dataset.assign(**{column: missing_values[data_type]})

# only way to preserve NAs for these specific dtype columns
if data_type == "integer":
dataset[column] = dataset[column].astype("Int64")
elif data_type == "boolean":
dataset[column] = dataset[column].astype(pd.BooleanDtype())
return dataset[list(schema.keys())]
184 changes: 184 additions & 0 deletions tests/test_process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
import uuid

import pandas as pd
from pandas.api.types import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_string_dtype,
)
from pandas.testing import assert_frame_equal
import pytest
import synapseclient

Expand Down Expand Up @@ -500,3 +507,180 @@ def test_that_func_returns_correct_error_warning_if_input_col_has_na_and_nas_is_
)
assert error == ""
assert warning == ""


def get_create_missing_columns_test_cases():
return [
{
"name": "str_no_na",
"test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}),
"test_schema": {"col1": "string"},
"expected_output": pd.DataFrame({"col1": ["str1", "str2", ""]}),
"expected_dtype": is_string_dtype,
"expected_na_count": 0,
},
{
"name": "str_na",
"test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}),
"test_schema": {"col2": "string"},
"expected_output": pd.DataFrame({"col2": ["", "", ""]}),
"expected_dtype": is_string_dtype,
"expected_na_count": 0,
},
{
"name": "float_na",
"test_input": pd.DataFrame({"col1": ["str1", "str2", ""]}),
"test_schema": {"col2": "float"},
"expected_output": pd.DataFrame(
{"col2": [float("nan"), float("nan"), float("nan")]}
),
"expected_dtype": is_float_dtype,
"expected_na_count": 3,
},
{
"name": "float_no_na",
"test_input": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}),
"test_schema": {"col1": "float"},
"expected_output": pd.DataFrame({"col1": [1.0, 2.0, float("nan")]}),
"expected_dtype": is_float_dtype,
"expected_na_count": 1,
},
{
"name": "int_na",
"test_input": pd.DataFrame({"col1": [2, 3, 4]}),
"test_schema": {"col2": "integer"},
"expected_output": pd.DataFrame(
{"col2": [None, None, None]}, dtype=pd.Int64Dtype()
),
"expected_dtype": is_integer_dtype,
"expected_na_count": 3,
},
{
"name": "int_no_na",
"test_input": pd.DataFrame({"col1": [2, 3, 4]}),
"test_schema": {"col1": "integer"},
"expected_output": pd.DataFrame({"col1": [2, 3, 4]}, dtype=pd.Int64Dtype()),
"expected_dtype": is_integer_dtype,
"expected_na_count": 0,
},
{
"name": "bool_na",
"test_input": pd.DataFrame({"col1": [True, False, None]}),
"test_schema": {"col2": "boolean"},
"expected_output": pd.DataFrame(
{"col2": [None, None, None]}, dtype=pd.BooleanDtype()
),
"expected_dtype": is_bool_dtype,
"expected_na_count": 3,
},
{
"name": "bool_no_na",
"test_input": pd.DataFrame({"col1": [True, False, None]}),
"test_schema": {"col1": "boolean"},
"expected_output": pd.DataFrame(
{"col1": [True, False, None]}, dtype=pd.BooleanDtype()
),
"expected_dtype": is_bool_dtype,
"expected_na_count": 1,
},
{
"name": "empty_col",
"test_input": pd.DataFrame({"col1": []}),
"test_schema": {"col2": "string"},
"expected_output": pd.DataFrame({"col2": []}, dtype=str),
"expected_dtype": is_string_dtype,
"expected_na_count": 0,
},
{
"name": "empty_df",
"test_input": pd.DataFrame({}),
"test_schema": {"col1": "float"},
"expected_output": pd.DataFrame({"col1": []}, index=[]),
"expected_dtype": is_float_dtype,
"expected_na_count": 0,
},
{
"name": "empty_col_int",
"test_input": pd.DataFrame({"col1": []}),
"test_schema": {"col2": "integer"},
"expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()),
"expected_dtype": is_integer_dtype,
"expected_na_count": 0,
},
{
"name": "empty_df_int",
"test_input": pd.DataFrame({"col1": []}),
"test_schema": {"col2": "integer"},
"expected_output": pd.DataFrame({"col2": []}, dtype=pd.Int64Dtype()),
"expected_dtype": is_integer_dtype,
"expected_na_count": 0,
},
]


@pytest.mark.parametrize(
"test_cases",
get_create_missing_columns_test_cases(),
ids=lambda x: x["name"],
)
def test_that_create_missing_columns_gets_expected_output_with_single_col_df(
test_cases,
):
result = process_functions.create_missing_columns(
dataset=test_cases["test_input"], schema=test_cases["test_schema"]
)
assert_frame_equal(result, test_cases["expected_output"], check_exact=True)
assert test_cases["expected_dtype"](result.iloc[:, 0])
assert result.isna().sum().sum() == test_cases["expected_na_count"]


def test_that_create_missing_columns_returns_expected_output_with_multi_col_df():
test_input = pd.DataFrame(
{
"col2": ["str1", "str2", "str3"],
"col1": [2, 3, 4],
"col3": [2.0, 3.0, float("nan")],
"col7": [True, False, None],
}
)
test_schema = {
"col1": "integer",
"col2": "string",
"col3": "float",
"col4": "integer",
"col5": "string",
"col6": "float",
"col7": "boolean",
"col8": "boolean",
}
result = process_functions.create_missing_columns(
dataset=test_input, schema=test_schema
)
expected_output = pd.DataFrame(
{
"col1": [2, 3, 4],
"col2": ["str1", "str2", "str3"],
"col3": [2.0, 3.0, float("nan")],
"col4": [None, None, None],
"col5": ["", "", ""],
"col6": [float("nan"), float("nan"), float("nan")],
"col7": [True, False, None],
"col8": [None, None, None],
}
)
expected_output["col1"] = expected_output["col1"].astype("Int64")
expected_output["col4"] = expected_output["col4"].astype("Int64")
expected_output["col7"] = expected_output["col7"].astype(pd.BooleanDtype())
expected_output["col8"] = expected_output["col8"].astype(pd.BooleanDtype())

assert result["col1"].dtype == pd.Int64Dtype()
assert is_string_dtype(result["col2"])
assert is_float_dtype(result["col3"])
assert result["col4"].dtype == pd.Int64Dtype()
assert is_string_dtype(result["col5"])
assert is_float_dtype(result["col6"])
assert result["col7"].dtype == pd.BooleanDtype()
assert result["col8"].dtype == pd.BooleanDtype()
assert result.isna().sum().sum() == 11

assert_frame_equal(result, expected_output, check_exact=True)

0 comments on commit 889698d

Please sign in to comment.