[GEN-974] Allow NaN, nan and NA strings for mutation data (#549)

* initial commit * deprecated setup, replace with setup_method * update black version and re-lint * update _get_dataframe docstring for vcf and maf * add additional tests * fix relevant code smells * remove unused conversion code * add None into valid vals in test
Sage-Bionetworks · Feb 7, 2024 · 5e69a73 · 5e69a73
1 parent 45e54b3
commit 5e69a73
Show file tree

Hide file tree

Showing 29 changed files with 369 additions and 54 deletions.
diff --git a/genie/config.py b/genie/config.py
@@ -1,4 +1,5 @@
 """Configuration to obtain registry classes"""
+
 import importlib
 import logging
 

diff --git a/genie/create_case_lists.py b/genie/create_case_lists.py
@@ -1,6 +1,7 @@
 """
 Creates case lists per cancer type
 """
+
 from collections import defaultdict
 import csv
 import os

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
@@ -1,4 +1,5 @@
 """Updates dashboard tables"""
+
 import argparse
 import datetime
 import logging
@@ -347,9 +348,11 @@ def update_oncotree_code_tables(syn, database_mappingdf):
     oncotree_mapping = process_functions.get_oncotree_code_mappings(oncotree_link)
 
     clinicaldf["PRIMARY_CODES"] = [
-        oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"]
-        if i.upper() in oncotree_mapping.keys()
-        else "DEPRECATED_CODE"
+        (
+            oncotree_mapping[i.upper()]["ONCOTREE_PRIMARY_NODE"]
+            if i.upper() in oncotree_mapping.keys()
+            else "DEPRECATED_CODE"
+        )
         for i in clinicaldf.ONCOTREE_CODE
     ]
 
@@ -457,9 +460,9 @@ def update_sample_difference_table(syn, database_mappingdf):
         .applymap(int)
     )
 
-    diff_between_releasesdf[
-        ["Clinical", "Mutation", "CNV", "SEG", "Fusions"]
-    ] = new_values
+    diff_between_releasesdf[["Clinical", "Mutation", "CNV", "SEG", "Fusions"]] = (
+        new_values
+    )
 
     load._update_table(
         syn,

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -1052,30 +1052,38 @@ def store_clinical_files(
     }
 
     clinicaldf["CANCER_TYPE"] = [
-        oncotree_dict[code.upper()]["CANCER_TYPE"]
-        if code.upper() in oncotree_dict.keys()
-        else float("nan")
+        (
+            oncotree_dict[code.upper()]["CANCER_TYPE"]
+            if code.upper() in oncotree_dict.keys()
+            else float("nan")
+        )
         for code in clinicaldf["ONCOTREE_CODE"]
     ]
 
     clinicaldf["CANCER_TYPE_DETAILED"] = [
-        oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"]
-        if code.upper() in oncotree_dict.keys()
-        else float("nan")
+        (
+            oncotree_dict[code.upper()]["CANCER_TYPE_DETAILED"]
+            if code.upper() in oncotree_dict.keys()
+            else float("nan")
+        )
         for code in clinicaldf["ONCOTREE_CODE"]
     ]
 
     clinicaldf["ONCOTREE_PRIMARY_NODE"] = [
-        oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"]
-        if code.upper() in oncotree_dict.keys()
-        else float("nan")
+        (
+            oncotree_dict[code.upper()]["ONCOTREE_PRIMARY_NODE"]
+            if code.upper() in oncotree_dict.keys()
+            else float("nan")
+        )
         for code in clinicaldf["ONCOTREE_CODE"]
     ]
 
     clinicaldf["ONCOTREE_SECONDARY_NODE"] = [
-        oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"]
-        if code.upper() in oncotree_dict.keys()
-        else float("nan")
+        (
+            oncotree_dict[code.upper()]["ONCOTREE_SECONDARY_NODE"]
+            if code.upper() in oncotree_dict.keys()
+            else float("nan")
+        )
         for code in clinicaldf["ONCOTREE_CODE"]
     ]
 
@@ -1086,9 +1094,11 @@ def store_clinical_files(
     # descriptions can match
     clinicaldf["AGE_AT_SEQ_REPORT_DAYS"] = clinicaldf["AGE_AT_SEQ_REPORT"]
     clinicaldf["AGE_AT_SEQ_REPORT"] = [
-        int(math.floor(int(float(age)) / 365.25))
-        if process_functions.checkInt(age)
-        else age
+        (
+            int(math.floor(int(float(age)) / 365.25))
+            if process_functions.checkInt(age)
+            else age
+        )
         for age in clinicaldf["AGE_AT_SEQ_REPORT"]
     ]
     clinicaldf["AGE_AT_SEQ_REPORT"][clinicaldf["AGE_AT_SEQ_REPORT"] == ">32485"] = ">89"

diff --git a/genie/example_filetype_format.py b/genie/example_filetype_format.py
@@ -1,6 +1,7 @@
 """TODO: Rename this to model.py
 This contains the GENIE model objects
 """
+
 from abc import ABCMeta
 from dataclasses import dataclass
 import logging

diff --git a/genie/load.py b/genie/load.py
@@ -2,6 +2,7 @@
 This module contains all the functions that stores data
 to Synapse
 """
+
 import logging
 import os
 import time

diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -1,4 +1,5 @@
 """Processing functions that are used in the GENIE pipeline"""
+
 import datetime
 import json
 import logging

diff --git a/genie/process_mutation.py b/genie/process_mutation.py
@@ -1,5 +1,6 @@
 """Process mutation files
 TODO deprecate this module and spread functions around"""
+
 from collections import namedtuple
 import logging
 import os

diff --git a/genie/transform.py b/genie/transform.py
@@ -1,5 +1,7 @@
 """This module contains all the transformation functions used throughout the GENIE
 package"""
+
+from typing import List
 import warnings
 
 import pandas as pd
@@ -64,3 +66,24 @@ def _convert_df_with_mixed_dtypes(read_csv_params: dict) -> pd.DataFrame:
         df = pd.read_csv(**read_csv_params, low_memory=False, engine="c")
     warnings.resetwarnings()
     return df
+
+
+def _convert_values_to_na(
+    input_df: pd.DataFrame, values_to_replace: List[str], columns_to_convert: List[str]
+) -> pd.DataFrame:
+    """Converts given values to NA in an input dataset
+
+    Args:
+        input_df (pd.DataFrame): input dataset
+        values_to_replace (List[str]): string values to replace with na
+        columns_to_convert (List[str]): subset of columns to convert with na in
+
+    Returns:
+        pd.DataFrame: dataset with specified values replaced with NAs
+    """
+    if not input_df.empty:
+        replace_mapping = {value: None for value in values_to_replace}
+        input_df[columns_to_convert] = input_df[columns_to_convert].replace(
+            replace_mapping
+        )
+    return input_df
diff --git a/genie/write_invalid_reasons.py b/genie/write_invalid_reasons.py
@@ -1,4 +1,5 @@
 """Write invalid reasons"""
+
 import logging
 import os
 

diff --git a/genie_registry/__init__.py b/genie_registry/__init__.py
@@ -1,4 +1,5 @@
 """Initialize GENIE registry"""
+
 # Import logging last to not take in synapseclient logging
 import logging
 

diff --git a/genie_registry/assay.py b/genie_registry/assay.py
@@ -1,4 +1,5 @@
 """Assay information class"""
+
 import os
 import yaml
 

diff --git a/genie_registry/bed.py b/genie_registry/bed.py
@@ -1,4 +1,5 @@
 """GENIE bed class and functions"""
+
 import os
 import logging
 import subprocess

diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
@@ -1,4 +1,5 @@
 """Clinical file format validation and processing"""
+
 # from __future__ import annotations
 import datetime
 from io import StringIO

diff --git a/genie_registry/maf.py b/genie_registry/maf.py
@@ -1,6 +1,7 @@
 from io import StringIO
-import os
 import logging
+import os
+from typing import List
 
 import pandas as pd
 
@@ -198,10 +199,6 @@ def _validate(self, mutationDF):
         for col in numerical_cols:
             col_exists = process_functions.checkColExist(mutationDF, col)
             if col_exists:
-                # Since NA is an allowed value, when reading in the dataframe
-                # the 'NA' string is not converted.  This will convert all
-                # 'NA' values in the numerical columns into actual float('nan')
-                mutationDF.loc[mutationDF[col] == "NA", col] = float("nan")
                 # Attempt to convert column to float
                 try:
                     mutationDF[col] = mutationDF[col].astype(float)
@@ -352,13 +349,38 @@ def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
                     )
         return errors, warnings
 
-    def _get_dataframe(self, filePathList):
-        """Get mutation dataframe"""
-        # Must do this because pandas.read_csv will allow for a file to
-        # have more column headers than content.  E.g.
-        # A,B,C,D,E
-        # 1,2
-        # 2,3
+    def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame:
+        """Get mutation dataframe
+
+        1) Starts reading the first line in the file
+        2) Skips lines that starts with #
+        3) Reads in second line
+        4) Checks that first line fields matches second line. Must do this because
+        pandas.read_csv will allow for a file to have more column headers than content.
+        E.g)  A,B,C,D,E
+              1,2
+              2,3
+
+        5) We keep the 'NA', 'nan', and 'NaN' as strings in the data because
+        these are valid allele values
+        then convert the ones in the non-allele columns back to actual NAs
+
+        NOTE: Because allele columns are case-insensitive in maf data, we must
+        standardize the case of the columns when checking for the non-allele columns
+        to convert the NA strings to NAs
+
+        NOTE: This code allows empty dataframes to pass through
+        without errors
+
+        Args:
+            filePathList (List[str]): list of filepath(s)
+
+        Raises:
+            ValueError: First line fields doesn't match second line fields in file
+
+        Returns:
+            pd.DataFrame: mutation data
+        """
         with open(filePathList[0], "r") as maf_f:
             firstline = maf_f.readline()
             if firstline.startswith("#"):
@@ -370,34 +392,43 @@ def _get_dataframe(self, filePathList):
                 "Number of fields in a line do not match the "
                 "expected number of columns"
             )
+
         read_csv_params = {
             "filepath_or_buffer": filePathList[0],
             "sep": "\t",
             "comment": "#",
-            # Keep the value 'NA'
+            "keep_default_na": False,
             "na_values": [
                 "-1.#IND",
                 "1.#QNAN",
                 "1.#IND",
                 "-1.#QNAN",
                 "#N/A N/A",
-                "NaN",
                 "#N/A",
                 "N/A",
                 "#NA",
                 "NULL",
                 "-NaN",
-                "nan",
                 "-nan",
                 "",
             ],
-            "keep_default_na": False,
             # This is to check if people write files
             # with R, quote=T
             "quoting": 3,
             # Retain completely blank lines so that
             # validator will cause the file to fail
             "skip_blank_lines": False,
         }
+
         mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params)
+
+        mutationdf = transform._convert_values_to_na(
+            input_df=mutationdf,
+            values_to_replace=["NA", "nan", "NaN"],
+            columns_to_convert=[
+                col
+                for col in mutationdf.columns
+                if col.upper() not in self._allele_cols
+            ],
+        )
         return mutationdf
diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py
@@ -1,10 +1,11 @@
 import logging
 import os
+from typing import List
 
 import pandas as pd
 
 from genie.example_filetype_format import FileTypeFormat
-from genie import process_functions, validate
+from genie import process_functions, transform, validate
 
 logger = logging.getLogger(__name__)
 
@@ -28,7 +29,25 @@ def _validateFilename(self, filePath):
         endswith_vcf = basename.endswith(".vcf")
         assert startswith_genie and endswith_vcf
 
-    def _get_dataframe(self, filePathList):
+    def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame:
+        """Get mutation dataframe
+
+        1) Looks for the line in the file starting with #CHROM, that will be
+        the header line (columns).
+
+        2) When reading in the data, we keep the 'NA', 'nan', and 'NaN'
+        as strings in the data because these are valid allele values
+        then convert the ones in the non-allele columns back to actual NAs
+
+        Args:
+            filePathList (List[str]): list of filepath(s)
+
+        Raises:
+            ValueError: when line with #CHROM doesn't exist in file
+
+        Returns:
+            pd.DataFrame: mutation data
+        """
         headers = None
         filepath = filePathList[0]
         with open(filepath, "r") as vcffile:
@@ -38,10 +57,37 @@ def _get_dataframe(self, filePathList):
                     break
         if headers is not None:
             vcfdf = pd.read_csv(
-                filepath, sep="\t", comment="#", header=None, names=headers
+                filepath,
+                sep="\t",
+                comment="#",
+                header=None,
+                names=headers,
+                keep_default_na=False,
+                na_values=[
+                    "-1.#IND",
+                    "1.#QNAN",
+                    "1.#IND",
+                    "-1.#QNAN",
+                    "#N/A N/A",
+                    "#N/A",
+                    "N/A",
+                    "#NA",
+                    "NULL",
+                    "-NaN",
+                    "-nan",
+                    "",
+                ],
             )
         else:
             raise ValueError("Your vcf must start with the header #CHROM")
+
+        vcfdf = transform._convert_values_to_na(
+            input_df=vcfdf,
+            values_to_replace=["NA", "nan", "NaN"],
+            columns_to_convert=[
+                col for col in vcfdf.columns if col not in self._allele_cols
+            ],
+        )
         return vcfdf
 
     def process_steps(self, df):