From fb314f1d24e33623f469b46540e306068edf0be7 Mon Sep 17 00:00:00 2001
From: QianqiHuang <71415495+QianqiHuang@users.noreply.github.com>
Date: Sat, 5 Dec 2020 22:42:32 +0800
Subject: [PATCH 1/3] Update data_toolbox.py

---
 data_toolbox/data_toolbox.py | 946 +++++++++++++++++++----------------
 1 file changed, 505 insertions(+), 441 deletions(-)

diff --git a/data_toolbox/data_toolbox.py b/data_toolbox/data_toolbox.py
index 30fb16d..d70fdb0 100644
--- a/data_toolbox/data_toolbox.py
+++ b/data_toolbox/data_toolbox.py
@@ -1,441 +1,505 @@
-"""
-Prototype functions that are useful for data exploration
-"""
-import pandas as pd
-import numpy as np
-
-
-def find_patient(fname, patientunitstayid, cs=100000, verbose=True):
-    """
-    Retrieve single patient info. Also sorts by offsets
-    Parameters:
-        patientunitstayid: the patient whose data to pull
-        cs = the size of the chunk to iterate through
-        verbose = information on how many iterations through
-    Returns:
-        df: a dataframe (sorted by nursingchartoffset) with data from that patient
-    """
-
-    iteration = 0
-    for chunk in pd.read_csv(fname, chunksize=cs):
-        # for keeping track of where we are
-        if verbose:
-            if iteration % 100 == 0:
-                print("iter {0}".format(iteration))
-            iteration += 1
-
-        # pull patient information out
-        df = chunk.loc[chunk["patientunitstayid"] == patientunitstayid]
-
-        # when we find our dude
-        if df.empty == False:
-            print("Found patient")
-            break
-
-    if df.empty == True:
-        print("Error: no patient was found by that id")
-        return None
-
-    # sort for convenience
-    df = df.sort_values(by=["nursingchartoffset"])
-
-    return df
-
-
-def plot_vitals(vitals, patient):
-    """
-    Given a list of vitals and patient, form a nice timeseries plot
-    Args:
-        vitals: list of list in the form:
-                 [nursingchartcelltypevallabel, nursingchartcelltypevalname]
-        patient: dataframe from a single patient's data
-    """
-
-    plt.figure(figsize=[12, 8])
-    for v in vitals:
-        idx = (patient["nursingchartcelltypevallabel"] == v[0]) & (
-            patient["nursingchartcelltypevalname"] == v[1]
-        )
-        df_plot = patient.loc[idx, :]
-
-        if "Systolic" in v[1]:
-            marker = "^-"
-        elif "Diastolic" in v[1]:
-            marker = "v-"
-        else:
-            marker = "o-"
-        plt.plot(
-            df_plot["nursingchartoffset"],
-            pd.to_numeric(df_plot["nursingchartvalue"], errors="coerce"),
-            marker,
-            markersize=8,
-            lw=2,
-            label=v,
-        )
-
-    plt.xlabel("Time since ICU admission (minutes)")
-    plt.ylabel("Measurement value")
-    plt.legend(loc="upper right")
-    plt.show()
-
-
-def multi_patient_feature_plot(df, vitals):
-    """
-    For a dataframe of a given size, plot the time series feature of all patients
-    Parameters:
-        df = the pd dataframe to extract info from (don't use a big one!)
-        vitals = list of list in form [[nursingchartcelltypevallabel, nursingchartcelltypevalname]]
-    """
-
-    # sort df if not already done and find unique patients
-    df = df.sort_values(by=["nursingchartoffset"])
-    unique_patients = df.patientunitstayid.unique()
-
-    plt.figure()
-    # check out all unique patientunitstayid's
-    for patient_id in unique_patients:
-        patient = df.loc[df["patientunitstayid"] == patient_id]
-        idx = (patient["nursingchartcelltypevallabel"] == vitals[0][0]) & (
-            patient["nursingchartcelltypevalname"] == vitals[0][1]
-        )
-        df_plot = patient.loc[idx, :]
-
-        # style choice
-        marker = "o-"
-
-        plt.plot(
-            df_plot["nursingchartoffset"],
-            pd.to_numeric(df_plot["nursingchartvalue"], errors="coerce"),
-            marker,
-            markersize=8,
-            lw=2,
-            label=vitals,
-        )
-
-    plt.xlabel("Time Since ICU Admission (minutes)")
-    plt.ylabel("GCS Value")
-    plt.show()
-
-
-def get_next_patient(fname, idx_start=0, num_rows=1000000):
-    """
-    Function to read one patient at a time. This function is intended to be chained together
-    with repeated calls.
-    Note: this isn't the fastest way to do this for sure, but it works
-
-    Parameters:
-        fname: the filename to draw data from
-        idx_start: the row to skip to during data collection
-        num_rows: the amount of rows to read at a time
-    Returns:
-        patient: the dataframe belonging solely to the patient of interest
-                 the dataframe is returned SORTED by nursingchartoffset
-        next_idx: the row to skip to (aka where the next patient begins)
-    """
-
-    # extract headers
-    temp = pd.read_csv(fname, nrows=1)
-    header = temp.columns
-
-    # read in a chunk of the dataframe starting at a location
-    df = pd.read_csv(fname, skiprows=idx_start, nrows=num_rows)
-    df.columns = header
-
-    # get patientunitstayid - will be the patient of interest
-    patient_id = df["patientunitstayid"].iloc[0]
-
-    # extract all data of patient with this id
-    # and sort according to offsets
-    patient = df.loc[df["patientunitstayid"] == patient_id]
-    patient = patient.sort_values(by=["nursingchartoffset"])
-
-    # keep track of next patient index (i.e. how many rows to skip)
-    length, __ = patient.shape
-    next_idx = length + idx_start
-
-    return patient, next_idx
-
-
-# Caution! Potential problems with corrupted index!
-def shear_next_patient(df, next_idx):
-    """
-    Function to generate a dataframe for next patient and trim the original.
-
-    Parameters:
-        df: the original dataframe that contains all data
-        next_idx: the index to check
-    Returns:
-        df: the trimmed dataframe without the selected patient's data
-        p_df: the patient dataframe that was sheared from the original
-        p_id: the patient id
-        next_idx: the index to skip to (aka where the next patient begins)
-    """
-    # ASSUMES THE DF HAS BEEN SORTED BY PATIENTUNITSTAYID ALREADY
-
-    # Trim the dataframe
-    df = df.loc[next_idx:]
-
-    # Select the patient of interest
-    p_id = df["patientunitstayid"].loc[next_idx]
-    p_df = df.loc[df["patientunitstayid"] == p_id]
-
-    # Sort by the offset
-    p_df = p_df.sort_values(by=["nursingchartoffset"])
-
-    # Determine the index to skip to
-    length, __ = p_df.shape
-    next_idx = length + next_idx
-
-    return df, p_df, p_id, next_idx
-
-
-def lift_next_patient(df, next_idx):
-    """
-    Function to generate a dataframe for next patient.
-
-    Parameters:
-        df: the original dataframe that contains all data
-        next_idx: the index to check
-    Returns:
-        p_df: the patient dataframe that was lifted
-        p_id: the patient id
-        next_idx: the index to skip to (aka where the next patient begins)
-    """
-    # ASSUMES THE DF HAS BEEN SORTED BY PATIENTUNITSTAYID ALREADY
-
-    # index of patient
-    p_id = df["patientunitstayid"].iloc[next_idx]
-
-    # extract all data of patient with this id and sort according to offsets
-    p_df = df.loc[df["patientunitstayid"] == p_id]
-    p_df = p_df.sort_values(by=["nursingchartoffset"])
-
-    # keep track of next patient index (i.e. how many rows to skip)
-    length, __ = p_df.shape
-    next_idx = length + next_idx
-
-    return p_df, p_id, next_idx
-
-
-def filter_patients(df, patients):
-    """
-    Function to remove all rows that is not one of targeted patients
-
-    Parameters:
-        df: the original dataframe that contains all data
-        patients: the patient list interested
-    Return:
-        filtered df of target patients
-    """
-
-    return df[df["patientunitstayid"].isin(patients["patientunitstayid"].to_list())]
-
-
-def apply_exclusion_criteria(df, diagnoses, criteria):
-    """
-    Function to apply exclusion criteria.
-
-    Parameters:
-        df: the original dataframe that contains all data
-        diagnoses: the diagnosis dataframe
-        criteria: the criteria list
-    Return:
-        excluded_df: filtered df that excluded diagnosis criteria
-    """
-
-    # Apply a mask to find ids of patients to exclude
-    mask = diagnoses.diagnosisstring.apply(
-        lambda x: any(item for item in criteria if item in x)
-    )
-    temp_df = diagnoses[mask]
-
-    # recover just the ids
-    exclusion_ids = temp_df["patientunitstayid"]
-
-    # apply those ids
-    excluded_df = df[~df["patientunitstayid"].isin(exclusion_ids)]
-    return excluded_df
-
-
-def normal_temperature(num):
-    """
-    Function to normalize temperature values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if num == np.nan:
-        return num
-    # Convert Fahrenheit to Celsius
-    # And apply the function again
-    elif num > 50:
-        return normal_temperature((num - 32) * 5 / 9)
-    # Remove values out of range
-    elif num < 15 or num > 45:
-        return np.nan
-    # Return normal values directly
-    else:
-        return num
-
-
-def normal_sao2(num):
-    """
-    Function to normalize O2 saturation values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if num == np.nan:
-        return num
-    # Remove values out of range
-    elif num < 50 or num > 100:
-        return np.nan
-    # Return normal values directly
-    else:
-        return num
-
-
-def normal_heartrate(num):
-    """
-    Function to normalize heart rate values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if num == np.nan:
-        return num
-    # Remove values out of range
-    elif num > 300 or num < 0:
-        return np.nan
-    # Return normal values directly
-    else:
-        return num
-
-
-def normal_respiration(num):
-    """
-    Function to normalize respiratory rate values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if num == np.nan:
-        return num
-    # Remove values out of range
-    elif num > 100 or num < 0:
-        return np.nan
-    # Return normal values directly
-    else:
-        return num
-
-
-def normal_cvp(num):
-    """
-    Function to normalize central venous pressure values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if num == np.nan:
-        return num
-    # Remove values out of range
-    elif num < -10 or num > 50:
-        return np.nan
-    # Return normal values directly
-    else:
-        return num
-
-
-def normal_etco2(num):
-    """
-    Function to normalize end tidal CO2 values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if num == np.nan:
-        return num
-    # Remove values out of range
-    elif num < 0 or num > 100:
-        return np.nan
-    # Return normal values directly
-    else:
-        return num
-
-
-def normal_systemic(systolic, diastolic, mean_p):
-    """
-    Function to normalize systemic blood pressure values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if systolic == np.nan or diastolic == np.nan or mean_p == np.nan:
-        return np.nan, np.nan, np.nan
-    # Remove values out of range
-    elif systolic < 0 or systolic > 300:
-        return np.nan, np.nan, np.nan
-    elif diastolic < 0 or diastolic > 200:
-        return np.nan, np.nan, np.nan
-    elif mean_p < 0 or mean_p > 190:
-        return np.nan, np.nan, np.nan
-    elif diastolic >= mean_p:
-        return np.nan, np.nan, np.nan
-    elif systolic < mean_p:
-        return np.nan, np.nan, np.nan
-    elif systolic - diastolic <= 4:
-        return np.nan, np.nan, np.nan
-    # Return normal values directly
-    else:
-        return systolic, diastolic, mean_p
-
-
-def normal_pa(systolic, diastolic, mean_p):
-    """
-    Function to normalize pulmonary artery blood pressure values.
-
-    Parameters:
-        num: the originial input value
-    Return:
-        num: the normalized output value
-    """
-    # Return null values direcly
-    if systolic == np.nan or diastolic == np.nan or mean_p == np.nan:
-        return np.nan, np.nan, np.nan
-    # Remove values out of range
-    elif systolic < 0 or systolic > 300:
-        return np.nan, np.nan, np.nan
-    elif diastolic < 0 or diastolic > 200:
-        return np.nan, np.nan, np.nan
-    elif mean_p < 0 or mean_p > 190:
-        return np.nan, np.nan, np.nan
-    elif diastolic >= mean_p:
-        return np.nan, np.nan, np.nan
-    elif systolic < mean_p:
-        return np.nan, np.nan, np.nan
-    elif systolic - diastolic <= 4:
-        return np.nan, np.nan, np.nan
-    # Return normal values directly
-    else:
-        return systolic, diastolic, mean_p
+"""
+Prototype functions that are useful for data exploration
+"""
+import pandas as pd
+import numpy as np
+
+
+def find_patient(fname, patientunitstayid, cs=100000, verbose=True):
+    """
+    Retrieve single patient info. Also sorts by offsets
+    Parameters:
+        patientunitstayid: the patient whose data to pull
+        cs = the size of the chunk to iterate through
+        verbose = information on how many iterations through
+    Returns:
+        df: a dataframe (sorted by nursingchartoffset) with data from that patient
+    """
+
+    iteration = 0
+    for chunk in pd.read_csv(fname, chunksize=cs):
+        # for keeping track of where we are
+        if verbose:
+            if iteration % 100 == 0:
+                print("iter {0}".format(iteration))
+            iteration += 1
+
+        # pull patient information out
+        df = chunk.loc[chunk["patientunitstayid"] == patientunitstayid]
+
+        # when we find our dude
+        if df.empty == False:
+            print("Found patient")
+            break
+
+    if df.empty == True:
+        print("Error: no patient was found by that id")
+        return None
+
+    # sort for convenience
+    df = df.sort_values(by=["nursingchartoffset"])
+
+    return df
+
+
+def plot_vitals(vitals, patient):
+    """
+    Given a list of vitals and patient, form a nice timeseries plot
+    Args:
+        vitals: list of list in the form:
+                 [nursingchartcelltypevallabel, nursingchartcelltypevalname]
+        patient: dataframe from a single patient's data
+    """
+
+    plt.figure(figsize=[12, 8])
+    for v in vitals:
+        idx = (patient["nursingchartcelltypevallabel"] == v[0]) & (
+            patient["nursingchartcelltypevalname"] == v[1]
+        )
+        df_plot = patient.loc[idx, :]
+
+        if "Systolic" in v[1]:
+            marker = "^-"
+        elif "Diastolic" in v[1]:
+            marker = "v-"
+        else:
+            marker = "o-"
+        plt.plot(
+            df_plot["nursingchartoffset"],
+            pd.to_numeric(df_plot["nursingchartvalue"], errors="coerce"),
+            marker,
+            markersize=8,
+            lw=2,
+            label=v,
+        )
+
+    plt.xlabel("Time since ICU admission (minutes)")
+    plt.ylabel("Measurement value")
+    plt.legend(loc="upper right")
+    plt.show()
+
+
+def multi_patient_feature_plot(df, vitals):
+    """
+    For a dataframe of a given size, plot the time series feature of all patients
+    Parameters:
+        df = the pd dataframe to extract info from (don't use a big one!)
+        vitals = list of list in form [[nursingchartcelltypevallabel, nursingchartcelltypevalname]]
+    """
+
+    # sort df if not already done and find unique patients
+    df = df.sort_values(by=["nursingchartoffset"])
+    unique_patients = df.patientunitstayid.unique()
+
+    plt.figure()
+    # check out all unique patientunitstayid's
+    for patient_id in unique_patients:
+        patient = df.loc[df["patientunitstayid"] == patient_id]
+        idx = (patient["nursingchartcelltypevallabel"] == vitals[0][0]) & (
+            patient["nursingchartcelltypevalname"] == vitals[0][1]
+        )
+        df_plot = patient.loc[idx, :]
+
+        # style choice
+        marker = "o-"
+
+        plt.plot(
+            df_plot["nursingchartoffset"],
+            pd.to_numeric(df_plot["nursingchartvalue"], errors="coerce"),
+            marker,
+            markersize=8,
+            lw=2,
+            label=vitals,
+        )
+
+    plt.xlabel("Time Since ICU Admission (minutes)")
+    plt.ylabel("GCS Value")
+    plt.show()
+
+
+def get_next_patient(fname, idx_start=0, num_rows=1000000):
+    """
+    Function to read one patient at a time. This function is intended to be chained together
+    with repeated calls.
+    Note: this isn't the fastest way to do this for sure, but it works
+
+    Parameters:
+        fname: the filename to draw data from
+        idx_start: the row to skip to during data collection
+        num_rows: the amount of rows to read at a time
+    Returns:
+        patient: the dataframe belonging solely to the patient of interest
+                 the dataframe is returned SORTED by nursingchartoffset
+        next_idx: the row to skip to (aka where the next patient begins)
+    """
+
+    # extract headers
+    temp = pd.read_csv(fname, nrows=1)
+    header = temp.columns
+
+    # read in a chunk of the dataframe starting at a location
+    df = pd.read_csv(fname, skiprows=idx_start, nrows=num_rows)
+    df.columns = header
+
+    # get patientunitstayid - will be the patient of interest
+    patient_id = df["patientunitstayid"].iloc[0]
+
+    # extract all data of patient with this id
+    # and sort according to offsets
+    patient = df.loc[df["patientunitstayid"] == patient_id]
+    patient = patient.sort_values(by=["nursingchartoffset"])
+
+    # keep track of next patient index (i.e. how many rows to skip)
+    length, __ = patient.shape
+    next_idx = length + idx_start
+
+    return patient, next_idx
+
+
+# Caution! Potential problems with corrupted index!
+def shear_next_patient(df, next_idx):
+    """
+    Function to generate a dataframe for next patient and trim the original.
+
+    Parameters:
+        df: the original dataframe that contains all data
+        next_idx: the index to check
+    Returns:
+        df: the trimmed dataframe without the selected patient's data
+        p_df: the patient dataframe that was sheared from the original
+        p_id: the patient id
+        next_idx: the index to skip to (aka where the next patient begins)
+    """
+    # ASSUMES THE DF HAS BEEN SORTED BY PATIENTUNITSTAYID ALREADY
+
+    # Trim the dataframe
+    df = df.loc[next_idx:]
+
+    # Select the patient of interest
+    p_id = df["patientunitstayid"].loc[next_idx]
+    p_df = df.loc[df["patientunitstayid"] == p_id]
+
+    # Sort by the offset
+    p_df = p_df.sort_values(by=["nursingchartoffset"])
+
+    # Determine the index to skip to
+    length, __ = p_df.shape
+    next_idx = length + next_idx
+
+    return df, p_df, p_id, next_idx
+
+
+def lift_next_patient(df, next_idx):
+    """
+    Function to generate a dataframe for next patient.
+
+    Parameters:
+        df: the original dataframe that contains all data
+        next_idx: the index to check
+    Returns:
+        p_df: the patient dataframe that was lifted
+        p_id: the patient id
+        next_idx: the index to skip to (aka where the next patient begins)
+    """
+    # ASSUMES THE DF HAS BEEN SORTED BY PATIENTUNITSTAYID ALREADY
+
+    # index of patient
+    p_id = df["patientunitstayid"].iloc[next_idx]
+
+    # extract all data of patient with this id and sort according to offsets
+    p_df = df.loc[df["patientunitstayid"] == p_id]
+    p_df = p_df.sort_values(by=["nursingchartoffset"])
+
+    # keep track of next patient index (i.e. how many rows to skip)
+    length, __ = p_df.shape
+    next_idx = length + next_idx
+
+    return p_df, p_id, next_idx
+
+
+def filter_patients(df, patients):
+    """
+    Function to remove all rows that is not one of targeted patients
+
+    Parameters:
+        df: the original dataframe that contains all data
+        patients: the patient list interested
+    Return:
+        filtered df of target patients
+    """
+
+    return df[df["patientunitstayid"].isin(patients["patientunitstayid"].to_list())]
+
+
+def apply_exclusion_criteria(df, diagnoses, criteria):
+    """
+    Function to apply exclusion criteria.
+
+    Parameters:
+        df: the original dataframe that contains all data
+        diagnoses: the diagnosis dataframe
+        criteria: the criteria list
+    Return:
+        excluded_df: filtered df that excluded diagnosis criteria
+    """
+
+    # Apply a mask to find ids of patients to exclude
+    mask = diagnoses.diagnosisstring.apply(
+        lambda x: any(item for item in criteria if item in x)
+    )
+    temp_df = diagnoses[mask]
+
+    # recover just the ids
+    exclusion_ids = temp_df["patientunitstayid"]
+
+    # apply those ids
+    excluded_df = df[~df["patientunitstayid"].isin(exclusion_ids)]
+    return excluded_df
+
+
+def normal_temperature(num):
+    """
+    Function to normalize temperature values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if num == np.nan:
+        return num
+    # Convert Fahrenheit to Celsius
+    # And apply the function again
+    elif num > 50:
+        return normal_temperature((num - 32) * 5 / 9)
+    # Remove values out of range
+    elif num < 15 or num > 45:
+        return np.nan
+    # Return normal values directly
+    else:
+        return num
+
+
+def normal_sao2(num):
+    """
+    Function to normalize O2 saturation values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if num == np.nan:
+        return num
+    # Remove values out of range
+    elif num < 50 or num > 100:
+        return np.nan
+    # Return normal values directly
+    else:
+        return num
+
+
+def normal_heartrate(num):
+    """
+    Function to normalize heart rate values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if num == np.nan:
+        return num
+    # Remove values out of range
+    elif num > 300 or num < 0:
+        return np.nan
+    # Return normal values directly
+    else:
+        return num
+
+
+def normal_respiration(num):
+    """
+    Function to normalize respiratory rate values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if num == np.nan:
+        return num
+    # Remove values out of range
+    elif num > 100 or num < 0:
+        return np.nan
+    # Return normal values directly
+    else:
+        return num
+
+
+def normal_cvp(num):
+    """
+    Function to normalize central venous pressure values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if num == np.nan:
+        return num
+    # Remove values out of range
+    elif num < -10 or num > 50:
+        return np.nan
+    # Return normal values directly
+    else:
+        return num
+
+
+def normal_etco2(num):
+    """
+    Function to normalize end tidal CO2 values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if num == np.nan:
+        return num
+    # Remove values out of range
+    elif num < 0 or num > 100:
+        return np.nan
+    # Return normal values directly
+    else:
+        return num
+
+
+def normal_systemic(systolic, diastolic, mean_p):
+    """
+    Function to normalize systemic blood pressure values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if systolic == np.nan or diastolic == np.nan or mean_p == np.nan:
+        return np.nan, np.nan, np.nan
+    # Remove values out of range
+    elif systolic < 0 or systolic > 300:
+        return np.nan, np.nan, np.nan
+    elif diastolic < 0 or diastolic > 200:
+        return np.nan, np.nan, np.nan
+    elif mean_p < 0 or mean_p > 190:
+        return np.nan, np.nan, np.nan
+    elif diastolic >= mean_p:
+        return np.nan, np.nan, np.nan
+    elif systolic < mean_p:
+        return np.nan, np.nan, np.nan
+    elif systolic - diastolic <= 4:
+        return np.nan, np.nan, np.nan
+    # Return normal values directly
+    else:
+        return systolic, diastolic, mean_p
+
+
+def normal_pa(systolic, diastolic, mean_p):
+    """
+    Function to normalize pulmonary artery blood pressure values.
+
+    Parameters:
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    # Return null values direcly
+    if systolic == np.nan or diastolic == np.nan or mean_p == np.nan:
+        return np.nan, np.nan, np.nan
+    # Remove values out of range
+    elif systolic < 0 or systolic > 300:
+        return np.nan, np.nan, np.nan
+    elif diastolic < 0 or diastolic > 200:
+        return np.nan, np.nan, np.nan
+    elif mean_p < 0 or mean_p > 190:
+        return np.nan, np.nan, np.nan
+    elif diastolic >= mean_p:
+        return np.nan, np.nan, np.nan
+    elif systolic < mean_p:
+        return np.nan, np.nan, np.nan
+    elif systolic - diastolic <= 4:
+        return np.nan, np.nan, np.nan
+    # Return normal values directly
+    else:
+        return systolic, diastolic, mean_p
+
+    
+def normal_lab(labname, num):
+    """
+    Function to normalize lab values.
+    Parameters:
+        labname: the label name of lab test
+        num: the originial input value
+    Return:
+        num: the normalized output value
+    """
+    labmin = {'BUN': 0,
+     'Hct': 0,
+     'Hgb': 1,
+     'MCH': 10,
+     'MCHC': 15,
+     'MCV': 40,
+     'MPV': 3,
+     'RBC': 2,
+     'RDW': 5,
+     'WBC x 1000': 0,
+     'anion gap': 0,
+     'bedside glucose': 0,
+     'bicarbonate': 1,
+     'calcium': 3,
+     'chloride': 40,
+     'creatinine': 1,
+     'glucose': 0,
+     'platelets x 1000': 0,
+     'potassium': 1,
+     'sodium': 80}
+    
+    labmax = {'BUN': 200,
+     'Hct': 70,
+     'Hgb': 30,
+     'MCH': 50,
+     'MCHC': 60,
+     'MCV': 150,
+     'MPV': 20,
+     'RBC': 10,
+     'RDW': 25,
+     'WBC x 1000': 100,
+     'anion gap': 40,
+     'bedside glucose': 1000,
+     'bicarbonate': 50,
+     'calcium': 20,
+     'chloride': 160,
+     'creatinine': 20,
+     'glucose': 1000,
+     'platelets x 1000': 2000,
+     'potassium': 10,
+     'sodium': 200}
+    
+    rangemin = labmin[labname]
+    rangemax = labmax[labname]
+    
+    if num == np.nan:
+        return num
+    # Remove values out of range
+    elif num > rangemax or num <  rangemin:
+        return np.nan
+    # Return normal values directly
+    else:
+        return num
\ No newline at end of file

From 6e48b3c98734ae3d7f5161b1b91e20d92b8a1ff3 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Sat, 5 Dec 2020 07:08:42 -0800
Subject: [PATCH 2/3] Fix format

---
 data_toolbox/data_toolbox.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data_toolbox/data_toolbox.py b/data_toolbox/data_toolbox.py
index d70fdb0..be0ea68 100644
--- a/data_toolbox/data_toolbox.py
+++ b/data_toolbox/data_toolbox.py
@@ -444,6 +444,7 @@ def normal_pa(systolic, diastolic, mean_p):
 def normal_lab(labname, num):
     """
     Function to normalize lab values.
+    
     Parameters:
         labname: the label name of lab test
         num: the originial input value
@@ -502,4 +503,4 @@ def normal_lab(labname, num):
         return np.nan
     # Return normal values directly
     else:
-        return num
\ No newline at end of file
+        return num

From 9ffed0ed9d12a066d483471cc12092214deb803f Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Sat, 5 Dec 2020 07:15:37 -0800
Subject: [PATCH 3/3] Unify formats

---
 data_toolbox/data_toolbox.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/data_toolbox/data_toolbox.py b/data_toolbox/data_toolbox.py
index be0ea68..d8ca533 100644
--- a/data_toolbox/data_toolbox.py
+++ b/data_toolbox/data_toolbox.py
@@ -8,6 +8,7 @@
 def find_patient(fname, patientunitstayid, cs=100000, verbose=True):
     """
     Retrieve single patient info. Also sorts by offsets
+    
     Parameters:
         patientunitstayid: the patient whose data to pull
         cs = the size of the chunk to iterate through
@@ -45,7 +46,8 @@ def find_patient(fname, patientunitstayid, cs=100000, verbose=True):
 def plot_vitals(vitals, patient):
     """
     Given a list of vitals and patient, form a nice timeseries plot
-    Args:
+    
+    Parameters:
         vitals: list of list in the form:
                  [nursingchartcelltypevallabel, nursingchartcelltypevalname]
         patient: dataframe from a single patient's data
@@ -82,6 +84,7 @@ def plot_vitals(vitals, patient):
 def multi_patient_feature_plot(df, vitals):
     """
     For a dataframe of a given size, plot the time series feature of all patients
+    
     Parameters:
         df = the pd dataframe to extract info from (don't use a big one!)
         vitals = list of list in form [[nursingchartcelltypevallabel, nursingchartcelltypevalname]]