Rename data cleaning labeling to have _ instead of spaces, and resolv…

…e Colette's comment about verifying 100.612 will work as a placeholder value in the data cleaning labeling notebook
BU-Spark · Jun 26, 2024 · 0986075 · 0986075
1 parent 5212eb2
commit 0986075
Showing 1 changed file with 18 additions and 2 deletions.
diff --git a/...ation_labels/data cleaning labeling.ipynb → ...ation_labels/data_cleaning_labeling.ipynb b/...ation_labels/data cleaning labeling.ipynb → ...ation_labels/data_cleaning_labeling.ipynb
@@ -73,6 +73,22 @@
     "# Labeling Incorrect -2 Values"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8beca81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Checking if 100.612 would work as a placeholder value\n",
+    "placeholder = 100.612 # 100.612 is chosen as a random decimal value which should not appear throughout the citizen data\n",
+    "df.isin([placeholder]).any() # Is this placeholder value in any column in the citizen data? \n",
+    "# Output should return False for each column in the database. This means that the placeholder does not appear\n",
+    "# in the citizen data, and will thus work as a functional placeholder. Incorrectly reported -2 values are replaced\n",
+    "# with the placeholder, so they can later be turned into NA values, but only after validation labels are recorded\n",
+    "# for regular NA values (Not NA values created by incorrectly reported -2 values)."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "c7aa07d0-e0dd-4645-ab5d-43640a50edbd",
@@ -294,7 +310,7 @@
     "    for phenophase in phenophases:\n",
     "        if species_dict[phenophase] == 0: # Phenophase seen in species\n",
     "            false_positive_idx = species_df.index[species_df[phenophase] == -2] # Indices of reports that incorrectly assign -2 values (false positive) to phenophases SEEN in the species\n",
-    "            df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),100.612) # Turn all false positives into 100.612 (these observations will later be dropped but they cannot be Null/NaN/None values yet, so they will not be identified by .isna() check later)\n",
+    "            df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),placeholder) # Turn all false positives into 100.612 (these observations will later be dropped but they cannot be Null/NaN/None values yet, so they will not be identified by .isna() check later)\n",
     "            val_df.loc[false_positive_idx, 'validation_label'] = np.full(len(false_positive_idx),1) # Label false positive records as 1 because they will be dropped (1 represents observations incorrectly assigned -2 values)\n",
     "        if species_dict[phenophase] == 1: # Phenophase NOT seen in species\n",
     "            false_negative_idx = species_df.index[species_df[phenophase] != -2] # Indices of reports that incorrectly assign values other than -2 (false negative) to phenophases NOT SEEN in the species\n",
@@ -418,7 +434,7 @@
     "na_indices = df[df.isna().any(axis=1)].index # Finds Null/NaN/None values in citizen data\n",
     "val_df.loc[na_indices, 'validation_label'] = np.full(len(na_indices), 2) # 2 represents observations with Null/NaN/None values\n",
     "df = df.drop(df.columns[0], axis=1) # Drop the index column called \"Unnamed: 0\"\n",
-    "df = df.replace(100.612,None) # Turn 100.612 to None so these values will be dropped. These values were kept as temp so that they will not be detected by na_indices\n",
+    "df = df.replace(placeholder,None) # Turn 100.612 to None so these values will be dropped. These values were kept as a placeholder so that they will not be detected by na_indices\n",
     "df = df.dropna(how='any') # Drop any observation with at least one NaN/Null/None value reported\n",
     "df = df.sort_values(by='Species_name') # Order data by species for organization"
    ]