Skip to content

Commit

Permalink
Rename data cleaning labeling to have _ instead of spaces, and resolv…
Browse files Browse the repository at this point in the history
…e Colette's comment about verifying 100.612 will work as a placeholder value in the data cleaning labeling notebook
  • Loading branch information
zacharymeurer committed Jun 26, 2024
1 parent 5212eb2 commit 0986075
Showing 1 changed file with 18 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,22 @@
"# Labeling Incorrect -2 Values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8beca81",
"metadata": {},
"outputs": [],
"source": [
"# Checking if 100.612 would work as a placeholder value\n",
"placeholder = 100.612 # 100.612 is chosen as a random decimal value which should not appear throughout the citizen data\n",
"df.isin([placeholder]).any() # Is this placeholder value in any column in the citizen data? \n",
"# Output should return False for each column in the database. This means that the placeholder does not appear\n",
"# in the citizen data, and will thus work as a functional placeholder. Incorrectly reported -2 values are replaced\n",
"# with the placeholder, so they can later be turned into NA values, but only after validation labels are recorded\n",
"# for regular NA values (Not NA values created by incorrectly reported -2 values)."
]
},
{
"cell_type": "markdown",
"id": "c7aa07d0-e0dd-4645-ab5d-43640a50edbd",
Expand Down Expand Up @@ -294,7 +310,7 @@
" for phenophase in phenophases:\n",
" if species_dict[phenophase] == 0: # Phenophase seen in species\n",
" false_positive_idx = species_df.index[species_df[phenophase] == -2] # Indices of reports that incorrectly assign -2 values (false positive) to phenophases SEEN in the species\n",
" df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),100.612) # Turn all false positives into 100.612 (these observations will later be dropped but they cannot be Null/NaN/None values yet, so they will not be identified by .isna() check later)\n",
" df.loc[false_positive_idx, phenophase] = np.full(len(false_positive_idx),placeholder) # Turn all false positives into 100.612 (these observations will later be dropped but they cannot be Null/NaN/None values yet, so they will not be identified by .isna() check later)\n",
" val_df.loc[false_positive_idx, 'validation_label'] = np.full(len(false_positive_idx),1) # Label false positive records as 1 because they will be dropped (1 represents observations incorrectly assigned -2 values)\n",
" if species_dict[phenophase] == 1: # Phenophase NOT seen in species\n",
" false_negative_idx = species_df.index[species_df[phenophase] != -2] # Indices of reports that incorrectly assign values other than -2 (false negative) to phenophases NOT SEEN in the species\n",
Expand Down Expand Up @@ -418,7 +434,7 @@
"na_indices = df[df.isna().any(axis=1)].index # Finds Null/NaN/None values in citizen data\n",
"val_df.loc[na_indices, 'validation_label'] = np.full(len(na_indices), 2) # 2 represents observations with Null/NaN/None values\n",
"df = df.drop(df.columns[0], axis=1) # Drop the index column called \"Unnamed: 0\"\n",
"df = df.replace(100.612,None) # Turn 100.612 to None so these values will be dropped. These values were kept as temp so that they will not be detected by na_indices\n",
"df = df.replace(placeholder,None) # Turn 100.612 to None so these values will be dropped. These values were kept as a placeholder so that they will not be detected by na_indices\n",
"df = df.dropna(how='any') # Drop any observation with at least one NaN/Null/None value reported\n",
"df = df.sort_values(by='Species_name') # Order data by species for organization"
]
Expand Down

0 comments on commit 0986075

Please sign in to comment.