Skip to content

Commit

Permalink
Edited README_keys to be more e more aasdfs more understandable. Chan…
Browse files Browse the repository at this point in the history
…ged all files in code to have the correct file paths. Made some small bug fixes in data cleaning notebook. Moved unimportant notebooks to the dev_code folder
  • Loading branch information
zacharymeurer committed Jun 28, 2024
1 parent 6a812e4 commit ef761eb
Show file tree
Hide file tree
Showing 11 changed files with 67 additions and 97 deletions.
2 changes: 2 additions & 0 deletions code/-2_values_README_key.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

The possible values in the new columns are 0, 1, & 2.

## `[Phenophase]_incorrect_-2` Column Key

| Label | Meaning |
| :----: | :----- |
| 0 | Valid |
Expand Down
51 changes: 11 additions & 40 deletions code/data_cleaning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@
}
],
"source": [
"df = pd.read_csv(\"alldata.csv\") # Load raw citizen data in\n",
"df = pd.read_csv(\"../data/original_citizen_data/alldata.csv\") # Load raw citizen data in\n",
"df.head() # Previewing raw (before cleaning) citizen data"
]
},
Expand Down Expand Up @@ -542,7 +542,7 @@
},
"outputs": [],
"source": [
"states_shapefile = gpd.read_file(\"india_map/gadm41_IND_3.shp\") # Load map of India with states as a coordinate grid with labels"
"states_shapefile = gpd.read_file(\"../data/india_map/gadm41_IND_3.shp\") # Load map of India with states as a coordinate grid with labels"
]
},
{
Expand Down Expand Up @@ -1407,7 +1407,7 @@
"outputs": [],
"source": [
"# Load species lookup dicts for id <-> name from species_codes.csv\n",
"species_codes = pd.read_csv(\"species codes.csv\", encoding='unicode_escape')\n",
"species_codes = pd.read_csv(\"../data/species codes.csv\", encoding='unicode_escape')\n",
"\n",
"species_id_to_name = {}\n",
"species_name_to_id = {}\n",
Expand Down Expand Up @@ -1577,7 +1577,7 @@
"outputs": [],
"source": [
"# Save updated_alldata.csv to disk\n",
"df.to_csv('updated_alldata.csv', index=False)"
"df.to_csv('../data/cleaned_alldata.csv', index=False)"
]
},
{
Expand All @@ -1596,8 +1596,8 @@
"outputs": [],
"source": [
"# Create directories for storing citizen and reference data.\n",
"os.makedirs(\"all data/citizen\", exist_ok=True)\n",
"os.makedirs(\"all data/reference\", exist_ok=True)"
"os.makedirs(\"../data/citizen_states_cleaned\", exist_ok=True)\n",
"os.makedirs(\"../data/reference_states_cleaned\", exist_ok=True)"
]
},
{
Expand All @@ -1622,7 +1622,7 @@
" state_df = df[df[\"State_name\"] == state_name] # Only use observations from given state\n",
" state_df = state_df.drop([\"State_name\"], axis = 1) # Drop State_name column because it is the same value throughout each CSV file\n",
" state_name = state_name.replace(\" \",\"_\").lower() # Reformat state names to lowercase with _ instead of spaces\n",
" state_df.to_csv(f\"all data/citizen/{state_name}.csv\", index=False) # Save citizen observations in the given state to disk"
" state_df.to_csv(f\"../data/citizen_states_cleaned/{state_name}.csv\", index=False) # Save citizen observations in the given state to disk"
]
},
{
Expand Down Expand Up @@ -1679,7 +1679,7 @@
" \n",
" df = df.drop_duplicates() # Drop reference data for species with duplicate yearly observations\n",
" \n",
" week_codes = [list(df.columns[3:-2]) # Names of week columns\n",
" week_codes = list(df.columns[3:-2]) # Names of week columns\n",
" base_cols = list(df.columns[:3]) + [df.columns[-1]] # Names of non-week columns excluding created_at (excluding this column acts as dropping it)\n",
" \n",
" new_cols = base_cols + ['week'] + list(acronym_to_phenophase_dict.values()) # Names of columns used in cleaned reference data\n",
Expand Down Expand Up @@ -2336,45 +2336,16 @@
"source": [
"# Clean all pvt tables & save them to disk in reference folder\n",
"\n",
"for tablename in os.listdir('./pvttables_raw/'): # Iterate through pvt tables\n",
"for tablename in os.listdir('../data/original_reference_data/pvttables_raw/'): # Iterate through pvt tables\n",
" if tablename == '.DS_Store': # Ignore this file \n",
" continue\n",
" print(\"cleaning {}\".format(tablename))\n",
" ref_df = pd.read_csv('./pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table\n",
" ref_df = pd.read_csv('../data/original_reference_data/pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table\n",
" tablename = tablename.replace(\"pvt_\",\"\") # Reformat table name\n",
" if tablename == 'maharastra.csv': # Fix misspelling\n",
" tablename = 'maharashtra.csv'\n",
" new_df = clean_df(ref_df) # Clean reference data\n",
" new_df.to_csv('./all data/reference/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2980acaa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Kerala', 'Karnataka', 'Telangana', 'Puducherry', 'Maharashtra',\n",
" 'Tamil Nadu', 'Goa', 'Odisha', 'Manipur', 'Andhra Pradesh',\n",
" 'Rajasthan', 'West Bengal', 'Chhattisgarh', 'Madhya Pradesh',\n",
" 'Punjab', 'Gujarat', 'Andaman and Nicobar Islands',\n",
" 'Uttar Pradesh', 'Uttarakhand', 'Assam', 'Meghalaya', 'Nagaland',\n",
" 'Tripura', 'Delhi', 'Jammu and Kashmir', 'Himachal Pradesh',\n",
" 'Jharkhand', 'Haryana', 'Arunachal Pradesh', 'Bihar',\n",
" 'Lakshadweep', 'Sikkim', 'Chandigarh', 'Dadra and Nagar Haveli'],\n",
" dtype=object)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv(\"updated_alldata.csv\")[\"State_name\"].unique()"
" new_df.to_csv('../data/reference_states_cleaned/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder"
]
}
],
Expand Down
22 changes: 11 additions & 11 deletions code/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# Previewing Original Citizen Data


df = pd.read_csv("alldata.csv") # Load raw citizen data in
df = pd.read_csv("../data/original_citizen_data/alldata.csv") # Load raw citizen data in


# # Handling Incorrect -2 Values
Expand Down Expand Up @@ -252,7 +252,7 @@ def create_species_dict(*absent_phenophases):
# Filling in Missing States


states_shapefile = gpd.read_file("india_map/gadm41_IND_3.shp") # Load map of India with states as a coordinate grid with labels
states_shapefile = gpd.read_file("../data/india_map/gadm41_IND_3.shp") # Load map of India with states as a coordinate grid with labels

# Function for filling state_name attribute based on coordinates for observations with NA state_name
def find_indian_state(latitude, longitude, gdf):
Expand Down Expand Up @@ -403,7 +403,7 @@ def anomaly_detection_overall(df, min_observations_for_outlier_detection):


# Load species lookup dicts for id <-> name from species_codes.csv
species_codes = pd.read_csv("species codes.csv", encoding='unicode_escape')
species_codes = pd.read_csv("../data/species codes.csv", encoding='unicode_escape')

species_id_to_name = {}
species_name_to_id = {}
Expand Down Expand Up @@ -551,15 +551,15 @@ def anomaly_detection_overall(df, min_observations_for_outlier_detection):


# Save updated_alldata.csv to disk
df.to_csv('updated_alldata.csv', index=False)
df.to_csv('../data/cleaned_alldata.csv', index=False)


# Make Directories for Citizen and Reference Data


# Create directories for storing citizen and reference data.
os.makedirs("all data/citizen", exist_ok=True)
os.makedirs("all data/reference", exist_ok=True)
os.makedirs("../data/citizen_states_cleaned", exist_ok=True)
os.makedirs("../data/reference_states_cleaned", exist_ok=True)


# State DFs to all data/citizen folder\
Expand All @@ -570,7 +570,7 @@ def anomaly_detection_overall(df, min_observations_for_outlier_detection):
state_df = df[df["State_name"] == state_name] # Only use observations from given state
state_df = state_df.drop(["State_name"], axis = 1) # Drop State_name column because it is the same value throughout each CSV file
state_name = state_name.replace(" ","_").lower() # Reformat state names to lowercase with _ instead of spaces
state_df.to_csv(f"all data/citizen/{state_name}.csv", index=False) # Save citizen observations in the given state to disk
state_df.to_csv(f"../data/citizen_states_cleaned/{state_name}.csv", index=False) # Save citizen observations in the given state to disk


# Reference Data Cleaning
Expand Down Expand Up @@ -615,7 +615,7 @@ def clean_df(df):

df = df.drop_duplicates() # Drop reference data for species with duplicate yearly observations

week_codes = [list(df.columns[3:-2]) # Names of week columns
week_codes = list(df.columns[3:-2]) # Names of week columns
base_cols = list(df.columns[:3]) + [df.columns[-1]] # Names of non-week columns excluding created_at (excluding this column acts as dropping it)

new_cols = base_cols + ['week'] + list(acronym_to_phenophase_dict.values()) # Names of columns used in cleaned reference data
Expand Down Expand Up @@ -668,14 +668,14 @@ def clean_df(df):

# Clean all pvt tables & save them to disk in reference folder
print("Started Cleaning Reference Data")
for tablename in os.listdir('./pvttables_raw/'): # Iterate through pvt tables
for tablename in os.listdir('../data/original_reference_data/pvttables_raw/'): # Iterate through pvt tables
if tablename == '.DS_Store': # Ignore this file
continue
print("cleaning {}".format(tablename))
ref_df = pd.read_csv('./pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table
ref_df = pd.read_csv('../data/original_reference_data/pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table
tablename = tablename.replace("pvt_","") # Reformat table name
if tablename == 'maharastra.csv': # Fix misspelling
tablename = 'maharashtra.csv'
new_df = clean_df(ref_df) # Clean reference data
new_df.to_csv('./all data/reference/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder
new_df.to_csv('../data/reference_states_cleaned/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder
print("Finished Citizen and Reference Data Cleaning")
14 changes: 7 additions & 7 deletions code/mean_transition_times_data_generation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
"outputs": [],
"source": [
"# load in name_to_id lookup table\n",
"name_to_id_df = pd.read_csv('species codes.csv', encoding='unicode_escape')\n",
"name_to_id_df = pd.read_csv('../data/species codes.csv', encoding='unicode_escape')\n",
"id_to_name_dict = {}\n",
"for _, row in name_to_id_df.iterrows():\n",
" name = \"{}-{}\".format(row['species_primary_common_name'], row['species_scientific_name']).lower().replace(' ', '')\n",
Expand Down Expand Up @@ -248,7 +248,7 @@
"'''\n",
"def plot_score_and_pcts(species_id, attr, L=5, M=15, w_1=0.25, w_2=1):\n",
" # load state dataframe and percent of observations of attribute each week\n",
" state_df = pd.read_csv('all data/citizen/kerala.csv')\n",
" state_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
" pcts = []\n",
" for year in range(2018, 2024):\n",
" pcts += get_percent_of_positive_observations_for_each_week(state_df, year, species_id, attr)\n",
Expand Down Expand Up @@ -5355,7 +5355,7 @@
"w_2 = 1\n",
"\n",
"# load in all data from kerala\n",
"kerala_df = pd.read_csv('all data/citizen/kerala.csv')\n",
"kerala_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
"\n",
"# get 10 most common species ids\n",
"top_10_ids = kerala_df['Species_id'].value_counts().index.tolist()[:10]\n",
Expand Down Expand Up @@ -5464,7 +5464,7 @@
" 1161: ['Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_unripe', 'Fruits_ripe']}\n",
"\n",
"# load all kerala observations\n",
"kerala_df = pd.read_csv('all data/citizen/kerala.csv')\n",
"kerala_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
"\n",
"# get a list of all transition dataframes, for each species_id / attr pair in transition_dict\n",
"transition_dfs = []\n",
Expand All @@ -5474,7 +5474,7 @@
"\n",
"# join all dataframes together\n",
"transition_time_df_mango_jack = pd.concat(transition_dfs, ignore_index=True)\n",
"transition_time_df_mango_jack.to_csv(\"average_transition_times.csv\", index=False)"
"transition_time_df_mango_jack.to_csv(\"../data/average_transition_times.csv\", index=False)"
]
},
{
Expand Down Expand Up @@ -5937,7 +5937,7 @@
" twin.legend(loc='upper right')\n",
" plt.show()\n",
"\n",
"kerala_df = pd.read_csv('all data/citizen/kerala.csv')\n",
"kerala_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
"for species_id in [1090, 1161]:\n",
" for attr_list in list(transition_dict.values()):\n",
" for attr in attr_list:\n",
Expand All @@ -5961,7 +5961,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
"version": "3.12.2"
}
},
"nbformat": 4,
Expand Down
24 changes: 12 additions & 12 deletions code/selecting_reference_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,18 @@
"metadata": {},
"outputs": [],
"source": [
"def reformat_state_name(state_filename):\n",
"def reformat_state_name(state_name):\n",
" \"\"\"\n",
" Reformats state_filename (e.g. andaman_and_nicobar_islands) to properly punctuated state names (e.g. Andaman and Nicobar Islands)\n",
" Reformats state_name (e.g. andaman_and_nicobar_islands) to properly punctuated state names (e.g. Andaman and Nicobar Islands)\n",
"\n",
" Args:\n",
" state_filename (string): State name according to CSV files\n",
" state_name (string): State name according to CSV files\n",
" Returns:\n",
" reformatted_state_name (string): State name according to original citizen data\n",
" \"\"\"\n",
" \n",
" # Replace underscores with spaces & capitalize all words except 'and'\n",
" reformatted_state_name = ' '.join([word.capitalize() if word != 'and' else word for word in state_filename.replace('_', ' ').split()])\n",
" reformatted_state_name = ' '.join([word.capitalize() if word != 'and' else word for word in state_name.replace('_', ' ').split()])\n",
" return reformatted_state_name\n",
"\n",
"def select_reference_data(state, species, year, k=1):\n",
Expand All @@ -65,7 +65,7 @@
" selected_ref_df (DataFrame): Selected reference data in format of original reference data\n",
" \"\"\"\n",
" \n",
" df = pd.read_csv(f\"all data/citizen/{state}.csv\")\n",
" df = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")\n",
" df = df[df[\"Species_name\"] == species]\n",
" species_id = df['Species_id'].iloc[0] # Get the id for the given species\n",
" df = df[df[\"Year\"] == int(year)]\n",
Expand Down Expand Up @@ -138,7 +138,7 @@
" presence_pcts (Dict(int,float)): Percentage of observations indicating phenophase presence for each week\n",
" \"\"\"\n",
" \n",
" df = pd.read_csv(f\"all data/citizen/{state}.csv\")\n",
" df = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")\n",
" df = df[df[\"Species_name\"] == species]\n",
" df = df[df[\"Year\"] == int(year)]\n",
" df = df.drop([\"Lat\", \"Long\", \"Date_of_observation\", \"Observation_ID\", \"User_id\", \"User_Tree_id\", \"Species_id\", \"Species_name\", \"Year\"], axis=1)\n",
Expand Down Expand Up @@ -166,7 +166,7 @@
" \"\"\"\n",
" \n",
" for state in states:\n",
" species_in_state = pd.read_csv(f\"all data/citizen/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
" species_in_state = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
" state_start_time = time.time()\n",
" for species in species_in_state:\n",
" species_start_time = time.time()\n",
Expand Down Expand Up @@ -292,8 +292,8 @@
}
],
"source": [
"plot_path = \"plots/selected_ref_vs_cit\" # Path for where plots will be stored\n",
"states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"all data/citizen\")]\n",
"plot_path = \"../plots/selected_ref_vs_cit\" # Path for where plots will be stored\n",
"states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"../data/citizen_states_cleaned\")]\n",
"n_species = 10 # Top n most prevalent species within the given state\n",
"years = [2018,2019,2020,2021,2022,2023]\n",
"k = 3\n",
Expand Down Expand Up @@ -329,7 +329,7 @@
" \"\"\"\n",
" ref_df_list = []\n",
" for state in states:\n",
" species_in_state = pd.read_csv(f\"all data/citizen/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
" species_in_state = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
" state_start_time = time.time()\n",
" for species in species_in_state:\n",
" species_start_time = time.time()\n",
Expand Down Expand Up @@ -430,11 +430,11 @@
}
],
"source": [
"states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"all data/citizen\")]\n",
"states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"../data/citizen_states_cleaned\")]\n",
"n_species = 10 # Top n most prevalent species within the given state\n",
"years = [2018,2019,2020,2021,2022,2023]\n",
"k = 3\n",
"create_selected_ref_df(states, n_species, years, k).to_csv(\"selected_reference_data.csv\")"
"create_selected_ref_df(states, n_species, years, k).to_csv(\"../data/selected_reference_data.csv\")"
]
}
],
Expand Down
2 changes: 1 addition & 1 deletion code/validation_labels_README_key.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
### `validation_labels_alldata.csv` is a copy of alldata.csv with a new column `validation_label` which labels the observations that were dropped from the citizen data in our team's data cleaning process. The reason for dropping each observation is given by the validation label's value. The meanings of these values are listed in the key below:
`validation_labels_alldata.csv` is a copy of alldata.csv with a new column `validation_label` which labels the observations that were dropped from the citizen data in our team's data cleaning process. The reason for dropping each observation is given by the validation label's value. The meanings of these values are listed in the key below:

## Key for `validation_label` Column

Expand Down
Loading

0 comments on commit ef761eb

Please sign in to comment.