Edited README_keys to be more e more aasdfs more understandable. Chan…

…ged all files in code to have the correct file paths. Made some small bug fixes in data cleaning notebook. Moved unimportant notebooks to the dev_code folder
BU-Spark · Jun 28, 2024 · ef761eb · ef761eb
1 parent 6a812e4
commit ef761eb
Show file tree

Hide file tree

Showing 11 changed files with 67 additions and 97 deletions.
diff --git a/code/-2_values_README_key.md b/code/-2_values_README_key.md
@@ -2,6 +2,8 @@
 
 The possible values in the new columns are 0, 1, & 2.
 
+## `[Phenophase]_incorrect_-2` Column Key
+
 | Label  | Meaning |
 | :----: | :----- |
 | 0      | Valid |

diff --git a/code/data_cleaning.ipynb b/code/data_cleaning.ipynb
@@ -274,7 +274,7 @@
     }
    ],
    "source": [
-    "df = pd.read_csv(\"alldata.csv\") # Load raw citizen data in\n",
+    "df = pd.read_csv(\"../data/original_citizen_data/alldata.csv\") # Load raw citizen data in\n",
     "df.head() # Previewing raw (before cleaning) citizen data"
    ]
   },
@@ -542,7 +542,7 @@
    },
    "outputs": [],
    "source": [
-    "states_shapefile = gpd.read_file(\"india_map/gadm41_IND_3.shp\") # Load map of India with states as a coordinate grid with labels"
+    "states_shapefile = gpd.read_file(\"../data/india_map/gadm41_IND_3.shp\") # Load map of India with states as a coordinate grid with labels"
    ]
   },
   {
@@ -1407,7 +1407,7 @@
    "outputs": [],
    "source": [
     "# Load species lookup dicts for id <-> name from species_codes.csv\n",
-    "species_codes = pd.read_csv(\"species codes.csv\", encoding='unicode_escape')\n",
+    "species_codes = pd.read_csv(\"../data/species codes.csv\", encoding='unicode_escape')\n",
     "\n",
     "species_id_to_name = {}\n",
     "species_name_to_id = {}\n",
@@ -1577,7 +1577,7 @@
    "outputs": [],
    "source": [
     "# Save updated_alldata.csv to disk\n",
-    "df.to_csv('updated_alldata.csv', index=False)"
+    "df.to_csv('../data/cleaned_alldata.csv', index=False)"
    ]
   },
   {
@@ -1596,8 +1596,8 @@
    "outputs": [],
    "source": [
     "# Create directories for storing citizen and reference data.\n",
-    "os.makedirs(\"all data/citizen\", exist_ok=True)\n",
-    "os.makedirs(\"all data/reference\", exist_ok=True)"
+    "os.makedirs(\"../data/citizen_states_cleaned\", exist_ok=True)\n",
+    "os.makedirs(\"../data/reference_states_cleaned\", exist_ok=True)"
    ]
   },
   {
@@ -1622,7 +1622,7 @@
     "    state_df = df[df[\"State_name\"] == state_name] # Only use observations from given state\n",
     "    state_df = state_df.drop([\"State_name\"], axis = 1) # Drop State_name column because it is the same value throughout each CSV file\n",
     "    state_name = state_name.replace(\" \",\"_\").lower() # Reformat state names to lowercase with _ instead of spaces\n",
-    "    state_df.to_csv(f\"all data/citizen/{state_name}.csv\", index=False) # Save citizen observations in the given state to disk"
+    "    state_df.to_csv(f\"../data/citizen_states_cleaned/{state_name}.csv\", index=False) # Save citizen observations in the given state to disk"
    ]
   },
   {
@@ -1679,7 +1679,7 @@
     "    \n",
     "    df = df.drop_duplicates() # Drop reference data for species with duplicate yearly observations\n",
     "    \n",
-    "    week_codes = [list(df.columns[3:-2]) # Names of week columns\n",
+    "    week_codes = list(df.columns[3:-2]) # Names of week columns\n",
     "    base_cols = list(df.columns[:3]) + [df.columns[-1]] # Names of non-week columns excluding created_at (excluding this column acts as dropping it)\n",
     "    \n",
     "    new_cols = base_cols + ['week'] + list(acronym_to_phenophase_dict.values()) # Names of columns used in cleaned reference data\n",
@@ -2336,45 +2336,16 @@
    "source": [
     "# Clean all pvt tables & save them to disk in reference folder\n",
     "\n",
-    "for tablename in os.listdir('./pvttables_raw/'): # Iterate through pvt tables\n",
+    "for tablename in os.listdir('../data/original_reference_data/pvttables_raw/'): # Iterate through pvt tables\n",
     "    if tablename == '.DS_Store': # Ignore this file \n",
     "        continue\n",
     "    print(\"cleaning {}\".format(tablename))\n",
-    "    ref_df = pd.read_csv('./pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table\n",
+    "    ref_df = pd.read_csv('../data/original_reference_data/pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table\n",
     "    tablename = tablename.replace(\"pvt_\",\"\") # Reformat table name\n",
     "    if tablename == 'maharastra.csv': # Fix misspelling\n",
     "        tablename = 'maharashtra.csv'\n",
     "    new_df = clean_df(ref_df) # Clean reference data\n",
-    "    new_df.to_csv('./all data/reference/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "2980acaa",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['Kerala', 'Karnataka', 'Telangana', 'Puducherry', 'Maharashtra',\n",
-       "       'Tamil Nadu', 'Goa', 'Odisha', 'Manipur', 'Andhra Pradesh',\n",
-       "       'Rajasthan', 'West Bengal', 'Chhattisgarh', 'Madhya Pradesh',\n",
-       "       'Punjab', 'Gujarat', 'Andaman and Nicobar Islands',\n",
-       "       'Uttar Pradesh', 'Uttarakhand', 'Assam', 'Meghalaya', 'Nagaland',\n",
-       "       'Tripura', 'Delhi', 'Jammu and Kashmir', 'Himachal Pradesh',\n",
-       "       'Jharkhand', 'Haryana', 'Arunachal Pradesh', 'Bihar',\n",
-       "       'Lakshadweep', 'Sikkim', 'Chandigarh', 'Dadra and Nagar Haveli'],\n",
-       "      dtype=object)"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pd.read_csv(\"updated_alldata.csv\")[\"State_name\"].unique()"
+    "    new_df.to_csv('../data/reference_states_cleaned/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder"
    ]
   }
  ],

diff --git a/code/data_cleaning.py b/code/data_cleaning.py
@@ -24,7 +24,7 @@
 # Previewing Original Citizen Data
 
 
-df = pd.read_csv("alldata.csv") # Load raw citizen data in
+df = pd.read_csv("../data/original_citizen_data/alldata.csv") # Load raw citizen data in
 
 
 # # Handling Incorrect -2 Values
@@ -252,7 +252,7 @@ def create_species_dict(*absent_phenophases):
 # Filling in Missing States
 
 
-states_shapefile = gpd.read_file("india_map/gadm41_IND_3.shp") # Load map of India with states as a coordinate grid with labels
+states_shapefile = gpd.read_file("../data/india_map/gadm41_IND_3.shp") # Load map of India with states as a coordinate grid with labels
 
 # Function for filling state_name attribute based on coordinates for observations with NA state_name
 def find_indian_state(latitude, longitude, gdf):
@@ -403,7 +403,7 @@ def anomaly_detection_overall(df, min_observations_for_outlier_detection):
 
 
 # Load species lookup dicts for id <-> name from species_codes.csv
-species_codes = pd.read_csv("species codes.csv", encoding='unicode_escape')
+species_codes = pd.read_csv("../data/species codes.csv", encoding='unicode_escape')
 
 species_id_to_name = {}
 species_name_to_id = {}
@@ -551,15 +551,15 @@ def anomaly_detection_overall(df, min_observations_for_outlier_detection):
 
 
 # Save updated_alldata.csv to disk
-df.to_csv('updated_alldata.csv', index=False)
+df.to_csv('../data/cleaned_alldata.csv', index=False)
 
 
 # Make Directories for Citizen and Reference Data
 
 
 # Create directories for storing citizen and reference data.
-os.makedirs("all data/citizen", exist_ok=True)
-os.makedirs("all data/reference", exist_ok=True)
+os.makedirs("../data/citizen_states_cleaned", exist_ok=True)
+os.makedirs("../data/reference_states_cleaned", exist_ok=True)
 
 
 # State DFs to all data/citizen folder\
@@ -570,7 +570,7 @@ def anomaly_detection_overall(df, min_observations_for_outlier_detection):
     state_df = df[df["State_name"] == state_name] # Only use observations from given state
     state_df = state_df.drop(["State_name"], axis = 1) # Drop State_name column because it is the same value throughout each CSV file
     state_name = state_name.replace(" ","_").lower() # Reformat state names to lowercase with _ instead of spaces
-    state_df.to_csv(f"all data/citizen/{state_name}.csv", index=False) # Save citizen observations in the given state to disk
+    state_df.to_csv(f"../data/citizen_states_cleaned/{state_name}.csv", index=False) # Save citizen observations in the given state to disk
 
 
 # Reference Data Cleaning
@@ -615,7 +615,7 @@ def clean_df(df):
 
     df = df.drop_duplicates() # Drop reference data for species with duplicate yearly observations
 
-    week_codes = [list(df.columns[3:-2]) # Names of week columns
+    week_codes = list(df.columns[3:-2]) # Names of week columns
     base_cols = list(df.columns[:3]) + [df.columns[-1]] # Names of non-week columns excluding created_at (excluding this column acts as dropping it)
 
     new_cols = base_cols + ['week'] + list(acronym_to_phenophase_dict.values()) # Names of columns used in cleaned reference data
@@ -668,14 +668,14 @@ def clean_df(df):
 
 # Clean all pvt tables & save them to disk in reference folder
 print("Started Cleaning Reference Data")
-for tablename in os.listdir('./pvttables_raw/'): # Iterate through pvt tables
+for tablename in os.listdir('../data/original_reference_data/pvttables_raw/'): # Iterate through pvt tables
     if tablename == '.DS_Store': # Ignore this file 
         continue
     print("cleaning {}".format(tablename))
-    ref_df = pd.read_csv('./pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table
+    ref_df = pd.read_csv('../data/original_reference_data/pvttables_raw/{}'.format(tablename), sep=';') # Load the given pvt table
     tablename = tablename.replace("pvt_","") # Reformat table name
     if tablename == 'maharastra.csv': # Fix misspelling
         tablename = 'maharashtra.csv'
     new_df = clean_df(ref_df) # Clean reference data
-    new_df.to_csv('./all data/reference/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder
+    new_df.to_csv('../data/reference_states_cleaned/{}'.format(tablename), index=False) # Save cleaned reference data to disk in reference folder
 print("Finished Citizen and Reference Data Cleaning")
diff --git a/code/mean_transition_times_data_generation.ipynb b/code/mean_transition_times_data_generation.ipynb
@@ -83,7 +83,7 @@
    "outputs": [],
    "source": [
     "# load in name_to_id lookup table\n",
-    "name_to_id_df = pd.read_csv('species codes.csv', encoding='unicode_escape')\n",
+    "name_to_id_df = pd.read_csv('../data/species codes.csv', encoding='unicode_escape')\n",
     "id_to_name_dict = {}\n",
     "for _, row in name_to_id_df.iterrows():\n",
     "    name = \"{}-{}\".format(row['species_primary_common_name'], row['species_scientific_name']).lower().replace(' ', '')\n",
@@ -248,7 +248,7 @@
     "'''\n",
     "def plot_score_and_pcts(species_id, attr, L=5, M=15, w_1=0.25, w_2=1):\n",
     "    # load state dataframe and percent of observations of attribute each week\n",
-    "    state_df = pd.read_csv('all data/citizen/kerala.csv')\n",
+    "    state_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
     "    pcts = []\n",
     "    for year in range(2018, 2024):\n",
     "        pcts += get_percent_of_positive_observations_for_each_week(state_df, year, species_id, attr)\n",
@@ -5355,7 +5355,7 @@
     "w_2 = 1\n",
     "\n",
     "# load in all data from kerala\n",
-    "kerala_df = pd.read_csv('all data/citizen/kerala.csv')\n",
+    "kerala_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
     "\n",
     "# get 10 most common species ids\n",
     "top_10_ids = kerala_df['Species_id'].value_counts().index.tolist()[:10]\n",
@@ -5464,7 +5464,7 @@
     "                   1161: ['Flowers_bud', 'Flowers_male', 'Flowers_Female', 'Fruits_unripe', 'Fruits_ripe']}\n",
     "\n",
     "# load all kerala observations\n",
-    "kerala_df = pd.read_csv('all data/citizen/kerala.csv')\n",
+    "kerala_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
     "\n",
     "# get a list of all transition dataframes, for each species_id / attr pair in transition_dict\n",
     "transition_dfs = []\n",
@@ -5474,7 +5474,7 @@
     "\n",
     "# join all dataframes together\n",
     "transition_time_df_mango_jack = pd.concat(transition_dfs, ignore_index=True)\n",
-    "transition_time_df_mango_jack.to_csv(\"average_transition_times.csv\", index=False)"
+    "transition_time_df_mango_jack.to_csv(\"../data/average_transition_times.csv\", index=False)"
    ]
   },
   {
@@ -5937,7 +5937,7 @@
     "    twin.legend(loc='upper right')\n",
     "    plt.show()\n",
     "\n",
-    "kerala_df = pd.read_csv('all data/citizen/kerala.csv')\n",
+    "kerala_df = pd.read_csv('../data/citizen_states_cleaned/kerala.csv')\n",
     "for species_id in [1090, 1161]:\n",
     "    for attr_list in list(transition_dict.values()):\n",
     "        for attr in attr_list:\n",
@@ -5961,7 +5961,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

diff --git a/code/selecting_reference_data.ipynb b/code/selecting_reference_data.ipynb
@@ -38,18 +38,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def reformat_state_name(state_filename):\n",
+    "def reformat_state_name(state_name):\n",
     "    \"\"\"\n",
-    "    Reformats state_filename (e.g. andaman_and_nicobar_islands) to properly punctuated state names (e.g. Andaman and Nicobar Islands)\n",
+    "    Reformats state_name (e.g. andaman_and_nicobar_islands) to properly punctuated state names (e.g. Andaman and Nicobar Islands)\n",
     "\n",
     "    Args:\n",
-    "        state_filename (string): State name according to CSV files\n",
+    "        state_name (string): State name according to CSV files\n",
     "    Returns:\n",
     "        reformatted_state_name (string): State name according to original citizen data\n",
     "    \"\"\"\n",
     "    \n",
     "    # Replace underscores with spaces & capitalize all words except 'and'\n",
-    "    reformatted_state_name = ' '.join([word.capitalize() if word != 'and' else word for word in state_filename.replace('_', ' ').split()])\n",
+    "    reformatted_state_name = ' '.join([word.capitalize() if word != 'and' else word for word in state_name.replace('_', ' ').split()])\n",
     "    return reformatted_state_name\n",
     "\n",
     "def select_reference_data(state, species, year, k=1):\n",
@@ -65,7 +65,7 @@
     "        selected_ref_df (DataFrame): Selected reference data in format of original reference data\n",
     "    \"\"\"\n",
     "    \n",
-    "    df = pd.read_csv(f\"all data/citizen/{state}.csv\")\n",
+    "    df = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")\n",
     "    df = df[df[\"Species_name\"] == species]\n",
     "    species_id = df['Species_id'].iloc[0] # Get the id for the given species\n",
     "    df = df[df[\"Year\"] == int(year)]\n",
@@ -138,7 +138,7 @@
     "        presence_pcts (Dict(int,float)): Percentage of observations indicating phenophase presence for each week\n",
     "    \"\"\"\n",
     "    \n",
-    "    df = pd.read_csv(f\"all data/citizen/{state}.csv\")\n",
+    "    df = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")\n",
     "    df = df[df[\"Species_name\"] == species]\n",
     "    df = df[df[\"Year\"] == int(year)]\n",
     "    df = df.drop([\"Lat\", \"Long\", \"Date_of_observation\", \"Observation_ID\", \"User_id\", \"User_Tree_id\", \"Species_id\", \"Species_name\", \"Year\"], axis=1)\n",
@@ -166,7 +166,7 @@
     "    \"\"\"\n",
     "    \n",
     "    for state in states:\n",
-    "        species_in_state = pd.read_csv(f\"all data/citizen/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
+    "        species_in_state = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
     "        state_start_time = time.time()\n",
     "        for species in species_in_state:\n",
     "            species_start_time = time.time()\n",
@@ -292,8 +292,8 @@
     }
    ],
    "source": [
-    "plot_path = \"plots/selected_ref_vs_cit\" # Path for where plots will be stored\n",
-    "states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"all data/citizen\")]\n",
+    "plot_path = \"../plots/selected_ref_vs_cit\" # Path for where plots will be stored\n",
+    "states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"../data/citizen_states_cleaned\")]\n",
     "n_species = 10 # Top n most prevalent species within the given state\n",
     "years = [2018,2019,2020,2021,2022,2023]\n",
     "k = 3\n",
@@ -329,7 +329,7 @@
     "    \"\"\"\n",
     "    ref_df_list = []\n",
     "    for state in states:\n",
-    "        species_in_state = pd.read_csv(f\"all data/citizen/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
+    "        species_in_state = pd.read_csv(f\"../data/citizen_states_cleaned/{state}.csv\")['Species_name'].value_counts().index[:n_species] # Top n most prevalent species in the given state\n",
     "        state_start_time = time.time()\n",
     "        for species in species_in_state:\n",
     "            species_start_time = time.time()\n",
@@ -430,11 +430,11 @@
     }
    ],
    "source": [
-    "states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"all data/citizen\")]\n",
+    "states = ['kerala'] # use the following instead for all states: [state.replace('.csv','') for state in os.listdir(\"../data/citizen_states_cleaned\")]\n",
     "n_species = 10 # Top n most prevalent species within the given state\n",
     "years = [2018,2019,2020,2021,2022,2023]\n",
     "k = 3\n",
-    "create_selected_ref_df(states, n_species, years, k).to_csv(\"selected_reference_data.csv\")"
+    "create_selected_ref_df(states, n_species, years, k).to_csv(\"../data/selected_reference_data.csv\")"
    ]
   }
  ],

diff --git a/code/validation_labels_README_key.md b/code/validation_labels_README_key.md
@@ -1,4 +1,4 @@
-### `validation_labels_alldata.csv` is a copy of alldata.csv with a new column `validation_label` which labels the observations that were dropped from the citizen data in our team's data cleaning process. The reason for dropping each observation is given by the validation label's value. The meanings of these values are listed in the key below:
+`validation_labels_alldata.csv` is a copy of alldata.csv with a new column `validation_label` which labels the observations that were dropped from the citizen data in our team's data cleaning process. The reason for dropping each observation is given by the validation label's value. The meanings of these values are listed in the key below:
 
 ## Key for `validation_label` Column