Fixed minor bugs in Entity Bridge app

GSA · Dec 12, 2024 · 2fe06c7 · 2fe06c7
1 parent da1eaed
commit 2fe06c7
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 11 deletions.
diff --git a/streamlit_app/app/entity_bridge/data_normalizer.py b/streamlit_app/app/entity_bridge/data_normalizer.py
@@ -116,13 +116,17 @@ def normalize_ids(df, selected_fields):
     else:
         # Generate unique Parent IDs based on Parent Names
         parent_name_field = selected_fields['parent_name']
+        if df[parent_name_field].isnull().any():
+            st.error(f"Missing values in '{parent_name_field}'. Cannot generate IDs for these entries.")
         df['generated_parent_id'] = df[parent_name_field].apply(lambda x: generate_unique_identifier())
         selected_fields['parent_id'] = 'generated_parent_id'
         log_normalization_actions(actions_log, "Generated unique Parent IDs based on Parent Names.")
 
     # Normalize Child IDs, if present
     if selected_fields.get('child_name'):
         child_name_field = selected_fields['child_name']
+        if df[child_name_field].isnull().any():
+            st.error(f"Missing values in '{child_name_field}'. Cannot generate IDs for these entries.")
         if selected_fields.get('child_id'):
             child_id_field = selected_fields['child_id']
             if df[child_id_field].isnull().any():
@@ -155,7 +159,8 @@ def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, ch
         selected_fields (dict): Dictionary containing field names:
             - 'parent_name': Parent Name field name.
             - 'child_name': Child Name field name (optional).
-        custom_stopwords (list, optional): List of custom stopwords to remove from names.
+        parent_custom_stopwords (list,optional): List of custom stopwords for parent names.
+        child_custom_stopwords (list,optional): List of custom stopwords for child names.
 
     Returns:
         DataFrame: The DataFrame with normalized names.
@@ -172,7 +177,7 @@ def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, ch
 
     # Normalize Parent Names
     df[parent_name_field] = df[parent_name_field].apply(
-        lambda x: normalize_text(x, custom_stopwords=parent_custom_stopwords)
+        lambda x: normalize_text(x, custom_stopwords=parent_custom_stopwords) if pd.notnull(x) else x
         )
     log_normalization_actions(actions_log, f"Normalized Parent Names in '{parent_name_field}'.")
 
@@ -183,7 +188,7 @@ def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, ch
         log_normalization_actions(actions_log, f"Copied '{child_name_field}' to '{child_name_field}_original'.")
 
         df[child_name_field] = df[child_name_field].apply(
-            lambda x: normalize_text(x, custom_stopwords=child_custom_stopwords)
+            lambda x: normalize_text(x, custom_stopwords=child_custom_stopwords) if pd.notnull(x) else x
             )
         log_normalization_actions(actions_log, f"Normalized Child Names in '{child_name_field}'.")
 
@@ -201,7 +206,8 @@ def normalize_data_frames(data_frames, parent_custom_stopwords=None, child_custo
 
     Args:
         data_frames (list): List of tuples (DataFrame, selected_fields).
-        custom_stopwords (list, optional): List of custom stopwords to remove from names.
+        parent_custom_stopwords (list,optional): List of custom stopwords for parent names.
+        child_custom_stopwords (list,optional): List of custom stopwords for child names.
 
     Returns:
         list: List of normalized DataFrames with updated selected_fields.

diff --git a/streamlit_app/app/entity_bridge/duplicate_remover.py b/streamlit_app/app/entity_bridge/duplicate_remover.py
@@ -87,7 +87,7 @@ def remove_duplicates(df, selected_fields):
     # Remove duplicates
     df_no_duplicates = df.drop_duplicates(subset=columns_to_check, keep='first')
 
-    num_duplicates_removed = len(duplicates) // 2  # Since keep='first', half of the duplicates are removed
+    num_duplicates_removed = len(df) - len(df_no_duplicates) // 2  # Since keep='first', half of the duplicates are removed
     if num_duplicates_removed > 0:
         log_normalization_actions(actions_log, f"Removed {num_duplicates_removed} duplicate rows based on fields {columns_to_check}")
         st.write(f"Removed {num_duplicates_removed} duplicate rows.")

diff --git a/streamlit_app/app/pages/Entity_Bridge.py b/streamlit_app/app/pages/Entity_Bridge.py
@@ -13,9 +13,9 @@
 from entity_bridge import ui_helper
 
 if 'proceed1' not in st.session_state:
-    st.session_state['proceed1'] = None
+    st.session_state['proceed1'] = False
 if 'proceed2' not in st.session_state:
-    st.session_state['proceed2'] = None
+    st.session_state['proceed2'] = False
 
 def process_file(file, idx):
     """
@@ -92,20 +92,28 @@ def process_file(file, idx):
         else:
             st.error(f"Failed to process file {file.name}.")
 
+if data_frames:
+    st.session_state['proceed1']=True
+
 if st.button("Reset",key="reset1"):
     st.session_state['proceed1']=False
     st.session_state['proceed2']=False
 
-if data_frames or st.session_state['proceed1']:
+if st.session_state['proceed1']:
     st.header("Normalizing Data and Checking for Similar Names")
+
+    # Get custom stopwords from the user
+    parent_custom_stopwords, child_custom_stopwords = ui_helper.get_custom_stopwords()
+
     # Step 3: Normalize IDs and Names, check and merge similar names within data frames
     normalized_data_frames = data_normalizer.normalize_data_frames(
         data_frames,
         parent_custom_stopwords=parent_custom_stopwords,
         child_custom_stopwords=child_custom_stopwords
         )
 
-    st.session_state['proceed2'] = st.button("Proceed with later steps")
+    if st.button("Proceed with later steps"):
+        st.session_state['proceed2'] = True
     if st.button("Reset",key="reset2"):
         st.session_state['proceed1']=False
         st.session_state['proceed2']=False
@@ -130,5 +138,5 @@ def process_file(file, idx):
         # Step 8: Display Enriched DataFrames
         ui_helper.display_enriched_data(enriched_data_frames)
 
-    # Step 9: Download Enriched DataFrames
-    ui_helper.download_enriched_data(enriched_data_frames)
+        # Step 9: Download Enriched DataFrames
+        ui_helper.download_enriched_data(enriched_data_frames)