improved program flows - Entity Bridge

GSA · Dec 4, 2024 · a2e2c36 · a2e2c36
1 parent 2395c8b
commit a2e2c36
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 30 deletions.
diff --git a/streamlit_app/app/entity_bridge/data_normalizer.py b/streamlit_app/app/entity_bridge/data_normalizer.py
@@ -146,7 +146,7 @@ def normalize_ids(df, selected_fields):
     return df, selected_fields
 
 
-def normalize_entity_names(df, selected_fields, custom_stopwords=None):
+def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, child_custom_stopwords=None):
     """
     Normalize entity names in the DataFrame by applying various text preprocessing steps.
 
@@ -171,7 +171,9 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None):
     log_normalization_actions(actions_log, f"Copied '{parent_name_field}' to '{parent_name_field}_original'.")
 
     # Normalize Parent Names
-    df[parent_name_field] = df[parent_name_field].apply(lambda x: normalize_text(x, custom_stopwords))
+    df[parent_name_field] = df[parent_name_field].apply(
+        lambda x: normalize_text(x, custom_stopwords=parent_custom_stopwords)
+        )
     log_normalization_actions(actions_log, f"Normalized Parent Names in '{parent_name_field}'.")
 
     # If Child Names are present, normalize them
@@ -180,7 +182,9 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None):
         df[f'{child_name_field}_original'] = df[child_name_field]
         log_normalization_actions(actions_log, f"Copied '{child_name_field}' to '{child_name_field}_original'.")
 
-        df[child_name_field] = df[child_name_field].apply(lambda x: normalize_text(x, custom_stopwords))
+        df[child_name_field] = df[child_name_field].apply(
+            lambda x: normalize_text(x, custom_stopwords=child_custom_stopwords)
+            )
         log_normalization_actions(actions_log, f"Normalized Child Names in '{child_name_field}'.")
 
     # Display the normalization actions log
@@ -191,8 +195,7 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None):
 
     return df
 
-
-def normalize_data_frames(data_frames, custom_stopwords=None):
+def normalize_data_frames(data_frames, parent_custom_stopwords=None, child_custom_stopwords=None):
     """
     Apply normalization to a list of DataFrames.
 
@@ -215,7 +218,11 @@ def normalize_data_frames(data_frames, custom_stopwords=None):
         df, selected_fields = normalize_ids(df, selected_fields)
 
         # Normalize Entity Names
-        df = normalize_entity_names(df, selected_fields, custom_stopwords)
+        df = normalize_entity_names(
+            df,
+            selected_fields,
+            parent_custom_stopwords=parent_custom_stopwords,
+            child_custom_stopwords=child_custom_stopwords)
 
         # Check and merge similar parent names
         df, parent_name_mapping = check_and_merge_similar_names(

diff --git a/streamlit_app/app/entity_bridge/ui_helper.py b/streamlit_app/app/entity_bridge/ui_helper.py
@@ -33,6 +33,37 @@ def display_file_upload():
         st.info(f"{len(uploaded_files)} files were uploaded.")
     return uploaded_files
 
+def get_custom_stopwords():
+    """
+    Display input boxes for custom stop words for parent and child names.
+
+    Returns:
+        tuple: Two lists containing custom stop words for parent names and child names.
+
+    Side Effects:
+        Displays input boxes in the Streamlit UI
+    """
+    st.subheader("Custom Stop Words for Name Normalization")
+
+    parent_stopwords_input = st.text_input(
+        "Enter stop words for Parent Names (comma-separated):",
+        value='',
+        key='parent_custom_stopwords'
+    )
+
+    child_stopwords_input = st.text_input(
+        "Enter stop words for Child Names (comma-separated):",
+        value='',
+        key='child_custom_stopwords'
+    )
+
+    # Parse inputs to lists of stop words
+    parent_custom_stopwords = [word.strip() for word in parent_stopwords_input.split(',') if word.strip()]
+    child_custom_stopwords = [word.strip() for word in child_stopwords_input.split(',') if word.strip()]
+
+    return parent_custom_stopwords, child_custom_stopwords
+
+
 def display_missing_data_options(idx, file_name):
     """
     Display options for handling missing data and return the user's choice.

diff --git a/streamlit_app/app/pages/Entity_Bridge.py b/streamlit_app/app/pages/Entity_Bridge.py
@@ -12,6 +12,11 @@
 from entity_bridge import entity_matcher
 from entity_bridge import ui_helper
 
+if 'proceed1' not in st.session_state:
+    st.session_state['proceed1'] = None
+if 'proceed2' not in st.session_state:
+    st.session_state['proceed2'] = None
+
 def process_file(file, idx):
     """
     Process a single uploaded file, including loading, handling missing data,
@@ -86,34 +91,53 @@ def process_file(file, idx):
             data_frames.append((df_selected, selected_fields))
         else:
             st.error(f"Failed to process file {file.name}.")
+    # Ask user to input custom stopwords (optional) for further processing
+    parent_custom_stopwords, child_custom_stopwords = ui_helper.get_custom_stopwords()
+    st.session_state['proceed1'] = st.button("Proceed with Normalizing Names")
 else:
     st.warning("Please upload at least two files to proceed.")
 
-if data_frames:
+if st.button("Reset",key="reset1"):
+    st.session_state['proceed1']=False
+    st.session_state['proceed2']=False
+
+if data_frames or st.session_state['proceed1']:
     st.header("Normalizing Data and Checking for Similar Names")
     # Step 3: Normalize IDs and Names, check and merge similar names within data frames
-    normalized_data_frames = data_normalizer.normalize_data_frames(data_frames)
-
-    st.header("Removing Duplicates from Data Frames")
-    # Step 4: Remove Duplicates (now includes displaying duplicates and removed rows)
-    deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames)
-
-    st.header("Matching Entities Across Data Frames and Assigning Unique Identifiers")
-    # Step 5: Construct Unique Parent List
-    unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames)
-
-    # Step 6: Construct Unique Child List
-    unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames)
-
-    # Step 7: Enrich DataFrames with Unique IDs
-    enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids(
-        deduplicated_data_frames, unique_parents_df, unique_children_df
-    )
-
-    # Step 8: Display Enriched DataFrames
-    ui_helper.display_enriched_data(enriched_data_frames)
-
-    # Step 9: Download Enriched DataFrames
-    ui_helper.download_enriched_data(enriched_data_frames)
+    normalized_data_frames = data_normalizer.normalize_data_frames(
+        data_frames,
+        parent_custom_stopwords=parent_custom_stopwords,
+        child_custom_stopwords=child_custom_stopwords
+        )
+
+    st.session_state['proceed2'] = st.button("Proceed with later steps")
+    if st.button("Reset",key="reset2"):
+        st.session_state['proceed1']=False
+        st.session_state['proceed2']=False
+
+    if st.session_state['proceed2']:
+        st.header("Removing Duplicates from Data Frames")
+        # Step 4: Remove Duplicates (now includes displaying duplicates and removed rows)
+        deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames)
+
+        st.header("Matching Entities Across Data Frames and Assigning Unique Identifiers")
+        # Step 5: Construct Unique Parent List
+        unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames)
+
+        # Step 6: Construct Unique Child List
+        unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames)
+
+        # Step 7: Enrich DataFrames with Unique IDs
+        enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids(
+            deduplicated_data_frames, unique_parents_df, unique_children_df
+        )
+
+        # Step 8: Display Enriched DataFrames
+        ui_helper.display_enriched_data(enriched_data_frames)
+
+        # Step 9: Download Enriched DataFrames
+        ui_helper.download_enriched_data(enriched_data_frames)
+    else:
+        st.info("Click 'Proceed' to continue.")
 else:
     st.warning("Please upload at least two files to proceed.")