diff --git a/streamlit_app/app/entity_bridge/data_normalizer.py b/streamlit_app/app/entity_bridge/data_normalizer.py index 5ba060e..623d3ab 100644 --- a/streamlit_app/app/entity_bridge/data_normalizer.py +++ b/streamlit_app/app/entity_bridge/data_normalizer.py @@ -146,7 +146,7 @@ def normalize_ids(df, selected_fields): return df, selected_fields -def normalize_entity_names(df, selected_fields, custom_stopwords=None): +def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, child_custom_stopwords=None): """ Normalize entity names in the DataFrame by applying various text preprocessing steps. @@ -171,7 +171,9 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None): log_normalization_actions(actions_log, f"Copied '{parent_name_field}' to '{parent_name_field}_original'.") # Normalize Parent Names - df[parent_name_field] = df[parent_name_field].apply(lambda x: normalize_text(x, custom_stopwords)) + df[parent_name_field] = df[parent_name_field].apply( + lambda x: normalize_text(x, custom_stopwords=parent_custom_stopwords) + ) log_normalization_actions(actions_log, f"Normalized Parent Names in '{parent_name_field}'.") # If Child Names are present, normalize them @@ -180,7 +182,9 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None): df[f'{child_name_field}_original'] = df[child_name_field] log_normalization_actions(actions_log, f"Copied '{child_name_field}' to '{child_name_field}_original'.") - df[child_name_field] = df[child_name_field].apply(lambda x: normalize_text(x, custom_stopwords)) + df[child_name_field] = df[child_name_field].apply( + lambda x: normalize_text(x, custom_stopwords=child_custom_stopwords) + ) log_normalization_actions(actions_log, f"Normalized Child Names in '{child_name_field}'.") # Display the normalization actions log @@ -191,8 +195,7 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None): return df - -def normalize_data_frames(data_frames, custom_stopwords=None): +def normalize_data_frames(data_frames, parent_custom_stopwords=None, child_custom_stopwords=None): """ Apply normalization to a list of DataFrames. @@ -215,7 +218,11 @@ def normalize_data_frames(data_frames, custom_stopwords=None): df, selected_fields = normalize_ids(df, selected_fields) # Normalize Entity Names - df = normalize_entity_names(df, selected_fields, custom_stopwords) + df = normalize_entity_names( + df, + selected_fields, + parent_custom_stopwords=parent_custom_stopwords, + child_custom_stopwords=child_custom_stopwords) # Check and merge similar parent names df, parent_name_mapping = check_and_merge_similar_names( diff --git a/streamlit_app/app/entity_bridge/ui_helper.py b/streamlit_app/app/entity_bridge/ui_helper.py index 945c527..d7eeb92 100644 --- a/streamlit_app/app/entity_bridge/ui_helper.py +++ b/streamlit_app/app/entity_bridge/ui_helper.py @@ -33,6 +33,37 @@ def display_file_upload(): st.info(f"{len(uploaded_files)} files were uploaded.") return uploaded_files +def get_custom_stopwords(): + """ + Display input boxes for custom stop words for parent and child names. + + Returns: + tuple: Two lists containing custom stop words for parent names and child names. + + Side Effects: + Displays input boxes in the Streamlit UI + """ + st.subheader("Custom Stop Words for Name Normalization") + + parent_stopwords_input = st.text_input( + "Enter stop words for Parent Names (comma-separated):", + value='', + key='parent_custom_stopwords' + ) + + child_stopwords_input = st.text_input( + "Enter stop words for Child Names (comma-separated):", + value='', + key='child_custom_stopwords' + ) + + # Parse inputs to lists of stop words + parent_custom_stopwords = [word.strip() for word in parent_stopwords_input.split(',') if word.strip()] + child_custom_stopwords = [word.strip() for word in child_stopwords_input.split(',') if word.strip()] + + return parent_custom_stopwords, child_custom_stopwords + + def display_missing_data_options(idx, file_name): """ Display options for handling missing data and return the user's choice. diff --git a/streamlit_app/app/pages/Entity_Bridge.py b/streamlit_app/app/pages/Entity_Bridge.py index 86f4c6d..fcae27b 100644 --- a/streamlit_app/app/pages/Entity_Bridge.py +++ b/streamlit_app/app/pages/Entity_Bridge.py @@ -12,6 +12,11 @@ from entity_bridge import entity_matcher from entity_bridge import ui_helper +if 'proceed1' not in st.session_state: + st.session_state['proceed1'] = None +if 'proceed2' not in st.session_state: + st.session_state['proceed2'] = None + def process_file(file, idx): """ Process a single uploaded file, including loading, handling missing data, @@ -86,34 +91,53 @@ def process_file(file, idx): data_frames.append((df_selected, selected_fields)) else: st.error(f"Failed to process file {file.name}.") + # Ask user to input custom stopwords (optional) for further processing + parent_custom_stopwords, child_custom_stopwords = ui_helper.get_custom_stopwords() + st.session_state['proceed1'] = st.button("Proceed with Normalizing Names") else: st.warning("Please upload at least two files to proceed.") -if data_frames: +if st.button("Reset",key="reset1"): + st.session_state['proceed1']=False + st.session_state['proceed2']=False + +if data_frames or st.session_state['proceed1']: st.header("Normalizing Data and Checking for Similar Names") # Step 3: Normalize IDs and Names, check and merge similar names within data frames - normalized_data_frames = data_normalizer.normalize_data_frames(data_frames) - - st.header("Removing Duplicates from Data Frames") - # Step 4: Remove Duplicates (now includes displaying duplicates and removed rows) - deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames) - - st.header("Matching Entities Across Data Frames and Assigning Unique Identifiers") - # Step 5: Construct Unique Parent List - unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames) - - # Step 6: Construct Unique Child List - unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames) - - # Step 7: Enrich DataFrames with Unique IDs - enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids( - deduplicated_data_frames, unique_parents_df, unique_children_df - ) - - # Step 8: Display Enriched DataFrames - ui_helper.display_enriched_data(enriched_data_frames) - - # Step 9: Download Enriched DataFrames - ui_helper.download_enriched_data(enriched_data_frames) + normalized_data_frames = data_normalizer.normalize_data_frames( + data_frames, + parent_custom_stopwords=parent_custom_stopwords, + child_custom_stopwords=child_custom_stopwords + ) + + st.session_state['proceed2'] = st.button("Proceed with later steps") + if st.button("Reset",key="reset2"): + st.session_state['proceed1']=False + st.session_state['proceed2']=False + + if st.session_state['proceed2']: + st.header("Removing Duplicates from Data Frames") + # Step 4: Remove Duplicates (now includes displaying duplicates and removed rows) + deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames) + + st.header("Matching Entities Across Data Frames and Assigning Unique Identifiers") + # Step 5: Construct Unique Parent List + unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames) + + # Step 6: Construct Unique Child List + unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames) + + # Step 7: Enrich DataFrames with Unique IDs + enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids( + deduplicated_data_frames, unique_parents_df, unique_children_df + ) + + # Step 8: Display Enriched DataFrames + ui_helper.display_enriched_data(enriched_data_frames) + + # Step 9: Download Enriched DataFrames + ui_helper.download_enriched_data(enriched_data_frames) + else: + st.info("Click 'Proceed' to continue.") else: st.warning("Please upload at least two files to proceed.") \ No newline at end of file