Skip to content

Commit

Permalink
improved program flows - Entity Bridge
Browse files Browse the repository at this point in the history
  • Loading branch information
Cybonto committed Dec 4, 2024
1 parent 2395c8b commit a2e2c36
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 30 deletions.
19 changes: 13 additions & 6 deletions streamlit_app/app/entity_bridge/data_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def normalize_ids(df, selected_fields):
return df, selected_fields


def normalize_entity_names(df, selected_fields, custom_stopwords=None):
def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, child_custom_stopwords=None):
"""
Normalize entity names in the DataFrame by applying various text preprocessing steps.
Expand All @@ -171,7 +171,9 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None):
log_normalization_actions(actions_log, f"Copied '{parent_name_field}' to '{parent_name_field}_original'.")

# Normalize Parent Names
df[parent_name_field] = df[parent_name_field].apply(lambda x: normalize_text(x, custom_stopwords))
df[parent_name_field] = df[parent_name_field].apply(
lambda x: normalize_text(x, custom_stopwords=parent_custom_stopwords)
)
log_normalization_actions(actions_log, f"Normalized Parent Names in '{parent_name_field}'.")

# If Child Names are present, normalize them
Expand All @@ -180,7 +182,9 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None):
df[f'{child_name_field}_original'] = df[child_name_field]
log_normalization_actions(actions_log, f"Copied '{child_name_field}' to '{child_name_field}_original'.")

df[child_name_field] = df[child_name_field].apply(lambda x: normalize_text(x, custom_stopwords))
df[child_name_field] = df[child_name_field].apply(
lambda x: normalize_text(x, custom_stopwords=child_custom_stopwords)
)
log_normalization_actions(actions_log, f"Normalized Child Names in '{child_name_field}'.")

# Display the normalization actions log
Expand All @@ -191,8 +195,7 @@ def normalize_entity_names(df, selected_fields, custom_stopwords=None):

return df


def normalize_data_frames(data_frames, custom_stopwords=None):
def normalize_data_frames(data_frames, parent_custom_stopwords=None, child_custom_stopwords=None):
"""
Apply normalization to a list of DataFrames.
Expand All @@ -215,7 +218,11 @@ def normalize_data_frames(data_frames, custom_stopwords=None):
df, selected_fields = normalize_ids(df, selected_fields)

# Normalize Entity Names
df = normalize_entity_names(df, selected_fields, custom_stopwords)
df = normalize_entity_names(
df,
selected_fields,
parent_custom_stopwords=parent_custom_stopwords,
child_custom_stopwords=child_custom_stopwords)

# Check and merge similar parent names
df, parent_name_mapping = check_and_merge_similar_names(
Expand Down
31 changes: 31 additions & 0 deletions streamlit_app/app/entity_bridge/ui_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,37 @@ def display_file_upload():
st.info(f"{len(uploaded_files)} files were uploaded.")
return uploaded_files

def get_custom_stopwords():
"""
Display input boxes for custom stop words for parent and child names.
Returns:
tuple: Two lists containing custom stop words for parent names and child names.
Side Effects:
Displays input boxes in the Streamlit UI
"""
st.subheader("Custom Stop Words for Name Normalization")

parent_stopwords_input = st.text_input(
"Enter stop words for Parent Names (comma-separated):",
value='',
key='parent_custom_stopwords'
)

child_stopwords_input = st.text_input(
"Enter stop words for Child Names (comma-separated):",
value='',
key='child_custom_stopwords'
)

# Parse inputs to lists of stop words
parent_custom_stopwords = [word.strip() for word in parent_stopwords_input.split(',') if word.strip()]
child_custom_stopwords = [word.strip() for word in child_stopwords_input.split(',') if word.strip()]

return parent_custom_stopwords, child_custom_stopwords


def display_missing_data_options(idx, file_name):
"""
Display options for handling missing data and return the user's choice.
Expand Down
72 changes: 48 additions & 24 deletions streamlit_app/app/pages/Entity_Bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
from entity_bridge import entity_matcher
from entity_bridge import ui_helper

if 'proceed1' not in st.session_state:
st.session_state['proceed1'] = None
if 'proceed2' not in st.session_state:
st.session_state['proceed2'] = None

def process_file(file, idx):
"""
Process a single uploaded file, including loading, handling missing data,
Expand Down Expand Up @@ -86,34 +91,53 @@ def process_file(file, idx):
data_frames.append((df_selected, selected_fields))
else:
st.error(f"Failed to process file {file.name}.")
# Ask user to input custom stopwords (optional) for further processing
parent_custom_stopwords, child_custom_stopwords = ui_helper.get_custom_stopwords()
st.session_state['proceed1'] = st.button("Proceed with Normalizing Names")
else:
st.warning("Please upload at least two files to proceed.")

if data_frames:
if st.button("Reset",key="reset1"):
st.session_state['proceed1']=False
st.session_state['proceed2']=False

if data_frames or st.session_state['proceed1']:
st.header("Normalizing Data and Checking for Similar Names")
# Step 3: Normalize IDs and Names, check and merge similar names within data frames
normalized_data_frames = data_normalizer.normalize_data_frames(data_frames)

st.header("Removing Duplicates from Data Frames")
# Step 4: Remove Duplicates (now includes displaying duplicates and removed rows)
deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames)

st.header("Matching Entities Across Data Frames and Assigning Unique Identifiers")
# Step 5: Construct Unique Parent List
unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames)

# Step 6: Construct Unique Child List
unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames)

# Step 7: Enrich DataFrames with Unique IDs
enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids(
deduplicated_data_frames, unique_parents_df, unique_children_df
)

# Step 8: Display Enriched DataFrames
ui_helper.display_enriched_data(enriched_data_frames)

# Step 9: Download Enriched DataFrames
ui_helper.download_enriched_data(enriched_data_frames)
normalized_data_frames = data_normalizer.normalize_data_frames(
data_frames,
parent_custom_stopwords=parent_custom_stopwords,
child_custom_stopwords=child_custom_stopwords
)

st.session_state['proceed2'] = st.button("Proceed with later steps")
if st.button("Reset",key="reset2"):
st.session_state['proceed1']=False
st.session_state['proceed2']=False

if st.session_state['proceed2']:
st.header("Removing Duplicates from Data Frames")
# Step 4: Remove Duplicates (now includes displaying duplicates and removed rows)
deduplicated_data_frames = duplicate_remover.remove_duplicates_from_data_frames(normalized_data_frames)

st.header("Matching Entities Across Data Frames and Assigning Unique Identifiers")
# Step 5: Construct Unique Parent List
unique_parents_df = entity_matcher.construct_unique_parent_list(deduplicated_data_frames)

# Step 6: Construct Unique Child List
unique_children_df = entity_matcher.construct_unique_child_list(deduplicated_data_frames)

# Step 7: Enrich DataFrames with Unique IDs
enriched_data_frames = entity_matcher.enrich_data_frames_with_unique_ids(
deduplicated_data_frames, unique_parents_df, unique_children_df
)

# Step 8: Display Enriched DataFrames
ui_helper.display_enriched_data(enriched_data_frames)

# Step 9: Download Enriched DataFrames
ui_helper.download_enriched_data(enriched_data_frames)
else:
st.info("Click 'Proceed' to continue.")
else:
st.warning("Please upload at least two files to proceed.")

0 comments on commit a2e2c36

Please sign in to comment.