Skip to content

Commit

Permalink
Fixed minor bugs in Entity Bridge app
Browse files Browse the repository at this point in the history
  • Loading branch information
Cybonto committed Dec 12, 2024
1 parent da1eaed commit 2fe06c7
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 11 deletions.
14 changes: 10 additions & 4 deletions streamlit_app/app/entity_bridge/data_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,17 @@ def normalize_ids(df, selected_fields):
else:
# Generate unique Parent IDs based on Parent Names
parent_name_field = selected_fields['parent_name']
if df[parent_name_field].isnull().any():
st.error(f"Missing values in '{parent_name_field}'. Cannot generate IDs for these entries.")
df['generated_parent_id'] = df[parent_name_field].apply(lambda x: generate_unique_identifier())
selected_fields['parent_id'] = 'generated_parent_id'
log_normalization_actions(actions_log, "Generated unique Parent IDs based on Parent Names.")

# Normalize Child IDs, if present
if selected_fields.get('child_name'):
child_name_field = selected_fields['child_name']
if df[child_name_field].isnull().any():
st.error(f"Missing values in '{child_name_field}'. Cannot generate IDs for these entries.")
if selected_fields.get('child_id'):
child_id_field = selected_fields['child_id']
if df[child_id_field].isnull().any():
Expand Down Expand Up @@ -155,7 +159,8 @@ def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, ch
selected_fields (dict): Dictionary containing field names:
- 'parent_name': Parent Name field name.
- 'child_name': Child Name field name (optional).
custom_stopwords (list, optional): List of custom stopwords to remove from names.
parent_custom_stopwords (list,optional): List of custom stopwords for parent names.
child_custom_stopwords (list,optional): List of custom stopwords for child names.
Returns:
DataFrame: The DataFrame with normalized names.
Expand All @@ -172,7 +177,7 @@ def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, ch

# Normalize Parent Names
df[parent_name_field] = df[parent_name_field].apply(
lambda x: normalize_text(x, custom_stopwords=parent_custom_stopwords)
lambda x: normalize_text(x, custom_stopwords=parent_custom_stopwords) if pd.notnull(x) else x
)
log_normalization_actions(actions_log, f"Normalized Parent Names in '{parent_name_field}'.")

Expand All @@ -183,7 +188,7 @@ def normalize_entity_names(df, selected_fields, parent_custom_stopwords=None, ch
log_normalization_actions(actions_log, f"Copied '{child_name_field}' to '{child_name_field}_original'.")

df[child_name_field] = df[child_name_field].apply(
lambda x: normalize_text(x, custom_stopwords=child_custom_stopwords)
lambda x: normalize_text(x, custom_stopwords=child_custom_stopwords) if pd.notnull(x) else x
)
log_normalization_actions(actions_log, f"Normalized Child Names in '{child_name_field}'.")

Expand All @@ -201,7 +206,8 @@ def normalize_data_frames(data_frames, parent_custom_stopwords=None, child_custo
Args:
data_frames (list): List of tuples (DataFrame, selected_fields).
custom_stopwords (list, optional): List of custom stopwords to remove from names.
parent_custom_stopwords (list,optional): List of custom stopwords for parent names.
child_custom_stopwords (list,optional): List of custom stopwords for child names.
Returns:
list: List of normalized DataFrames with updated selected_fields.
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app/app/entity_bridge/duplicate_remover.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def remove_duplicates(df, selected_fields):
# Remove duplicates
df_no_duplicates = df.drop_duplicates(subset=columns_to_check, keep='first')

num_duplicates_removed = len(duplicates) // 2 # Since keep='first', half of the duplicates are removed
num_duplicates_removed = len(df) - len(df_no_duplicates) // 2 # Since keep='first', half of the duplicates are removed
if num_duplicates_removed > 0:
log_normalization_actions(actions_log, f"Removed {num_duplicates_removed} duplicate rows based on fields {columns_to_check}")
st.write(f"Removed {num_duplicates_removed} duplicate rows.")
Expand Down
20 changes: 14 additions & 6 deletions streamlit_app/app/pages/Entity_Bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
from entity_bridge import ui_helper

if 'proceed1' not in st.session_state:
st.session_state['proceed1'] = None
st.session_state['proceed1'] = False
if 'proceed2' not in st.session_state:
st.session_state['proceed2'] = None
st.session_state['proceed2'] = False

def process_file(file, idx):
"""
Expand Down Expand Up @@ -92,20 +92,28 @@ def process_file(file, idx):
else:
st.error(f"Failed to process file {file.name}.")

if data_frames:
st.session_state['proceed1']=True

if st.button("Reset",key="reset1"):
st.session_state['proceed1']=False
st.session_state['proceed2']=False

if data_frames or st.session_state['proceed1']:
if st.session_state['proceed1']:
st.header("Normalizing Data and Checking for Similar Names")

# Get custom stopwords from the user
parent_custom_stopwords, child_custom_stopwords = ui_helper.get_custom_stopwords()

# Step 3: Normalize IDs and Names, check and merge similar names within data frames
normalized_data_frames = data_normalizer.normalize_data_frames(
data_frames,
parent_custom_stopwords=parent_custom_stopwords,
child_custom_stopwords=child_custom_stopwords
)

st.session_state['proceed2'] = st.button("Proceed with later steps")
if st.button("Proceed with later steps"):
st.session_state['proceed2'] = True
if st.button("Reset",key="reset2"):
st.session_state['proceed1']=False
st.session_state['proceed2']=False
Expand All @@ -130,5 +138,5 @@ def process_file(file, idx):
# Step 8: Display Enriched DataFrames
ui_helper.display_enriched_data(enriched_data_frames)

# Step 9: Download Enriched DataFrames
ui_helper.download_enriched_data(enriched_data_frames)
# Step 9: Download Enriched DataFrames
ui_helper.download_enriched_data(enriched_data_frames)

0 comments on commit 2fe06c7

Please sign in to comment.