Skip to content

Commit

Permalink
update requested changes
Browse files Browse the repository at this point in the history
  • Loading branch information
mdsage1 committed Mar 22, 2024
1 parent 0b3de80 commit bc5dc82
Showing 1 changed file with 19 additions and 28 deletions.
47 changes: 19 additions & 28 deletions apps/openchallenges/edam-etl/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,43 +47,34 @@ def transform_to_dataframe(version: str) -> pd.DataFrame | None:
return None


def count_occurrences(identifier_pattern: str, df) -> int:
"""Count the number of pattern occurrences"""
return (
df["class_id"]
.str.contains(identifier_pattern, case=False, na=False, regex=True)
.sum()
)


def print_info_statistics(df: pd.DataFrame) -> None:
"""Gather data about the EDAM ontology"""
if df is not None:
print(f"Number of Concepts Transformed: {len(df)}")
print(f"Column names: {df.columns.tolist()}")

# Create regex patterns for each concept
data_pattern = r"/data_"
operation_pattern = r"/operation_"
format_pattern = r"/format_"
topic_pattern = r"/topic_"
identifier_pattern = r"/identifier_"
data_pattern = r"/data_\d+$"
operation_pattern = r"/operation_\d+$"
format_pattern = r"/format_\d+$"
topic_pattern = r"/topic_\d+$"
identifier_pattern = r"/identifier_\d+$"

# Use pandas' vectorized string operations to count occurrences
data_count = (
df["class_id"]
.str.contains(data_pattern, case=False, na=False, regex=True)
.sum()
)
operation_count = (
df["class_id"].str.contains(operation_pattern, case=False, na=False).sum()
)
format_count = (
df["class_id"]
.str.contains(format_pattern, case=False, na=False, regex=True)
.sum()
)
topic_count = (
df["class_id"]
.str.contains(topic_pattern, case=False, na=False, regex=True)
.sum()
)
identifier_count = (
df["class_id"]
.str.contains(identifier_pattern, case=False, na=False, regex=True)
.sum()
)
data_count = count_occurrences(data_pattern, df)
operation_count = count_occurrences(operation_pattern, df)
format_count = count_occurrences(format_pattern, df)
topic_count = count_occurrences(topic_pattern, df)
identifier_count = count_occurrences(identifier_pattern, df)

# Calculate 'other' count by subtracting the specific counts from the total
other_count = len(df) - (
Expand Down

0 comments on commit bc5dc82

Please sign in to comment.