Skip to content

Commit

Permalink
Merge pull request #65 from transferwise/llm_friendly_summary
Browse files Browse the repository at this point in the history
Generation of LLM-friendly summary of the analysis
  • Loading branch information
AlxdrPolyakov authored Jan 20, 2025
2 parents 60c2f15 + d556287 commit 22a3d2c
Show file tree
Hide file tree
Showing 7 changed files with 951 additions and 380 deletions.
163 changes: 6 additions & 157 deletions notebooks/Finding interesting segments in time series.ipynb

Large diffs are not rendered by default.

863 changes: 664 additions & 199 deletions notebooks/Finding interesting segments.ipynb

Large diffs are not rendered by default.

35 changes: 14 additions & 21 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
from setuptools import find_packages, setup

with open('README.md') as f:
with open("README.md") as f:
long_description = f.read()

setup(
name="wise-pizza",
version="0.2.3",
description="A library to find and visualise the most interesting slices in multidimensional data",
long_description=long_description,
long_description_content_type='text/markdown',
long_description_content_type="text/markdown",
author="Wise",
url='https://github.com/transferwise/wise-pizza',
url="https://github.com/transferwise/wise-pizza",
classifiers=[
'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
],
install_requires=[
"ipython",
Expand All @@ -29,27 +29,20 @@
"tqdm",
"cloudpickle",
"pivottablejs",
"streamlit==1.28.0"
"streamlit>=1.28.0",
],
extras_require={
"test": [
"flake8",
"pytest",
"pytest-cov"
],
"test": ["flake8", "pytest", "pytest-cov"],
},
packages=find_packages(
include=[
'wise_pizza',
'wise_pizza.*'
],
exclude=['tests*'],
include=["wise_pizza", "wise_pizza.*"],
exclude=["tests*"],
),
entry_points={
'console_scripts': [
'run_wise_pizza_streamlit = wise_pizza.run_streamlit_app_entry_point:main',
"console_scripts": [
"run_wise_pizza_streamlit = wise_pizza.run_streamlit_app_entry_point:main",
],
},
include_package_data=True,
keywords='wise-pizza',
keywords="wise-pizza",
)
82 changes: 82 additions & 0 deletions wise_pizza/dataframe_with_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import logging

import pandas as pd

logger = logging.getLogger(__name__)


class DataFrameWithMetadata(pd.DataFrame):
def __init__(
self,
*args,
name: str = None,
description: str = None,
column_descriptions=None,
**kwargs,
):
super().__init__(*args, **kwargs)

self.attrs["name"] = name or "" # Store DataFrame name
self.attrs["description"] = description or "" # Store DataFrame description
self.attrs["column_descriptions"] = {}

if column_descriptions:
column_descriptions = {
k: v for k, v in column_descriptions.items() if k in self.columns
}
if column_descriptions:
self.attrs["column_descriptions"] = column_descriptions
else:
logger.warning(
"None of the column descriptions provided matched the DataFrame columns"
)

def to_markdown(self, index: bool = True, **kwargs):
# Start with DataFrame description if it exists
output = []
if self.attrs["name"]:
output.append(f"Table name: {self.attrs['name']}\n")

if self.attrs["description"]:
output.append(f"Table description: {self.attrs['description']}\n")

if not self.attrs["column_descriptions"]:
output.append(super().to_markdown(index=index, **kwargs))
return "\n".join(output)

desc_row = " | ".join(
(["---"] if index else [])
+ [self.attrs["column_descriptions"].get(col, "") for col in self.columns]
)
original_md = super().to_markdown(index=index, **kwargs)
header_end = original_md.index("\n|")
output.append(
original_md[:header_end] + "\n|" + desc_row + original_md[header_end:]
)
return "\n".join(output)

def head(self, n: int = 5):
out = DataFrameWithMetadata(super().head(n))
out.attrs = self.attrs
return out


if __name__ == "__main__":
# Usage example:
df = DataFrameWithMetadata(
{"a": [1, 2], "b": [3, 4]},
description="Description for the DataFrame",
name="DataFrame Name",
column_descriptions={
"a": "Description for column a",
"b": "Description for column b",
},
)

md = df.to_markdown()
print(md)
md2 = df.to_markdown(index=False)
print(md2)
print("yay!")
# This would raise an error:
# df = DescribedDataFrame({'a': [1]}, descriptions={'nonexistent': 'Description'})
34 changes: 34 additions & 0 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def explain_changes_in_average(
dims: List[str],
total_name: str,
size_name: str,
average_name: Optional[str] = None,
min_segments: Optional[int] = None,
max_segments: int = None,
min_depth: int = 1,
Expand Down Expand Up @@ -72,6 +73,7 @@ def explain_changes_in_average(
@param verbose: If set to a truish value, lots of debug info is printed to console
@return: A fitted object
"""

df1 = df1.copy()
df2 = df2.copy()

Expand Down Expand Up @@ -111,6 +113,9 @@ def explain_changes_in_average(
verbose=verbose,
)

if hasattr(df1, "attrs"):
sf.data_attrs = df1.attrs

if hasattr(sf, "pre_total"):
sf.pre_total = avg1
sf.post_total += avg1
Expand All @@ -124,6 +129,9 @@ def explain_changes_in_average(

# And might want to relabel some plots?
sf.task = "changes in average"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
return sf


Expand All @@ -133,6 +141,7 @@ def explain_changes_in_totals(
dims: List[str],
total_name: str,
size_name: str,
average_name: Optional[str] = None,
min_segments: Optional[int] = None,
max_segments: int = None,
min_depth: int = 1,
Expand Down Expand Up @@ -211,6 +220,8 @@ def explain_changes_in_totals(
cluster_values=cluster_values,
verbose=verbose,
)
if hasattr(df1, "attrs"):
sf_size.data_attrs = df1.attrs

sf_avg = explain_levels(
df=df_avg.data,
Expand All @@ -227,6 +238,9 @@ def explain_changes_in_totals(
verbose=verbose,
)

if hasattr(df1, "attrs"):
sf_avg.data_attrs = df1.attrs

sf_size.final_size = final_size
sf_avg.final_size = final_size
sp = SlicerPair(sf_size, sf_avg)
Expand Down Expand Up @@ -274,6 +288,11 @@ def explain_changes_in_totals(
return_fig=return_fig,
)
sf.task = "changes in totals"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
if hasattr(df1, "attrs"):
sf.data_attrs = df1.attrs
return sf


Expand All @@ -282,6 +301,7 @@ def explain_levels(
dims: List[str],
total_name: str,
size_name: Optional[str] = None,
average_name: Optional[str] = None,
min_segments: int = None,
max_segments: int = None,
min_depth: int = 1,
Expand Down Expand Up @@ -343,6 +363,9 @@ def explain_levels(
cluster_values=cluster_values,
)

if hasattr(df, "attrs"):
sf.data_attrs = df.attrs

for s in sf.segments:
s["naive_avg"] += average
s["total"] += average * s["seg_size"]
Expand All @@ -358,6 +381,9 @@ def explain_levels(
cluster_value_width=cluster_value_width,
)
sf.task = "levels"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
return sf


Expand All @@ -367,6 +393,7 @@ def explain_timeseries(
total_name: str,
time_name: str,
size_name: Optional[str] = None,
average_name: Optional[str] = None,
num_segments: int = None,
max_depth: int = 2,
solver: str = "tree",
Expand Down Expand Up @@ -401,6 +428,7 @@ def explain_timeseries(
assert (
solver == "tree"
), "Only the tree solver is supported for time series at the moment"
attrs = getattr(df, "attrs", None)
df = copy.copy(df)

# replace NaN values in numeric columns with zeros
Expand Down Expand Up @@ -518,6 +546,9 @@ def explain_timeseries(
n_jobs=n_jobs,
)

if hasattr(df, "attrs"):
sf.data_attrs = attrs

# TODO: insert back the normalized bits?
for s in sf.segments:
segment_def = s["segment"]
Expand Down Expand Up @@ -557,4 +588,7 @@ def explain_timeseries(
average_name=average_name,
)
sf.task = "time"
sf.size_name = size_name
sf.total_name = total_name
sf.average_name = average_name
return sf
Loading

0 comments on commit 22a3d2c

Please sign in to comment.