Merge pull request #65 from transferwise/llm_friendly_summary

Generation of LLM-friendly summary of the analysis
transferwise · Jan 20, 2025 · 22a3d2c · 22a3d2c
2 parents 60c2f15 + d556287
commit 22a3d2c
Show file tree

Hide file tree

Showing 7 changed files with 951 additions and 380 deletions.
diff --git a/notebooks/Finding interesting segments in time series.ipynb b/notebooks/Finding interesting segments in time series.ipynb
diff --git a/notebooks/Finding interesting segments.ipynb b/notebooks/Finding interesting segments.ipynb
diff --git a/setup.py b/setup.py
@@ -1,21 +1,21 @@
 from setuptools import find_packages, setup
 
-with open('README.md') as f:
+with open("README.md") as f:
     long_description = f.read()
 
 setup(
     name="wise-pizza",
     version="0.2.3",
     description="A library to find and visualise the most interesting slices in multidimensional data",
     long_description=long_description,
-    long_description_content_type='text/markdown',
+    long_description_content_type="text/markdown",
     author="Wise",
-    url='https://github.com/transferwise/wise-pizza',
+    url="https://github.com/transferwise/wise-pizza",
     classifiers=[
-        'Programming Language :: Python :: 3 :: Only',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
     ],
     install_requires=[
         "ipython",
@@ -29,27 +29,20 @@
         "tqdm",
         "cloudpickle",
         "pivottablejs",
-        "streamlit==1.28.0"
+        "streamlit>=1.28.0",
     ],
     extras_require={
-        "test": [
-            "flake8",
-            "pytest",
-            "pytest-cov"
-        ],
+        "test": ["flake8", "pytest", "pytest-cov"],
     },
     packages=find_packages(
-        include=[
-            'wise_pizza',
-            'wise_pizza.*'
-        ],
-        exclude=['tests*'],
+        include=["wise_pizza", "wise_pizza.*"],
+        exclude=["tests*"],
     ),
     entry_points={
-        'console_scripts': [
-            'run_wise_pizza_streamlit = wise_pizza.run_streamlit_app_entry_point:main',
+        "console_scripts": [
+            "run_wise_pizza_streamlit = wise_pizza.run_streamlit_app_entry_point:main",
         ],
     },
     include_package_data=True,
-    keywords='wise-pizza',
+    keywords="wise-pizza",
 )
diff --git a/wise_pizza/dataframe_with_metadata.py b/wise_pizza/dataframe_with_metadata.py
@@ -0,0 +1,82 @@
+import logging
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+class DataFrameWithMetadata(pd.DataFrame):
+    def __init__(
+        self,
+        *args,
+        name: str = None,
+        description: str = None,
+        column_descriptions=None,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.attrs["name"] = name or ""  # Store DataFrame name
+        self.attrs["description"] = description or ""  # Store DataFrame description
+        self.attrs["column_descriptions"] = {}
+
+        if column_descriptions:
+            column_descriptions = {
+                k: v for k, v in column_descriptions.items() if k in self.columns
+            }
+            if column_descriptions:
+                self.attrs["column_descriptions"] = column_descriptions
+            else:
+                logger.warning(
+                    "None of the column descriptions provided matched the DataFrame columns"
+                )
+
+    def to_markdown(self, index: bool = True, **kwargs):
+        # Start with DataFrame description if it exists
+        output = []
+        if self.attrs["name"]:
+            output.append(f"Table name: {self.attrs['name']}\n")
+
+        if self.attrs["description"]:
+            output.append(f"Table description: {self.attrs['description']}\n")
+
+        if not self.attrs["column_descriptions"]:
+            output.append(super().to_markdown(index=index, **kwargs))
+            return "\n".join(output)
+
+        desc_row = " | ".join(
+            (["---"] if index else [])
+            + [self.attrs["column_descriptions"].get(col, "") for col in self.columns]
+        )
+        original_md = super().to_markdown(index=index, **kwargs)
+        header_end = original_md.index("\n|")
+        output.append(
+            original_md[:header_end] + "\n|" + desc_row + original_md[header_end:]
+        )
+        return "\n".join(output)
+
+    def head(self, n: int = 5):
+        out = DataFrameWithMetadata(super().head(n))
+        out.attrs = self.attrs
+        return out
+
+
+if __name__ == "__main__":
+    # Usage example:
+    df = DataFrameWithMetadata(
+        {"a": [1, 2], "b": [3, 4]},
+        description="Description for the DataFrame",
+        name="DataFrame Name",
+        column_descriptions={
+            "a": "Description for column a",
+            "b": "Description for column b",
+        },
+    )
+
+    md = df.to_markdown()
+    print(md)
+    md2 = df.to_markdown(index=False)
+    print(md2)
+    print("yay!")
+    # This would raise an error:
+    # df = DescribedDataFrame({'a': [1]}, descriptions={'nonexistent': 'Description'})
diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
@@ -34,6 +34,7 @@ def explain_changes_in_average(
     dims: List[str],
     total_name: str,
     size_name: str,
+    average_name: Optional[str] = None,
     min_segments: Optional[int] = None,
     max_segments: int = None,
     min_depth: int = 1,
@@ -72,6 +73,7 @@ def explain_changes_in_average(
     @param verbose: If set to a truish value, lots of debug info is printed to console
     @return: A fitted object
     """
+
     df1 = df1.copy()
     df2 = df2.copy()
 
@@ -111,6 +113,9 @@ def explain_changes_in_average(
         verbose=verbose,
     )
 
+    if hasattr(df1, "attrs"):
+        sf.data_attrs = df1.attrs
+
     if hasattr(sf, "pre_total"):
         sf.pre_total = avg1
         sf.post_total += avg1
@@ -124,6 +129,9 @@ def explain_changes_in_average(
 
     # And might want to relabel some plots?
     sf.task = "changes in average"
+    sf.size_name = size_name
+    sf.total_name = total_name
+    sf.average_name = average_name
     return sf
 
 
@@ -133,6 +141,7 @@ def explain_changes_in_totals(
     dims: List[str],
     total_name: str,
     size_name: str,
+    average_name: Optional[str] = None,
     min_segments: Optional[int] = None,
     max_segments: int = None,
     min_depth: int = 1,
@@ -211,6 +220,8 @@ def explain_changes_in_totals(
             cluster_values=cluster_values,
             verbose=verbose,
         )
+        if hasattr(df1, "attrs"):
+            sf_size.data_attrs = df1.attrs
 
         sf_avg = explain_levels(
             df=df_avg.data,
@@ -227,6 +238,9 @@ def explain_changes_in_totals(
             verbose=verbose,
         )
 
+        if hasattr(df1, "attrs"):
+            sf_avg.data_attrs = df1.attrs
+
         sf_size.final_size = final_size
         sf_avg.final_size = final_size
         sp = SlicerPair(sf_size, sf_avg)
@@ -274,6 +288,11 @@ def explain_changes_in_totals(
             return_fig=return_fig,
         )
         sf.task = "changes in totals"
+        sf.size_name = size_name
+        sf.total_name = total_name
+        sf.average_name = average_name
+        if hasattr(df1, "attrs"):
+            sf.data_attrs = df1.attrs
         return sf
 
 
@@ -282,6 +301,7 @@ def explain_levels(
     dims: List[str],
     total_name: str,
     size_name: Optional[str] = None,
+    average_name: Optional[str] = None,
     min_segments: int = None,
     max_segments: int = None,
     min_depth: int = 1,
@@ -343,6 +363,9 @@ def explain_levels(
         cluster_values=cluster_values,
     )
 
+    if hasattr(df, "attrs"):
+        sf.data_attrs = df.attrs
+
     for s in sf.segments:
         s["naive_avg"] += average
         s["total"] += average * s["seg_size"]
@@ -358,6 +381,9 @@ def explain_levels(
         cluster_value_width=cluster_value_width,
     )
     sf.task = "levels"
+    sf.size_name = size_name
+    sf.total_name = total_name
+    sf.average_name = average_name
     return sf
 
 
@@ -367,6 +393,7 @@ def explain_timeseries(
     total_name: str,
     time_name: str,
     size_name: Optional[str] = None,
+    average_name: Optional[str] = None,
     num_segments: int = None,
     max_depth: int = 2,
     solver: str = "tree",
@@ -401,6 +428,7 @@ def explain_timeseries(
     assert (
         solver == "tree"
     ), "Only the tree solver is supported for time series at the moment"
+    attrs = getattr(df, "attrs", None)
     df = copy.copy(df)
 
     # replace NaN values in numeric columns with zeros
@@ -518,6 +546,9 @@ def explain_timeseries(
         n_jobs=n_jobs,
     )
 
+    if hasattr(df, "attrs"):
+        sf.data_attrs = attrs
+
     # TODO: insert back the normalized bits?
     for s in sf.segments:
         segment_def = s["segment"]
@@ -557,4 +588,7 @@ def explain_timeseries(
         average_name=average_name,
     )
     sf.task = "time"
+    sf.size_name = size_name
+    sf.total_name = total_name
+    sf.average_name = average_name
     return sf