logging.

uwgraphics · Aug 6, 2024 · f54a0c5 · f54a0c5
1 parent 4a9b02e
commit f54a0c5
Show file tree

Hide file tree

Showing 13 changed files with 147 additions and 59 deletions.
diff --git a/data/Fig1-designProcess/distinctSubset.pickle b/data/Fig1-designProcess/distinctSubset.pickle
diff --git a/data/Fig1-designProcess/fullData.pickle b/data/Fig1-designProcess/fullData.pickle
diff --git a/data/Fig1-designProcess/hullSubset.pickle b/data/Fig1-designProcess/hullSubset.pickle
diff --git a/data/Fig1-designProcess/outliersSubset.pickle b/data/Fig1-designProcess/outliersSubset.pickle
diff --git a/data/solverData.csv b/data/solverData.csv
@@ -0,0 +1,16 @@
+Loss Function,Algorithm,Dataset Length,Dataset Width,Subset Length,Computation Time,Loss
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07317258277907968,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07850275002419949,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.08522158302366734,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07728620758280158,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07322066696360707,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07752608275040984,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07825316581875086,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.0774826668202877,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.08167708432301879,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07435220899060369,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07715724967420101,0.0
+"Uni-criterion: sum, outlierness",greedySwap,200,2,40,0.028571959119290113,-83.00160578686646
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.09449754096567631,0.0
+"Uni-criterion: sum, outlierness",greedySwap,200,2,40,0.03406029101461172,-82.66069589660857
+"Uni-criterion: distinctness, distances",greedySwap,200,2,60,0.16028858395293355,-84.45664299596237
diff --git a/data/solverLog.csv b/data/solverLog.csv
@@ -207,3 +207,28 @@ Uni-criterion: clusterCenters,greedySwap,1000,2,10,0.13824820891022682,0.7651631
 "Uni-criterion: sum, outlierness",greedySwap,200,2,40,0.02950233267620206,-83.70271953374831
 "Uni-criterion: distinctness, distances",greedySwap,200,2,60,0.1577538326382637,-74.06808510204883
 "Multi-criterion: 100*(earthMoversDistance) + 1*(distinctness, distances)",greedySwap,200,2,80,8.527639084029943,-38.28864852224051
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.0784420003183186,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08677054103463888,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08529883390292525,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08417691616341472,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.09987304219976068,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.09347962541505694,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08436895813792944,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.0909910830669105,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08975366689264774,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.09015812491998076,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08418579073622823,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08488195901736617,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08465166576206684,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,12,0.11604362493380904,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.07149016577750444,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.07371812500059605,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.07845375034958124,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.06613433314487338,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,11,0.09711675019934773,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.0770460837520659,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.0783906253054738,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07647287519648671,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07736429199576378,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07778699975460768,0.0
+"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07915870798751712,0.0
diff --git a/figures/Fig1-designProcess/express.pdf b/figures/Fig1-designProcess/express.pdf
diff --git a/flexibleSubsetSelection/loss.py b/flexibleSubsetSelection/loss.py
@@ -2,6 +2,7 @@
 
 # Standard library
 from functools import partial
+import logging
 from typing import Any, Callable, Dict, List
 
 # Third party
@@ -11,6 +12,9 @@
 # Local files
 from .sets import Dataset, Subset
 
+# Setup logger
+logger = logging.getLogger(__name__)
+
 
 # --- Loss Function ------------------------------------------------------------
 
@@ -51,6 +55,10 @@ def __init__(self, objectives: List[Callable],
         # Generate the combined objective function
         self.calculate = partial(self._loss)
 
+        logger.debug("Initialized a multi-criterion loss function with "
+                     "objectives: %s, parameters: %s, and weights: %s", 
+                     objectives, parameters, weights)
+
     def _loss(self, dataset: Dataset, z: ArrayLike) -> float:
         """
         Compute the overall loss function by evaluating each objective function
@@ -131,6 +139,12 @@ def __init__(self, objective: Callable, solveArray: str = "dataArray",
         self.selectBy = selectBy
         self.parameters = parameters
 
+        logger.info("Initialized a uni-criterion loss function with "
+                    "objective: %s, solve array: %s, selection method: %s, "
+                    "and parameters: %s", 
+                    objective.__name__, solveArray, selectBy, parameters)
+
+
     def calculate(self, dataset: Dataset, z: ArrayLike) -> float:
         """
         Compute the loss by evaluating the objective with its parameters on the 

diff --git a/flexibleSubsetSelection/objective.py b/flexibleSubsetSelection/objective.py
@@ -6,7 +6,7 @@
 
 # Third party
 import ot
-from ott.geometry import costs, pointcloud
+from ott.geometry import pointcloud
 
 import numpy as np
 from numpy.typing import ArrayLike
@@ -95,14 +95,13 @@ def sinkhornDistance(distances: np.ndarray, datasetLength, subsetLength,
     Computes the Sinkhorn distance using the POT library.
 
     Args:
-        distances (np.ndarray): distance matrix .
-        reg (float, optional): Regularization parameter. Defaults to 0.1.
-        verbose (bool, optional): If True, print progress messages. Defaults to False.
+        distances: distance matrix .
+        reg: Regularization parameter.
+        verbose: If True, print progress messages. Defaults to False.
 
     Returns:
         float: Sinkhorn distance.
     """
-    print(distances.shape)
     return ot.sinkhorn2(np.ones(datasetLength) / datasetLength, 
                         np.ones(subsetLength) / subsetLength, 
                         distances, 
@@ -130,8 +129,8 @@ def clusterCenters(array: np.ndarray, clusterCenters: np.ndarray) -> float:
     each cluster center.
 
     Args:
-        array (np.ndarray): Array of datapoints in the set.
-        clusterCenters (np.ndarray): Array of cluster centers.
+        array: Array of datapoints in the set.
+        clusterCenters: Array of cluster centers.
 
     Returns: The sum of distances to the nearest point for each cluster center.
     """

diff --git a/flexibleSubsetSelection/sets.py b/flexibleSubsetSelection/sets.py
@@ -1,7 +1,8 @@
 # --- Imports ------------------------------------------------------------------
 
 # Standard library
-import os
+import logging
+from pathlib import Path
 from typing import Literal
 
 # Third party
@@ -10,12 +11,14 @@
 
 import pandas as pd
 import pickle
-
 from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
 
 # Local files
 from . import generate
 
+# Setup logger
+logger = logging.getLogger(__name__)
+
 
 # --- Dataset and Subset Classes -----------------------------------------------
 
@@ -24,8 +27,8 @@ class Set:
     Base class for Dataset and Subset providing shared save and load functions.
     """
 
-    def save(self, name: str, fileType: str = 'pickle', 
-             directory: str = '../data', index: bool = False) -> None:
+    def save(self, name: str, fileType: str = "pickle", 
+             directory: (str | Path) = "../data", index: bool = False) -> None:
         """
         Saves self.data as a file.
 
@@ -38,58 +41,62 @@ def save(self, name: str, fileType: str = 'pickle',
         Raises:
             ValueError: If an unsupported file type is specified.
         """
-        if not os.path.exists(directory):
-            os.makedirs(directory)
+        path = Path(directory)
+        path.mkdir(parents=True, exist_ok=True)
+        filePath = path / f"{name}.{fileType}"
 
-        filePath = os.path.join(directory, f"{name}.{fileType}")
-
         try:
             if fileType == "pickle":
-                with open(filePath, 'wb') as f:
+                with open(filePath, "wb") as f:
                     pickle.dump(self.data, f)
+                logger.info(f"Data successfully saved at '%s'.", filePath)
             elif fileType == "csv":
                 self.data.to_csv(filePath, index=index)
+                logger.info(f"Data successfully saved at '%s'.", filePath)
             else:
                 raise ValueError(f"Unsupported file type: {fileType}.")
         except Exception as e:
-            print(f"Error saving file: {e}")
+            logger.exception("Error saving file", e)
 
-    def load(self, name: str, fileType: str = 'pickle', 
-             directory: str = '../data') -> None:
+    def load(self, name: str, fileType: str = "pickle",  
+             directory: (str | Path) = "../data") -> None:
         """
         Loads from a file into self.data.
 
         Args:
             name: The name of the file.
             fileType: The type of file (pickle or csv).
             directory: Directory to load the file from.
-        
+
         Raises:
             ValueError: If an unsupported file type is specified.
         """
-        filePath = os.path.join(directory, f"{name}.{fileType}")
-
+        path = Path(directory)
+        filePath = path / f"{name}.{fileType}"
+
         try:
             if fileType == "pickle":
-                with open(filePath, 'rb') as f:
+                with open(filePath, "rb") as f:
                     self.data = pickle.load(f)
+                logger.info(f"Data successfully loaded from '%s'.", filePath)
             elif fileType == "csv":
                 self.data = pd.read_csv(filePath)
+                logger.info(f"Data successfully loaded from '%s'.", filePath)
             else:
                 raise ValueError(f"Unsupported file type: {fileType}.")
         except Exception as e:
-            print(f"Error loading file: {e}")
+            logger.exception("Error loading file", e)
 
 
 class Dataset(Set):
     """
-    A class for creating, storing, and processing of datasets for subsetting
+    A class for creating, storing, and processing of datasets for subsetting.
     """
 
     def __init__(self, data: (pd.DataFrame | np.ndarray | None) = None, 
-                 randTypes: (str | list| None) = None, 
+                 randTypes: (str | list | None) = None, 
                  size: (tuple | None) = None, interval: tuple = (1, 5), 
-                 features: (list| None) = None, 
+                 features: (list | None) = None, 
                  seed: (int | np.random.Generator | None) = None) -> None:
         """
         Initialize a dataset with data or by random data generation.
@@ -146,6 +153,7 @@ def __init__(self, data: (pd.DataFrame | np.ndarray | None) = None,
         self.dataArray = self.data[self.features].to_numpy()
         self.indices = {feature: i for i, feature in enumerate(self.features)}
         self.interval = interval
+        logger.info("%s created.", self)
 
     def preprocess(self, **parameters) -> None:
         """
@@ -160,11 +168,15 @@ def preprocess(self, **parameters) -> None:
                 dictionary of additional parameters.
         """
         for name, preprocessor in parameters.items():
-            if isinstance(preprocessor, tuple): # with preprocessor parameters
-                preprocessor, parameters = preprocessor
-                setattr(self, name, preprocessor(self.dataArray, **parameters))
-            else:
-                setattr(self, name, preprocessor(self.dataArray))
+            try:
+                if isinstance(preprocessor, tuple): # with parameters
+                    func, params = preprocessor
+                    setattr(self, name, func(self.dataArray, **params))
+                else:
+                    setattr(self, name, preprocessor(self.dataArray))
+                logger.info(f"Data preprocessed with function '%s'.", name)
+            except Exception as e:
+                logger.exception("Error applying function '%s'.", name)
 
     def scale(self, interval: (tuple | None) = None) -> None:
         """
@@ -187,6 +199,7 @@ def scale(self, interval: (tuple | None) = None) -> None:
         self.dataArray = (self.dataArray - minVals) / rangeVals
         self.dataArray = self.dataArray * (interval[1] - interval[0])
         self.dataArray += interval[0]
+        logger.info("Data scaled to %s.", interval)
 
     def discretize(self, bins: (int | ArrayLike), 
                    features: (list | None) = None, 
@@ -210,14 +223,19 @@ def discretize(self, bins: (int | ArrayLike),
             array = "dataArray"
 
         # Gets specified features
-        indices = [self.indices[feature] for feature in features]
+        try:
+            indices = [self.indices[feature] for feature in features]
+        except KeyError as e:
+            logger.exception("Feature not found in indices.")
+
         selected = self.dataArray[:, indices]
         discretizer = KBinsDiscretizer(n_bins = bins, 
-                                       encode = 'ordinal', 
+                                       encode = "ordinal", 
                                        strategy = strategy)
 
         setattr(self, array, discretizer.fit_transform(selected))
         self.bins = bins
+        logger.info("%s discretized by %s with %s bins.", array, strategy, bins)
 
     def encode(self, features: (list | None) = None, dimensions: int = 1, 
                array: (str | None) = None) -> None:
@@ -252,6 +270,7 @@ def encode(self, features: (list | None) = None, dimensions: int = 1,
         mask = np.ones(self.dataArray.shape[1], dtype=bool)
         mask[indices] = False
         setattr(self, array, np.hstack((self.dataArray[:, mask], encoded)))
+        logger.info("Data one-hot encoded in '%s'", array)
 
     def __repr__(self) -> str:
         """
@@ -288,11 +307,7 @@ def __init__(self, dataset: Dataset, z: ArrayLike,
 
         Raises:
             ValueError: If length of z does not match the length of dataset.
-            TypeError: If dataset is not an instance of Dataset.
         """
-        if not isinstance(dataset, Dataset):
-            raise TypeError("Dataset must be an instance of Dataset class.")
-
         if len(z) != dataset.size[0]:
             raise ValueError("Length of z must match the length of dataset.")
 
@@ -305,6 +320,7 @@ def __init__(self, dataset: Dataset, z: ArrayLike,
         self.data = dataset.data[z == 1].copy()  # subset of the full data
         self.solveTime = solveTime
         self.loss = loss
+        logger.info("Created %s.", self)
 
     def __repr__(self) -> str:
         """
@@ -316,7 +332,7 @@ def __repr__(self) -> str:
         if self.loss is not None:
             string = ", ".join(string, f"loss={round(self.loss, 4)})")
         return string
-    
+
     def __str__(self) -> str:
         """
         Return a user-friendly string representation of the Subset object.