Skip to content

Commit

Permalink
logging.
Browse files Browse the repository at this point in the history
  • Loading branch information
cbbcbail committed Aug 6, 2024
1 parent 4a9b02e commit f54a0c5
Show file tree
Hide file tree
Showing 13 changed files with 147 additions and 59 deletions.
Binary file modified data/Fig1-designProcess/distinctSubset.pickle
Binary file not shown.
Binary file modified data/Fig1-designProcess/fullData.pickle
Binary file not shown.
Binary file modified data/Fig1-designProcess/hullSubset.pickle
Binary file not shown.
Binary file modified data/Fig1-designProcess/outliersSubset.pickle
Binary file not shown.
16 changes: 16 additions & 0 deletions data/solverData.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Loss Function,Algorithm,Dataset Length,Dataset Width,Subset Length,Computation Time,Loss
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07317258277907968,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07850275002419949,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.08522158302366734,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07728620758280158,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07322066696360707,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07752608275040984,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07825316581875086,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.0774826668202877,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.08167708432301879,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07435220899060369,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07715724967420101,0.0
"Uni-criterion: sum, outlierness",greedySwap,200,2,40,0.028571959119290113,-83.00160578686646
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.09449754096567631,0.0
"Uni-criterion: sum, outlierness",greedySwap,200,2,40,0.03406029101461172,-82.66069589660857
"Uni-criterion: distinctness, distances",greedySwap,200,2,60,0.16028858395293355,-84.45664299596237
25 changes: 25 additions & 0 deletions data/solverLog.csv
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,28 @@ Uni-criterion: clusterCenters,greedySwap,1000,2,10,0.13824820891022682,0.7651631
"Uni-criterion: sum, outlierness",greedySwap,200,2,40,0.02950233267620206,-83.70271953374831
"Uni-criterion: distinctness, distances",greedySwap,200,2,60,0.1577538326382637,-74.06808510204883
"Multi-criterion: 100*(earthMoversDistance) + 1*(distinctness, distances)",greedySwap,200,2,80,8.527639084029943,-38.28864852224051
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.0784420003183186,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08677054103463888,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08529883390292525,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08417691616341472,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.09987304219976068,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.09347962541505694,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08436895813792944,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.0909910830669105,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08975366689264774,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.09015812491998076,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08418579073622823,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08488195901736617,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,10,0.08465166576206684,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,12,0.11604362493380904,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.07149016577750444,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.07371812500059605,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.07845375034958124,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,8,0.06613433314487338,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,11,0.09711675019934773,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.0770460837520659,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.0783906253054738,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07647287519648671,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07736429199576378,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07778699975460768,0.0
"Uni-criterion: preserveMetric, hull",greedyMinSubset,200,2,9,0.07915870798751712,0.0
Binary file modified figures/Fig1-designProcess/express.pdf
Binary file not shown.
14 changes: 14 additions & 0 deletions flexibleSubsetSelection/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# Standard library
from functools import partial
import logging
from typing import Any, Callable, Dict, List

# Third party
Expand All @@ -11,6 +12,9 @@
# Local files
from .sets import Dataset, Subset

# Setup logger
logger = logging.getLogger(__name__)


# --- Loss Function ------------------------------------------------------------

Expand Down Expand Up @@ -51,6 +55,10 @@ def __init__(self, objectives: List[Callable],
# Generate the combined objective function
self.calculate = partial(self._loss)

logger.debug("Initialized a multi-criterion loss function with "
"objectives: %s, parameters: %s, and weights: %s",
objectives, parameters, weights)

def _loss(self, dataset: Dataset, z: ArrayLike) -> float:
"""
Compute the overall loss function by evaluating each objective function
Expand Down Expand Up @@ -131,6 +139,12 @@ def __init__(self, objective: Callable, solveArray: str = "dataArray",
self.selectBy = selectBy
self.parameters = parameters

logger.info("Initialized a uni-criterion loss function with "
"objective: %s, solve array: %s, selection method: %s, "
"and parameters: %s",
objective.__name__, solveArray, selectBy, parameters)


def calculate(self, dataset: Dataset, z: ArrayLike) -> float:
"""
Compute the loss by evaluating the objective with its parameters on the
Expand Down
13 changes: 6 additions & 7 deletions flexibleSubsetSelection/objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# Third party
import ot
from ott.geometry import costs, pointcloud
from ott.geometry import pointcloud

import numpy as np
from numpy.typing import ArrayLike
Expand Down Expand Up @@ -95,14 +95,13 @@ def sinkhornDistance(distances: np.ndarray, datasetLength, subsetLength,
Computes the Sinkhorn distance using the POT library.
Args:
distances (np.ndarray): distance matrix .
reg (float, optional): Regularization parameter. Defaults to 0.1.
verbose (bool, optional): If True, print progress messages. Defaults to False.
distances: distance matrix .
reg: Regularization parameter.
verbose: If True, print progress messages. Defaults to False.
Returns:
float: Sinkhorn distance.
"""
print(distances.shape)
return ot.sinkhorn2(np.ones(datasetLength) / datasetLength,
np.ones(subsetLength) / subsetLength,
distances,
Expand Down Expand Up @@ -130,8 +129,8 @@ def clusterCenters(array: np.ndarray, clusterCenters: np.ndarray) -> float:
each cluster center.
Args:
array (np.ndarray): Array of datapoints in the set.
clusterCenters (np.ndarray): Array of cluster centers.
array: Array of datapoints in the set.
clusterCenters: Array of cluster centers.
Returns: The sum of distances to the nearest point for each cluster center.
"""
Expand Down
80 changes: 48 additions & 32 deletions flexibleSubsetSelection/sets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# --- Imports ------------------------------------------------------------------

# Standard library
import os
import logging
from pathlib import Path
from typing import Literal

# Third party
Expand All @@ -10,12 +11,14 @@

import pandas as pd
import pickle

from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder

# Local files
from . import generate

# Setup logger
logger = logging.getLogger(__name__)


# --- Dataset and Subset Classes -----------------------------------------------

Expand All @@ -24,8 +27,8 @@ class Set:
Base class for Dataset and Subset providing shared save and load functions.
"""

def save(self, name: str, fileType: str = 'pickle',
directory: str = '../data', index: bool = False) -> None:
def save(self, name: str, fileType: str = "pickle",
directory: (str | Path) = "../data", index: bool = False) -> None:
"""
Saves self.data as a file.
Expand All @@ -38,58 +41,62 @@ def save(self, name: str, fileType: str = 'pickle',
Raises:
ValueError: If an unsupported file type is specified.
"""
if not os.path.exists(directory):
os.makedirs(directory)
path = Path(directory)
path.mkdir(parents=True, exist_ok=True)
filePath = path / f"{name}.{fileType}"

filePath = os.path.join(directory, f"{name}.{fileType}")

try:
if fileType == "pickle":
with open(filePath, 'wb') as f:
with open(filePath, "wb") as f:
pickle.dump(self.data, f)
logger.info(f"Data successfully saved at '%s'.", filePath)
elif fileType == "csv":
self.data.to_csv(filePath, index=index)
logger.info(f"Data successfully saved at '%s'.", filePath)
else:
raise ValueError(f"Unsupported file type: {fileType}.")
except Exception as e:
print(f"Error saving file: {e}")
logger.exception("Error saving file", e)

def load(self, name: str, fileType: str = 'pickle',
directory: str = '../data') -> None:
def load(self, name: str, fileType: str = "pickle",
directory: (str | Path) = "../data") -> None:
"""
Loads from a file into self.data.
Args:
name: The name of the file.
fileType: The type of file (pickle or csv).
directory: Directory to load the file from.
Raises:
ValueError: If an unsupported file type is specified.
"""
filePath = os.path.join(directory, f"{name}.{fileType}")

path = Path(directory)
filePath = path / f"{name}.{fileType}"

try:
if fileType == "pickle":
with open(filePath, 'rb') as f:
with open(filePath, "rb") as f:
self.data = pickle.load(f)
logger.info(f"Data successfully loaded from '%s'.", filePath)
elif fileType == "csv":
self.data = pd.read_csv(filePath)
logger.info(f"Data successfully loaded from '%s'.", filePath)
else:
raise ValueError(f"Unsupported file type: {fileType}.")
except Exception as e:
print(f"Error loading file: {e}")
logger.exception("Error loading file", e)


class Dataset(Set):
"""
A class for creating, storing, and processing of datasets for subsetting
A class for creating, storing, and processing of datasets for subsetting.
"""

def __init__(self, data: (pd.DataFrame | np.ndarray | None) = None,
randTypes: (str | list| None) = None,
randTypes: (str | list | None) = None,
size: (tuple | None) = None, interval: tuple = (1, 5),
features: (list| None) = None,
features: (list | None) = None,
seed: (int | np.random.Generator | None) = None) -> None:
"""
Initialize a dataset with data or by random data generation.
Expand Down Expand Up @@ -146,6 +153,7 @@ def __init__(self, data: (pd.DataFrame | np.ndarray | None) = None,
self.dataArray = self.data[self.features].to_numpy()
self.indices = {feature: i for i, feature in enumerate(self.features)}
self.interval = interval
logger.info("%s created.", self)

def preprocess(self, **parameters) -> None:
"""
Expand All @@ -160,11 +168,15 @@ def preprocess(self, **parameters) -> None:
dictionary of additional parameters.
"""
for name, preprocessor in parameters.items():
if isinstance(preprocessor, tuple): # with preprocessor parameters
preprocessor, parameters = preprocessor
setattr(self, name, preprocessor(self.dataArray, **parameters))
else:
setattr(self, name, preprocessor(self.dataArray))
try:
if isinstance(preprocessor, tuple): # with parameters
func, params = preprocessor
setattr(self, name, func(self.dataArray, **params))
else:
setattr(self, name, preprocessor(self.dataArray))
logger.info(f"Data preprocessed with function '%s'.", name)
except Exception as e:
logger.exception("Error applying function '%s'.", name)

def scale(self, interval: (tuple | None) = None) -> None:
"""
Expand All @@ -187,6 +199,7 @@ def scale(self, interval: (tuple | None) = None) -> None:
self.dataArray = (self.dataArray - minVals) / rangeVals
self.dataArray = self.dataArray * (interval[1] - interval[0])
self.dataArray += interval[0]
logger.info("Data scaled to %s.", interval)

def discretize(self, bins: (int | ArrayLike),
features: (list | None) = None,
Expand All @@ -210,14 +223,19 @@ def discretize(self, bins: (int | ArrayLike),
array = "dataArray"

# Gets specified features
indices = [self.indices[feature] for feature in features]
try:
indices = [self.indices[feature] for feature in features]
except KeyError as e:
logger.exception("Feature not found in indices.")

selected = self.dataArray[:, indices]
discretizer = KBinsDiscretizer(n_bins = bins,
encode = 'ordinal',
encode = "ordinal",
strategy = strategy)

setattr(self, array, discretizer.fit_transform(selected))
self.bins = bins
logger.info("%s discretized by %s with %s bins.", array, strategy, bins)

def encode(self, features: (list | None) = None, dimensions: int = 1,
array: (str | None) = None) -> None:
Expand Down Expand Up @@ -252,6 +270,7 @@ def encode(self, features: (list | None) = None, dimensions: int = 1,
mask = np.ones(self.dataArray.shape[1], dtype=bool)
mask[indices] = False
setattr(self, array, np.hstack((self.dataArray[:, mask], encoded)))
logger.info("Data one-hot encoded in '%s'", array)

def __repr__(self) -> str:
"""
Expand Down Expand Up @@ -288,11 +307,7 @@ def __init__(self, dataset: Dataset, z: ArrayLike,
Raises:
ValueError: If length of z does not match the length of dataset.
TypeError: If dataset is not an instance of Dataset.
"""
if not isinstance(dataset, Dataset):
raise TypeError("Dataset must be an instance of Dataset class.")

if len(z) != dataset.size[0]:
raise ValueError("Length of z must match the length of dataset.")

Expand All @@ -305,6 +320,7 @@ def __init__(self, dataset: Dataset, z: ArrayLike,
self.data = dataset.data[z == 1].copy() # subset of the full data
self.solveTime = solveTime
self.loss = loss
logger.info("Created %s.", self)

def __repr__(self) -> str:
"""
Expand All @@ -316,7 +332,7 @@ def __repr__(self) -> str:
if self.loss is not None:
string = ", ".join(string, f"loss={round(self.loss, 4)})")
return string

def __str__(self) -> str:
"""
Return a user-friendly string representation of the Subset object.
Expand Down
Loading

0 comments on commit f54a0c5

Please sign in to comment.