diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..71de542 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.analysis.typeCheckingMode": "basic", + "python.analysis.autoImportCompletions": false +} \ No newline at end of file diff --git a/flexibleSubsetSelection/algorithm.py b/flexibleSubsetSelection/algorithm.py index d20ce53..a806cc7 100644 --- a/flexibleSubsetSelection/algorithm.py +++ b/flexibleSubsetSelection/algorithm.py @@ -13,7 +13,7 @@ # --- Utility ------------------------------------------------------------------ def randomSample(datasetSize: tuple, subsetSize: int, - seed: int | np.random.Generator = None): + seed: (int | np.random.Generator | None) = None): """ Randomly sample from dataset by generating random indices to create subset @@ -46,7 +46,8 @@ def createEnvironment(outputFlag: int = 0): return environment -def optimize(objective, constraints, environment, solver, log_file='gurobi_log.txt', verbose=False): +def optimize(objective, constraints, environment, solver, + log_file='gurobi_log.txt', verbose=False): """ Sets up a cvxpy problem with given objective and constraints and solves it using the specified solver. @@ -99,7 +100,8 @@ def bestOfRandom(dataset, lossFunction, subsetSize, minLoss=0, def averageOfRandom(dataset, lossFunction, subsetSize, minLoss=0, - maxIterations=None, seed=None, verbose=False, selectBy="row"): + maxIterations=None, seed=None, verbose=False, + selectBy="row"): if maxIterations is None: maxIterations = dataset.size[0] diff --git a/flexibleSubsetSelection/generate.py b/flexibleSubsetSelection/generate.py index 046d789..cb84369 100644 --- a/flexibleSubsetSelection/generate.py +++ b/flexibleSubsetSelection/generate.py @@ -9,8 +9,8 @@ # --- Random Dataset Generation ------------------------------------------------ -def randomData(randType: str | list, size: tuple, interval: tuple, - seed: int | np.random.Generator = None) -> pd.DataFrame: +def randomData(randType: str, size: tuple, interval: tuple, + seed: (int | np.random.Generator | None) = None) -> pd.DataFrame: """ Generate random data based on the specified random generation method. @@ -31,7 +31,7 @@ def randomData(randType: str | list, size: tuple, interval: tuple, generators = { "uniform": uniform, "binary": binary, - "categorical": categorical, + "categories": categories, "normal": normal, "multimodal": multimodal, "skew": skew, @@ -48,7 +48,7 @@ def randomData(randType: str | list, size: tuple, interval: tuple, raise ValueError(f"unknown random generation method: {randType}") def uniform(size: tuple, interval: tuple, - seed: int | np.random.Generator = None) -> pd.DataFrame: + seed: (int | np.random.Generator| None) = None) -> pd.DataFrame: """ Generate random data from a uniform distribution using numpy. @@ -63,7 +63,8 @@ def uniform(size: tuple, interval: tuple, data = rng.uniform(interval[0], interval[1], size=size) return pd.DataFrame(data) -def binary(size: tuple, seed: int | np.random.Generator = None) -> pd.DataFrame: +def binary(size: tuple, + seed: (int | np.random.Generator | None) = None) -> pd.DataFrame: """ Generate random binary data points of bernoulli trials using numpy where each feature has a random probability p. @@ -79,8 +80,8 @@ def binary(size: tuple, seed: int | np.random.Generator = None) -> pd.DataFrame: data = rng.binomial(1, probabilities, size=size) return pd.DataFrame(data) -def categorical(size: tuple, interval: tuple, - seed: int | np.random.Generator = None) -> pd.DataFrame: +def categories(size: tuple, interval: tuple, + seed: (int | np.random.Generator | None) = None) -> pd.DataFrame: """ Generate random categorical data points using numpy with a random number of categories and a random probability p in interval. @@ -101,7 +102,7 @@ def categorical(size: tuple, interval: tuple, return pd.DataFrame(data) def normal(size: tuple, interval: tuple, - seed: int | np.random.Generator = None) -> pd.DataFrame: + seed: (int | np.random.Generator | None) = None) -> pd.DataFrame: """ Generate random data from a normal distribution using numpy centered on random mean and with random standard deviation. @@ -121,7 +122,7 @@ def normal(size: tuple, interval: tuple, return pd.DataFrame(data) def multimodal(size: tuple, interval: tuple, sigmaInterval: tuple = (0.1, 3), - seed: int | np.random.Generator = None) -> pd.DataFrame: + seed: (int | np.random.Generator | None) = None) -> pd.DataFrame: """ Generate random data from multimodal distributions using numpy with a random number of normal distributions centered on random means and standard @@ -157,7 +158,7 @@ def multimodal(size: tuple, interval: tuple, sigmaInterval: tuple = (0.1, 3), return pd.DataFrame(data) def skew(size: tuple, interval: tuple = (-5, 5), - seed: int | np.random.Generator = None) -> pd.DataFrame: + seed: (int | np.random.Generator | None) = None) -> pd.DataFrame: """ Generate random data from skewed distributions using scipy with random skewness parameter. @@ -182,7 +183,7 @@ def skew(size: tuple, interval: tuple = (-5, 5), def blobs(size: tuple, interval: tuple, numClusters: int = 6, sigmaInterval: tuple = (0.1, 3), - seed: int | np.random.Generator = None) -> pd.DataFrame: + seed: (int | np.random.Generator | None) = None) -> pd.DataFrame: """ Generate random data points using sklearn with numClusters blobs and with random means and standard deviations. diff --git a/flexibleSubsetSelection/loss.py b/flexibleSubsetSelection/loss.py index 74eef09..2e7ff11 100644 --- a/flexibleSubsetSelection/loss.py +++ b/flexibleSubsetSelection/loss.py @@ -22,7 +22,7 @@ class MultiCriterion(): def __init__(self, objectives: List[Callable], parameters: List[Dict[str, Any]], - weights: ArrayLike = None) -> None: + weights: (np.ndarray | None) = None) -> None: """ Define a multi-criterion loss function with a set of objectives, weights, and parameters diff --git a/flexibleSubsetSelection/plot.py b/flexibleSubsetSelection/plot.py index 15f283f..91b063b 100644 --- a/flexibleSubsetSelection/plot.py +++ b/flexibleSubsetSelection/plot.py @@ -1,14 +1,21 @@ # --- Imports ------------------------------------------------------------------ +# Standard library +from typing import Callable + # Third party import matplotlib +from matplotlib.axes import Axes from matplotlib.colors import to_rgb, to_hex +from matplotlib.collections import LineCollection import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns +# Local files +from . import sets # --- Color -------------------------------------------------------------------- @@ -16,9 +23,12 @@ class Color: """ Create and store color palettes and color bars for use in visualizations """ - def __init__(self, palette: dict = None): + def __init__(self, palette: dict | None = None): """ Initialize the class with a custom or default palette + + Args: + palette: dictionary of color names and color values """ if palette is None: self.palette = { @@ -33,9 +43,7 @@ def __init__(self, palette: dict = None): self.palette = palette def __getitem__(self, color): - """ - Returns a color value from the palette directly. - """ + """Returns a color value from the palette directly.""" return self.palette[color] def getPalette(self, names: list, colors: list) -> dict: @@ -80,8 +88,8 @@ def getGradientPalette(self, color: str, number: int = 6, # --- Figures ------------------------------------------------------------------ -def moveFigure(fig, x, y): - """move figure's upper left corner to pixel (x, y)""" +def moveFigure(fig, x: int, y: int): + """Move figure's upper left corner to pixel (x, y).""" backend = matplotlib.get_backend() if backend == "TkAgg": fig.canvas.manager.window.wm_geometry("+%d+%d" % (x, y)) @@ -91,21 +99,21 @@ def moveFigure(fig, x, y): fig.canvas.manager.window.move(x, y) def clearAxes(fig): - """clear all axes in the figure""" + """Clear all axes in the figure.""" for ax in fig.get_axes(): ax.cla() def removeAxes(fig): - """remove all axes in the figure""" + """Remove all axes in the figure.""" for ax in fig.get_axes(): ax.remove() -def setPickEvent(fig, pickFunction): - """Set pickFunction as a pick event on the figure""" +def setPickEvent(fig, pickFunction: Callable): + """Set pickFunction as a pick event on the figure.""" fig.canvas.mpl_connect("pick_event", pickFunction) -def onPick(event, color): - """Toggle color of the selected item between green and yellow on event""" +def onPick(event, color: Color): + """Toggle color of the selected item between green and yellow on event.""" line = event.artist if line._color == color.palette["yellow"]: line._color = color.palette["green"] @@ -115,59 +123,114 @@ def onPick(event, color): line.zorder = 3 line._axes.figure.canvas.draw_idle() -def initializePane3D(ax, color): +def initializePane3D(ax: Axes, color: str): """Initialize the color of the background panes with hex color for 3D.""" rgb = to_rgb(color) ax.xaxis.set_pane_color(to_hex([min(1, c + (1 - c)*0.05) for c in rgb])) ax.yaxis.set_pane_color(to_hex([max(0, c*0.95) for c in rgb])) ax.zaxis.set_pane_color(rgb) -# --- Error Markers ------------------------------------------------------------ -def errorBar(ax, x, vals1, vals2, color): - """ - plot a series of error bars on ax along x between vals1 and vals2 of - given color +# --- Error Indicators --------------------------------------------------------- + +def errorBars(ax: Axes, x: float, vals1: float, vals2: float, color: str) -> None: """ - ax.errorbar(x=x, - y=(vals1 + vals2)/2, - yerr=abs(vals1 - vals2)/2, - ecolor=color, - ls="none", - elinewidth=3, - capsize=5, - capthick=1.5, - zorder=4) - -def errorMarkers(ax, x, vals1, color1, marker1, vals2=None, color2=None, - marker2=None): + Plot a series of error bars on ax along x between vals1 and vals2 of a + given color. + + Args: + ax: The axes on which to plot. + x: The x-coordinate for the error bars. + vals1: The first set of values. + vals2: The second set of values. + color: The color of the error bars. """ - plot a series of error markers on ax along x at vals1 and optionally vals2 - of given colors + ax.errorbar(x = x, + y = (vals1 + vals2) / 2, + yerr = abs(vals1 - vals2) / 2, + ecolor = color, + ls = "none", + elinewidth = 3, + capsize = 5, + capthick = 1.5, + zorder = 4) + +def errorMarkers(ax: Axes, x: list, vals1: list | None, color1: str | None, + marker1: str, vals2: list | None = None, + color2: str | None = None, marker2: str | None = None) -> None: """ + Plot a series of error markers on ax along x at vals1 and optionally vals2 + of given colors. - for i in x: - if vals1[i].size > 0: - for j in range(vals1[i].size): - ax.plot(x[i], vals1[i][j], + Args: + ax: The axes on which to plot. + x: The x-coordinates for the error markers. + vals1: The first set of values. + color1: The color for the first set of markers. + marker1: The marker style for the first set of markers. + vals2: The second set of values. + color2: The color for the second set of markers. + marker2: The marker style for the second set of markers. + """ + for i in range(len(x)): + if vals1 is not None and len(vals1[i]) > 0: + for val in vals1[i]: + ax.plot(x[i], + val, color = color1, markersize = 4, marker = marker1, zorder = 4) - if vals2[i].size > 0: - for j in range(vals2[i].size): - ax.plot(x[i], vals2[i][j], + + if vals2 is not None and len(vals2[i]) > 0: + for val in vals2[i]: + ax.plot(x[i], + val, color = color2, markersize = 3.5, markerfacecolor = None, marker = marker2, zorder = 4) +def errorLines(ax: Axes, vals1: np.ndarray, vals2: np.ndarray, color: str, + weights: (np.ndarray | None) = None) -> None: + """ + Plot a series of error lines on ax at vals1 and vals2 of given color. + + Args: + ax: The axes on which to plot. + vals1: The first set of values. + color: The color of the error lines. + vals2: The second set of values. + weights: The weights of the error lines. + """ + # Create grid of points + datasetX, subsetX = np.meshgrid(vals1[:, 0], vals2[:, 0], indexing='ij') + datasetY, subsetY = np.meshgrid(vals1[:, 1], vals2[:, 1], indexing='ij') + + # Create line segments + linesX = np.stack([datasetX, subsetX], axis=-1) + linesY = np.stack([datasetY, subsetY], axis=-1) + lines = np.stack([linesX, linesY], axis=-1).reshape(-1, 2, 2) + + # Assign weights if not provided + if weights is None: + weights = np.ones(len(lines)) + + # Create LineCollection + lines = LineCollection(lines, + colors=color, + linewidths=weights.flatten(), + alpha=0.2) + lines.set_capstyle("round") + ax.add_collection(lines) + # --- Plots -------------------------------------------------------------------- def initialize(color, font: str = "Times New Roman", size: int = 42, - faceColorAx = None, faceColorFig = None) -> None: + faceColorAx: (str | None) = None, + faceColorFig: (str | None) = None) -> None: """ Initialize matplotlib settings global parameters for text and background @@ -197,8 +260,9 @@ def initialize(color, font: str = "Times New Roman", size: int = 42, else: plt.rcParams["axes.facecolor"] = faceColorAx -def scatter(ax, color, dataset=None, subset=None, features=(0, 1), - **parameters): +def scatter(ax: Axes, color: Color, dataset: (sets.Dataset | None) = None, + subset: (sets.Subset | None) = None, features=(0, 1), + **parameters) -> None: """ Plot a scatterplot of data features on ax @@ -214,7 +278,7 @@ def scatter(ax, color, dataset=None, subset=None, features=(0, 1), if dataset is None and subset is None: raise ValueError("no dataset or subset specified") - + if len(features) == 3: if dataset is not None: ax.scatter(dataset.data[features[0]], @@ -250,18 +314,21 @@ def scatter(ax, color, dataset=None, subset=None, features=(0, 1), zorder=4, **parameters) -def parallelCoordinates(ax, color, dataset=None, subset=None, dataLinewidth=0.5, - subsetLinewidth=1.5, **parameters): +def parallelCoordinates(ax: Axes, color: Color, + dataset: (sets.Dataset | None) = None, + subset: (sets.Subset | None) = None, + dataLinewidth: float = 0.5, + subsetLinewidth: float = 1.5, **parameters) -> None: """ Plot a parallel coordinates chart of dataset on ax Args: - ax (matplotlib ax): The axis to plot the parallel coordinates on - dataset (pandas DataFrame): The dataset to plot - color (Color object): A color object with the color palette to use - subset (pandas DataFrame or None, optional): The subset to plot - dataLinewidth (float, optional): Linewidth for the main dataset - subsetLinewidth (float, optional): Linewidth for the subset + ax: The axis to plot the parallel coordinates on + dataset: The dataset to plot + color: A color object with the color palette to use + subset: The subset to plot + dataLinewidth: Linewidth for the main dataset + subsetLinewidth: Linewidth for the subset **parameters: Additional parameters to pass to pd.plotting.parallel_coordinates @@ -288,59 +355,57 @@ def parallelCoordinates(ax, color, dataset=None, subset=None, dataLinewidth=0.5, alpha=1, **parameters) - -def histogram(ax, color, dataset=None, subset=None, numBins=6, **parameters): +def histogram(ax: Axes, color: Color, dataset: (sets.Dataset | None) = None, + subset: (sets.Subset | None) = None, numBins: int = 6, + **parameters) -> None: """ Plot histograms of each feature side by side on ax with normalized subset and dataset overlapping on common bins Args: - ax (matplotlib ax): The axis to plot the histogram on - color (Color object): A color object with the color palette to use - dataset (sets.Dataset object, optional): The dataset to plot - subset (sets.Subset object, optional): The subset to plot - numBins (float): The number of bins to bin the dataset + ax: The axis to plot the histogram on + color: A color object with the color palette to use + dataset: The dataset to plot + subset: The subset to plot + numBins: The number of bins to bin the dataset Raises: ValueError: If neither a dataset or subset are provided """ if dataset is None and subset is None: raise ValueError("No dataset or subset specified") - - # Check if dataset is provided + if dataset is not None: features = dataset.data.columns - num_features = len(features) + numFeatures = len(features) # Get the positions of each bar group - bar_positions = np.arange(numBins * num_features, step=numBins) + barPositions = range(numBins * numFeatures, step=numBins) for i, feature in enumerate(features): # Plot the dataset histogram - dataset_hist = np.histogram(dataset.data[feature], bins=numBins) - dataset_heights = dataset_hist[0] + datasetHist = np.histogram(dataset.data[feature], bins=numBins) + datasetHeights = datasetHist[0] # Adjust bar positions - positions = bar_positions[i] + np.arange(numBins) + positions = barPositions[i] + np.arange(numBins) - ax.bar(positions, dataset_heights, width=1, + ax.bar(positions, datasetHeights, width=1, color=color.palette["green"], alpha=0.5) - - # Check if subset is provided if subset is not None: features = subset.data.columns - num_features = len(features) + numFeatures = len(features) # Get the positions of each bar group - bar_positions = np.arange(numBins * num_features, step=numBins) + barPositions = range(numBins * numFeatures, step=numBins) for i, feature in enumerate(features): # Calculate histogram of subset normalized by subset size - subset_hist = np.histogram(subset.data[feature], bins=numBins) - subset_heights = subset_hist[0] / len(subset.data) * len(dataset.data) + subsetHist = np.histogram(subset.data[feature], bins=numBins) + subsetHeights = subsetHist[0] / len(subset.data) * len(dataset.data) # Adjust bar positions - positions = bar_positions[i] + np.arange(numBins) + positions = barPositions[i] + np.arange(numBins) - ax.bar(positions, subset_heights, width=1, - color=color.palette["darkGreen"], alpha=0.5) # Increase alpha for better visibility \ No newline at end of file + ax.bar(positions, subsetHeights, width=1, + color=color.palette["darkGreen"], alpha=0.5) \ No newline at end of file diff --git a/flexibleSubsetSelection/sets.py b/flexibleSubsetSelection/sets.py index 45c4657..1bb00c6 100644 --- a/flexibleSubsetSelection/sets.py +++ b/flexibleSubsetSelection/sets.py @@ -2,6 +2,7 @@ # Standard library import os +from typing import Literal # Third party import numpy as np @@ -85,17 +86,18 @@ class Dataset(Base): A class for creating, storing, and processing of datasets for subsetting """ - def __init__(self, data: ArrayLike = None, randTypes: str | list = None, - size: tuple = None, interval: tuple = (1, 5), - features: list = None, - seed: int | np.random.Generator = None) -> None: + def __init__(self, data: (pd.DataFrame | np.ndarray | None) = None, + randTypes: (str | list| None) = None, + size: (tuple | None) = None, interval: tuple = (1, 5), + features: (list| None) = None, + seed: (int | np.random.Generator | None) = None) -> None: """ Initialize a dataset with data or by random data generation. Args: - data: The DataFrame or array of the data. + data: The data in pd.DataFrame or np.ndarray. randTypes: The method or methods for random data generation. - Supported methods: "uniform", "binary", "categorical", "normal", + Supported methods: "uniform", "binary", "categories", "normal", "multimodal", "skew", "blobs" size: The size of the dataset to create for random dataset generation or the size of the data (num rows, num columns). @@ -104,7 +106,8 @@ def __init__(self, data: ArrayLike = None, randTypes: str | list = None, seed: The random seed or generator for reproducibility. Raises: - ValueError: If no data or random generation method is specified. + ValueError: If no data or random generation method is specified or + if no size of data to generate is specified. """ # Initialize data if data is not None: # initialize with provided data @@ -114,16 +117,22 @@ def __init__(self, data: ArrayLike = None, randTypes: str | list = None, else: self.data = pd.DataFrame(data) elif randTypes is not None: # initialize with random data generation - if isinstance(randTypes, list): - self.data = pd.DataFrame({ - i: generate.randomData(randType, size, interval, seed) - for i, randType in enumerate(randTypes) - }) + if size is not None: + if isinstance(randTypes, list): + self.data = pd.DataFrame({ + i: generate.randomData(randType, size, interval, seed) + for i, randType in enumerate(randTypes) + }) + else: + self.data = generate.randomData(randTypes, + size, + interval, + seed) + if features is not None: + self.data.columns = features + self.size = size else: - self.data = generate.randomData(randTypes, size, interval, seed) - if features is not None: - self.data.columns = features - self.size = size + raise ValueError("No size of data to generate specified.") else: raise ValueError("No data or random generation method specified.") @@ -157,7 +166,7 @@ def preprocess(self, **parameters) -> None: else: setattr(self, name, preprocessor(self.dataArray)) - def scale(self, interval: tuple = None) -> None: + def scale(self, interval: (tuple | None) = None) -> None: """ Scales self.dataArray numpy array based on self.interval tuple @@ -179,16 +188,16 @@ def scale(self, interval: tuple = None) -> None: self.dataArray = self.dataArray * (interval[1] - interval[0]) self.dataArray += interval[0] - def discretize(self, bins: int | ArrayLike, features: list = None, - strategy: str = 'uniform', array: str = None) -> None: + def discretize(self, bins: int | ArrayLike, features: (list | None) = None, + strategy: Literal["uniform","quantile","kmeans"] = "uniform", + array: (str | None) = None) -> None: """ Discretize self.dataArray into bins. Arg: bins: Number of bins to use, bins in each feature, or bin edges. features: The features to use for the binning - strategy: sklearn KBinsDiscretizer strategy to use from 'uniform', - 'quantile', or 'kmeans'. + strategy: sklearn KBinsDiscretizer strategy to use. array: The array to assignt he result to. Raises: @@ -209,8 +218,8 @@ def discretize(self, bins: int | ArrayLike, features: list = None, setattr(self, array, discretizer.fit_transform(selected)) self.bins = bins - def encode(self, features: list = None, dimensions: int = 1, - array: str = None) -> None: + def encode(self, features: (list | None) = None, dimensions: int = 1, + array: (str | None) = None) -> None: """ One hot encodes self.dataArray with sklearn OneHotEncoder assuming data is discretized. @@ -263,8 +272,9 @@ class Subset(Base): A class for creating, storing, and handling subsets of datasets. """ - def __init__(self, dataset: Dataset, z: ArrayLike, solveTime: float = None, - loss: float = None) -> None: + def __init__(self, dataset: Dataset, z: ArrayLike, + solveTime: (float | None) = None, + loss: (float | None) = None) -> None: """ Initialize a subset with a Dataset object and the indicator vector z. diff --git a/flexibleSubsetSelection/solver.py b/flexibleSubsetSelection/solver.py index 5e54380..d76aaaa 100644 --- a/flexibleSubsetSelection/solver.py +++ b/flexibleSubsetSelection/solver.py @@ -21,7 +21,9 @@ class Solver(): solving algorithm and loss function, applied to calculate a subset. """ def __init__(self, algorithm: Callable, - lossFunction: loss.UniCriterion | loss.MultiCriterion = None, + lossFunction: (loss.UniCriterion | + loss.MultiCriterion | + None) = None, logPath: str = "../data/solverLog.csv") -> None: """ Initialize a subset selection solver with a solve algorithm and, diff --git a/flexibleSubsetSelection/timer.py b/flexibleSubsetSelection/timer.py index 081a934..feb097d 100644 --- a/flexibleSubsetSelection/timer.py +++ b/flexibleSubsetSelection/timer.py @@ -1,3 +1,4 @@ +# Based on https://realpython.com/python-timer/ # --- Imports ------------------------------------------------------------------ # Standard library @@ -18,7 +19,7 @@ def start(self): self._startTime = time.perf_counter() - def stop(self) -> float: + def stop(self) -> None: """Stop the timer, and return the elapsed time""" if self._startTime is None: raise TimerError(f"Timer is not running. Use .start() to start it")