diff --git a/.gitignore b/.gitignore index be3e12e..c9972a6 100644 --- a/.gitignore +++ b/.gitignore @@ -125,7 +125,9 @@ dmypy.json .pyre/ # Ignore test_.py for testing -/test_tools.py +*test*.py +*.sql +*.html # Ignore .txt for testing /examples/*.txt @@ -134,4 +136,9 @@ dmypy.json /examples/*.png # Ignore .pkl for testing -/examples/*.pkl \ No newline at end of file +/examples/*.pkl + +*.csv +*.xlsx + +labelencoder_dictionary.pkl diff --git a/likelihood/VERSION b/likelihood/VERSION index 7e099ec..5975b14 100644 --- a/likelihood/VERSION +++ b/likelihood/VERSION @@ -1 +1 @@ -1.2.6 \ No newline at end of file +1.2.8 \ No newline at end of file diff --git a/likelihood/graph/__init__.py b/likelihood/graph/__init__.py new file mode 100644 index 0000000..84a54e6 --- /dev/null +++ b/likelihood/graph/__init__.py @@ -0,0 +1 @@ +from .graph import * diff --git a/likelihood/graph/graph.py b/likelihood/graph/graph.py new file mode 100644 index 0000000..7aa7b63 --- /dev/null +++ b/likelihood/graph/graph.py @@ -0,0 +1,50 @@ +from IPython.display import HTML, display +from pandas.core.frame import DataFrame +from pyvis.network import Network + +from likelihood.tools import FeatureSelection + + +class DynamicGraph(FeatureSelection): + """A class to represent a dynamic graph""" + + def __init__(self, df: DataFrame, n_importances: int): + self.G = Network( + notebook=True, cdn_resources="remote", directed=True + ) # enable interactive visualization in Jupyter Notebooks + self.df = df + self.n_importances = n_importances + super().__init__() + + def fit(self, **kwargs) -> None: + """Fit the model according to the given data and parameters.""" + self.get_digraph(self.df, self.n_importances) + # create a dictionary with the indexes and names of the dataframe + self.get_index = dict(zip(self.df.columns, range(len(self.df.columns)))) + self._make_network() + + def _make_network(self) -> None: + """Create nodes and edges of the network based on feature importance scores""" + self._add_nodes() + for i in range(len(self.all_features_imp_graph)): + node = self.all_features_imp_graph[i][0] + edges = self.all_features_imp_graph[i][1] + + for label, weight in edges: + self.G.add_edge(i, self.get_index[label], weight=weight) + + def _add_nodes(self) -> None: + for i in range(len(self.all_features_imp_graph)): + node = self.all_features_imp_graph[i][0] + self.G.add_node(n_id=i, label=node) + + def draw(self, name="graph.html", **kwargs) -> None: + """Display the network using HTML format""" + spring_length = kwargs["spring_length"] if "spring_length" in kwargs else 500 + node_distance = kwargs["node_distance"] if "node_distance" in kwargs else 100 + self.G.repulsion(node_distance=node_distance, spring_length=spring_length) + self.G.show_buttons(filter_=["physics"]) + self.G.show(name) + + html_file_content = open(name, "r").read() + display(HTML(html_file_content)) diff --git a/likelihood/models/__init__.py b/likelihood/models/__init__.py index 7a789b0..3ed6e3c 100644 --- a/likelihood/models/__init__.py +++ b/likelihood/models/__init__.py @@ -1 +1,2 @@ from .regression import * +from .simulation import * diff --git a/likelihood/models/simulation.py b/likelihood/models/simulation.py new file mode 100644 index 0000000..71b1409 --- /dev/null +++ b/likelihood/models/simulation.py @@ -0,0 +1,44 @@ +import matplotlib.pyplot as plt +import numpy as np +from numpy import ndarray +from pandas.core.frame import DataFrame + +from likelihood.tools import FeatureSelection, OneHotEncoder + +# -------------------------------------------------------------------------------------------------------------------------------------- + + +class SimulationEngine(FeatureSelection): + + def __init__(self, df: DataFrame, n_importances: int, **kwargs): + + self.df = df + self.n_importances = n_importances + + super().__init__(**kwargs) + + def predict(self, column: str, n: int = None) -> ndarray | list: + + # We assing the entries of the dictionary corresponding to the column + w, quick_encoder, names_cols, dfe = self.w_dict[column] + + df_aux = dfe._df + + if n != None: + df_aux = df_aux.iloc[:n, :] + + y = df_aux.to_numpy() @ w + + if quick_encoder != None: + + one_hot = OneHotEncoder() + y = one_hot.decode(y) + encoding_dic = quick_encoder.decoding_list[0] + y = [encoding_dic[item] for item in y] + + return y + + def fit(self, **kwargs) -> None: + + # We run the feature selection algorithm + self.get_digraph(self.df, self.n_importances) diff --git a/likelihood/tools/tools.py b/likelihood/tools/tools.py index 677b922..a03f1ab 100644 --- a/likelihood/tools/tools.py +++ b/likelihood/tools/tools.py @@ -364,7 +364,7 @@ def sigmoide(x: float) -> float: class LogisticRegression: """class implementing multiple logistic regression""" - __slots__ = ["importance", "X", "y"] + __slots__ = ["importance", "X", "y", "w"] def __init__(self) -> None: """The class initializer""" @@ -394,14 +394,14 @@ def fit(self, dataset: ndarray, values: ndarray) -> None: U, S, VT = np.linalg.svd(self.X, full_matrices=False) inverse_sig = np.vectorize(sigmoide_inv) - w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ inverse_sig(self.y) + self.w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ inverse_sig(self.y) if self.y.shape[1] > 1: - for row in w: + for row in self.w: self.importance.append(np.around(np.max(row), decimals=8)) else: for i in range(self.X.shape[0]): - a = np.around(w[i], decimals=8) + a = np.around(self.w[i], decimals=8) self.importance.append(a) def predict(self, datapoints: ndarray) -> ndarray: @@ -443,7 +443,7 @@ def get_importances(self, print_important_features: bool = False) -> ndarray: class LinearRegression: """class implementing multiple linear regression""" - __slots__ = ["importance", "X", "y"] + __slots__ = ["importance", "X", "y", "w"] def __init__(self) -> None: """The class initializer""" @@ -471,10 +471,10 @@ def fit(self, dataset: ndarray, values: ndarray, verbose: bool = False) -> None: self.y = values U, S, VT = np.linalg.svd(self.X, full_matrices=False) - w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ self.y + self.w = (VT.T @ np.linalg.inv(np.diag(S)) @ U.T).T @ self.y for i in range(self.X.shape[0]): - a = np.around(w[i], decimals=8) + a = np.around(self.w[i], decimals=8) self.importance.append(a) if verbose: @@ -517,7 +517,7 @@ def get_importances(self, print_important_features: bool = False) -> ndarray: return np.array(self.importance) -def cal_average(y, alpha: float = 1): +def cal_average(y: ndarray, alpha: float = 1): """Calculates the moving average of the data Parameters @@ -621,7 +621,7 @@ def scale(self, dataset_: ndarray) -> ndarray: return dataset_ -def generate_series(n, n_steps: int, incline: bool = True): +def generate_series(n: int, n_steps: int, incline: bool = True): """Function that generates `n` series of length `n_steps`""" freq1, freq2, offsets1, offsets2 = np.random.rand(4, n, 1) @@ -882,94 +882,113 @@ def _confu_mat(self, y_true: ndarray, y_pred: ndarray, labels: list) -> ndarray: return count_mat +class OneHotEncoder: -def one_hot_encoding(x:ndarray | list) -> ndarray: - """ - Calculates the one-hot encoding on a numpy array. Only accepts array of intergers as labels - - Parameters - ---------- - x : `np.array` - An array containing the data. - - Returns - ------- - y : `ndarray` - The one hot encodig matrix of x. - """ - if not isinstance(x, ndarray): - x = np.array(x) #If not numpy array then convert it - - y = np.zeros((x.size, x.max() + 1)) #Build matrix of (size num of entries) x (max value + 1) - - y[np.arange(x.size), x] = 1 #Label with ones + __slots__ = ["x"] + + def __init__(self) -> None: + pass + + def encode(self, x: ndarray | list): + self.x = x + + if not isinstance(self.x, ndarray): + self.x = np.array(self.x) # If not numpy array then convert it - return y + y = np.zeros((self.x.size, self.x.max() + 1)) # Build matrix of (size num of entries) x (max value + 1) + y[np.arange(self.x.size), self.x] = 1 # Label with ones + + return y + + def decode(self, x: ndarray | list) -> ndarray: + if not isinstance(x, ndarray): + x = np.array(x) # If not numpy array then convert it + + #Regresamos los valores max de cada renglon + y = np.argmax(x, axis = 1) + + return y class FeatureSelection: """ - Class with method to obtain feature selection of a dataset. Returns string + Generate the data graph using a variation of the feature selection algorithm.. + + - The method `get_digraph` returns the network based on the feature selection method. """ - __slots__ = ["not_features", "X"] + __slots__ = ["not_features", "X", "all_features_imp_graph", "w_dict", "scaler"] + + def __init__(self, not_features: list[str] = []) -> None: + """The initializer of the class. The initial parameter is a list of strings with variables to discard.""" + self.not_features: List[str] = not_features + self.all_features_imp_graph: List[Tuple] = [] + self.w_dict = dict() - def __init__(self, not_features: list = []) -> None: - """The class initializer. The initial parameter is a string with variables""" - self.not_features = not_features + def get_digraph(self, dataset: DataFrame, n_importances: int) -> str: + """ + Get directed graph showing importance of features. - def feature_selection(self, dataset: DataFrame, n_importances: int) -> str: + Args: + dataset (`DataFrame`): Dataset to be used for generating the graph. + n_importances (`int`): Number of top importances to show in the graph. - # Asignar y limpiar dataset - self.load_data(dataset) + Returns: + A string representation of the directed graph. + """ + # Assign and clean dataset + self._load_data(dataset) curr_dataset = self.X columns = list(curr_dataset.columns) - # Construimos string de causal_graph + # We construct string from causal_graph feature_string = " digraph { " for column in columns: feature_string += column + "; " numeric_df = curr_dataset.select_dtypes(include="number") - scaler = DataScaler(numeric_df.copy().to_numpy(), n=None) - numeric_scaled = scaler.rescale() + self.scaler = DataScaler(numeric_df.copy().to_numpy(), n=None) + numeric_scaled = self.scaler.rescale() numeric_df = pd.DataFrame(numeric_scaled, columns=numeric_df.columns) curr_dataset[numeric_df.columns] = numeric_df - # Iteramos sobre todas las columnas para obtener sus importances - for column in columns: + # Iterate over all the columns to obtain their importances. + for index_column, column in enumerate(columns): - # Variable a predecir + # Variable to predict Y = curr_dataset[column] - # Verificamos si es numerica o es categorica + # We check whether it is numerical or categorical. column_type = Y.dtype if column_type != "object": - # Modelo de regresion lineal + # Linear regression model Model = LinearRegression() - # Dataset auxiliar sin la columna en cuestion + # Auxiliary dataset without the column in question X_aux = curr_dataset.drop([column], axis=1) - # Codificamos + # We encode dfe = DataFrameEncoder(X_aux) encoded_df = dfe.encode(save_mode=False) - # Entrenamos + # We train Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T) - # Obtenemos importance + # We obtain importance importance = Model.get_importances() + w = Model.w else: Model = LogisticRegression() num_unique_entries = curr_dataset[column].nunique() quick_encoder = DataFrameEncoder(Y.to_frame()) encoded_Y = quick_encoder.encode(save_mode=False) - # Mapeamos a one-hot - train_y = one_hot_encoding(encoded_Y[column]) - # PASAMOS 0 -> 0.5 y 1 -> 0.73105 + + # Mapping to one-hot + one_hot = OneHotEncoder() + train_y = one_hot.encode(encoded_Y[column]) + # PASSING 0 -> 0.5 and 1 -> 0.73105 for i in range(len(train_y)): for j in range(num_unique_entries): if train_y[i][j] == 1.0: @@ -977,28 +996,41 @@ def feature_selection(self, dataset: DataFrame, n_importances: int) -> str: else: train_y[i][j] = 0.5 - # Eliminamos la columna en cuestión + # Delete the column in question X_aux = curr_dataset.drop([column], axis=1) - # Codificamos + # We encode dfe = DataFrameEncoder(X_aux) encoded_df = dfe.encode(save_mode=False) - # Entrenamos + # We train Model.fit(encoded_df.to_numpy().T, train_y) - # Obtenemos importancias + # We obtain importance importance = Model.get_importances() + w = Model.w - # Obtenemos las n más importantes + # We obtain the $n$ most important ones top_n_indexes = sorted( range(len(importance)), key=lambda i: importance[i], reverse=True )[:n_importances] - # Construimos el string de la columna en cuestión + # We build the string for the column in question names_cols = list(X_aux.columns) + # We store the indices, values and column names in a list of tuples. + features_imp_node = [ + (names_cols[top_n_indexes[i]], importance[top_n_indexes[i]]) + for i in range(n_importances) + ] + #We store w's for predictions - # Lo formateamos + if column_type != "object": + self.w_dict[column] = (w, None, names_cols, dfe) + else: + self.w_dict[column] = (w, quick_encoder, names_cols, dfe) + # Add to general list + self.all_features_imp_graph.append((column, features_imp_node)) + # We format it for i in top_n_indexes: feature_string += names_cols[i] + " -> " @@ -1006,12 +1038,12 @@ def feature_selection(self, dataset: DataFrame, n_importances: int) -> str: return feature_string + "} " - def load_data(self, dataset: DataFrame): - # Asignamos datos y limpiamos dataset de columnas no requeridas + def _load_data(self, dataset: DataFrame): + # Assign data and clean dataset of unneeded columns if len(self.not_features) > 0: - # Quitamos columnas innecesarias - self.X = dataset.drop(self.not_features, axis=1) + # We remove unnecessary columns + self.X = dataset.drop(columns=self.not_features) else: self.X = dataset diff --git a/requirements.txt b/requirements.txt index 8187fec..21a2fbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ isort==5.12.0 numpy<2.0.0 matplotlib -corner \ No newline at end of file +corner +pyvis \ No newline at end of file diff --git a/setup.py b/setup.py index 580a9c3..38a75a8 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ "numpy<2.0.0", "matplotlib", "corner", + "pyvis", ], classifiers=[ "Programming Language :: Python :: 3",