Bug fixes and code clean-up (#46)

jzsmoreno · Jan 19, 2025 · 23df6b1 · 23df6b1
1 parent 74b109a
commit 23df6b1
Show file tree

Hide file tree

Showing 8 changed files with 231 additions and 98 deletions.
diff --git a/examples/Deep_Models.ipynb b/examples/Deep_Models.ipynb
diff --git a/examples/Simulation_Models.ipynb b/examples/Simulation_Models.ipynb
@@ -28,24 +28,104 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "   Color   Tamaño  Género  Edad  Precio  Cantidad\n",
-      "0   Rojo  Pequeño   Mujer    25  100.50         5\n",
-      "1   Azul  Mediano  Hombre    30  150.75        10\n",
-      "2   Rojo   Grande   Mujer    22  200.00         3\n",
-      "3  Verde  Mediano  Hombre    35   80.25         8\n",
-      "4   Rojo  Pequeño   Mujer    28  120.00         7\n"
-     ]
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Tamaño</th>\n",
+       "      <th>Género</th>\n",
+       "      <th>Edad</th>\n",
+       "      <th>Precio</th>\n",
+       "      <th>Cantidad</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Rojo</td>\n",
+       "      <td>Pequeño</td>\n",
+       "      <td>Mujer</td>\n",
+       "      <td>25</td>\n",
+       "      <td>100.50</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Azul</td>\n",
+       "      <td>Mediano</td>\n",
+       "      <td>Hombre</td>\n",
+       "      <td>30</td>\n",
+       "      <td>150.75</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Rojo</td>\n",
+       "      <td>Grande</td>\n",
+       "      <td>Mujer</td>\n",
+       "      <td>22</td>\n",
+       "      <td>200.00</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Verde</td>\n",
+       "      <td>Mediano</td>\n",
+       "      <td>Hombre</td>\n",
+       "      <td>35</td>\n",
+       "      <td>80.25</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Rojo</td>\n",
+       "      <td>Pequeño</td>\n",
+       "      <td>Mujer</td>\n",
+       "      <td>28</td>\n",
+       "      <td>120.00</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Color   Tamaño  Género  Edad  Precio  Cantidad\n",
+       "0   Rojo  Pequeño   Mujer    25  100.50         5\n",
+       "1   Azul  Mediano  Hombre    30  150.75        10\n",
+       "2   Rojo   Grande   Mujer    22  200.00         3\n",
+       "3  Verde  Mediano  Hombre    35   80.25         8\n",
+       "4   Rojo  Pequeño   Mujer    28  120.00         7"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "# Load data from a parquet file into a pandas DataFrame\n",
     "data = pd.read_parquet(\"./sample_data.parquet\")\n",
     "df = pd.DataFrame(data)\n",
     "# Display the first few rows of the dataframe to verify it's loaded correctly\n",
-    "print(df.head())"
+    "df.head()"
    ]
   },
   {
@@ -54,19 +134,63 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "      Color  Tamaño  Género  Edad  Precio  Cantidad\n",
-      "5  Amarillo  Grande  Hombre    40   250.0         4\n"
-     ]
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Tamaño</th>\n",
+       "      <th>Género</th>\n",
+       "      <th>Edad</th>\n",
+       "      <th>Precio</th>\n",
+       "      <th>Cantidad</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Azul</td>\n",
+       "      <td>Pequeño</td>\n",
+       "      <td>Hombre</td>\n",
+       "      <td>29</td>\n",
+       "      <td>125.0</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Color   Tamaño  Género  Edad  Precio  Cantidad\n",
+       "12  Azul  Pequeño  Hombre    29   125.0         8"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "# Sample one row from the dataframe for demonstration purposes\n",
     "df_aux = df.sample(1)\n",
     "# Display the sampled row\n",
-    "print(df_aux.head())"
+    "df_aux.head()"
    ]
   },
   {
@@ -129,8 +253,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The predicted value is 225.39\n",
-      "The probability of this value is:  15.75%\n",
+      "The predicted value is 81.15\n",
+      "The probability of this value is:  23.84%\n",
       "The variable is an inlier\n"
      ]
     }
@@ -180,7 +304,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The probability of this value is:  15.75%\n",
+      "The probability of this value is:  23.84%\n",
       "The variable is an inlier\n"
      ]
     }
@@ -222,7 +346,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },

diff --git a/likelihood/VERSION b/likelihood/VERSION
@@ -1 +1 @@
-1.2.25
+1.3.0
diff --git a/likelihood/graph/graph.py b/likelihood/graph/graph.py
@@ -45,8 +45,8 @@ def _add_nodes(self) -> None:
 
     def draw(self, name="graph.html", **kwargs) -> None:
         """Display the network using HTML format"""
-        spring_length = kwargs["spring_length"] if "spring_length" in kwargs else 500
-        node_distance = kwargs["node_distance"] if "node_distance" in kwargs else 100
+        spring_length = kwargs.get("spring_length", 500)
+        node_distance = kwargs.get("node_distance", 100)
         self.G.repulsion(node_distance=node_distance, spring_length=spring_length)
         self.G.show_buttons(filter_=["physics"])
         self.G.show(name)
@@ -89,5 +89,5 @@ def pyvis_to_networkx(self):
     df["y"] = y
     # Instantiate DynamicGraph
     fs = DynamicGraph(df, n_importances=2)
-    print(fs.fit())
+    fs.fit()
     fs.draw()
diff --git a/likelihood/graph/nn.py b/likelihood/graph/nn.py
@@ -96,7 +96,7 @@ def cal_adjacency_matrix(
 
     assert len(df_categorical) > 0
 
-    similarity = kwargs["similarity"] if "similarity" in kwargs else len(df_categorical.columns) - 1
+    similarity = kwargs.get("similarity", len(df_categorical.columns) - 1)
     assert similarity <= df_categorical.shape[1]
 
     adj_dict = {}

diff --git a/likelihood/models/deep/autoencoders.py b/likelihood/models/deep/autoencoders.py
@@ -79,6 +79,8 @@ def __init__(self, input_shape_parm, num_classes, units, activation, **kwargs):
             The activation function to use for the classifier layer. Default is "softmax". If the activation function is not a classification function, the model can be used in regression problems.
         num_layers : `int`
             The number of hidden layers in the classifier. Default is 1.
+        dropout : `float`
+            The dropout rate to use in the classifier. Default is None.
         """
         super(AutoClassifier, self).__init__()
         self.input_shape_parm = input_shape_parm
@@ -91,6 +93,7 @@ def __init__(self, input_shape_parm, num_classes, units, activation, **kwargs):
         self.classifier = None
         self.classifier_activation = kwargs.get("classifier_activation", "softmax")
         self.num_layers = kwargs.get("num_layers", 1)
+        self.dropout = kwargs.get("dropout", None)
 
     def build(self, input_shape):
         self.encoder = tf.keras.Sequential(
@@ -113,6 +116,8 @@ def build(self, input_shape):
                 self.classifier.add(
                     tf.keras.layers.Dense(units=self.units, activation=self.activation)
                 )
+                if self.dropout:
+                    self.classifier.add(tf.keras.layers.Dropout(self.dropout))
         self.classifier.add(
             tf.keras.layers.Dense(units=self.num_classes, activation=self.classifier_activation)
         )
@@ -132,6 +137,7 @@ def get_config(self):
             "activation": self.activation,
             "classifier_activation": self.classifier_activation,
             "num_layers": self.num_layers,
+            "dropout": self.dropout,
         }
         base_config = super(AutoClassifier, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -145,6 +151,7 @@ def from_config(cls, config):
             activation=config["activation"],
             classifier_activation=config["classifier_activation"],
             num_layers=config["num_layers"],
+            dropout=config["dropout"],
         )
 
 
@@ -156,6 +163,7 @@ def call_existing_code(
     input_shape_parm: None | int = None,
     num_classes: None | int = None,
     num_layers: int = 1,
+    **kwargs,
 ) -> AutoClassifier:
     """
     Calls an existing AutoClassifier instance.
@@ -180,12 +188,14 @@ def call_existing_code(
     `AutoClassifier`
         The AutoClassifier instance.
     """
+    dropout = kwargs.get("dropout", None)
     model = AutoClassifier(
         input_shape_parm=input_shape_parm,
         num_classes=num_classes,
         units=units,
         activation=activation,
         num_layers=num_layers,
+        dropout=dropout,
     )
     model.compile(
         optimizer=optimizer,
@@ -254,6 +264,11 @@ def build_model(
         if "num_layers" not in hyperparameters_keys
         else hyperparameters["num_layers"]
     )
+    dropout = (
+        hp.Float("dropout", min_value=0.1, max_value=0.9, sampling="log")
+        if "dropout" not in hyperparameters_keys
+        else hyperparameters["dropout"]
+    )
 
     model = call_existing_code(
         units=units,
@@ -263,6 +278,7 @@ def build_model(
         input_shape_parm=input_shape_parm,
         num_classes=num_classes,
         num_layers=num_layers,
+        dropout=dropout,
     )
     return model
 
@@ -408,10 +424,8 @@ def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
         self.model = model
         self.encoder_layer = self.model.encoder.layers[0]
         self.decoder_layer = self.model.decoder.layers[0]
-        self.classifier_layer = self.model.classifier.layers[-2]
         self.encoder_weights = self.encoder_layer.get_weights()[0]
         self.decoder_weights = self.decoder_layer.get_weights()[0]
-        self.classifier_weights = self.classifier_layer.get_weights()[0]
         colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
 
         by_hsv = sorted(
@@ -580,7 +594,12 @@ def _viz_tsne_repr(self, **kwargs) -> None:
     y = np.asarray(y).astype(np.float32)
 
     model = AutoClassifier(
-        input_shape_parm=X.shape[1], num_classes=3, units=27, activation="selu", num_layers=2
+        input_shape_parm=X.shape[1],
+        num_classes=3,
+        units=27,
+        activation="tanh",
+        num_layers=2,
+        dropout=0.2,
     )
     model.compile(
         optimizer="adam",

diff --git a/likelihood/models/simulation.py b/likelihood/models/simulation.py
@@ -2,31 +2,25 @@
 import warnings
 from typing import List, Tuple, Union
 
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from pandas.core.frame import DataFrame
 
 from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
 
-# Suppress RankWarning
 warnings.simplefilter("ignore", np.RankWarning)
 
 
 # --------------------------------------------------------------------------------------------------------------------------------------
 def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]:
-    # Count the frequency of each category in the column
     freq = df[column].value_counts()
 
-    # Calculate the 25th percentile (Q1) and 75th percentile (Q3)
     q1 = freq.quantile(0.25)
     q3 = freq.quantile(0.75)
 
-    # Filter categories that are below the 25th percentile and above the 75th percentile
     least_frequent = freq[freq <= q1]
     most_frequent = freq[freq >= q3]
 
-    # Get the least frequent category (25th percentile) and the most frequent category (75th percentile)
     least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None
     most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None