avoid overfitting

khnam-ng · Jan 13, 2025 · dbc0f79 · dbc0f79
1 parent 9ea5146
commit dbc0f79
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 22 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@ all_recommended_organisms.pkl
 df_organisms_selection.pkl
 models
 test_genes
-grid_search_results.csv
+grid_search_results.csv
+outputs
diff --git a/main.py b/main.py
@@ -4,22 +4,32 @@
 from src.utils import load_dataframe
 
 print("loading data...")
-# load data
+# load the data
 df = load_dataframe("df_organisms_selection.pkl")
 
 chunk_sizes = [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000]
 ks = [2, 3, 4, 5, 6, 7, 8]
-rf_params = {"n_estimators": 100, "max_depth": None, "random_state": 42}
+# chunk_sizes = [500, 1000, 1500]
+# ks = [2, 3, 4]
 
-# create directories for saving outputs
+rf_params = {
+    "n_estimators": 100,       # 100 estimators
+    "max_depth": 15,           # limit the depth of trees
+    "max_features": 'sqrt',    # use the square root of the number of features
+    "min_samples_split": 10,   # minimum number to split a node
+    "min_samples_leaf": 5,     # minimum number of samples per leaf
+    "random_state": 42
+}
+
+# create directories to save the results
 os.makedirs("models", exist_ok=True)
 os.makedirs("test_genes", exist_ok=True)
 
-# train models for all parameter combinations
+# train the models for all parameter combinations
 results = []
 for chunk_size in chunk_sizes:
     for k in ks:
-        accuracy = evaluate_and_save_model(
+        metrics = evaluate_and_save_model(
             df,
             chunk_size,
             k,
@@ -28,8 +38,9 @@
             model_dir="models",
             test_gene_dir="test_genes"
         )
-        results.append({"chunk_size": chunk_size, "k": k, "accuracy": accuracy})
+        results.append(metrics)
 
-# save grid search results
+# save the results to a CSV file
 results_df = pd.DataFrame(results)
 results_df.to_csv("grid_search_results.csv", index=False)
+print("results saved")
diff --git a/src/model_training.py b/src/model_training.py
@@ -3,45 +3,54 @@
 import pandas as pd
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
 from sklearn.metrics import accuracy_score
 
 def evaluate_and_save_model(df, chunk_size, k, rf_params, overlap, model_dir, test_gene_dir):
     """
-    train and save a random forest model for a specific parameter combination
-    also saves test genes in their raw form (before signatures)
+    train and save a RandomForest model for a specific parameter combination,
+    and also save the test genes in their raw form.
     """
     from .sequence_operations import split_gens_over_all_genomes
     from .kmer_signatures import signature_for_all_genes
 
-    print(f"Training model for chunk_size={chunk_size}, k={k}...")
+    print(f"training model for chunk_size={chunk_size}, k={k}...")
 
     # extract genes from genomes
     X_raw, y = split_gens_over_all_genomes(df, chunk_size=chunk_size, overlap=overlap)
-    print(f"Generated genes: {len(X_raw)}")
+    print(f"generated genes: {len(X_raw)}")
 
-    # sign the genes with k-mers
+    # sign genes with k-mers
     X_signature = signature_for_all_genes(X_raw, k)
-    print("Generated signatures.")
+    print("generated signatures.")
 
     # split into training and testing sets
     X_train_sig, X_test_sig, y_train, y_test, X_train_raw, X_test_raw = train_test_split(
         X_signature, y, X_raw, test_size=0.2, random_state=42
     )
 
-    # train random forest model
+    # train the RandomForest model
     rf_model = RandomForestClassifier(**rf_params)
     rf_model.fit(X_train_sig, y_train)
 
-    # evaluate the model
+    # evaluation with 5-fold cross-validation
+    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation
+    cv_scores = cross_val_score(rf_model, X_train_sig, y_train, cv=cv, scoring='accuracy')
+
+    # mean accuracy from cross-validation
+    cv_mean_accuracy = np.mean(cv_scores)
+    cv_std_accuracy = np.std(cv_scores)
+
+    # evaluate the model on the test set
     y_pred = rf_model.predict(X_test_sig)
-    accuracy = accuracy_score(y_test, y_pred)
-    print(f"Accuracy for chunk_size={chunk_size}, k={k}: {accuracy}")
+    test_accuracy = accuracy_score(y_test, y_pred)
+    print(f"test set accuracy: {test_accuracy}")
+    print(f"cross-validation mean accuracy: {cv_mean_accuracy} ± {cv_std_accuracy}")
 
     # save the model
     model_path = os.path.join(model_dir, f"model_chunk{chunk_size}_k{k}.joblib")
     joblib.dump(rf_model, model_path)
-    print(f"Model saved to {model_path}")
+    print(f"model saved to {model_path}")
 
     # select random test genes and save them
     selected_indices = np.random.choice(len(X_test_raw), size=10, replace=False)
@@ -54,6 +63,13 @@ def evaluate_and_save_model(df, chunk_size, k, rf_params, overlap, model_dir, te
     })
     test_gene_path = os.path.join(test_gene_dir, f"test_genes_chunk{chunk_size}_k{k}.csv")
     test_genes_df.to_csv(test_gene_path, index=False)
-    print(f"Test genes saved to {test_gene_path}")
+    print(f"test genes saved to {test_gene_path}")
 
-    return accuracy
+    # return the metrics to save in the report
+    return {
+        "chunk_size": chunk_size,
+        "k": k,
+        "test_accuracy": test_accuracy,
+        "cv_mean_accuracy": cv_mean_accuracy,
+        "cv_std_accuracy": cv_std_accuracy
+    }