Skip to content

Commit

Permalink
avoid overfitting
Browse files Browse the repository at this point in the history
  • Loading branch information
parakawa committed Jan 13, 2025
1 parent 9ea5146 commit dbc0f79
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 22 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ all_recommended_organisms.pkl
df_organisms_selection.pkl
models
test_genes
grid_search_results.csv
grid_search_results.csv
outputs
25 changes: 18 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,32 @@
from src.utils import load_dataframe

print("loading data...")
# load data
# load the data
df = load_dataframe("df_organisms_selection.pkl")

chunk_sizes = [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000]
ks = [2, 3, 4, 5, 6, 7, 8]
rf_params = {"n_estimators": 100, "max_depth": None, "random_state": 42}
# chunk_sizes = [500, 1000, 1500]
# ks = [2, 3, 4]

# create directories for saving outputs
rf_params = {
"n_estimators": 100, # 100 estimators
"max_depth": 15, # limit the depth of trees
"max_features": 'sqrt', # use the square root of the number of features
"min_samples_split": 10, # minimum number to split a node
"min_samples_leaf": 5, # minimum number of samples per leaf
"random_state": 42
}

# create directories to save the results
os.makedirs("models", exist_ok=True)
os.makedirs("test_genes", exist_ok=True)

# train models for all parameter combinations
# train the models for all parameter combinations
results = []
for chunk_size in chunk_sizes:
for k in ks:
accuracy = evaluate_and_save_model(
metrics = evaluate_and_save_model(
df,
chunk_size,
k,
Expand All @@ -28,8 +38,9 @@
model_dir="models",
test_gene_dir="test_genes"
)
results.append({"chunk_size": chunk_size, "k": k, "accuracy": accuracy})
results.append(metrics)

# save grid search results
# save the results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv("grid_search_results.csv", index=False)
print("results saved")
44 changes: 30 additions & 14 deletions src/model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,54 @@
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

def evaluate_and_save_model(df, chunk_size, k, rf_params, overlap, model_dir, test_gene_dir):
"""
train and save a random forest model for a specific parameter combination
also saves test genes in their raw form (before signatures)
train and save a RandomForest model for a specific parameter combination,
and also save the test genes in their raw form.
"""
from .sequence_operations import split_gens_over_all_genomes
from .kmer_signatures import signature_for_all_genes

print(f"Training model for chunk_size={chunk_size}, k={k}...")
print(f"training model for chunk_size={chunk_size}, k={k}...")

# extract genes from genomes
X_raw, y = split_gens_over_all_genomes(df, chunk_size=chunk_size, overlap=overlap)
print(f"Generated genes: {len(X_raw)}")
print(f"generated genes: {len(X_raw)}")

# sign the genes with k-mers
# sign genes with k-mers
X_signature = signature_for_all_genes(X_raw, k)
print("Generated signatures.")
print("generated signatures.")

# split into training and testing sets
X_train_sig, X_test_sig, y_train, y_test, X_train_raw, X_test_raw = train_test_split(
X_signature, y, X_raw, test_size=0.2, random_state=42
)

# train random forest model
# train the RandomForest model
rf_model = RandomForestClassifier(**rf_params)
rf_model.fit(X_train_sig, y_train)

# evaluate the model
# evaluation with 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 5-fold cross-validation
cv_scores = cross_val_score(rf_model, X_train_sig, y_train, cv=cv, scoring='accuracy')

# mean accuracy from cross-validation
cv_mean_accuracy = np.mean(cv_scores)
cv_std_accuracy = np.std(cv_scores)

# evaluate the model on the test set
y_pred = rf_model.predict(X_test_sig)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for chunk_size={chunk_size}, k={k}: {accuracy}")
test_accuracy = accuracy_score(y_test, y_pred)
print(f"test set accuracy: {test_accuracy}")
print(f"cross-validation mean accuracy: {cv_mean_accuracy} ± {cv_std_accuracy}")

# save the model
model_path = os.path.join(model_dir, f"model_chunk{chunk_size}_k{k}.joblib")
joblib.dump(rf_model, model_path)
print(f"Model saved to {model_path}")
print(f"model saved to {model_path}")

# select random test genes and save them
selected_indices = np.random.choice(len(X_test_raw), size=10, replace=False)
Expand All @@ -54,6 +63,13 @@ def evaluate_and_save_model(df, chunk_size, k, rf_params, overlap, model_dir, te
})
test_gene_path = os.path.join(test_gene_dir, f"test_genes_chunk{chunk_size}_k{k}.csv")
test_genes_df.to_csv(test_gene_path, index=False)
print(f"Test genes saved to {test_gene_path}")
print(f"test genes saved to {test_gene_path}")

return accuracy
# return the metrics to save in the report
return {
"chunk_size": chunk_size,
"k": k,
"test_accuracy": test_accuracy,
"cv_mean_accuracy": cv_mean_accuracy,
"cv_std_accuracy": cv_std_accuracy
}

0 comments on commit dbc0f79

Please sign in to comment.