diff --git a/.gitignore b/.gitignore index 6fa2352..43c4a88 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ all_recommended_organisms.pkl df_organisms_selection.pkl models test_genes -grid_search_results.csv \ No newline at end of file +grid_search_results.csv +outputs \ No newline at end of file diff --git a/main.py b/main.py index 9c8632b..92dd1ec 100644 --- a/main.py +++ b/main.py @@ -4,22 +4,32 @@ from src.utils import load_dataframe print("loading data...") -# load data +# load the data df = load_dataframe("df_organisms_selection.pkl") chunk_sizes = [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000] ks = [2, 3, 4, 5, 6, 7, 8] -rf_params = {"n_estimators": 100, "max_depth": None, "random_state": 42} +# chunk_sizes = [500, 1000, 1500] +# ks = [2, 3, 4] -# create directories for saving outputs +rf_params = { + "n_estimators": 100, # 100 estimators + "max_depth": 15, # limit the depth of trees + "max_features": 'sqrt', # use the square root of the number of features + "min_samples_split": 10, # minimum number to split a node + "min_samples_leaf": 5, # minimum number of samples per leaf + "random_state": 42 +} + +# create directories to save the results os.makedirs("models", exist_ok=True) os.makedirs("test_genes", exist_ok=True) -# train models for all parameter combinations +# train the models for all parameter combinations results = [] for chunk_size in chunk_sizes: for k in ks: - accuracy = evaluate_and_save_model( + metrics = evaluate_and_save_model( df, chunk_size, k, @@ -28,8 +38,9 @@ model_dir="models", test_gene_dir="test_genes" ) - results.append({"chunk_size": chunk_size, "k": k, "accuracy": accuracy}) + results.append(metrics) -# save grid search results +# save the results to a CSV file results_df = pd.DataFrame(results) results_df.to_csv("grid_search_results.csv", index=False) +print("results saved") diff --git a/src/model_training.py b/src/model_training.py index 4d7ab8d..7a0cac3 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -3,45 +3,54 @@ import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split +from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.metrics import accuracy_score def evaluate_and_save_model(df, chunk_size, k, rf_params, overlap, model_dir, test_gene_dir): """ - train and save a random forest model for a specific parameter combination - also saves test genes in their raw form (before signatures) + train and save a RandomForest model for a specific parameter combination, + and also save the test genes in their raw form. """ from .sequence_operations import split_gens_over_all_genomes from .kmer_signatures import signature_for_all_genes - print(f"Training model for chunk_size={chunk_size}, k={k}...") + print(f"training model for chunk_size={chunk_size}, k={k}...") # extract genes from genomes X_raw, y = split_gens_over_all_genomes(df, chunk_size=chunk_size, overlap=overlap) - print(f"Generated genes: {len(X_raw)}") + print(f"generated genes: {len(X_raw)}") - # sign the genes with k-mers + # sign genes with k-mers X_signature = signature_for_all_genes(X_raw, k) - print("Generated signatures.") + print("generated signatures.") # split into training and testing sets X_train_sig, X_test_sig, y_train, y_test, X_train_raw, X_test_raw = train_test_split( X_signature, y, X_raw, test_size=0.2, random_state=42 ) - # train random forest model + # train the RandomForest model rf_model = RandomForestClassifier(**rf_params) rf_model.fit(X_train_sig, y_train) - # evaluate the model + # evaluation with 5-fold cross-validation + cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 5-fold cross-validation + cv_scores = cross_val_score(rf_model, X_train_sig, y_train, cv=cv, scoring='accuracy') + + # mean accuracy from cross-validation + cv_mean_accuracy = np.mean(cv_scores) + cv_std_accuracy = np.std(cv_scores) + + # evaluate the model on the test set y_pred = rf_model.predict(X_test_sig) - accuracy = accuracy_score(y_test, y_pred) - print(f"Accuracy for chunk_size={chunk_size}, k={k}: {accuracy}") + test_accuracy = accuracy_score(y_test, y_pred) + print(f"test set accuracy: {test_accuracy}") + print(f"cross-validation mean accuracy: {cv_mean_accuracy} ± {cv_std_accuracy}") # save the model model_path = os.path.join(model_dir, f"model_chunk{chunk_size}_k{k}.joblib") joblib.dump(rf_model, model_path) - print(f"Model saved to {model_path}") + print(f"model saved to {model_path}") # select random test genes and save them selected_indices = np.random.choice(len(X_test_raw), size=10, replace=False) @@ -54,6 +63,13 @@ def evaluate_and_save_model(df, chunk_size, k, rf_params, overlap, model_dir, te }) test_gene_path = os.path.join(test_gene_dir, f"test_genes_chunk{chunk_size}_k{k}.csv") test_genes_df.to_csv(test_gene_path, index=False) - print(f"Test genes saved to {test_gene_path}") + print(f"test genes saved to {test_gene_path}") - return accuracy + # return the metrics to save in the report + return { + "chunk_size": chunk_size, + "k": k, + "test_accuracy": test_accuracy, + "cv_mean_accuracy": cv_mean_accuracy, + "cv_std_accuracy": cv_std_accuracy + }