From a7aca3cb0f684fcd12748ab33117809cb07f40d6 Mon Sep 17 00:00:00 2001 From: tomseimandi Date: Fri, 22 Mar 2024 10:53:02 +0000 Subject: [PATCH] Improve error analysis script --- notebooks/error_analysis.py | 155 ++++++++++++++++++++++++++++-------- 1 file changed, 123 insertions(+), 32 deletions(-) diff --git a/notebooks/error_analysis.py b/notebooks/error_analysis.py index fdc1676..51397e5 100644 --- a/notebooks/error_analysis.py +++ b/notebooks/error_analysis.py @@ -1,6 +1,7 @@ """ Script to analyse Camembert errors. """ +from typing import List, Tuple import sys sys.path.append("src/") @@ -8,8 +9,10 @@ import hvac import os import mlflow -from utils.data import get_test_data +from utils.data import get_test_data, get_file_system from sklearn.metrics import accuracy_score +import pandas as pd +import argparse def set_aws_credentials(): @@ -32,61 +35,149 @@ def set_aws_credentials(): return -def main(): +def set_mlflow_env(): """ - Conduct error analysis. + Set MLflow environment variables. """ - set_aws_credentials() - os.environ["MLFLOW_TRACKING_URI"] = "https://projet-ape-mlflow.user.lab.sspcloud.fr/" os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://minio.lab.sspcloud.fr" os.environ["MLFLOW_EXPERIMENT_NAME"] = "Test" + return - # Get fasttext model - fasttext_run_id = "e4a53fec83534420bec1cb747ef700a9" - fasttext_model = mlflow.pyfunc.load_model( - f"runs:/{fasttext_run_id}/default", - ) - # Get feature names - Y = mlflow.get_run(fasttext_run_id).data.params["Y"] - text_feature = mlflow.get_run(fasttext_run_id).data.params["text_feature"] +def get_run_params(run_id: str) -> Tuple: + """ + Get run parameters of a logged model. + + Args: + run_id (str): MLflow run ID. + + Returns: + Tuple: Run parameters. + """ + text_feature = mlflow.get_run(run_id).data.params["text_feature"] categorical_features = [] for idx in range(1, 6): categorical_features.append( - mlflow.get_run(fasttext_run_id).data.params.get(f"categorical_features_{idx}") + mlflow.get_run(run_id).data.params.get(f"categorical_features_{idx}") ) - df_test = get_test_data() - query = df_test.head[[text_feature] + categorical_features].to_dict("list") + return text_feature, categorical_features + + +def get_fasttext_preds(df_test: pd.DataFrame, run_id: str) -> List[str]: + """ + Get predictions of a logged FastText model on a test + dataframe. + + Args: + df_test (pd.DataFrame): Test dataframe. + run_id (str): MLflow run ID of the FastText model. + + Returns: + List[str]: Predictions. + """ + # Get fasttext model + fasttext_model = mlflow.pyfunc.load_model( + f"runs:/{run_id}/default", + ) + + # Get feature names + text_feature, categorical_features = get_run_params(run_id) + + # Build query + query = df_test[[text_feature] + categorical_features].to_dict("list") + # Inference fasttext_output = fasttext_model.predict(query, {"k": 1}) + + # Post-process predictions fasttext_predictions = [ (x[0].replace("__label__", ""), y[0]) for x, y in zip(fasttext_output[0], fasttext_output[1]) ] fasttext_predictions = [x[0] for x in fasttext_predictions] - labels = df_test[Y].tolist() - fasttext_accuracy = accuracy_score(labels, fasttext_predictions) - print(fasttext_accuracy) + return fasttext_predictions - # Camembert - run_id = "b4b1f22889844ff085cba81a3ae0b4ec" + +def get_camembert_preds(df_test: pd.DataFrame, run_id: str) -> List[str]: + """ + Get predictions of a logged Camembert model on a test + dataframe. + + Args: + df_test (pd.DataFrame): Test dataframe. + run_id (str): MLflow run ID of the Camembert model. + + Returns: + List[str]: Predictions. + """ + # Fetch model model = mlflow.pyfunc.load_model( f"runs:/{run_id}/default", ) # Get feature names - Y = mlflow.get_run(run_id).data.params["Y"] - text_feature = mlflow.get_run(run_id).data.params["text_feature"] - categorical_features = [] - for idx in range(1, 6): - categorical_features.append( - mlflow.get_run(run_id).data.params.get(f"categorical_features_{idx}") - ) - df_test = get_test_data() - query = df_test.head[[text_feature] + categorical_features].to_dict("list") + text_feature, categorical_features = get_run_params(run_id) + + # Build query + query = df_test[[text_feature] + categorical_features].to_dict("list") camembert_output = model.predict(query, {"k": 1}) - print(camembert_output) + camembert_predictions = [x[0] for x in camembert_output[0]] + return camembert_predictions + + +def main(fasttext_run_id: str, camembert_run_id: str): + """ + Conduct error analysis. + + Args: + s3_path (str): S3 path to the test data. If the file is not found, the + predictions are computed. + """ + set_aws_credentials() + set_mlflow_env() + + fs = get_file_system() + s3_path = f"projet-ape/estoril/predictions_{fasttext_run_id}_{camembert_run_id}.csv" + if not fs.exists(s3_path): + # Get raw test data + df_test = get_test_data() + + # Compute predictions + df_test["y_pred_fasttext"] = get_fasttext_preds(df_test, fasttext_run_id) + df_test["y_pred_camembert"] = get_camembert_preds(df_test, camembert_run_id) + + # Save predictions + df_test = df_test.rename(columns={"apet_finale": "y_true"}) + with fs.open(s3_path, "w") as f: + df_test.to_csv(f, index=False) + else: + # Load test data with predictions + with fs.open(s3_path) as f: + df_test = pd.read_csv(f) + + # Compute accuracy + fasttext_accuracy = accuracy_score(df_test["y_true"], df_test["y_pred_fasttext"]) + camembert_accuracy = accuracy_score(df_test["y_true"], df_test["y_pred_camembert"]) + print(fasttext_accuracy) + print(camembert_accuracy) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-f", + "--fasttext-run-id", + type=str, + default="3927ef0bee3b4be79117cb73f02ca376", + ) + parser.add_argument( + "-c", + "--camembert-run-id", + type=str, + default="46998e8160ad4b33b494f31f339fc1ae", + ) + args = parser.parse_args() - # Implement comparison of fasttext and camembert + main(fasttext_run_id=args.fasttext_run_id, camembert_run_id=args.camembert_run_id)