Pass models through tarball. Fix skip issues

FeTS-AI · Dec 18, 2023 · c97597d · c97597d
1 parent 5f53fd9
commit c97597d
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 33 deletions.
diff --git a/mlcubes/data_preparation/mlcube/mlcube.yaml b/mlcubes/data_preparation/mlcube/mlcube.yaml
@@ -21,6 +21,7 @@ tasks:
         data_path: input_data,
         labels_path: input_labels,
         parameters_file: parameters.yaml,
+        models: additional_files/models,
       }
       outputs: {
         output_path: data/,

diff --git a/mlcubes/data_preparation/project/mlcube.py b/mlcubes/data_preparation/project/mlcube.py
@@ -23,12 +23,13 @@ def prepare(
     data_path: str = typer.Option(..., "--data_path"),
     labels_path: str = typer.Option(..., "--labels_path"),
     parameters_file: str = typer.Option(..., "--parameters_file"),
+    models_path: str = typer.Option(..., "--models"),
     output_path: str = typer.Option(..., "--output_path"),
     output_labels_path: str = typer.Option(..., "--output_labels_path"),
     report_file: str = typer.Option(..., "--report_file"),
     metadata_path: str = typer.Option(..., "--metadata_path"),
 ):
-    cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}"
+    cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --models_path={models_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}"
     exec_python(cmd)
 
 

diff --git a/mlcubes/data_preparation/project/prepare.py b/mlcubes/data_preparation/project/prepare.py
@@ -2,6 +2,7 @@
 import argparse
 import pandas as pd
 import yaml
+import shutil
 from stages.generate_report import GenerateReport
 from stages.get_csv import AddToCSV
 from stages.nifti_transform import NIfTITransform
@@ -14,6 +15,8 @@
 from stages.pipeline import Pipeline
 from stages.constants import INTERIM_FOLDER, FINAL_FOLDER, TUMOR_MASK_FOLDER
 
+MODELS_PATH = "/project/models"
+
 
 def find_csv_filenames(path_to_dir, suffix=".csv"):
     filenames = os.listdir(path_to_dir)
@@ -28,6 +31,9 @@ def setup_argparser():
     parser.add_argument(
         "--labels_path", dest="labels", type=str, help="path containing labels"
     )
+    parser.add_argument(
+        "--models_path", dest="models", type=str, help="path to the nnunet models"
+    )
     parser.add_argument(
         "--data_out", dest="data_out", type=str, help="path to store prepared data"
     )
@@ -79,7 +85,7 @@ def init_pipeline(args):
     loop = None
     report_gen = GenerateReport(out_data_csv, args.data, out_raw, args.labels, args.labels_out, args.data_out, 8, brain_data_out, 3, tumor_data_out, 5)
     csv_proc = AddToCSV(out_raw, out_data_csv, valid_data_out, out_raw)
-    nifti_proc = NIfTITransform(out_data_csv, nifti_data_out, valid_data_out, args.metadata_path)
+    nifti_proc = NIfTITransform(out_data_csv, nifti_data_out, valid_data_out, args.metadata_path, args.data_out)
     brain_extract_proc = Extract(
         out_data_csv,
         brain_data_out,
@@ -141,24 +147,9 @@ def init_report(args) -> pd.DataFrame:
 def main():
     args = setup_argparser()
 
-    # Check if the input data is already prepared
-    # If so, just copy the contents and skip all processing
-    # TODO: this means we won't have a report. What would be the best way
-    # to handle this?
-    # TODO: Re-enable this when it is implemented correctly and we see the need for it
-    # # 1. If there is a csv file in the input folder
-    # # always reuse it for the prepared dataset
-    # csvs = find_csv_filenames(args.data_out)
-    # if len(csvs) == 1:
-    #     # One csv was found. Assume this is the desired csv
-    #     # move it to the expected location
-    #     # TODO: How to deal with inconsistent paths because of MLCube functionality?
-    #     csv_path = os.path.join(args.data_out, csvs[0])
-    #     os.rename(csv_path, out_data_csv)
-    #     # can we assume the paths inside data.csv to be relative to the csv?
-    #     # TODO: Create some logic to turn the csv paths into the expected paths for the MLCube
-    #     # update_csv_paths(out_data_csv)
-
+    # Move models to the expected location
+    if not os.path.exists(MODELS_PATH):
+        shutil.copytree(args.models, MODELS_PATH)
 
     report = init_report(args)
     pipeline = init_pipeline(args)

diff --git a/mlcubes/data_preparation/project/stages/generate_report.py b/mlcubes/data_preparation/project/stages/generate_report.py
@@ -345,6 +345,15 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]:
                 # Keep track of the cases that were found on the input folder
                 observed_cases.add(index)
 
+                has_semiprepared, in_tp_path = has_semiprepared_folder_structure(in_tp_path, recursive=True)
+                if has_semiprepared:
+                    tumor_seg = get_tumor_segmentation(subject, timepoint, in_tp_path)
+                    if tumor_seg is not None:
+                        report = self._proceed_to_comparison(subject, timepoint, in_tp_path, report)
+                    else:
+                        report = self._proceed_to_tumor_extraction(subject, timepoint, in_tp_path, report)
+                    continue
+
                 if index in report.index:
                     # Case has already been identified, see if input hash is different
                     # if so, override the contents and restart the state for that case
@@ -373,15 +382,6 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]:
                     # Move files around so it has the expected structure
                     to_expected_folder_structure(out_tp_path, contents_path)
 
-                has_semiprepared, in_tp_path = has_semiprepared_folder_structure(in_tp_path, recursive=True)
-                if has_semiprepared:
-                    tumor_seg = get_tumor_segmentation(subject, timepoint, in_tp_path)
-                    if tumor_seg is not None:
-                        report = self._proceed_to_comparison(subject, timepoint, in_tp_path, report)
-                    else:
-                        report = self._proceed_to_tumor_extraction(subject, timepoint, in_tp_path, report)
-                    continue
-
                 if input_is_prepared:
                     data["status_name"] = "DONE"
                     data["status_code"] = self.done_status_code

diff --git a/mlcubes/data_preparation/project/stages/nifti_transform.py b/mlcubes/data_preparation/project/stages/nifti_transform.py
@@ -11,10 +11,11 @@
 
 class NIfTITransform(RowStage):
     def __init__(
-        self, data_csv: str, out_path: str, prev_stage_path: str, metadata_path: str
+        self, data_csv: str, out_path: str, prev_stage_path: str, metadata_path: str, data_out: str,
     ):
         self.data_csv = data_csv
         self.out_path = out_path
+        self.data_out = data_out
         self.prev_stage_path = prev_stage_path
         self.metadata_path = metadata_path
         os.makedirs(self.out_path, exist_ok=True)
@@ -85,7 +86,7 @@ def __process_case(self, index: Union[str, int]):
 
     def __update_prev_stage_state(self, index: Union[str, int], report: pd.DataFrame):
         prev_data_path = report.loc[index]["data_path"]
-        prev_data_path = unnormalize_path(prev_data_path, "mlcube_io3")
+        prev_data_path = unnormalize_path(prev_data_path, self.data_out)
         shutil.rmtree(prev_data_path)
 
     def __undo_current_stage_changes(self, index: Union[str, int]):

diff --git a/mlcubes/data_preparation/project/stages/pipeline.py b/mlcubes/data_preparation/project/stages/pipeline.py
@@ -19,8 +19,9 @@ def normalize_report_paths(report: DataFrame) -> DataFrame:
     Returns:
         DataFrame: report with transformed paths
     """
-    report["data_path"] = report["data_path"].str.split("mlcube_io3").str[-1]
-    report["labels_path"] = report["labels_path"].str.split("mlcube_io3").str[-1]
+    pattern = "mlcube_io\d+"
+    report["data_path"] = report["data_path"].str.split(pattern).str[-1]
+    report["labels_path"] = report["labels_path"].str.split(pattern).str[-1]
     return report