diff --git a/Dockerfile b/Dockerfile index 9440bca8..cf96c523 100644 --- a/Dockerfile +++ b/Dockerfile @@ -85,6 +85,16 @@ RUN cp -R /Front-End/bin/install/appdir/usr/bin/data_prep_models /project/stages # Hotfix: install more recent version of GaNDLF for metrics generation RUN pip install git+https://github.com/mlcommons/GaNDLF@616b37bafad8f89d5c816a88f44fa30470601311 +RUN pip install torch torchvision + +RUN pip install git+https://github.com/MIC-DKFZ/nnUNet.git@nnunetv1 + +RUN mkdir /nnUNet_raw_data_base && mkdir /nnUNet_preprocessed + +ENV nnUNet_raw_data_base="/nnUNet_raw_data_base" +ENV nnUNet_preprocessed="/nnUNet_preprocessed" +ENV RESULTS_FOLDER="/project/models/nnUNet_trained_models" + COPY ./mlcubes/data_preparation/project /project ENTRYPOINT ["python", "/project/mlcube.py"] diff --git a/mlcubes/.gitignore b/mlcubes/.gitignore index ac044f44..be8d1082 100644 --- a/mlcubes/.gitignore +++ b/mlcubes/.gitignore @@ -6,4 +6,5 @@ *.png */mlcube/workspace/* !requirements.txt -!*/mlcube/workspace/parameters.yaml \ No newline at end of file +!*/mlcube/workspace/parameters.yaml +models \ No newline at end of file diff --git a/mlcubes/data_preparation/mlcube/mlcube.yaml b/mlcubes/data_preparation/mlcube/mlcube.yaml index 467cae74..4a2ec422 100644 --- a/mlcubes/data_preparation/mlcube/mlcube.yaml +++ b/mlcubes/data_preparation/mlcube/mlcube.yaml @@ -8,7 +8,7 @@ platform: docker: # Image name - image: mlcommons/rano-data-prep:latest + image: mlcommons/rano-data-prep:nnunet # Docker build context relative to $MLCUBE_ROOT. Default is `build`. build_context: "../project" # Docker file name within docker build context, default is `Dockerfile`. @@ -21,6 +21,7 @@ tasks: data_path: input_data, labels_path: input_labels, parameters_file: parameters.yaml, + models: additional_files/models, } outputs: { output_path: data/, diff --git a/mlcubes/data_preparation/project/mlcube.py b/mlcubes/data_preparation/project/mlcube.py index be71c415..749b7f72 100644 --- a/mlcubes/data_preparation/project/mlcube.py +++ b/mlcubes/data_preparation/project/mlcube.py @@ -23,12 +23,13 @@ def prepare( data_path: str = typer.Option(..., "--data_path"), labels_path: str = typer.Option(..., "--labels_path"), parameters_file: str = typer.Option(..., "--parameters_file"), + models_path: str = typer.Option(..., "--models"), output_path: str = typer.Option(..., "--output_path"), output_labels_path: str = typer.Option(..., "--output_labels_path"), report_file: str = typer.Option(..., "--report_file"), metadata_path: str = typer.Option(..., "--metadata_path"), ): - cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}" + cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --models_path={models_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}" exec_python(cmd) diff --git a/mlcubes/data_preparation/project/prepare.py b/mlcubes/data_preparation/project/prepare.py index 85ffeffe..03427d87 100644 --- a/mlcubes/data_preparation/project/prepare.py +++ b/mlcubes/data_preparation/project/prepare.py @@ -2,10 +2,12 @@ import argparse import pandas as pd import yaml +import shutil from stages.generate_report import GenerateReport from stages.get_csv import AddToCSV from stages.nifti_transform import NIfTITransform from stages.extract import Extract +from stages.extract_nnunet import ExtractNnUNet from stages.manual import ManualStage from stages.comparison import SegmentationComparisonStage from stages.confirm import ConfirmStage @@ -13,6 +15,8 @@ from stages.pipeline import Pipeline from stages.constants import INTERIM_FOLDER, FINAL_FOLDER, TUMOR_MASK_FOLDER +MODELS_PATH = "/project/models" + def find_csv_filenames(path_to_dir, suffix=".csv"): filenames = os.listdir(path_to_dir) @@ -27,6 +31,9 @@ def setup_argparser(): parser.add_argument( "--labels_path", dest="labels", type=str, help="path containing labels" ) + parser.add_argument( + "--models_path", dest="models", type=str, help="path to the nnunet models" + ) parser.add_argument( "--data_out", dest="data_out", type=str, help="path to store prepared data" ) @@ -78,7 +85,7 @@ def init_pipeline(args): loop = None report_gen = GenerateReport(out_data_csv, args.data, out_raw, args.labels, args.labels_out, args.data_out, 8, brain_data_out, 3, tumor_data_out, 5) csv_proc = AddToCSV(out_raw, out_data_csv, valid_data_out, out_raw) - nifti_proc = NIfTITransform(out_data_csv, nifti_data_out, valid_data_out, args.metadata_path) + nifti_proc = NIfTITransform(out_data_csv, nifti_data_out, valid_data_out, args.metadata_path, args.data_out) brain_extract_proc = Extract( out_data_csv, brain_data_out, @@ -89,14 +96,12 @@ def init_pipeline(args): "extract_brain", 3, ) - tumor_extract_proc = Extract( + tumor_extract_proc = ExtractNnUNet( out_data_csv, tumor_data_out, INTERIM_FOLDER, brain_data_out, INTERIM_FOLDER, - # loop, - "extract_tumor", 4, ) manual_proc = ManualStage(out_data_csv, tumor_data_out, tumor_data_out, backup_out) @@ -142,24 +147,9 @@ def init_report(args) -> pd.DataFrame: def main(): args = setup_argparser() - # Check if the input data is already prepared - # If so, just copy the contents and skip all processing - # TODO: this means we won't have a report. What would be the best way - # to handle this? - # TODO: Re-enable this when it is implemented correctly and we see the need for it - # # 1. If there is a csv file in the input folder - # # always reuse it for the prepared dataset - # csvs = find_csv_filenames(args.data_out) - # if len(csvs) == 1: - # # One csv was found. Assume this is the desired csv - # # move it to the expected location - # # TODO: How to deal with inconsistent paths because of MLCube functionality? - # csv_path = os.path.join(args.data_out, csvs[0]) - # os.rename(csv_path, out_data_csv) - # # can we assume the paths inside data.csv to be relative to the csv? - # # TODO: Create some logic to turn the csv paths into the expected paths for the MLCube - # # update_csv_paths(out_data_csv) - + # Move models to the expected location + if not os.path.exists(MODELS_PATH): + shutil.copytree(args.models, MODELS_PATH) report = init_report(args) pipeline = init_pipeline(args) diff --git a/mlcubes/data_preparation/project/stages/extract.py b/mlcubes/data_preparation/project/stages/extract.py index c8390918..81643776 100644 --- a/mlcubes/data_preparation/project/stages/extract.py +++ b/mlcubes/data_preparation/project/stages/extract.py @@ -74,7 +74,7 @@ def execute( """ self.__prepare_exec() self.__copy_case(index) - self.__process_case(index) + self._process_case(index) report, success = self.__update_state(index, report) self.prep.write() @@ -99,7 +99,7 @@ def __copy_case(self, index: Union[str, int]): for prev, copy in zip(prev_paths, copy_paths): shutil.copytree(prev, copy, dirs_exist_ok=True) - def __process_case(self, index: Union[str, int]): + def _process_case(self, index: Union[str, int]): id, tp = get_id_tp(index) df = self.prep.subjects_df row_search = df[(df["SubjectID"] == id) & (df["Timepoint"] == tp)] diff --git a/mlcubes/data_preparation/project/stages/extract_nnunet.py b/mlcubes/data_preparation/project/stages/extract_nnunet.py new file mode 100644 index 00000000..12361b61 --- /dev/null +++ b/mlcubes/data_preparation/project/stages/extract_nnunet.py @@ -0,0 +1,193 @@ +from typing import Union, List, Tuple +from tqdm import tqdm +import pandas as pd +import os +from os.path import realpath, dirname, join +import shutil +import time +import SimpleITK as sitk +import subprocess +import traceback +from LabelFusion.wrapper import fuse_images + +from .extract import Extract +from .PrepareDataset import ( + Preparator, + FINAL_FOLDER, + generate_tumor_segmentation_fused_images, + save_screenshot, +) +from .utils import update_row_with_dict, get_id_tp, MockTqdm + +MODALITY_MAPPING = { + "t1c": "t1c", + "t1ce": "t1c", + "t1": "t1n", + "t1n": "t1n", + "t2": "t2w", + "t2w": "t2w", + "t2f": "t2f", + "flair": "t2f", +} + +MODALITY_VARIANTS = { + "t1c": "T1GD", + "t1ce": "T1GD", + "t1": "T1", + "t1n": "T1", + "t2": "T2", + "t2w": "T2", + "t2f": "FLAIR", + "flair": "FLAIR", +} + + +class ExtractNnUNet(Extract): + def __init__( + self, + data_csv: str, + out_path: str, + subpath: str, + prev_stage_path: str, + prev_subpath: str, + status_code: int, + extra_labels_path=[], + ): + self.data_csv = data_csv + self.out_path = out_path + self.subpath = subpath + self.data_subpath = FINAL_FOLDER + self.prev_path = prev_stage_path + self.prev_subpath = prev_subpath + os.makedirs(self.out_path, exist_ok=True) + self.prep = Preparator(data_csv, out_path, "BraTSPipeline") + self.pbar = tqdm() + self.failed = False + self.exception = None + self.__status_code = status_code + self.extra_labels_path = extra_labels_path + + @property + def name(self) -> str: + return "nnUNet Tumor Extraction" + + @property + def status_code(self) -> str: + return self.__status_code + + def __get_models(self): + rel_models_path = "../models/nnUNet_trained_models/nnUNet/3d_fullres" + models_path = realpath(join(dirname(__file__), rel_models_path)) + return os.listdir(models_path) + + def __get_mod_order(self, model): + rel_orders_path = "../models/nnUNet_modality_order" + order_path = realpath(join(dirname(__file__), rel_orders_path, model, "order")) + with open(order_path, "r") as f: + order_str = f.readline() + # remove 'order = ' from the splitted list + modalities = order_str.split()[2:] + modalities = [MODALITY_MAPPING[mod] for mod in modalities] + return modalities + + def __prepare_case(self, path, id, tp, order): + tmp_subject = f"{id}-{tp}" + tmp_path = os.path.join(path, "tmp-data") + tmp_subject_path = os.path.join(tmp_path, tmp_subject) + tmp_out_path = os.path.join(path, "tmp-out") + shutil.rmtree(tmp_path, ignore_errors=True) + shutil.rmtree(tmp_out_path, ignore_errors=True) + os.makedirs(tmp_subject_path) + os.makedirs(tmp_out_path) + in_modalities_path = os.path.join(path, "DataForFeTS", id, tp) + input_modalities = {} + for modality_file in os.listdir(in_modalities_path): + if not modality_file.endswith(".nii.gz"): + continue + modality = modality_file[:-7].split("_")[-1] + norm_mod = MODALITY_MAPPING[modality] + mod_idx = order.index(norm_mod) + mod_idx = str(mod_idx).zfill(4) + + out_modality_file = f"{tmp_subject}_{mod_idx}.nii.gz" + in_file = os.path.join(in_modalities_path, modality_file) + out_file = os.path.join(tmp_subject_path, out_modality_file) + input_modalities[MODALITY_VARIANTS[modality]] = in_file + shutil.copyfile(in_file, out_file) + + return tmp_subject_path, tmp_out_path, input_modalities + + def __run_model(self, model, data_path, out_path): + # models are named Task_..., where is always 3 numbers + task_id = model[4:7] + cmd = f"nnUNet_predict -i {data_path} -o {out_path} -t {task_id} -f all" + print(cmd) + print(os.listdir(data_path)) + start = time.time() + subprocess.call(cmd, shell=True) + end = time.time() + total_time = end - start + print(f"Total time elapsed is {total_time} seconds") + + def __finalize_pred(self, tmp_out_path, out_pred_filepath): + # We assume there's only one file in out_path + pred = None + for file in os.listdir(tmp_out_path): + if file.endswith(".nii.gz"): + pred = file + + if pred is None: + raise RuntimeError("No tumor segmentation was found") + + pred_filepath = os.path.join(tmp_out_path, pred) + shutil.move(pred_filepath, out_pred_filepath) + return out_pred_filepath + + def _process_case(self, index: Union[str, int]): + id, tp = get_id_tp(index) + subject_id = f"{id}_{tp}" + models = self.__get_models() + outputs = [] + images_for_fusion = [] + out_path = os.path.join(self.out_path, "DataForQC", id, tp) + out_pred_path = os.path.join(out_path, "TumorMasksForQC") + os.makedirs(out_pred_path, exist_ok=True) + for i, model in enumerate(models): + order = self.__get_mod_order(model) + tmp_data_path, tmp_out_path, input_modalities = self.__prepare_case( + self.out_path, id, tp, order + ) + out_pred_filepath = os.path.join( + out_pred_path, f"{id}_{tp}_tumorMask_model_{i}.nii.gz" + ) + try: + self.__run_model(model, tmp_data_path, tmp_out_path) + output = self.__finalize_pred(tmp_out_path, out_pred_filepath) + outputs.append(output) + images_for_fusion.append(sitk.ReadImage(output, sitk.sitkUInt8)) + except Exception as e: + self.exception = e + self.failed = True + self.traceback = traceback.format_exc() + return + + # cleanup + shutil.rmtree(tmp_data_path, ignore_errors=True) + shutil.rmtree(tmp_out_path, ignore_errors=True) + + fused_outputs = generate_tumor_segmentation_fused_images( + images_for_fusion, out_pred_path, subject_id + ) + outputs += fused_outputs + + for output in outputs: + # save the screenshot + tumor_mask_id = os.path.basename(output).replace(".nii.gz", "") + save_screenshot( + input_modalities, + os.path.join( + out_path, + f"{tumor_mask_id}_summary.png", + ), + output, + ) diff --git a/mlcubes/data_preparation/project/stages/generate_report.py b/mlcubes/data_preparation/project/stages/generate_report.py index e5e253ab..57c8882f 100644 --- a/mlcubes/data_preparation/project/stages/generate_report.py +++ b/mlcubes/data_preparation/project/stages/generate_report.py @@ -345,6 +345,15 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]: # Keep track of the cases that were found on the input folder observed_cases.add(index) + has_semiprepared, in_tp_path = has_semiprepared_folder_structure(in_tp_path, recursive=True) + if has_semiprepared: + tumor_seg = get_tumor_segmentation(subject, timepoint, in_tp_path) + if tumor_seg is not None: + report = self._proceed_to_comparison(subject, timepoint, in_tp_path, report) + else: + report = self._proceed_to_tumor_extraction(subject, timepoint, in_tp_path, report) + continue + if index in report.index: # Case has already been identified, see if input hash is different # if so, override the contents and restart the state for that case @@ -373,15 +382,6 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]: # Move files around so it has the expected structure to_expected_folder_structure(out_tp_path, contents_path) - has_semiprepared, in_tp_path = has_semiprepared_folder_structure(in_tp_path, recursive=True) - if has_semiprepared: - tumor_seg = get_tumor_segmentation(subject, timepoint, in_tp_path) - if tumor_seg is not None: - report = self._proceed_to_comparison(subject, timepoint, in_tp_path, report) - else: - report = self._proceed_to_tumor_extraction(subject, timepoint, in_tp_path, report) - continue - if input_is_prepared: data["status_name"] = "DONE" data["status_code"] = self.done_status_code diff --git a/mlcubes/data_preparation/project/stages/nifti_transform.py b/mlcubes/data_preparation/project/stages/nifti_transform.py index f59048f6..07ecc712 100644 --- a/mlcubes/data_preparation/project/stages/nifti_transform.py +++ b/mlcubes/data_preparation/project/stages/nifti_transform.py @@ -11,10 +11,11 @@ class NIfTITransform(RowStage): def __init__( - self, data_csv: str, out_path: str, prev_stage_path: str, metadata_path: str + self, data_csv: str, out_path: str, prev_stage_path: str, metadata_path: str, data_out: str, ): self.data_csv = data_csv self.out_path = out_path + self.data_out = data_out self.prev_stage_path = prev_stage_path self.metadata_path = metadata_path os.makedirs(self.out_path, exist_ok=True) @@ -85,7 +86,7 @@ def __process_case(self, index: Union[str, int]): def __update_prev_stage_state(self, index: Union[str, int], report: pd.DataFrame): prev_data_path = report.loc[index]["data_path"] - prev_data_path = unnormalize_path(prev_data_path, "mlcube_io3") + prev_data_path = unnormalize_path(prev_data_path, self.data_out) shutil.rmtree(prev_data_path) def __undo_current_stage_changes(self, index: Union[str, int]): diff --git a/mlcubes/data_preparation/project/stages/pipeline.py b/mlcubes/data_preparation/project/stages/pipeline.py index 1ca4560e..fb8f517b 100644 --- a/mlcubes/data_preparation/project/stages/pipeline.py +++ b/mlcubes/data_preparation/project/stages/pipeline.py @@ -19,8 +19,9 @@ def normalize_report_paths(report: DataFrame) -> DataFrame: Returns: DataFrame: report with transformed paths """ - report["data_path"] = report["data_path"].str.split("mlcube_io3").str[-1] - report["labels_path"] = report["labels_path"].str.split("mlcube_io3").str[-1] + pattern = "mlcube_io\d+" + report["data_path"] = report["data_path"].str.split(pattern).str[-1] + report["labels_path"] = report["labels_path"].str.split(pattern).str[-1] return report diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index 401a5b36..b4ba8197 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -120,7 +120,7 @@ def _get_relevant_dicom_tags(filename: str) -> dict: return output_dict -def _save_screenshot( +def save_screenshot( input_images: dict, output_filename: str = None, input_mask: str = None ) -> None: """ @@ -451,7 +451,13 @@ def _run_tumor_segmentation_using_gandlf( tumor_masks_to_return.append(renamed_path) images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8)) + fused_masks_to_return = generate_tumor_segmentation_fused_images(images_for_fusion, mask_output_dir, subject_id) + return tumor_masks_to_return + fused_masks_to_return + + +def generate_tumor_segmentation_fused_images(images_for_fusion, mask_output_dir, subject_id): tumor_class_list = [0, 1, 2, 3, 4] + fused_masks_to_return = [] if len(images_for_fusion) > 1: for fusion_type in ["staple", "simple", "voting"]: @@ -461,9 +467,9 @@ def _run_tumor_segmentation_using_gandlf( f"{subject_id}_tumorMask_fused-{fusion_type}.nii.gz", ) sitk.WriteImage(fused_mask, fused_mask_file) - tumor_masks_to_return.append(fused_mask_file) + fused_masks_to_return.append(fused_mask_file) - return tumor_masks_to_return + return fused_masks_to_return class Preparator: @@ -689,7 +695,7 @@ def convert_to_dicom(self, idx: int, row: pd.Series, pbar: tqdm): f"{subject_id_timepoint}_summary_coregistration.png", ) # save the screenshot - _save_screenshot(outputs_reoriented, screenshot_path) + save_screenshot(outputs_reoriented, screenshot_path) if os.path.exists(screenshot_path): shutil.copyfile( @@ -753,7 +759,7 @@ def extract_brain(self, row: pd.Series, pbar: tqdm): sitk.WriteImage(masked_image, file_to_save) # save the screenshot - _save_screenshot( + save_screenshot( input_for_tumor_models, posixpath.join( interimOutputDir_actual, @@ -793,7 +799,7 @@ def extract_tumor(self, row: pd.Series, pbar: tqdm): for tumor_mask in tumor_masks_for_qc: tumor_mask_id = os.path.basename(tumor_mask).replace(".nii.gz", "") # save the screenshot - _save_screenshot( + save_screenshot( input_for_tumor_models, posixpath.join(interimOutputDir_actual, f"{tumor_mask_id}_summary.png"), tumor_mask,