Skip to content

Commit

Permalink
Pass models through tarball. Fix skip issues
Browse files Browse the repository at this point in the history
  • Loading branch information
aristizabal95 committed Dec 18, 2023
1 parent 5f53fd9 commit c97597d
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 33 deletions.
1 change: 1 addition & 0 deletions mlcubes/data_preparation/mlcube/mlcube.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ tasks:
data_path: input_data,
labels_path: input_labels,
parameters_file: parameters.yaml,
models: additional_files/models,
}
outputs: {
output_path: data/,
Expand Down
3 changes: 2 additions & 1 deletion mlcubes/data_preparation/project/mlcube.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ def prepare(
data_path: str = typer.Option(..., "--data_path"),
labels_path: str = typer.Option(..., "--labels_path"),
parameters_file: str = typer.Option(..., "--parameters_file"),
models_path: str = typer.Option(..., "--models"),
output_path: str = typer.Option(..., "--output_path"),
output_labels_path: str = typer.Option(..., "--output_labels_path"),
report_file: str = typer.Option(..., "--report_file"),
metadata_path: str = typer.Option(..., "--metadata_path"),
):
cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}"
cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --models_path={models_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}"
exec_python(cmd)


Expand Down
29 changes: 10 additions & 19 deletions mlcubes/data_preparation/project/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import argparse
import pandas as pd
import yaml
import shutil
from stages.generate_report import GenerateReport
from stages.get_csv import AddToCSV
from stages.nifti_transform import NIfTITransform
Expand All @@ -14,6 +15,8 @@
from stages.pipeline import Pipeline
from stages.constants import INTERIM_FOLDER, FINAL_FOLDER, TUMOR_MASK_FOLDER

MODELS_PATH = "/project/models"


def find_csv_filenames(path_to_dir, suffix=".csv"):
filenames = os.listdir(path_to_dir)
Expand All @@ -28,6 +31,9 @@ def setup_argparser():
parser.add_argument(
"--labels_path", dest="labels", type=str, help="path containing labels"
)
parser.add_argument(
"--models_path", dest="models", type=str, help="path to the nnunet models"
)
parser.add_argument(
"--data_out", dest="data_out", type=str, help="path to store prepared data"
)
Expand Down Expand Up @@ -79,7 +85,7 @@ def init_pipeline(args):
loop = None
report_gen = GenerateReport(out_data_csv, args.data, out_raw, args.labels, args.labels_out, args.data_out, 8, brain_data_out, 3, tumor_data_out, 5)
csv_proc = AddToCSV(out_raw, out_data_csv, valid_data_out, out_raw)
nifti_proc = NIfTITransform(out_data_csv, nifti_data_out, valid_data_out, args.metadata_path)
nifti_proc = NIfTITransform(out_data_csv, nifti_data_out, valid_data_out, args.metadata_path, args.data_out)
brain_extract_proc = Extract(
out_data_csv,
brain_data_out,
Expand Down Expand Up @@ -141,24 +147,9 @@ def init_report(args) -> pd.DataFrame:
def main():
args = setup_argparser()

# Check if the input data is already prepared
# If so, just copy the contents and skip all processing
# TODO: this means we won't have a report. What would be the best way
# to handle this?
# TODO: Re-enable this when it is implemented correctly and we see the need for it
# # 1. If there is a csv file in the input folder
# # always reuse it for the prepared dataset
# csvs = find_csv_filenames(args.data_out)
# if len(csvs) == 1:
# # One csv was found. Assume this is the desired csv
# # move it to the expected location
# # TODO: How to deal with inconsistent paths because of MLCube functionality?
# csv_path = os.path.join(args.data_out, csvs[0])
# os.rename(csv_path, out_data_csv)
# # can we assume the paths inside data.csv to be relative to the csv?
# # TODO: Create some logic to turn the csv paths into the expected paths for the MLCube
# # update_csv_paths(out_data_csv)

# Move models to the expected location
if not os.path.exists(MODELS_PATH):
shutil.copytree(args.models, MODELS_PATH)

report = init_report(args)
pipeline = init_pipeline(args)
Expand Down
18 changes: 9 additions & 9 deletions mlcubes/data_preparation/project/stages/generate_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,15 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]:
# Keep track of the cases that were found on the input folder
observed_cases.add(index)

has_semiprepared, in_tp_path = has_semiprepared_folder_structure(in_tp_path, recursive=True)
if has_semiprepared:
tumor_seg = get_tumor_segmentation(subject, timepoint, in_tp_path)
if tumor_seg is not None:
report = self._proceed_to_comparison(subject, timepoint, in_tp_path, report)
else:
report = self._proceed_to_tumor_extraction(subject, timepoint, in_tp_path, report)
continue

if index in report.index:
# Case has already been identified, see if input hash is different
# if so, override the contents and restart the state for that case
Expand Down Expand Up @@ -373,15 +382,6 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]:
# Move files around so it has the expected structure
to_expected_folder_structure(out_tp_path, contents_path)

has_semiprepared, in_tp_path = has_semiprepared_folder_structure(in_tp_path, recursive=True)
if has_semiprepared:
tumor_seg = get_tumor_segmentation(subject, timepoint, in_tp_path)
if tumor_seg is not None:
report = self._proceed_to_comparison(subject, timepoint, in_tp_path, report)
else:
report = self._proceed_to_tumor_extraction(subject, timepoint, in_tp_path, report)
continue

if input_is_prepared:
data["status_name"] = "DONE"
data["status_code"] = self.done_status_code
Expand Down
5 changes: 3 additions & 2 deletions mlcubes/data_preparation/project/stages/nifti_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@

class NIfTITransform(RowStage):
def __init__(
self, data_csv: str, out_path: str, prev_stage_path: str, metadata_path: str
self, data_csv: str, out_path: str, prev_stage_path: str, metadata_path: str, data_out: str,
):
self.data_csv = data_csv
self.out_path = out_path
self.data_out = data_out
self.prev_stage_path = prev_stage_path
self.metadata_path = metadata_path
os.makedirs(self.out_path, exist_ok=True)
Expand Down Expand Up @@ -85,7 +86,7 @@ def __process_case(self, index: Union[str, int]):

def __update_prev_stage_state(self, index: Union[str, int], report: pd.DataFrame):
prev_data_path = report.loc[index]["data_path"]
prev_data_path = unnormalize_path(prev_data_path, "mlcube_io3")
prev_data_path = unnormalize_path(prev_data_path, self.data_out)
shutil.rmtree(prev_data_path)

def __undo_current_stage_changes(self, index: Union[str, int]):
Expand Down
5 changes: 3 additions & 2 deletions mlcubes/data_preparation/project/stages/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ def normalize_report_paths(report: DataFrame) -> DataFrame:
Returns:
DataFrame: report with transformed paths
"""
report["data_path"] = report["data_path"].str.split("mlcube_io3").str[-1]
report["labels_path"] = report["labels_path"].str.split("mlcube_io3").str[-1]
pattern = "mlcube_io\d+"
report["data_path"] = report["data_path"].str.split(pattern).str[-1]
report["labels_path"] = report["labels_path"].str.split(pattern).str[-1]
return report


Expand Down

0 comments on commit c97597d

Please sign in to comment.