From 30fe073f20609c24b31873dbab7cd8b1aa5926f7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 12 Jun 2023 10:54:27 +0100 Subject: [PATCH] Cleanup of prepare-training-data-bert --- cm-mlops/script/download-file/customize.py | 2 +- .../prepare-training-data-bert/_cm.json | 24 ++++++++++++------- .../script/prepare-training-data-bert/run.sh | 1 - 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/cm-mlops/script/download-file/customize.py b/cm-mlops/script/download-file/customize.py index caa38ddffa..6e9bd7e048 100644 --- a/cm-mlops/script/download-file/customize.py +++ b/cm-mlops/script/download-file/customize.py @@ -49,7 +49,7 @@ def preprocess(i): elif env['CM_DOWNLOAD_TOOL'] == "curl": env['CM_DOWNLOAD_CMD'] = f"curl {extra_download_options} {url}" elif env['CM_DOWNLOAD_TOOL'] == "gdown": - env['CM_DOWNLOAD_CMD'] = f"gdown -c {extra_download_options} {url}" + env['CM_DOWNLOAD_CMD'] = f"gdown {extra_download_options} {url}" filename = env['CM_DOWNLOAD_FILENAME'] env['CM_DOWNLOAD_DOWNLOADED_FILENAME'] = filename diff --git a/cm-mlops/script/prepare-training-data-bert/_cm.json b/cm-mlops/script/prepare-training-data-bert/_cm.json index fd1d1ddd35..99bdda94a8 100644 --- a/cm-mlops/script/prepare-training-data-bert/_cm.json +++ b/cm-mlops/script/prepare-training-data-bert/_cm.json @@ -27,7 +27,8 @@ "default": true, "deps": [ { - "tags": "get,git,repo,_repo.https://github.com/mlcommons/training_results_v2.1" + "tags": "get,git,repo,_repo.https://github.com/mlcommons/training_results_v2.1", + "extra_cache_tags": "mlperf,training,results" } ], "prehook_deps": [ @@ -36,14 +37,16 @@ "env": { "CM_DOWNLOAD_FILENAME": "bert_config.json", "CM_DOWNLOAD_PATH": "<<>>" - } + }, + "extra_cache_tags": "mlperf,training,bert,config" }, { "tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1USK108J6hMM_d27xCHi738qBL8_BT1u1", "env": { "CM_DOWNLOAD_FILENAME": "vocab.txt", "CM_DOWNLOAD_PATH": "<<>>" - } + }, + "extra_cache_tags": "bert,vocab" }, { "tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1tmMgLwoBvbEJEHXh77sqrXYw5RpqT8R_", @@ -51,7 +54,8 @@ "CM_DOWNLOAD_FILENAME": "bert_reference_results_text_md5.txt", "CM_DOWNLOAD_PATH": "<<>>", "CM_DOWNLOAD_FINAL_ENV_NAME": "CM_BERT_REFERENCE_RESULTS_TEXT_MD5_FILE_PATH" - } + }, + "extra_cache_tags": "bert,data,results,md5" }, { "tags": "download-and-extract,file,_gdown,_extract,_url.https://drive.google.com/uc?id=14xV2OUGSQDG_yDBrmbSdcDC-QGeqpfs_", @@ -61,28 +65,32 @@ "CM_DOWNLOAD_PATH": "<<>>", "CM_EXTRACT_PATH": "<<>>", "CM_EXTRACT_EXTRACTED_CHECKSUM_FILE": "<<>>" - } + }, + "extra_cache_tags": "bert,data,results" }, { "tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1chiTBljF0Eh1U5pKs6ureVHgSbtU8OG_", "env": { "CM_DOWNLOAD_FILENAME": "model.ckpt-28252.data-00000-of-00001", "CM_DOWNLOAD_PATH": "<<>>" - } + }, + "extra_cache_tags": "bert,checkpoint,data" }, { "tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1Q47V3K3jFRkbJ2zGCrKkKk-n0fvMZsa0", "env": { "CM_DOWNLOAD_FILENAME": "model.ckpt-28252.index", "CM_DOWNLOAD_PATH": "<<>>" - } + }, + "extra_cache_tags": "bert,checkpoint,index" }, { "tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1vAcVmXSLsLeQ1q7gvHnQUSth5W_f_pwv", "env": { "CM_DOWNLOAD_FILENAME": "model.ckpt-28252.meta", "CM_DOWNLOAD_PATH": "<<>>" - } + }, + "extra_cache_tags": "bert,checkpoint,meta" } ], "env": { diff --git a/cm-mlops/script/prepare-training-data-bert/run.sh b/cm-mlops/script/prepare-training-data-bert/run.sh index 789edbfab8..23cd41289b 100644 --- a/cm-mlops/script/prepare-training-data-bert/run.sh +++ b/cm-mlops/script/prepare-training-data-bert/run.sh @@ -30,5 +30,4 @@ CUR=${CM_DATA_DIR:-"$PWD/data"} run "cd \"${CM_RUN_DIR}\"" run "docker build --pull -t mlperf-nvidia:language_model ." run "ID=`docker run -dt --runtime=nvidia --ipc=host -v $CUR:/workspace/bert_data mlperf-nvidia:language_model bash`" -#run "docker exec $ID bash -c 'python3 -m pip install --upgrade gdown && cd /workspace/bert && ./input_preprocessing/prepare_data.sh -s --outputdir /workspace/bert_data'" run "docker exec $ID bash -c 'cd /workspace/bert && ./input_preprocessing/prepare_data.sh -s --outputdir /workspace/bert_data'"