Skip to content

Commit

Permalink
Cleanup of prepare-training-data-bert
Browse files Browse the repository at this point in the history
  • Loading branch information
arjunsuresh committed Jun 12, 2023
1 parent 3f9a9bc commit 30fe073
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 10 deletions.
2 changes: 1 addition & 1 deletion cm-mlops/script/download-file/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def preprocess(i):
elif env['CM_DOWNLOAD_TOOL'] == "curl":
env['CM_DOWNLOAD_CMD'] = f"curl {extra_download_options} {url}"
elif env['CM_DOWNLOAD_TOOL'] == "gdown":
env['CM_DOWNLOAD_CMD'] = f"gdown -c {extra_download_options} {url}"
env['CM_DOWNLOAD_CMD'] = f"gdown {extra_download_options} {url}"

filename = env['CM_DOWNLOAD_FILENAME']
env['CM_DOWNLOAD_DOWNLOADED_FILENAME'] = filename
Expand Down
24 changes: 16 additions & 8 deletions cm-mlops/script/prepare-training-data-bert/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
"default": true,
"deps": [
{
"tags": "get,git,repo,_repo.https://github.com/mlcommons/training_results_v2.1"
"tags": "get,git,repo,_repo.https://github.com/mlcommons/training_results_v2.1",
"extra_cache_tags": "mlperf,training,results"
}
],
"prehook_deps": [
Expand All @@ -36,22 +37,25 @@
"env": {
"CM_DOWNLOAD_FILENAME": "bert_config.json",
"CM_DOWNLOAD_PATH": "<<<CM_BERT_CONFIG_DOWNLOAD_DIR>>>"
}
},
"extra_cache_tags": "mlperf,training,bert,config"
},
{
"tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1USK108J6hMM_d27xCHi738qBL8_BT1u1",
"env": {
"CM_DOWNLOAD_FILENAME": "vocab.txt",
"CM_DOWNLOAD_PATH": "<<<CM_BERT_VOCAB_DOWNLOAD_DIR>>>"
}
},
"extra_cache_tags": "bert,vocab"
},
{
"tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1tmMgLwoBvbEJEHXh77sqrXYw5RpqT8R_",
"env": {
"CM_DOWNLOAD_FILENAME": "bert_reference_results_text_md5.txt",
"CM_DOWNLOAD_PATH": "<<<CM_BERT_DATA_DOWNLOAD_DIR>>>",
"CM_DOWNLOAD_FINAL_ENV_NAME": "CM_BERT_REFERENCE_RESULTS_TEXT_MD5_FILE_PATH"
}
},
"extra_cache_tags": "bert,data,results,md5"
},
{
"tags": "download-and-extract,file,_gdown,_extract,_url.https://drive.google.com/uc?id=14xV2OUGSQDG_yDBrmbSdcDC-QGeqpfs_",
Expand All @@ -61,28 +65,32 @@
"CM_DOWNLOAD_PATH": "<<<CM_BERT_DATA_DOWNLOAD_DIR>>>",
"CM_EXTRACT_PATH": "<<<CM_BERT_DATA_DOWNLOAD_DIR>>>",
"CM_EXTRACT_EXTRACTED_CHECKSUM_FILE": "<<<CM_BERT_REFERENCE_RESULTS_TEXT_MD5_FILE_PATH>>>"
}
},
"extra_cache_tags": "bert,data,results"
},
{
"tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1chiTBljF0Eh1U5pKs6ureVHgSbtU8OG_",
"env": {
"CM_DOWNLOAD_FILENAME": "model.ckpt-28252.data-00000-of-00001",
"CM_DOWNLOAD_PATH": "<<<CM_BERT_CHECKPOINT_DOWNLOAD_DIR>>>"
}
},
"extra_cache_tags": "bert,checkpoint,data"
},
{
"tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1Q47V3K3jFRkbJ2zGCrKkKk-n0fvMZsa0",
"env": {
"CM_DOWNLOAD_FILENAME": "model.ckpt-28252.index",
"CM_DOWNLOAD_PATH": "<<<CM_BERT_CHECKPOINT_DOWNLOAD_DIR>>>"
}
},
"extra_cache_tags": "bert,checkpoint,index"
},
{
"tags": "download,file,_gdown,_url.https://drive.google.com/uc?id=1vAcVmXSLsLeQ1q7gvHnQUSth5W_f_pwv",
"env": {
"CM_DOWNLOAD_FILENAME": "model.ckpt-28252.meta",
"CM_DOWNLOAD_PATH": "<<<CM_BERT_CHECKPOINT_DOWNLOAD_DIR>>>"
}
},
"extra_cache_tags": "bert,checkpoint,meta"
}
],
"env": {
Expand Down
1 change: 0 additions & 1 deletion cm-mlops/script/prepare-training-data-bert/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,4 @@ CUR=${CM_DATA_DIR:-"$PWD/data"}
run "cd \"${CM_RUN_DIR}\""
run "docker build --pull -t mlperf-nvidia:language_model ."
run "ID=`docker run -dt --runtime=nvidia --ipc=host -v $CUR:/workspace/bert_data mlperf-nvidia:language_model bash`"
#run "docker exec $ID bash -c 'python3 -m pip install --upgrade gdown && cd /workspace/bert && ./input_preprocessing/prepare_data.sh -s --outputdir /workspace/bert_data'"
run "docker exec $ID bash -c 'cd /workspace/bert && ./input_preprocessing/prepare_data.sh -s --outputdir /workspace/bert_data'"

0 comments on commit 30fe073

Please sign in to comment.