diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 23b8119..454b8bc 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,7 +18,9 @@ }, "extensions": [ "ms-python.python", - "ms-toolsai.jupyter" + "ms-toolsai.jupyter", + "ms-vsliveshare.vsliveshare", // Live Share extension + "github.copilot" // GitHub Copilot extension ], "remoteUser": "vscode" } diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..95f830f --- /dev/null +++ b/.dvc/config @@ -0,0 +1,6 @@ +[core] + remote = biobricks.ai +['remote "biobricks.ai"'] + url = https://ins-dvc.s3.amazonaws.com/insdvc +['remote "s3.biobricks.ai"'] + url = s3://ins-dvc/insdvc \ No newline at end of file diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3f72fce --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/download +/brick +status.txt \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..dde937d --- /dev/null +++ b/README @@ -0,0 +1,6 @@ +# SMRT Small Molecule Retention Time + + +This dataset is available on figshare at + +https://figshare.com/articles/dataset/The_METLIN_small_molecule_dataset_for_machine_learning-based_retention_time_prediction/8038913 \ No newline at end of file diff --git a/code/00_status.py b/code/00_status.py new file mode 100644 index 0000000..3c27637 --- /dev/null +++ b/code/00_status.py @@ -0,0 +1 @@ +# PURPOSE: CHECK IF THE SOURCE HAS CHANGED diff --git a/code/01_download.py b/code/01_download.py new file mode 100644 index 0000000..6f05903 --- /dev/null +++ b/code/01_download.py @@ -0,0 +1,7 @@ +# PURPOSE: DOWNLOAD THE SMRT DATA TO THE ./download DIRECTORY +import os + +# downloads to the ./download directory +os.makedirs('download', exist_ok=True) + +# read the data from the ./download directory diff --git a/code/02_process.py b/code/02_process.py new file mode 100644 index 0000000..cbf4857 --- /dev/null +++ b/code/02_process.py @@ -0,0 +1,7 @@ +# PURPOSE: CHANGE THE DOWNLOADED DATA TO ONE OR MORE PARQUET FILES +import os + +# exports to the ./brick directory +os.makedirs('brick', exist_ok=True) + +# read the data from the ./download directory diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..f494059 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,40 @@ +schema: '2.0' +stages: + status: + cmd: python code/00_status.py + deps: + - path: code/00_status.py + hash: md5 + md5: 95a09d63c054eb185a1408771f4ee8a3 + size: 43 + download: + cmd: python code/01_download.py + deps: + - path: code/01_download.py + hash: md5 + md5: f82fd5fc2597b90ed411991180e4ac30 + size: 195 + outs: + - path: download/ + hash: md5 + md5: d751713988987e9331980363e24189ce.dir + size: 0 + nfiles: 0 + process: + cmd: python code/02_process.py + deps: + - path: code/02_process.py + hash: md5 + md5: c520d6a17cb1fb7e47155d606ce80701 + size: 197 + - path: download/ + hash: md5 + md5: d751713988987e9331980363e24189ce.dir + size: 0 + nfiles: 0 + outs: + - path: brick/ + hash: md5 + md5: d751713988987e9331980363e24189ce.dir + size: 0 + nfiles: 0 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..e6dd000 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,17 @@ +stages: + status: + cmd: python code/00_status.py + download: + cmd: python code/01_download.py + deps: + - status.txt + - code/01_download.py + outs: + - download/ + process: + cmd: python code/02_process.py + deps: + - download/ + - code/02_process.py + outs: + - brick/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 45684e1..f3a121b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ -python-dotenv -pandas -biobricks -fastparquet -pyarrow +python-dotenv==1.0.1 +pandas==2.2.2 +biobricks==0.3.7 +fastparquet==2024.5.0 +pyarrow==16.1.0 +dvc==3.51.1 +dvc-s3==3.2.0 \ No newline at end of file