diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 000000000..528f30c71 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 000000000..aefccf2ac --- /dev/null +++ b/.dvc/config @@ -0,0 +1,4 @@ +[core] + remote = storage +['remote "storage"'] + url = gdrive://1bozOw-FD0JkthpUQnkrJqxYtU2Uwj6E8 diff --git a/Data/Soil/.gitignore b/Data/Soil/.gitignore new file mode 100644 index 000000000..b5c873d4f --- /dev/null +++ b/Data/Soil/.gitignore @@ -0,0 +1,3 @@ +/processed +/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif +/soilcarbon.ovr diff --git a/Data/Soil/soilcarbon.ovr.dvc b/Data/Soil/soilcarbon.ovr.dvc new file mode 100644 index 000000000..0ce8f3eec --- /dev/null +++ b/Data/Soil/soilcarbon.ovr.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 33956e4dd24c1caa1dcea956e85e1f5f + size: 131473 + path: soilcarbon.ovr diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 000000000..bc154f352 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,40 @@ +schema: '2.0' +stages: + prepare: + cmd: + - "gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 \\\n -to SRC_METHOD=NO_GEOTRANSFORM\ + \ -tr 0.5 0.5 \\\n -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326\ + \ \\\n -of GTiff Data/Soil/soilcarbon.ovr \\\n Data/Soil/prepared/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif" + deps: + - path: Data/Soil/soilcarbon.ovr + md5: 33956e4dd24c1caa1dcea956e85e1f5f + size: 131473 + outs: + - path: Data/Soil/prepared/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + md5: 96f78155b79a835f56d019586d4c1f14 + size: 1038282 + transform: + cmd: + - "python scripts/transform.py --input tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr\ + \ \\\n --output Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif" + deps: + - path: tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr + md5: 33956e4dd24c1caa1dcea956e85e1f5f + size: 131473 + outs: + - path: Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + md5: 96f78155b79a835f56d019586d4c1f14 + size: 1038282 + Load: + cmd: + - cp Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif Data/Soil/processed/ + - rm -f Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + extract: + cmd: + - python scripts/extract.py + - python scripts/rename_files.py + load: + cmd: + - dvc push + clean: + cmd: rm -rf tmp_unzip_path diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 000000000..4c6ad9bcb --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,19 @@ +stages: + extract: + cmd: + - python scripts/extract.py + - python scripts/rename_files.py + transform: + cmd: + - >- + python scripts/transform.py --input tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr \ + --output Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + deps: + - tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr + outs: + - Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + load: + cmd: + - dvc push + clean: + cmd: rm -rf tmp_unzip_path \ No newline at end of file diff --git a/scripts/extract.py b/scripts/extract.py new file mode 100644 index 000000000..1d0cc4db5 --- /dev/null +++ b/scripts/extract.py @@ -0,0 +1,29 @@ +import argparse +import requests +import zipfile +import io +import os + +URL = "https://databasin2-filestore.s3.amazonaws.com:443/a4cb6d367eae4e52a08902874f8bfedf/download/a4cb6d367eae4e52a08902874f8bfedf_1_zip_en.zip?Signature=O6QSoOR%2BisIRVE2mpxzkphTkhmw%3D&Expires=1680100356&AWSAccessKeyId=AKIAI4RK5BEPK3FCQPUQ" + +def ensure_url_is_accessible(URL): + r = requests.get(URL) + if not r.ok: + print("Download link expired. Please update download link") + else: + download_and_unzip_files(r.content) + +def download_and_unzip_files(content): + current_directory = os.getcwd() + target_parent_dir = os.path.join(current_directory, r'tmp_unzip_path') + if not os.path.exists(target_parent_dir): + os.mkdir(target_parent_dir) + try: + z = zipfile.ZipFile(io.BytesIO(content)) + z.extractall(target_parent_dir) + except Exception as e: + print(e) + else: + print("unzipped successfully") + +ensure_url_is_accessible(URL) \ No newline at end of file diff --git a/scripts/rename_files.py b/scripts/rename_files.py new file mode 100644 index 000000000..a95438ba1 --- /dev/null +++ b/scripts/rename_files.py @@ -0,0 +1,12 @@ +import os + +current_directory = os.getcwd() +target_parent_dir = os.path.join(current_directory, r'tmp_unzip_path/data') +if os.path.exists(target_parent_dir): + for file_name in os.listdir(target_parent_dir): + if '\\' in file_name: + old_file_name = os.path.join(target_parent_dir, file_name) + filename = os.fsdecode(file_name) + changed_name = filename.replace("\\", "_") + new_file_name = os.path.join(target_parent_dir, changed_name) + os.rename(old_file_name,new_file_name) \ No newline at end of file diff --git a/scripts/transform.py b/scripts/transform.py new file mode 100644 index 000000000..f8e3ce2ad --- /dev/null +++ b/scripts/transform.py @@ -0,0 +1,22 @@ +import subprocess +import argparse +import os + +parser = argparse.ArgumentParser() +parser.add_argument('--input', help="Directory of file to transform") +parser.add_argument('--output', help="Directory for transformed files") +args = vars(parser.parse_args()) + + +def run_shell_cmd(cmd): + try: + p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) + last_stdout_bytes, last_stderr_bytes = p.communicate() + if last_stdout_bytes: + return last_stdout_bytes.decode('utf-8', 'replace') + else: + return last_stderr_bytes + except Exception as e: + print(e) + +run_shell_cmd("gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 -to SRC_METHOD=NO_GEOTRANSFORM -tr 0.5 0.5 -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326 -of GTiff " + args.get('input') + " " + args.get('output')) \ No newline at end of file