Skip to content

Commit

Permalink
Create check_for_datafiles.py
Browse files Browse the repository at this point in the history
  • Loading branch information
stichbury committed Feb 4, 2025
1 parent d239ff5 commit b4c26d6
Showing 1 changed file with 48 additions and 0 deletions.
48 changes: 48 additions & 0 deletions tools/check_for_datafiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""FUNCTION TO CHECK FOR DATA FILES IN NON-WHITELISTED FOLDERS."""

import glob
from pathlib import Path

file_extensions = [
"csv",
"xlsx",
"xls",
"parquet",
"tsv",
"hdf5",
"h5",
"pickle",
"pkl",
"db",
"sqlite",
"sqlite3",
"orc",
]
whitelist_folders = [
"/venv",
"/vizro-core/docs",
"/vizro-ai/tests/integration/reports",
] # starting from project root dir


def check_for_data_files():
"""Recursively finds all data files in non-whitelisted folders.
Raises:
AssertionError if data files are present in non-whitelisted folders.
"""
project_dir = str(Path(__file__).parent.parent)
whitelist_dir = {f"{project_dir}{dir}" for dir in whitelist_folders}

found_files = {file for ext in file_extensions for file in glob.glob(project_dir + f"/**/*.{ext}", recursive=True)}
whitelisted_files = {files for dir in whitelist_dir for files in found_files if files.startswith(dir)}
to_be_removed_files = found_files - whitelisted_files

assert len(to_be_removed_files) == 0, (
f"Caution! Please remove your data files {to_be_removed_files} before merging!"
)


if __name__ == "__main__":
check_for_data_files()

0 comments on commit b4c26d6

Please sign in to comment.