Add a script for downloading the embeddings cache easily

Demokratis-ch · Dec 2, 2024 · 3127e5a · 3127e5a
1 parent 2aa651d
commit 3127e5a
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 data/consultation-documents/*
-data/dataframes/*
-data/embeddings-cache/*
+data/dataframes/*.parquet
+data/embeddings-cache/*.parquet
 
 research/document_types/mlruns.db
 research/document_types/mlruns/*

diff --git a/data/embeddings-cache/download.py b/data/embeddings-cache/download.py
@@ -0,0 +1,28 @@
+"""Run this script with `uv run data/embeddings-cache/download.py` to download embeddings caches from Exoscale.
+
+This overwrites your local cache!
+"""
+
+import logging
+import pathlib
+import sys
+
+import dotenv
+
+CACHE_FILES = ("openai--text-embedding-3-large.parquet",)
+
+CACHE_DIRECTORY = pathlib.Path(__file__).parent
+REPOSITORY_ROOT = (CACHE_DIRECTORY / ".." / "..").resolve()
+
+sys.path.append(str(REPOSITORY_ROOT))
+
+import research.lib.data_access  # noqa: E402
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    dotenv.load_dotenv()
+    for cache_file in CACHE_FILES:
+        research.lib.data_access.download_file_from_exoscale(
+            remote_path=pathlib.Path("tmp") / "embeddings-cache" / cache_file,
+            local_path=CACHE_DIRECTORY / cache_file,
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,4 +56,5 @@ ignore = [
     "D212",  # `multi-line-summary-first-line`; do not enforce this little detail
     "D213",  # `multi-line-summary-second-line`; do not enforce this little detail
     "ISC001",  # `single-line-implicit-string-concatenation`; conflicts with the formatter
+    "INP001",  # `implicit-namespace-package`; nonsense for some directories
 ]
diff --git a/research/lib/data_access.py b/research/lib/data_access.py
@@ -10,25 +10,31 @@
 from demokratis_ml.data import schemata
 
 
-def ensure_dataframe_is_available(local_path: pathlib.Path) -> None:
-    """Download a dataframe from an Exoscale Simple Object Storage bucket if it is not already available locally."""
-    logger = logging.getLogger("ensure_dataframe_is_available")
-    if local_path.exists():
-        logger.info("File %s already exists locally.", local_path)
-        return
+def download_file_from_exoscale(remote_path: pathlib.Path, local_path: pathlib.Path) -> None:
+    """Download an arbitrary file from our Exoscale Simple Object Storage bucket."""
+    logger = logging.getLogger("download_file_from_exoscale")
     s3 = boto3.client(
         "s3",
         aws_access_key_id=os.environ["EXOSCALE_SOS_ACCESS_KEY"],
         aws_secret_access_key=os.environ["EXOSCALE_SOS_SECRET_KEY"],
         endpoint_url=os.environ["EXOSCALE_SOS_ENDPOINT"],
     )
     bucket_name = os.environ["EXOSCALE_SOS_BUCKET"]
-    remote_path = pathlib.Path("dataframes") / local_path.name
+    # remote_path = pathlib.Path("dataframes") / local_path.name
     local_path.parent.mkdir(parents=True, exist_ok=True)
-    logger.warning("Downloading %s from bucket %s to %s", remote_path, bucket_name, local_path)
+    logger.info("Downloading %s from bucket %s to %s", remote_path, bucket_name, local_path)
     s3.download_file(bucket_name, str(remote_path), local_path)
 
 
+def ensure_dataframe_is_available(local_path: pathlib.Path) -> None:
+    """Download a dataframe from our Exoscale Simple Object Storage bucket if it is not already available locally."""
+    logger = logging.getLogger("ensure_dataframe_is_available")
+    if local_path.exists():
+        logger.info("File %s already exists locally.", local_path)
+        return
+    download_file_from_exoscale(pathlib.Path("dataframes") / local_path.name, local_path)
+
+
 @pa.check_output(schemata.FullConsultationDocumentSchemaV1.to_schema())
 def load_consultation_documents(
     input_file: pathlib.Path,