Skip to content

Commit

Permalink
Add a script for downloading the embeddings cache easily
Browse files Browse the repository at this point in the history
  • Loading branch information
Vita Midori committed Dec 2, 2024
1 parent 2aa651d commit 3127e5a
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 10 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
data/consultation-documents/*
data/dataframes/*
data/embeddings-cache/*
data/dataframes/*.parquet
data/embeddings-cache/*.parquet

research/document_types/mlruns.db
research/document_types/mlruns/*
Expand Down
28 changes: 28 additions & 0 deletions data/embeddings-cache/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Run this script with `uv run data/embeddings-cache/download.py` to download embeddings caches from Exoscale.
This overwrites your local cache!
"""

import logging
import pathlib
import sys

import dotenv

CACHE_FILES = ("openai--text-embedding-3-large.parquet",)

CACHE_DIRECTORY = pathlib.Path(__file__).parent
REPOSITORY_ROOT = (CACHE_DIRECTORY / ".." / "..").resolve()

sys.path.append(str(REPOSITORY_ROOT))

import research.lib.data_access # noqa: E402

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
dotenv.load_dotenv()
for cache_file in CACHE_FILES:
research.lib.data_access.download_file_from_exoscale(
remote_path=pathlib.Path("tmp") / "embeddings-cache" / cache_file,
local_path=CACHE_DIRECTORY / cache_file,
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,5 @@ ignore = [
"D212", # `multi-line-summary-first-line`; do not enforce this little detail
"D213", # `multi-line-summary-second-line`; do not enforce this little detail
"ISC001", # `single-line-implicit-string-concatenation`; conflicts with the formatter
"INP001", # `implicit-namespace-package`; nonsense for some directories
]
22 changes: 14 additions & 8 deletions research/lib/data_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,31 @@
from demokratis_ml.data import schemata


def ensure_dataframe_is_available(local_path: pathlib.Path) -> None:
"""Download a dataframe from an Exoscale Simple Object Storage bucket if it is not already available locally."""
logger = logging.getLogger("ensure_dataframe_is_available")
if local_path.exists():
logger.info("File %s already exists locally.", local_path)
return
def download_file_from_exoscale(remote_path: pathlib.Path, local_path: pathlib.Path) -> None:
"""Download an arbitrary file from our Exoscale Simple Object Storage bucket."""
logger = logging.getLogger("download_file_from_exoscale")
s3 = boto3.client(
"s3",
aws_access_key_id=os.environ["EXOSCALE_SOS_ACCESS_KEY"],
aws_secret_access_key=os.environ["EXOSCALE_SOS_SECRET_KEY"],
endpoint_url=os.environ["EXOSCALE_SOS_ENDPOINT"],
)
bucket_name = os.environ["EXOSCALE_SOS_BUCKET"]
remote_path = pathlib.Path("dataframes") / local_path.name
# remote_path = pathlib.Path("dataframes") / local_path.name
local_path.parent.mkdir(parents=True, exist_ok=True)
logger.warning("Downloading %s from bucket %s to %s", remote_path, bucket_name, local_path)
logger.info("Downloading %s from bucket %s to %s", remote_path, bucket_name, local_path)
s3.download_file(bucket_name, str(remote_path), local_path)


def ensure_dataframe_is_available(local_path: pathlib.Path) -> None:
"""Download a dataframe from our Exoscale Simple Object Storage bucket if it is not already available locally."""
logger = logging.getLogger("ensure_dataframe_is_available")
if local_path.exists():
logger.info("File %s already exists locally.", local_path)
return
download_file_from_exoscale(pathlib.Path("dataframes") / local_path.name, local_path)


@pa.check_output(schemata.FullConsultationDocumentSchemaV1.to_schema())
def load_consultation_documents(
input_file: pathlib.Path,
Expand Down

0 comments on commit 3127e5a

Please sign in to comment.