Simplify cache by dropping two collections (#202)

* docs: ✏️ add backup/restore to migration instructions * feat: 🎸 pass the max number of rows to the worker * feat: 🎸 delete the 'rows' and 'columns' collections instead of keeping a large collection of rows and columns, then compute the response on every endpoint call, possibly truncating the response, we now pre-compute the response and store it in the cache. We lose the ability to get the original data, but we don't need it. It fixes #197. See #197 (comment). BREAKING CHANGE: 🧨 the cache database structure has been modified. Run 20220408_cache_remove_dbrow_dbcolumn.py to migrate the database. * style: 💄 fix types and style * docs: ✏️ add parameter to avoid error in mongodump * docs: ✏️ mark ROWS_MAX_BYTES and ROWS_MIN_NUMBER as worker vars
huggingface · Apr 12, 2022 · 623606d · 623606d
1 parent a3431ef
commit 623606d
Show file tree

Hide file tree

Showing 20 changed files with 562 additions and 361 deletions.
diff --git a/.env.example b/.env.example
@@ -35,15 +35,6 @@
 # URL to connect to mongo db
 # MONGO_URL="mongodb://localhost:27018"
 
-# Max size of the /rows endpoint response in bytes
-# ROWS_MAX_BYTES=1_000_000
-
-# Max number of rows in the /rows endpoint response
-# ROWS_MAX_NUMBER=100
-
-# Min number of rows in the /rows endpoint response
-# ROWS_MIN_NUMBER=10
-
 # Number of uvicorn workers
 # WEB_CONCURRENCY = 2
 
@@ -66,6 +57,15 @@
 # Max size (in bytes) of the dataset to fallback in normal mode if streaming fails
 # MAX_SIZE_FALLBACK = 100_000_000
 
+# Max size of the /rows endpoint response in bytes
+# ROWS_MAX_BYTES=1_000_000
+
+# Max number of rows in the /rows endpoint response
+# ROWS_MAX_NUMBER=100
+
+# Min number of rows in the /rows endpoint response
+# ROWS_MIN_NUMBER=10
+
 # Number of seconds a worker will sleep before trying to process a new job
 # WORKER_SLEEP_SECONDS = 5
 

diff --git a/README.md b/README.md
@@ -38,9 +38,6 @@ Set environment variables to configure the following aspects:
 - `MONGO_CACHE_DATABASE`: the name of the database used for storing the cache. Defaults to `"datasets_preview_cache"`.
 - `MONGO_QUEUE_DATABASE`: the name of the database used for storing the queue. Defaults to `"datasets_preview_queue"`.
 - `MONGO_URL`: the URL used to connect to the mongo db server. Defaults to `"mongodb://localhost:27018"`.
-- `ROWS_MAX_BYTES`: max size of the /rows endpoint response in bytes. Defaults to `1_000_000` (1 MB).
-- `ROWS_MAX_NUMBER`: max number of rows in the /rows endpoint response. Defaults to `100`.
-- `ROWS_MIN_NUMBER`: min number of rows in the /rows endpoint response. Defaults to `10`.
 - `WEB_CONCURRENCY`: the number of workers. For now, it's ignored and hardcoded to 1 because the cache is not shared yet. Defaults to `2`.
 
 For example:
@@ -71,6 +68,8 @@ Also specify `HF_TOKEN` with an App Access Token (ask moonlanding administrators
 
 Also specify `MAX_SIZE_FALLBACK` with the maximum size in bytes of the dataset to fallback in normal mode if streaming fails. Note that it requires to have the size in the info metadata. Set to `0` to disable the fallback. Defaults to `100_000_000`.
 
+`ROWS_MIN_NUMBER` is the min number (defaults to `10`) and `ROWS_MAX_NUMBER` the max number (defaults to `100`) of rows fetched by the worker for the split, and provided in the /rows endpoint response. `ROWS_MAX_BYTES` is the max size of the /rows endpoint response in bytes. Defaults to `1_000_000` (1 MB).
+
 The `WORKER_QUEUE` variable specifies which jobs queue the worker will pull jobs from. It can be equal to `datasets` (default) or `splits`. The `datasets` jobs should be a lot faster than the `splits` ones, so that we should need a lot more workers for `splits` than for `datasets`.
 
 To warm the cache, ie. add all the missing Hugging Face datasets to the queue:

diff --git a/src/datasets_preview_backend/config.py b/src/datasets_preview_backend/config.py
@@ -44,10 +44,12 @@
 MONGO_CACHE_DATABASE = get_str_value(d=os.environ, key="MONGO_CACHE_DATABASE", default=DEFAULT_MONGO_CACHE_DATABASE)
 MONGO_QUEUE_DATABASE = get_str_value(d=os.environ, key="MONGO_QUEUE_DATABASE", default=DEFAULT_MONGO_QUEUE_DATABASE)
 MONGO_URL = get_str_value(d=os.environ, key="MONGO_URL", default=DEFAULT_MONGO_URL)
-ROWS_MAX_BYTES = get_int_value(d=os.environ, key="ROWS_MAX_BYTES", default=DEFAULT_ROWS_MAX_BYTES)
-ROWS_MAX_NUMBER = get_int_value(d=os.environ, key="ROWS_MAX_NUMBER", default=DEFAULT_ROWS_MAX_NUMBER)
-ROWS_MIN_NUMBER = get_int_value(d=os.environ, key="ROWS_MIN_NUMBER", default=DEFAULT_ROWS_MIN_NUMBER)
 WEB_CONCURRENCY = get_int_value(d=os.environ, key="WEB_CONCURRENCY", default=DEFAULT_WEB_CONCURRENCY)
 
 # Ensure datasets library uses the expected revision for canonical datasets
 os.environ["HF_SCRIPTS_VERSION"] = DATASETS_REVISION
+
+# for tests - to be removed
+ROWS_MAX_BYTES = get_int_value(d=os.environ, key="ROWS_MAX_BYTES", default=DEFAULT_ROWS_MAX_BYTES)
+ROWS_MAX_NUMBER = get_int_value(d=os.environ, key="ROWS_MAX_NUMBER", default=DEFAULT_ROWS_MAX_NUMBER)
+ROWS_MIN_NUMBER = get_int_value(d=os.environ, key="ROWS_MIN_NUMBER", default=DEFAULT_ROWS_MIN_NUMBER)
diff --git a/src/datasets_preview_backend/constants.py b/src/datasets_preview_backend/constants.py
@@ -11,16 +11,16 @@
 DEFAULT_MONGO_CACHE_DATABASE: str = "datasets_preview_cache"
 DEFAULT_MONGO_QUEUE_DATABASE: str = "datasets_preview_queue"
 DEFAULT_MONGO_URL: str = "mongodb://localhost:27018"
-DEFAULT_ROWS_MAX_BYTES: int = 1_000_000
-DEFAULT_ROWS_MAX_NUMBER: int = 100
-DEFAULT_ROWS_MIN_NUMBER: int = 10
 DEFAULT_WEB_CONCURRENCY: int = 2
 
 DEFAULT_HF_TOKEN: Optional[str] = None
 DEFAULT_MAX_JOBS_PER_DATASET: int = 2
 DEFAULT_MAX_LOAD_PCT: int = 50
 DEFAULT_MAX_MEMORY_PCT: int = 60
 DEFAULT_MAX_SIZE_FALLBACK: int = 100_000_000
+DEFAULT_ROWS_MAX_BYTES: int = 1_000_000
+DEFAULT_ROWS_MAX_NUMBER: int = 100
+DEFAULT_ROWS_MIN_NUMBER: int = 10
 DEFAULT_WORKER_SLEEP_SECONDS: int = 5
 DEFAULT_WORKER_QUEUE: str = "datasets"