ihmeuw · zmbc · Apr 25, 2024 · Apr 22, 2024 · Apr 24, 2024 · Apr 25, 2024
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -199,6 +199,7 @@
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3.8", None),
     "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "dask": ("https://docs.dask.org/en/stable/", None),
     "tables": ("https://www.pytables.org/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
     "networkx": ("https://networkx.org/documentation/stable/", None),

diff --git a/docs/source/simulated_populations/index.rst b/docs/source/simulated_populations/index.rst
@@ -137,3 +137,11 @@ or United States), unzip the contents to the desired location on your computer.
 Once you've unzipped the simulated population data, you can pass the directory
 path to the :code:`source` parameter of the :ref:`dataset generation functions
 <dataset_generation_functions>` to generate large-scale datasets!
+
+If you're using one of the larger populations, you'll also want to take a look at the
+:code:`engine` parameter.
+By default, pseudopeople generates datasets using Pandas, which does not fully parallelize
+across cores and requires the entire dataset to fit into RAM.
+However, by passing "dask" to the :code:`engine` parameter, you can run the dataset
+generation on a Dask cluster, which can spill data to disk and even distribute
+the computation across multiple computers!
diff --git a/setup.py b/setup.py
@@ -51,10 +51,12 @@
         "jupyter",
     ]
 
+    dask_requirements = ["dask"]
+
     test_requirements = [
         "pytest",
         "pytest-mock",
-    ]
+    ] + dask_requirements
 
     lint_requirements = [
         "black==22.3.0",
@@ -109,6 +111,7 @@
             + test_requirements
             + interactive_requirements
             + lint_requirements,
+            "dask": dask_requirements,
         },
         # entry_points="""
         #         [console_scripts]