ucbepic · shreyashankar · Oct 12, 2024 · Oct 10, 2024 · Oct 11, 2024
diff --git a/docetl/operations/sample.py b/docetl/operations/sample.py
@@ -0,0 +1,53 @@
+import sklearn.model_selection
+from typing import Any, Dict, List, Optional, Tuple
+from .base import BaseOperation
+
+
+class SampleOperation(BaseOperation):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the SampleOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or invalid in the configuration.
+            TypeError: If configuration values have incorrect types.
+        """
+        pass
+
+    def execute(
+        self, input_data: List[Dict], is_build: bool = False
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the sample operation on the input data.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+            is_build (bool): Whether the operation is being executed
+              in the build phase. Defaults to False.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the filtered
+              list of dictionaries and the total cost of the operation.
+        """
+
+        samples = self.config["samples"]
+        if isinstance(samples, list):
+            output_data = [input_data[sample]
+                           for sample in samples]
+        else:
+            stratify=None
+            if "stratify" in self.config:
+                stratify = [data[self.config["stratify"]] for data in input_data]
+            output_data, dummy = sklearn.model_selection.train_test_split(
+                input_data,
+                train_size = samples,
+                random_state = self.config.get("random_state", None),
+                stratify = stratify)
+        return output_data, 0
diff --git a/docs/operators/sample.md b/docs/operators/sample.md
@@ -0,0 +1,43 @@
+# Sample operation
+
+The Sample operation in DocETL samples items from the input. It is
+meant mostly as a debugging tool:
+
+Insert it before the last operation, the one you're currently trying
+to tack on to the end of a working pipeline, to limit the amount of
+data it will be fed, so that the run time is small enough to
+comfortably debug its prompt. Once it seems to be working, you can
+remove the sample operation. You can then repeat this for each
+operation you add while developing your pipeline!
+
+
+
+## 🚀 Example: 
+
+```yaml
+- name: cluster_concepts
+  type: sample
+  samples: 0.1
+  random_state: 42
+  stratify: category
+```
+
+This sample operation will return a pseudo-randomly selected 10% of
+the samples (`samples: 0.1`). The random selection will be seeded with
+a constant (42), meaning the same selection will be returned if you
+rerun the pipeline (If no random state is given, a different sample
+will be returned every time). Additionally, the random sampling will
+sample each value of the `category` key equally.
+
+## Required Parameters
+
+- `name`: A unique name for the operation.
+- `type`: Must be set to "sample".
+- `samples`: Either a list of sample indices to just return those samples, an integer count of samples, or a float fraction of samples.
+
+## Optional Parameters
+
+| Parameter                 | Description                                                                      | Default                       |
+| ------------------------- | -------------------------------------------------------------------------------- | ----------------------------- |
+| `random_state             | An integer to seed the random generator with | Use the (numpy) global random state
+| `stratify`                | The key to stratify by             |  |
diff --git a/pyproject.toml b/pyproject.toml
@@ -88,6 +88,7 @@ reduce = "docetl.operations.reduce:ReduceOperation"
 resolve = "docetl.operations.resolve:ResolveOperation"
 gather = "docetl.operations.gather:GatherOperation"
 cluster = "docetl.operations.cluster:ClusterOperation"
+sample = "docetl.operations.sample:SampleOperation"
 
 [tool.poetry.plugins."docetl.parser"]
 llama_index_simple_directory_reader = "docetl.parsing_tools:llama_index_simple_directory_reader"