Adds strict filter mechanism to Mixer (#231)

* WIP: adds unshuffled/unmerged mixing operations * Example config with shuffle false * Adds comment for sanity * Try new gh upload-artifact with overwrite * More ci fixes * Should panic if no suffix
allenai · Feb 13, 2025 · 625ac1c · 625ac1c
1 parent d42fa1d
commit 625ac1c
Show file tree

Hide file tree

Showing 7 changed files with 113 additions and 20 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dolma"
-version = "1.1.0"
+version = "1.1.1"
 edition = "2021"
 license = "Apache-2.0"
 

diff --git a/configs/test/test_filtered_mixer.yaml b/configs/test/test_filtered_mixer.yaml
@@ -0,0 +1,24 @@
+streams:
+  - name: filtered_object_test
+    documents:
+      - s3://ai2-oe-data/pretraining-data/sources/refine/v0/documents/0001/0000_dclm_shard_00000*.jsonl.zstd
+    attributes:
+      - random_number_v1
+      - fineweb-edu-classifier
+      - fineweb-edu-classifier-original
+    compression:
+      input: zst
+      output: zst
+    output:
+      path: s3://ai2-oe-data/tylerm/test/mixer/filtered
+    filter:
+      include:
+        - '((.attributes."HuggingFaceFW_fineweb-edu-classifier_score" != null and .attributes."HuggingFaceFW_fineweb-edu-classifier_score_original" != null) and ((${oc.env:ALPHA} * (.attributes.random_number_v1__random_number_v1__random[0][-1] * 2 - 1)) + (.attributes."HuggingFaceFW_fineweb-edu-classifier_score"[0][-1]) - .attributes."HuggingFaceFW_fineweb-edu-classifier_score_original"[0][-1]) > 0.30)'
+      syntax: jq
+
+work_dir:
+  input: "/tmp/dolma/input"
+  output: "/tmp/dolma/output"
+
+shuffle: false
+processes: 10
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dolma"
-version = "1.1.0"
+version = "1.1.1"
 description = "Data filters"
 license = { text = "Apache-2.0" }
 readme = "README.md"

diff --git a/python/dolma/cli/mixer.py b/python/dolma/cli/mixer.py
@@ -73,6 +73,7 @@ class MixerConfig:
     streams: List[StreamConfig] = field(default=[], help="List configurations of streams to be mixed")
     work_dir: WorkDirConfig = field(default=WorkDirConfig(), help="Configuration for temporary work directories.")
     processes: int = field(default=1, help="Number of processes to use for mixing. By default 1 process is used.")
+    shuffle: bool = field(default=True, help="Whether to shard and shuffle the documents during mixing.")
     dryrun: bool = field(
         default=False,
         help="If true, only print the configuration and exit without running the mixer.",
@@ -92,6 +93,7 @@ def run(cls, parsed_config: MixerConfig):
                 "work_dir": {"input": str(work_dirs.input), "output": str(work_dirs.output)},
                 "processes": int(parsed_config.processes),
                 "streams": [],
+                "shuffle": bool(parsed_config.shuffle),
             }
 
             for stream_config in parsed_config.streams:

diff --git a/src/mixer.rs b/src/mixer.rs
@@ -9,10 +9,14 @@ use crate::shard::Shard;
 use mixer_config::*;
 
 pub fn run(config: MixerConfig) -> Result<u32, u32> {
-    let shards = Shard::split_streams(&config.streams).unwrap();
-
+    let shards = if config.shuffle {
+        Shard::split_streams(&config.streams).unwrap()
+    } else {
+        Shard::split_streams_unshuffled(&config.streams).unwrap()
+    };
     let threadpool = ThreadPool::new(config.processes);
     let failed_shard_count_ref = Arc::new(AtomicU32::new(0));
+
     for shard in shards {
         let output_path = Path::new(&config.work_dir.output.clone()).join(&shard.output);
         if output_path.exists() {
@@ -50,11 +54,18 @@ pub mod mixer_config {
 
     use crate::shard::shard_config::{StreamConfig, WorkDirConfig};
 
+    fn shuffle_default() -> bool {
+        true
+    }
+
     #[derive(Serialize, Deserialize, Clone)]
     pub struct MixerConfig {
         pub streams: Vec<StreamConfig>,
         pub processes: usize,
         pub work_dir: WorkDirConfig,
+        // Includes default for backwards compatibility
+        #[serde(default = "shuffle_default")]
+        pub shuffle: bool,
     }
 
     impl MixerConfig {

diff --git a/src/shard.rs b/src/shard.rs
@@ -63,21 +63,11 @@ impl Shard {
                 })
                 .collect::<Vec<(DocumentPaths, usize)>>();
             let mut shard_size = inputs_with_sizes[0].1;
+            // Start with the first input and add it to the vector
             let mut shard_inputs: Vec<DocumentPaths> = vec![inputs_with_sizes[0].0.clone()];
-            let output_ext = match stream_config
-                .compression
-                .clone()
-                .unwrap_or(CompressionConfig::infer())
-                .output
-            {
-                // empty string means no compression
-                Some(ext) if ext.is_empty() => "".to_string(),
-                // if there is an extension, add a dot
-                Some(ext) => format!(".{}", ext),
-                // default to .gz
-                None => ".gz".to_string(),
-            };
+            let output_ext = Shard::get_output_extension(stream_config.clone());
 
+            // We slice from the second position since we already added the first input above
             for (input, size) in inputs_with_sizes[1..].iter() {
                 if *size == 0 {
                     log::warn!(
@@ -89,7 +79,7 @@ impl Shard {
                 shard_size += size;
                 if shard_size > stream_config.output.max_size_in_bytes {
                     let output = format!(
-                        "{}/{}-{:04}.json{}",
+                        "{}/{}-{:04}.jsonl{}",
                         stream_config.output.path,
                         stream_config.name,
                         stream_shard_count,
@@ -139,6 +129,56 @@ impl Shard {
         Ok(shards)
     }
 
+    pub fn split_streams_unshuffled(streams: &Vec<StreamConfig>) -> Result<Vec<Shard>, IoError> {
+        // Partitions the input files of a stream into a vector of shards each consisting of a single object and maintaining
+        // the original file structure and naming below */documents/. Useful for "filter" only operations where the resulting
+        // dataset is a strict subset of the original and is intended to be unshuffled and unsharded.
+        let mut shards: Vec<Shard> = Vec::new();
+        for stream_config in streams {
+            let stream_inputs = find_objects_matching_patterns(&stream_config.documents)?;
+            let input_count = stream_inputs.len();
+            let inputs = stream_inputs.into_iter().map(|input| {
+                let mut attr_paths = Vec::new();
+                for prefix in stream_config.attributes.iter() {
+                    let attr_prefix = format!("/attributes/{}/", prefix);
+                    let attr_path = input.replace("/documents/", &attr_prefix);
+                    attr_paths.push(attr_path);
+                }
+                DocumentPaths {
+                    doc_path: input,
+                    attribute_paths: attr_paths,
+                }
+            });
+
+            for input in inputs {
+                let doc_path_clone = input.doc_path.clone();
+                let output_suffix = doc_path_clone.split("/documents/").last().unwrap();
+                let output = format!(
+                    "{}/documents/{}",
+                    stream_config.output.path.clone(),
+                    output_suffix
+                );
+                log::info!("Creating shard for {}", output);
+                let shard: Shard = Shard {
+                    inputs: vec![input.clone()],
+                    output,
+                    filter: stream_config.filter.clone(),
+                    span_replacements: stream_config.span_replacement.clone(),
+                    discard_fields: stream_config.output.discard_fields.clone(),
+                    min_text_length: stream_config.output.min_text_length.clone(),
+                    compression: stream_config.compression.clone(),
+                };
+                shards.push(shard);
+            }
+            log::info!(
+                "Created {} shards of file count 1 for {}",
+                input_count,
+                stream_config.name,
+            );
+        }
+        Ok(shards)
+    }
+
     // Process a shard:
     // Read all input files sequentially,
     // Merge attributes
@@ -465,6 +505,22 @@ impl Shard {
         cache.finalize_output(&self.output)?;
         Ok(())
     }
+
+    fn get_output_extension(stream_config: StreamConfig) -> String {
+        match stream_config
+            .compression
+            .clone()
+            .unwrap_or(CompressionConfig::infer())
+            .output
+        {
+            // empty string means no compression
+            Some(ext) if ext.is_empty() => "".to_string(),
+            // if there is an extension, add a dot
+            Some(ext) => format!(".{}", ext),
+            // default to .gz
+            None => ".gz".to_string(),
+        }
+    }
 }
 
 pub mod shard_config {