From 97afe6fc30e12fee3a8e33a104299209495292c8 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Thu, 11 Apr 2024 12:39:15 +0200
Subject: [PATCH] various updates throughout the package

---
 python/text_utils/api/cli.py                  |  72 +-
 python/text_utils/api/processor.py            |  21 +-
 .../cli/create_continuation_index.py          |   4 +-
 python/text_utils/constraints.py              |  85 ++
 python/text_utils/inference/utils.py          |  68 ++
 src/continuations.rs                          |  31 +-
 src/data/loading.rs                           |  77 +-
 src/data/mod.rs                               |  23 +-
 src/data/preprocessing.rs                     |  71 +-
 src/grammar.rs                                |  12 -
 src/tokenization.rs                           |  66 +-
 text-utils-grammar/src/lr1.rs                 |   2 +-
 text-utils-grammar/src/utils.rs               |   8 +-
 text-utils-prefix/Cargo.lock                  | 979 ++++++++++++++++++
 text-utils-prefix/src/art.rs                  |  76 +-
 text-utils-prefix/src/lib.rs                  |  19 +-
 text-utils-prefix/src/patricia.rs             |   2 +-
 text-utils-prefix/src/vec.rs                  |  12 +-
 18 files changed, 1301 insertions(+), 327 deletions(-)
 create mode 100644 python/text_utils/constraints.py
 create mode 100644 text-utils-prefix/Cargo.lock

diff --git a/python/text_utils/api/cli.py b/python/text_utils/api/cli.py
index 4ab5817..72fed63 100644
--- a/python/text_utils/api/cli.py
+++ b/python/text_utils/api/cli.py
@@ -4,7 +4,7 @@
 import time
 import logging
 import warnings
-from typing import Iterator, Iterable, Union, Optional, Type
+from typing import Iterator, Iterable, Union, Type
 try:
     import readline  # noqa
 except ImportError:
@@ -33,6 +33,7 @@ def parser(
     ) -> argparse.ArgumentParser:
         parser = argparse.ArgumentParser(name, description)
         model_group = parser.add_mutually_exclusive_group()
+        default_model = cls.text_processor_cls.default_model()
         model_group.add_argument(
             "-m",
             "--model",
@@ -40,7 +41,7 @@ def parser(
                 model.name for model in
                 cls.text_processor_cls.available_models()
             ],
-            default=cls.text_processor_cls.default_model().name,
+            default=None if default_model is None else default_model.name,
             help=f"Name of the model to use for {cls.text_processor_cls.task}"
         )
         model_group.add_argument(
@@ -185,8 +186,8 @@ def __init__(self, args: argparse.Namespace):
     def version(self) -> str:
         raise NotImplementedError
 
-    def format_output(self, item: data.InferenceData) -> Iterable[str]:
-        return [item.text]
+    def format_output(self, output: str) -> Iterable[str]:
+        return [output]
 
     def _run_with_profiling(self, file: str) -> None:
         import cProfile
@@ -194,17 +195,16 @@ def _run_with_profiling(self, file: str) -> None:
 
     def process_iter(
         self,
-        text_processor: TextProcessor,
-        iter: Iterator[data.InferenceData]
-    ) -> Iterator[data.InferenceData]:
+        processor: TextProcessor,
+        iter: Iterator[str]
+    ) -> Iterator[str]:
         raise NotImplementedError
 
     def process_file(
         self,
-        text_processor: TextProcessor,
-        path: str,
-        lang: Optional[str],
-        out_file: Union[str, TextIOWrapper]
+        processor: TextProcessor,
+        input_file: str,
+        output_file: str | TextIOWrapper
     ) -> None:
         raise NotImplementedError
 
@@ -277,8 +277,7 @@ def run(self) -> None:
         start = time.perf_counter()
         if self.args.process is not None:
             self.args.progress = False
-            ipt = data.InferenceData(self.args.process)
-            opt = next(self.process_iter(self.cor, iter([ipt])))
+            opt = next(self.process_iter(self.cor, iter([self.args.process])))
             for line in self.format_output(opt):
                 print(line)
 
@@ -290,7 +289,7 @@ def run(self) -> None:
                 assert isinstance(self.args.out_path, str)
                 out = self.args.out_path
 
-            self.process_file(self.cor, self.args.file, self.args.lang, out)
+            self.process_file(self.cor, self.args.file, out)
 
             if self.args.report:
                 for d in self.cor.devices:
@@ -311,7 +310,7 @@ def run(self) -> None:
                     not self.args.unsorted,
                     self.cor.devices,
                     next(self.cor.model.parameters()).dtype,
-                    batch_max_tokens=self.args.batch_max_tokens,
+                    self.args.batch_max_tokens,
                 )
                 print(report)
 
@@ -328,34 +327,19 @@ def run(self) -> None:
                 return
 
             try:
-                if self.args.unsorted:
-                    # correct lines from stdin as they come
-                    input_it = (
-                        data.InferenceData(line.rstrip("\r\n"))
-                        for line in sys.stdin
-                    )
-                    sized_it = ProgressIterator(
-                        input_it,
-                        self.inference_data_size
-                    )
-                    outputs = self.process_iter(self.cor, sized_it)
-                    for opt in outputs:
-                        for line in self.format_output(opt):
-                            print(line)
-                else:
-                    # read stdin completely, then potentially sort and correct
-                    inputs = [
-                        data.InferenceData(line.rstrip("\r\n"))
-                        for line in sys.stdin
-                    ]
-                    sized_it = ProgressIterator(
-                        iter(inputs),
-                        self.inference_data_size
-                    )
-                    outputs = self.process_iter(self.cor, sized_it)
-                    for opt in outputs:
-                        for line in self.format_output(opt):
-                            print(line)
+                # correct lines from stdin as they come
+                input_it = (
+                    line.rstrip("\r\n")
+                    for line in sys.stdin
+                )
+                sized_it = ProgressIterator(
+                    input_it,
+                    self.inference_data_size
+                )
+                outputs = self.process_iter(self.cor, sized_it)
+                for opt in outputs:
+                    for line in self.format_output(opt):
+                        print(line)
 
                 if self.args.report:
                     for d in self.cor.devices:
@@ -373,7 +357,7 @@ def run(self) -> None:
                         not self.args.unsorted,
                         self.cor.devices,
                         next(self.cor.model.parameters()).dtype,
-                        batch_max_tokens=self.args.batch_max_tokens,
+                        self.args.batch_max_tokens,
                     )
                     print(report)
 
diff --git a/python/text_utils/api/processor.py b/python/text_utils/api/processor.py
index 78af78a..8657db6 100644
--- a/python/text_utils/api/processor.py
+++ b/python/text_utils/api/processor.py
@@ -7,7 +7,7 @@
 
 from tqdm import tqdm
 import torch
-from torch import autocast, nn
+from torch import nn
 from torch.backends import cudnn, cuda
 
 from text_utils import (
@@ -40,8 +40,10 @@ def available_models(cls) -> List[ModelInfo]:
         raise NotImplementedError
 
     @classmethod
-    def default_model(cls) -> ModelInfo:
+    def default_model(cls) -> ModelInfo | None:
         available_models = cls.available_models()
+        if len(available_models) == 0:
+            return None
         for info in available_models:
             if "default" in info.tags:
                 return info
@@ -85,7 +87,10 @@ def from_pretrained(
         force_download: bool = False
     ):
         if model is None:
-            model = cls.default_model().name
+            default = cls.default_model()
+            assert default is not None, "no default model available"
+            model = default.name
+
         assert model is not None
         assert any(model == m.name for m in cls.available_models()), \
             f"model {model} does not match any of the available models:\n" \
@@ -195,7 +200,7 @@ def _process_results(
 
     def _get_loader(
         self,
-        inputs: Union[Tuple[List[str], Optional[List[str]]], Iterator[data.InferenceData]],
+        inputs: list[str] | Iterator[data.InferenceData],
         batch_size: int = 16,
         batch_max_tokens: Optional[int] = None,
         sort: bool = True,
@@ -229,7 +234,7 @@ def _get_loader(
             "sort": sort
         })
         self._inference_loader_cfg.update(kwargs)
-        if isinstance(inputs, tuple):
+        if isinstance(inputs, list):
             files, languages = inputs
             loader = data.InferenceLoader.from_files(
                 files=files,
@@ -245,8 +250,8 @@ def _get_loader(
             )
         else:
             raise ValueError(
-                f"unknown input type {type(inputs)}, must either be a tuple of "
-                f"files and languages or an iterator over sequence language pairs"
+                f"unknown input type {type(inputs)}, must either be a list of "
+                f"files and an iterator over strings"
             )
 
         return loader
@@ -274,7 +279,7 @@ def _process_sorted(
         progress_total: int,
         progress_unit: str = "seq",
         show_progress: bool = False,
-    ) -> List[data.InferenceData]:
+    ) -> list[data.InferenceData]:
         results = {}
         pbar = self._pbar(
             progress_desc,
diff --git a/python/text_utils/cli/create_continuation_index.py b/python/text_utils/cli/create_continuation_index.py
index 4efd0dc..ce29e49 100644
--- a/python/text_utils/cli/create_continuation_index.py
+++ b/python/text_utils/cli/create_continuation_index.py
@@ -19,7 +19,7 @@ def create(args: argparse.Namespace):
         os.makedirs(dir, exist_ok=True)
 
     start = time.perf_counter()
-    continuations.Continuations.build_from_file(
+    continuations.ContinuationIndex.build_from_file(
         args.input_file,
         args.output_file
     )
@@ -29,7 +29,7 @@ def create(args: argparse.Namespace):
     start = time.perf_counter()
     # empty continuations for testing
     conts = []
-    continuations.Continuations.load_with_continuations(
+    continuations.ContinuationIndex.load_with_continuations(
         args.output_file,
         conts
     )
diff --git a/python/text_utils/constraints.py b/python/text_utils/constraints.py
new file mode 100644
index 0000000..68d4870
--- /dev/null
+++ b/python/text_utils/constraints.py
@@ -0,0 +1,85 @@
+import copy
+
+from text_utils._internal import grammar
+from text_utils._internal import continuations
+
+
+# re-export grammar constraints
+RegexConstraint = grammar.RegexConstraint
+LR1Constraint = grammar.LR1Constraint
+
+
+class Constraint:
+    """
+    Base class for constraints.
+    """
+
+    def get(self) -> tuple[list[int], bool]:
+        """
+        Returns the current constraint indices and whether we
+        are in a state that matches the constraint.
+        """
+        raise NotImplementedError
+
+    def reset(self, input: bytes | None = None) -> None:
+        """
+        Resets the constraint to the initial state.
+        """
+        raise NotImplementedError
+
+    def next(self, index: int) -> None:
+        """
+        Updates the constraint based on the chosen index / token id.
+        """
+        raise NotImplementedError
+
+    def is_match(self) -> bool:
+        """
+        Returns whether the current state matches the constraint.
+        """
+        raise NotImplementedError
+
+    def clone(self) -> 'Constraint':
+        """
+        Returns a copy of the constraint.
+        """
+        raise NotImplementedError
+
+
+class ContinuationConstraint(Constraint):
+    """
+    Constraint for only allowing certain continuations for
+    a given prefix.
+    """
+
+    def __init__(
+        self,
+        cont_index: continuations.ContinuationIndex,
+        prefix: bytes | None = None
+    ):
+        self.prefix = prefix or bytes()
+        self.value = cont_index.get_value(self.prefix)
+        self.cont_index = cont_index
+
+    def get(self) -> tuple[list[int], bool]:
+        indices, value = self.cont_index.get(self.prefix)
+        self.value = value
+        return indices, self.is_match()
+
+    def reset(self, input: bytes | None = None) -> None:
+        self.prefix = input or bytes()
+
+    def next(self, index: int) -> None:
+        self.prefix += self.cont_index.get_continuation(index)
+
+    def is_match(self) -> bool:
+        return self.value is not None
+
+    def clone(self) -> 'ContinuationConstraint':
+        return ContinuationConstraint(
+            self.cont_index,
+            self.prefix
+        )
+
+    def get_value(self) -> str | None:
+        return self.value
diff --git a/python/text_utils/inference/utils.py b/python/text_utils/inference/utils.py
index 162cf1f..2da4e7a 100644
--- a/python/text_utils/inference/utils.py
+++ b/python/text_utils/inference/utils.py
@@ -1,6 +1,7 @@
 from typing import Callable, Any
 import torch
 
+from text_utils.constraints import Constraint
 
 # maps from token ids, length, and other kwargs to distribution over next token id and other info
 DecodeFn = Callable[..., tuple[torch.Tensor, dict[str, Any]]]
@@ -136,6 +137,73 @@ def __repr__(self) -> str:
 ]
 
 
+def constraint_logit_fn(
+    retrieve_constraint_fn: Callable[[int | Beam], Constraint | None],
+    eos_token_id: int
+) -> LogitFn:
+    def _constrain_logits(
+        logits: torch.Tensor,
+        beams_or_indices:  list[int] | list[Beam]
+    ) -> torch.Tensor:
+        zeros = torch.full_like(logits, float("-inf"))
+
+        batch_indices = []
+        constrain_indices = []
+        for i, beam_or_idx in enumerate(beams_or_indices):
+            constraint = retrieve_constraint_fn(beam_or_idx)
+
+            if constraint is None:
+                zeros[i] = logits[i]
+                continue
+
+            constrain_to, is_match = constraint.get()
+
+            batch_indices.extend([i] * len(constrain_to))
+            constrain_indices.extend(constrain_to)
+
+            if len(constrain_to) == 0 or is_match:
+                batch_indices.append(i)
+                constrain_indices.append(eos_token_id)
+
+        batch_indices = torch.tensor(batch_indices, device=logits.device)
+        constrain_indices = torch.tensor(
+            constrain_indices,
+            device=logits.device
+        )
+
+        zeros[batch_indices, constrain_indices] = logits[
+            batch_indices,
+            constrain_indices
+        ]
+
+        return zeros
+
+    return _constrain_logits
+
+
+def constraint_sample_fn(
+    retrieve_constraint_fn: Callable[[int], Constraint | None],
+    sample_fn: SampleFn,
+    eos_token_id: int
+) -> SampleFn:
+    def _constrain_sample(
+        logits: torch.Tensor,
+        indices: list[int]
+    ) -> torch.Tensor:
+        token_ids = sample_fn(logits, indices)
+        for idx, token_id in zip(indices, token_ids.tolist()):
+            if token_id == eos_token_id:
+                continue
+
+            constraint = retrieve_constraint_fn(idx)
+            if constraint is not None:
+                constraint.next(token_id)
+
+        return token_ids
+
+    return _constrain_sample
+
+
 def default_beam_candidate_fn() -> BeamCandidateFn:
     def _default_beam_candidate_fn(
         beam: Beam,
diff --git a/src/continuations.rs b/src/continuations.rs
index b57df90..e7c6ac8 100644
--- a/src/continuations.rs
+++ b/src/continuations.rs
@@ -9,18 +9,18 @@ use pyo3::prelude::*;
 use text_utils_prefix::{AdaptiveRadixTrie, ContinuationSearch, ContinuationTrie, PrefixSearch};
 
 #[pyclass]
-pub struct Continuations {
-    continuations: ContinuationTrie<AdaptiveRadixTrie<String>>,
+pub struct ContinuationIndex {
+    cont_trie: ContinuationTrie<AdaptiveRadixTrie<String>>,
 }
 
 pub type ContinuationIndices = (Vec<usize>, Vec<usize>);
 #[pymethods]
-impl Continuations {
+impl ContinuationIndex {
     #[staticmethod]
     fn load_with_continuations(file: &str, continuations: Vec<Vec<u8>>) -> anyhow::Result<Self> {
         let trie = AdaptiveRadixTrie::load(file)?;
         Ok(Self {
-            continuations: ContinuationTrie::new(trie, continuations),
+            cont_trie: ContinuationTrie::new(trie, continuations),
         })
     }
 
@@ -46,14 +46,21 @@ impl Continuations {
         Ok(())
     }
 
-    fn get(&self, key: &[u8]) -> Option<String> {
-        self.continuations.get(key).cloned()
+    fn get_value(&self, key: &[u8]) -> Option<String> {
+        self.cont_trie.get(key).cloned()
     }
 
-    fn continuation_indices(&self, prefix: &[u8]) -> (Vec<usize>, Option<String>) {
+    fn get_continuation(&self, index: usize) -> Option<&[u8]> {
+        self.cont_trie
+            .continuations
+            .get(index)
+            .map(|c| c.as_slice())
+    }
+
+    fn get(&self, prefix: &[u8]) -> (Vec<usize>, Option<String>) {
         (
-            self.continuations.contains_continuations(prefix),
-            self.continuations.get(prefix).cloned(),
+            self.cont_trie.contains_continuations(prefix),
+            self.cont_trie.get(prefix).cloned(),
         )
     }
 
@@ -62,7 +69,7 @@ impl Continuations {
         prefixes: Vec<Vec<u8>>,
     ) -> (ContinuationIndices, Vec<Option<String>>) {
         (
-            self.continuations
+            self.cont_trie
                 .batch_contains_continuations(&prefixes)
                 .into_iter()
                 .enumerate()
@@ -78,7 +85,7 @@ impl Continuations {
                 ),
             prefixes
                 .iter()
-                .map(|prefix| self.continuations.get(prefix).cloned())
+                .map(|prefix| self.cont_trie.get(prefix).cloned())
                 .collect(),
         )
     }
@@ -87,7 +94,7 @@ impl Continuations {
 /// A submodule containing python implementations of a continuation trie
 pub(super) fn add_submodule(py: Python, parent_module: &PyModule) -> PyResult<()> {
     let m = PyModule::new(py, "continuations")?;
-    m.add_class::<Continuations>()?;
+    m.add_class::<ContinuationIndex>()?;
     parent_module.add_submodule(m)?;
 
     Ok(())
diff --git a/src/data/loading.rs b/src/data/loading.rs
index f9278bd..64abded 100644
--- a/src/data/loading.rs
+++ b/src/data/loading.rs
@@ -122,7 +122,6 @@ where
 pub fn text_data_generator_from_files<P: AsRef<Path>>(
     input: P,
     target: Option<P>,
-    lang: Option<String>,
 ) -> anyhow::Result<Box<dyn DataGen<Item = anyhow::Result<TextData>>>> {
     let input_len = count_lines(input.as_ref())?;
     let input_iter = LossyUtf8Reader::new(BufReader::new(open(input.as_ref())?)).lines();
@@ -148,7 +147,7 @@ pub fn text_data_generator_from_files<P: AsRef<Path>>(
         } else {
             None
         };
-        Ok(TextData::new(input_s?, target_s, lang.clone()))
+        Ok(TextData::new(input_s?, target_s))
     });
     Ok(Box::new(DataGenerator {
         min_len: input_len,
@@ -168,7 +167,6 @@ pub fn inference_data_generator_from_file(
 pub fn text_data_generator_from_sequences(
     input: Vec<String>,
     target: Option<Vec<String>>,
-    language: Option<Vec<String>>,
 ) -> anyhow::Result<Box<dyn DataGen<Item = anyhow::Result<TextData>>>> {
     let len = input.len();
     let input_iter = input.into_iter();
@@ -182,26 +180,13 @@ pub fn text_data_generator_from_sequences(
     } else {
         None
     };
-    let mut lang_iter = if let Some(language) = language {
-        if language.len() != len {
-            return Err(anyhow!("expect a language for every sequence"));
-        }
-        Some(language.into_iter())
-    } else {
-        None
-    };
     let iter = input_iter.map(move |input_s| {
         let target_s = if let Some(target_iter_mut) = target_iter.as_mut() {
             target_iter_mut.next()
         } else {
             None
         };
-        let lang_s = if let Some(lang_iter_mut) = lang_iter.as_mut() {
-            lang_iter_mut.next()
-        } else {
-            None
-        };
-        Ok(TextData::new(input_s, target_s, lang_s))
+        Ok(TextData::new(input_s, target_s))
     });
     Ok(Box::new(DataGenerator { iter, min_len: len }))
 }
@@ -915,7 +900,7 @@ mod tests {
         let base = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
         let d = base.clone().join("resources/test/multi30k.txt");
         let d2 = base.clone().join("resources/test/multi30k_rev.txt");
-        let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
+        let multi30k = text_data_generator_from_files(&d, None)?;
         let mut it = TextIterator::new(
             vec![multi30k],
             super::TextIterationStrategy::Sequential,
@@ -927,17 +912,15 @@ mod tests {
         let _data = TextData {
             target: MULTI30K_FIRST.to_string(),
             input: MULTI30K_FIRST.to_string(),
-            language: Some("1".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 0)));
         let _data = TextData {
             target: MULTI30K_SECOND.to_string(),
             input: MULTI30K_SECOND.to_string(),
-            language: Some("1".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 0)));
         // check sequential lines with input and target
-        let multi30k = text_data_generator_from_files(&d, Some(&d2), Some("1".to_string()))?;
+        let multi30k = text_data_generator_from_files(&d, Some(&d2))?;
         let mut it = TextIterator::new(
             vec![multi30k],
             super::TextIterationStrategy::Sequential,
@@ -948,18 +931,16 @@ mod tests {
         let _data = TextData {
             target: MULTI30K_FIRST.to_string(),
             input: MULTI30K_REV_FIRST.to_string(),
-            language: Some("1".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 0)));
         let _data = TextData {
             target: MULTI30K_SECOND.to_string(),
             input: MULTI30K_REV_SECOND.to_string(),
-            language: Some("1".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 0)));
         // check interleaved lines with two files
-        let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
-        let multi30k_rev = text_data_generator_from_files(&d2, None, Some("2".to_string()))?;
+        let multi30k = text_data_generator_from_files(&d, None)?;
+        let multi30k_rev = text_data_generator_from_files(&d2, None)?;
         let mut it = TextIterator::new(
             vec![multi30k, multi30k_rev],
             super::TextIterationStrategy::Interleaved,
@@ -970,57 +951,32 @@ mod tests {
         let _data = TextData {
             target: MULTI30K_FIRST.to_string(),
             input: MULTI30K_FIRST.to_string(),
-            language: Some("1".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 0)));
         let _data = TextData {
             target: MULTI30K_REV_FIRST.to_string(),
             input: MULTI30K_REV_FIRST.to_string(),
-            language: Some("2".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 1)));
         let _data = TextData {
             target: MULTI30K_SECOND.to_string(),
             input: MULTI30K_SECOND.to_string(),
-            language: Some("1".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 0)));
         let _data = TextData {
             target: MULTI30K_REV_SECOND.to_string(),
             input: MULTI30K_REV_SECOND.to_string(),
-            language: Some("2".to_string()),
         };
         assert!(matches!(it.next().unwrap(), (Ok(_data), 1)));
-        // check that they are indeed interleaved
-        let mut idx: usize = 4;
-        while let Some((data, _)) = it.next() {
-            assert_eq!(
-                &data.unwrap().language.unwrap(),
-                if idx % 2 == 0 { "1" } else { "2" }
-            );
-            idx += 1;
-        }
         // check weighted lines with two files
-        let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
-        let multi30k_rev = text_data_generator_from_files(&d2, None, Some("2".to_string()))?;
-        let mut it = TextIterator::new(
+        let multi30k = text_data_generator_from_files(&d, None)?;
+        let multi30k_rev = text_data_generator_from_files(&d2, None)?;
+        let it = TextIterator::new(
             vec![multi30k, multi30k_rev],
             super::TextIterationStrategy::Weighted,
             None,
         )?;
-
         assert_eq!(it.min_len(), 2 * 29000);
-        let mut first_count = 0;
-        let mut second_count = 0;
-        while let Some((data, _)) = it.next() {
-            if data.unwrap().language.unwrap().as_str() == "1" {
-                first_count += 1;
-            } else {
-                second_count += 1;
-            }
-        }
-        assert_eq!(first_count, 29000);
-        assert_eq!(first_count, second_count);
         Ok(())
     }
 
@@ -1036,7 +992,6 @@ mod tests {
         let tokenizer_cfg = TokenizerConfig {
             tokenize: TokenizeConfig::Dummy(Duration::from_millis(200)),
             special: SpecialConfig::default(),
-            language: None,
         };
         let (pipeline, _) = text_data_pipeline_with_tokenizer(
             TextDataPipelineConfig {
@@ -1048,7 +1003,7 @@ mod tests {
             512,
         )?;
         // test if it works with one worker and record the time it took
-        let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
+        let multi30k = text_data_generator_from_files(&d, None)?;
         let text_iter = TextIterator::new(
             vec![multi30k],
             super::TextIterationStrategy::Sequential,
@@ -1074,7 +1029,7 @@ mod tests {
 
         // if more cpus are available, test with more workers, check that its faster
         if n_cpus >= 2 {
-            let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
+            let multi30k = text_data_generator_from_files(&d, None)?;
             let text_iter = TextIterator::new(
                 vec![multi30k],
                 super::TextIterationStrategy::Sequential,
@@ -1099,7 +1054,7 @@ mod tests {
 
         // test with even more workers, if available
         if n_cpus >= 4 {
-            let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
+            let multi30k = text_data_generator_from_files(&d, None)?;
             let text_iter = TextIterator::new(
                 vec![multi30k],
                 super::TextIterationStrategy::Sequential,
@@ -1126,7 +1081,6 @@ mod tests {
         let tokenizer_cfg = TokenizerConfig {
             tokenize: TokenizeConfig::Dummy(Duration::from_millis(0)),
             special: SpecialConfig::default(),
-            language: None,
         };
         let (pipeline, _) = text_data_pipeline_with_tokenizer(
             TextDataPipelineConfig {
@@ -1137,7 +1091,7 @@ mod tests {
             tokenizer_cfg,
             512,
         )?;
-        let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
+        let multi30k = text_data_generator_from_files(&d, None)?;
         let text_iter = TextIterator::new(
             vec![multi30k],
             super::TextIterationStrategy::Sequential,
@@ -1166,7 +1120,7 @@ mod tests {
 
         let base = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
         let d = base.clone().join("resources/test/multi30k.txt");
-        let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
+        let multi30k = text_data_generator_from_files(&d, None)?;
         let text_iter = TextIterator::new(
             vec![multi30k],
             super::TextIterationStrategy::Sequential,
@@ -1179,7 +1133,6 @@ mod tests {
         let tokenizer_cfg = TokenizerConfig {
             tokenize: TokenizeConfig::Dummy(Duration::from_millis(0)),
             special: SpecialConfig::default(),
-            language: None,
         };
         let (pipeline, _) = text_data_pipeline_with_tokenizer(
             TextDataPipelineConfig {
@@ -1214,7 +1167,7 @@ mod tests {
         // (because some descriptions in multi30k appear twice)
         for shuffle in [true, false] {
             for sort in [true, false] {
-                let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?;
+                let multi30k = text_data_generator_from_files(&d, None)?;
                 let text_iter = TextIterator::new(
                     vec![multi30k],
                     super::TextIterationStrategy::Weighted,
diff --git a/src/data/mod.rs b/src/data/mod.rs
index 08d15bb..4f26b3b 100644
--- a/src/data/mod.rs
+++ b/src/data/mod.rs
@@ -47,18 +47,12 @@ pub struct TextData {
     input: String,
     #[pyo3(get)]
     target: String,
-    #[pyo3(get)]
-    language: Option<String>,
 }
 
 impl TextData {
-    pub fn new(input: String, target: Option<String>, language: Option<String>) -> Self {
+    pub fn new(input: String, target: Option<String>) -> Self {
         let target = target.unwrap_or_else(|| input.clone());
-        TextData {
-            input,
-            target,
-            language,
-        }
+        TextData { input, target }
     }
 }
 
@@ -963,7 +957,6 @@ type DataIter = dyn Iterator<Item = (Batch<Item>, <Batch<Item> as Tensorize>::Ou
 struct DataLoader {
     pipeline: TextDataPipeline,
     files: Vec<(String, Option<String>)>,
-    languages: Option<Vec<String>>,
     strategy: TextIterationStrategy,
     tokenizer_config: TokenizerConfig,
     num_threads: u8,
@@ -991,7 +984,6 @@ impl DataLoader {
     #[allow(clippy::too_many_arguments)]
     fn new(
         files: Vec<(String, Option<String>)>,
-        languages: Option<Vec<String>>,
         pipeline_config: TextDataPipelineConfig,
         tokenizer_config: TokenizerConfig,
         strategy: TextIterationStrategy,
@@ -1027,7 +1019,6 @@ impl DataLoader {
         Ok(DataLoader {
             pipeline,
             files,
-            languages,
             strategy,
             tokenizer_config,
             num_threads,
@@ -1053,13 +1044,8 @@ impl DataLoader {
     fn init_iter(&mut self) -> anyhow::Result<()> {
         let seed = self.seed.unwrap_or(0) + self.epoch as u64;
         let mut generators = vec![];
-        for (idx, (input_file, target_file)) in self.files.iter().enumerate() {
-            let lang = if self.languages.is_some() {
-                Some(self.languages.as_ref().unwrap()[idx].clone())
-            } else {
-                None
-            };
-            let generator = text_data_generator_from_files(input_file, target_file.as_ref(), lang)?;
+        for (input_file, target_file) in self.files.iter() {
+            let generator = text_data_generator_from_files(input_file, target_file.as_ref())?;
             generators.push(generator);
         }
 
@@ -1161,7 +1147,6 @@ impl DataLoader {
         }
         Self::new(
             files,
-            languages,
             pipeline_config,
             tokenizer_config,
             strategy,
diff --git a/src/data/preprocessing.rs b/src/data/preprocessing.rs
index 084f77a..f9b367d 100644
--- a/src/data/preprocessing.rs
+++ b/src/data/preprocessing.rs
@@ -54,12 +54,12 @@ pub enum PreprocessingFnConfig {
     ByteSubstring(usize, bool),
     // randomly edit and replace words in text
     SpellingCorruption(f64, bool, SpellingCorruptionMode),
-    // randomly replace the language token with the given default
-    LanguageDropout(f64),
     // mark inputs with additional info
     Mark(String, String),
     // add prefix to input sequence
     Prefix(String),
+    // decode from json
+    JsonDecode(bool, bool),
     // concatenate input and target sequences with a separator
     Concatenate(String),
 }
@@ -168,12 +168,6 @@ impl<'a> FromPyObject<'a> for PreprocessingFnConfig {
                 };
                 PreprocessingFnConfig::ByteSubstring(max_bytes.extract()?, use_graphemes)
             }
-            "language_dropout" => {
-                let Some(p) = d.get_item("prob")? else {
-                    return Err(py_required_key_error("prob", "language dropout config"));
-                };
-                PreprocessingFnConfig::LanguageDropout(p.extract()?)
-            }
             "spelling_corruption" => {
                 let Some(p) = d.get_item("prob")? else {
                     return Err(py_required_key_error("prob", "spelling corruption config"));
@@ -207,6 +201,19 @@ impl<'a> FromPyObject<'a> for PreprocessingFnConfig {
                 };
                 PreprocessingFnConfig::Prefix(prefix.extract()?)
             }
+            "json_decode" => {
+                let decode_input = d
+                    .get_item("input")?
+                    .map(|value| value.extract())
+                    .transpose()?
+                    .unwrap_or(false);
+                let decode_target = d
+                    .get_item("target")?
+                    .map(|value| value.extract())
+                    .transpose()?
+                    .unwrap_or(false);
+                PreprocessingFnConfig::JsonDecode(decode_input, decode_target)
+            }
             "concatenate" => {
                 let separator = if let Some(sep) = d.get_item("separator")? {
                     sep.extract()?
@@ -302,14 +309,7 @@ fn substring<F: Fn(&str) -> anyhow::Result<Vec<(usize, usize, usize)>> + Send +
                 ))
             }
         };
-        Ok((
-            TextData {
-                target,
-                input,
-                ..item
-            },
-            info,
-        ))
+        Ok((TextData { target, input }, info))
     })
 }
 
@@ -329,25 +329,6 @@ fn byte_substring(max_length: usize, use_graphemes: bool) -> Box<PreprocessingFn
     )
 }
 
-fn language_dropout(prob: f64) -> Box<PreprocessingFn> {
-    let prob = prob.clamp(0.0, 1.0);
-    Box::new(move |item, info| {
-        let mut rng = ChaCha8Rng::seed_from_u64(info.seed);
-        let r: f64 = rng.gen();
-        if r < prob {
-            Ok((
-                TextData {
-                    language: None,
-                    ..item
-                },
-                info,
-            ))
-        } else {
-            Ok((item, info))
-        }
-    })
-}
-
 #[derive(PartialEq, Debug, Clone)]
 pub enum SpellingCorruptionMode {
     Artificial(f64, f64, Option<PathBuf>),
@@ -678,12 +659,24 @@ pub fn preprocessing(cfg: PreprocessingFnConfig) -> Box<PreprocessingFn> {
         PreprocessingFnConfig::Normalize(scheme, use_graphemes) => {
             apply_to_text(move |s| Ok(normalize(s, scheme, use_graphemes)))
         }
-        PreprocessingFnConfig::LanguageDropout(p) => language_dropout(p),
         PreprocessingFnConfig::SpellingCorruption(p, full_del, mode) => {
             corrupt_spelling(p, full_del, mode)
         }
         PreprocessingFnConfig::Mark(key, value) => mark(key, value),
         PreprocessingFnConfig::Prefix(prefix) => apply_to_text(move |s| Ok(prefix.clone() + s)),
+        PreprocessingFnConfig::JsonDecode(decode_input, decode_target) => {
+            Box::new(move |mut item, info| {
+                if decode_input {
+                    item.input = serde_json::from_str(&item.input)
+                        .map_err(|e| anyhow!("failed to decode input text from json: {}", e))?;
+                }
+                if decode_target {
+                    item.target = serde_json::from_str(&item.target)
+                        .map_err(|e| anyhow!("failed to decode target text from json: {}", e))?;
+                }
+                Ok((item, info))
+            })
+        }
         PreprocessingFnConfig::Concatenate(separator) => concatenate(separator),
     }
 }
@@ -697,15 +690,15 @@ mod tests {
     #[test]
     fn test_corrupt_whitespace() -> anyhow::Result<()> {
         let noise_fn = corrupt_whitespace(0.0, 1.0, true);
-        let data = TextData::new("a test".to_string(), None, None);
+        let data = TextData::new("a test".to_string(), None);
         let info = TextDataInfo::default();
         let (noised, _) = noise_fn(data.clone(), info.clone())?;
         assert_eq!(&noised.input, "atest");
         let noise_fn = corrupt_whitespace(1.0, 0.0, true);
-        let data = TextData::new("a test".to_string(), None, None);
+        let data = TextData::new("a test".to_string(), None);
         let (noised, _) = noise_fn(data.clone(), info.clone())?;
         assert_eq!(&noised.input, "a t e s t");
-        let data = TextData::new("Ginsberǵs".to_string(), None, None);
+        let data = TextData::new("Ginsberǵs".to_string(), None);
         let (noised, _) = noise_fn(data.clone(), info.clone())?;
         assert_eq!(&noised.input, "G i n s b e r ǵ s");
         Ok(())
diff --git a/src/grammar.rs b/src/grammar.rs
index 2a79532..ac43fbd 100644
--- a/src/grammar.rs
+++ b/src/grammar.rs
@@ -116,11 +116,6 @@ impl RegexConstraint {
             .map_err(|_| anyhow!("error locking inner state"))
     }
 
-    fn should_stop(&self) -> bool {
-        // always false for regex
-        false
-    }
-
     fn next(&self, index: usize) -> anyhow::Result<()> {
         let inner = self.inner.clone();
         let constraint = self.constraint.clone();
@@ -332,13 +327,6 @@ impl LR1Constraint {
             .map_err(|_| anyhow!("error locking inner state"))
     }
 
-    fn should_stop(&self) -> anyhow::Result<bool> {
-        self.inner
-            .lock()
-            .map(|inner| inner.is_match && self.constraint.only_skippable_matching(&inner.state))
-            .map_err(|_| anyhow!("error locking inner state"))
-    }
-
     fn next(&self, index: usize) -> anyhow::Result<()> {
         let inner = self.inner.clone();
         let constraint = self.constraint.clone();
diff --git a/src/tokenization.rs b/src/tokenization.rs
index 643e065..bac0fa6 100644
--- a/src/tokenization.rs
+++ b/src/tokenization.rs
@@ -713,7 +713,7 @@ pub trait BaseTokenize: Send + Sync + 'static {
 pub trait Tokenize: BaseTokenize {
     fn vocab_size(&self) -> usize;
 
-    fn get_vocab(&self) -> Vec<Vec<u8>>;
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>>;
 
     fn tokenize(
         &self,
@@ -1157,7 +1157,7 @@ where
         self.join_parts(&parts)
     }
 
-    fn get_vocab(&self) -> Vec<Vec<u8>> {
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>> {
         let mut vocab = BTreeMap::new();
         for (token, &id) in &self.state.1.vocab {
             assert!(vocab.insert(id, token.to_bytes()).is_none());
@@ -1165,7 +1165,7 @@ where
         for (special, &id) in &self.special_vocab.vocab {
             assert!(vocab.insert(id, special.to_bytes()).is_none());
         }
-        vocab.into_values().collect()
+        Ok(vocab.into_values().collect())
     }
 }
 
@@ -1199,8 +1199,8 @@ impl Tokenize for DummyTokenizer {
         "".to_string()
     }
 
-    fn get_vocab(&self) -> Vec<Vec<u8>> {
-        vec![]
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>> {
+        Ok(vec![])
     }
 }
 
@@ -1371,7 +1371,7 @@ impl Tokenize for HuggingfaceTokenizer {
             })
     }
 
-    fn get_vocab(&self) -> Vec<Vec<u8>> {
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>> {
         let decode_fn = match self.inner.get_model() {
             hft::ModelWrapper::BPE(bpe) => {
                 let special = bpe
@@ -1398,14 +1398,19 @@ impl Tokenize for HuggingfaceTokenizer {
                         .to_vec()
                 }
             }
-            _ => unimplemented!("get vocab for models other than BPE not implemented"),
+            _ => {
+                return Err(anyhow!(
+                    "get vocab for models other than BPE not implemented"
+                ))
+            }
         };
-        self.inner
+        Ok(self
+            .inner
             .get_vocab(true)
             .into_iter()
             .sorted_by_key(|(_, id)| *id)
             .map(|(k, v)| decode_fn(&k, &v))
-            .collect()
+            .collect())
     }
 }
 
@@ -1530,9 +1535,7 @@ impl BPETokenizer {
                 .zip(bytes.iter().enumerate().skip(1))
                 .filter_map(|((first_idx, first), (second_idx, second))| {
                     let merged = [first.as_slice(), second.as_slice()].concat();
-                    let Some(merge_id) = self.state.0.get(&merged).copied() else {
-                        return None;
-                    };
+                    let merge_id = self.state.0.get(&merged).copied()?;
                     // heap in rust is a max heap by default
                     // so we reverse to get the earliest merges at earlier positions in
                     // the sequence first
@@ -1670,18 +1673,15 @@ impl Tokenize for BPETokenizer {
         String::from_utf8_lossy(&bytes).to_string()
     }
 
-    fn get_vocab(&self) -> Vec<Vec<u8>> {
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>> {
         let mut vocab: BTreeMap<_, _> = (0..256).map(|b| (b as u32, vec![b as u8])).collect();
-
         for (merge, id) in &self.state.0 {
             assert!(vocab.insert(256 + id, merge.clone()).is_none());
         }
-
         for (special, &id) in &self.special_vocab.vocab {
             assert!(vocab.insert(id, special.to_bytes()).is_none());
         }
-
-        vocab.into_values().collect()
+        Ok(vocab.into_values().collect())
     }
 }
 
@@ -2179,12 +2179,12 @@ impl Tokenize for ByteTokenizer {
         String::from_utf8_lossy(&bytes).to_string()
     }
 
-    fn get_vocab(&self) -> Vec<Vec<u8>> {
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>> {
         let mut vocab: BTreeMap<_, _> = (0..256).map(|b| (b as u32, vec![b as u8])).collect();
         for (special, &id) in &self.special_vocab.vocab {
             vocab.insert(id, special.to_bytes());
         }
-        vocab.into_values().collect()
+        Ok(vocab.into_values().collect())
     }
 }
 
@@ -2264,7 +2264,7 @@ impl Tokenize for ByT5Tokenizer {
         self.inner.de_tokenize(token_ids, ignore_special_tokens)
     }
 
-    fn get_vocab(&self) -> Vec<Vec<u8>> {
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>> {
         self.inner.get_vocab()
     }
 }
@@ -2361,7 +2361,7 @@ impl PyTokenizer {
         self.tokenizer.pad_token_id()
     }
 
-    fn get_vocab(&self) -> Vec<Vec<u8>> {
+    fn get_vocab(&self) -> anyhow::Result<Vec<Vec<u8>>> {
         self.tokenizer.get_vocab()
     }
 }
@@ -2410,16 +2410,15 @@ mod tests {
                 use_graphemes: true,
             },
             SpecialConfig::default(),
-            None,
         );
         let text = "a täst";
-        let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, None, true).unwrap();
+        let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, true).unwrap();
         assert_eq!(token_ids.len(), 6);
         assert_eq!(token_ids[3], tok.unk_token_id());
         assert_eq!(tok.de_tokenize(&token_ids, true), "a tst".to_string());
         assert_eq!(tok.de_tokenize(&token_ids, false), "a t<unk>st".to_string());
         let text = "a <pad>täst";
-        let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, None, false).unwrap();
+        let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, false).unwrap();
         assert_eq!(token_ids.len(), 7);
         assert_eq!(token_ids[2], tok.pad_token_id);
         assert_eq!(token_ids[4], tok.unk_token_id());
@@ -2429,7 +2428,7 @@ mod tests {
             "a <pad>t<unk>st".to_string()
         );
         let text = "a <pad>täst";
-        let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, None, true).unwrap();
+        let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, true).unwrap();
         assert_eq!(token_ids.len(), 11);
         assert_eq!(tok.de_tokenize(&token_ids, true), "a <pad>tst".to_string());
         assert_eq!(
@@ -2446,10 +2445,10 @@ mod tests {
             groups: ByteGroups::Bytes,
             aggregation: GroupAggregation::Mean,
         };
-        let tok = ByteTokenizer::new(tokenize_cfg.clone(), SpecialConfig::default(), None);
+        let tok = ByteTokenizer::new(tokenize_cfg.clone(), SpecialConfig::default());
         assert_eq!(tok.vocab_size(), 384);
         let text = "a täst";
-        let Tokenization { token_ids, info } = tok.tokenize(text, None, None, None, true).unwrap();
+        let Tokenization { token_ids, info } = tok.tokenize(text, None, None, true).unwrap();
         assert_eq!(
             token_ids.iter().map(|tok| *tok as u8).collect::<Vec<u8>>(),
             text.as_bytes()
@@ -2478,9 +2477,9 @@ mod tests {
             groups: ByteGroups::CodePoints,
             ..tokenize_cfg
         };
-        let tok = ByteTokenizer::new(tokenize_cfg, SpecialConfig::default(), None);
+        let tok = ByteTokenizer::new(tokenize_cfg, SpecialConfig::default());
         let text = "a täst";
-        let Tokenization { token_ids, info } = tok.tokenize(text, None, None, None, true).unwrap();
+        let Tokenization { token_ids, info } = tok.tokenize(text, None, None, true).unwrap();
         assert_eq!(
             token_ids.iter().map(|tok| *tok as u8).collect::<Vec<u8>>(),
             text.as_bytes()
@@ -2534,10 +2533,10 @@ mod tests {
             max_vocab_size: None,
             use_graphemes: true,
         };
-        let bpe = BPETokenizer::new(bpe_config, special_config, None).unwrap();
+        let bpe = BPETokenizer::new(bpe_config, special_config).unwrap();
         info!("loaded bpe tokenizer");
         let s = "this is a long reading couple restaurant";
-        let token_ids = bpe.tokenize(s, None, None, None, true).unwrap().token_ids;
+        let token_ids = bpe.tokenize(s, None, None, true).unwrap().token_ids;
         info!("token ids: {token_ids:?}");
         let tokens: Vec<_> = token_ids
             .iter()
@@ -2556,7 +2555,7 @@ mod tests {
                 .take(256)
                 .collect();
 
-            let token_ids = bpe.tokenize(&s, None, None, None, true).unwrap().token_ids;
+            let token_ids = bpe.tokenize(&s, None, None, true).unwrap().token_ids;
             let ds = bpe.de_tokenize(&token_ids, true);
             assert_eq!(s, ds);
         }
@@ -2572,8 +2571,7 @@ mod tests {
         };
         let tok = ByT5Tokenizer::new(tokenize_cfg);
         assert_eq!(tok.vocab_size(), 259);
-        let Tokenization { token_ids, info: _ } =
-            tok.tokenize("a täst", None, None, None, true).unwrap();
+        let Tokenization { token_ids, info: _ } = tok.tokenize("a täst", None, None, true).unwrap();
         assert_eq!(token_ids, vec![100, 35, 119, 198, 167, 118, 119, 1]);
     }
 
diff --git a/text-utils-grammar/src/lr1.rs b/text-utils-grammar/src/lr1.rs
index f9b5c3e..14c840c 100644
--- a/text-utils-grammar/src/lr1.rs
+++ b/text-utils-grammar/src/lr1.rs
@@ -181,7 +181,7 @@ fn find_token_or_matching(
         match pdfa.find_prefix_match(state, prefix) {
             PrefixMatch::None => continue,
             PrefixMatch::Maybe(state) => prefix_matches.push((pidx, state)),
-            PrefixMatch::UpTo(end, _) => {
+            PrefixMatch::UpTo(end) => {
                 if !found_token || end > len {
                     len = end;
                     token = tidx.as_ref().copied();
diff --git a/text-utils-grammar/src/utils.rs b/text-utils-grammar/src/utils.rs
index c137e53..d3cb596 100644
--- a/text-utils-grammar/src/utils.rs
+++ b/text-utils-grammar/src/utils.rs
@@ -102,7 +102,7 @@ impl Debug for PrefixDFA {
 pub(crate) enum PrefixMatch {
     None,
     Maybe(StateID),
-    UpTo(usize, StateID),
+    UpTo(usize),
 }
 
 impl PrefixDFA {
@@ -153,16 +153,16 @@ impl PrefixDFA {
     #[inline]
     pub(crate) fn find_prefix_match(&self, mut state: StateID, prefix: &[u8]) -> PrefixMatch {
         let mut last_match = if self.is_match_state(state) {
-            Some((0, state))
+            Some(0)
         } else {
             None
         };
         for (i, &b) in prefix.iter().enumerate() {
             state = self.dfa.next_state(state, b);
             if self.is_match_state(state) {
-                last_match = Some((i + 1, state));
+                last_match = Some(i + 1);
             } else if !self.is_maybe_match(state) {
-                return last_match.map_or(PrefixMatch::None, |(i, s)| PrefixMatch::UpTo(i, s));
+                return last_match.map_or(PrefixMatch::None, |i| PrefixMatch::UpTo(i));
             }
         }
         PrefixMatch::Maybe(state)
diff --git a/text-utils-prefix/Cargo.lock b/text-utils-prefix/Cargo.lock
new file mode 100644
index 0000000..5b9df20
--- /dev/null
+++ b/text-utils-prefix/Cargo.lock
@@ -0,0 +1,979 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstyle"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
+
+[[package]]
+name = "art-tree"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba8426b68532975e76fe7ddd5f8cdd981d750c87b4a24687b5acc1f26ae3f1f2"
+
+[[package]]
+name = "autocfg"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80"
+
+[[package]]
+name = "base64"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
+
+[[package]]
+name = "bitflags"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+
+[[package]]
+name = "bumpalo"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "cc"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2678b2e3449475e95b0aa6f9b506a28e61b3dc8996592b983695e8ebb58a8b41"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a0d04d43504c61aa6c7531f1871dd0d418d91130162063b789da00fd7057a5e"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "num-traits",
+ "serde",
+ "windows-targets",
+]
+
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0"
+dependencies = [
+ "clap_builder",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
+dependencies = [
+ "anstyle",
+ "clap_lex",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools 0.10.5",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools 0.10.5",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
+[[package]]
+name = "darling"
+version = "0.20.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
+[[package]]
+name = "either"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "getrandom"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+ "serde",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.14.3",
+ "serde",
+]
+
+[[package]]
+name = "is-terminal"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys",
+]
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+
+[[package]]
+name = "js-sys"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.153"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "log"
+version = "0.4.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
+
+[[package]]
+name = "memchr"
+version = "2.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
+
+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
+[[package]]
+name = "num-traits"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+
+[[package]]
+name = "oorandom"
+version = "11.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
+
+[[package]]
+name = "patricia_tree"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31f2f4539bffe53fc4b4da301df49d114b845b077bd5727b7fe2bd9d8df2ae68"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "plotters"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab"
+dependencies = [
+ "plotters-backend",
+]
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand",
+]
+
+[[package]]
+name = "rayon"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "regex"
+version = "1.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56"
+
+[[package]]
+name = "ryu"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.197"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.197"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.115"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
+dependencies = [
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_with"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee80b0e361bbf88fd2f6e242ccd19cfda072cb0faa6ae694ecee08199938569a"
+dependencies = [
+ "base64",
+ "chrono",
+ "hex",
+ "indexmap 1.9.3",
+ "indexmap 2.2.6",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "serde_with_macros",
+ "time",
+]
+
+[[package]]
+name = "serde_with_macros"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6561dc161a9224638a31d876ccdfefbc1df91d3f3a8342eddb35f055d48c7655"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "strsim"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+
+[[package]]
+name = "syn"
+version = "2.0.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "text-utils-prefix"
+version = "0.1.0"
+dependencies = [
+ "art-tree",
+ "criterion",
+ "itertools 0.12.1",
+ "patricia_tree",
+ "rand",
+ "rand_chacha",
+ "rand_distr",
+ "rayon",
+ "serde",
+ "serde_json",
+ "serde_with",
+]
+
+[[package]]
+name = "time"
+version = "0.3.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+
+[[package]]
+name = "time-macros"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+
+[[package]]
+name = "web-sys"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
diff --git a/text-utils-prefix/src/art.rs b/text-utils-prefix/src/art.rs
index 7b74bc3..8e4e5e4 100644
--- a/text-utils-prefix/src/art.rs
+++ b/text-utils-prefix/src/art.rs
@@ -32,80 +32,6 @@ enum NodeType<V> {
     }, // N256(Children<V, 256>, u16),
 }
 
-// impl<V: Serialize> Serialize for NodeType<V> {
-//     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-//     where
-//         S: serde::Serializer,
-//     {
-//         match self {
-//             NodeType::Leaf(v) => {
-//                 let mut s = serializer.serialize_tuple_variant("N", 0, "L", 1)?;
-//                 s.serialize_field(v)?;
-//                 s.end()
-//             }
-//             NodeType::N4(index, children, n) => {
-//                 let mut s = serializer.serialize_tuple_variant("N", 1, "4", 3)?;
-//                 s.serialize_field(index)?;
-//                 s.serialize_field(children)?;
-//                 s.serialize_field(n)?;
-//                 s.end()
-//             }
-//             NodeType::N16(keys, children, n) => {
-//                 let mut s = serializer.serialize_tuple_variant("N", 2, "16", 3)?;
-//                 s.serialize_field(keys)?;
-//                 s.serialize_field(children)?;
-//                 s.serialize_field(n)?;
-//                 s.end()
-//             }
-//             NodeType::N48(index, children, n) => {
-//                 let mut s = serializer.serialize_tuple_variant("N", 3, "48", 3)?;
-//                 let vec: Vec<_> = index.iter().collect();
-//                 s.serialize_field(&vec)?;
-//                 let vec: Vec<_> = children.iter().collect();
-//                 s.serialize_field(&vec)?;
-//                 s.serialize_field(n)?;
-//                 s.end()
-//             }
-//             NodeType::N256(children, n) => {
-//                 let mut s = serializer.serialize_tuple_variant("N", 4, "256", 2)?;
-//                 let vec: Vec<_> = children.iter().collect();
-//                 s.serialize_field(&vec)?;
-//                 s.serialize_field(n)?;
-//                 s.end()
-//             }
-//         }
-//     }
-// }
-
-// impl<'de, V: Deserialize<'de>> Deserialize<'de> for NodeType<V> {
-//     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-//     where
-//         D: serde::Deserializer<'de>,
-//     {
-//         struct NodeTypeVisitor<V> {
-//             marker: PhantomData<V>,
-//         }
-//
-//         impl<'de, V> Visitor<'de> for NodeTypeVisitor<V> {
-//             type Value = V;
-//
-//             fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-//                 formatter.write_str("an adaptive radix trie node type")
-//             }
-//
-//             fn visit_
-//         }
-//
-//         deserializer.deserialize_enum(
-//             "N",
-//             &["L", "4", "16", "48", "256"],
-//             NodeTypeVisitor {
-//                 marker: PhantomData,
-//             },
-//         )
-//     }
-// }
-
 #[derive(Debug, Serialize, Deserialize)]
 struct Node<V> {
     prefix: Box<[u8]>,
@@ -805,7 +731,7 @@ impl<V> PrefixSearch for AdaptiveRadixTrie<V> {
         path
     }
 
-    fn continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
+    fn iter_continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
         let Some(root) = &self.root else {
             return Box::new(empty());
         };
diff --git a/text-utils-prefix/src/lib.rs b/text-utils-prefix/src/lib.rs
index fb9e332..d02d07a 100644
--- a/text-utils-prefix/src/lib.rs
+++ b/text-utils-prefix/src/lib.rs
@@ -22,7 +22,7 @@ pub trait PrefixSearch {
 
     fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)>;
 
-    fn continuations(
+    fn iter_continuations(
         &self,
         prefix: &[u8],
     ) -> Box<dyn Iterator<Item = (Vec<u8>, &Self::Value)> + '_>;
@@ -63,7 +63,7 @@ pub trait ContinuationsTrie {
 
 pub struct ContinuationTrie<T> {
     pub trie: T,
-    continuations: Vec<Vec<u8>>,
+    pub continuations: Vec<Vec<u8>>,
     optimized: (Vec<usize>, Vec<usize>),
 }
 
@@ -115,11 +115,11 @@ where
         self.trie.path(prefix)
     }
 
-    fn continuations(
+    fn iter_continuations(
         &self,
         prefix: &[u8],
     ) -> Box<dyn Iterator<Item = (Vec<u8>, &Self::Value)> + '_> {
-        self.trie.continuations(prefix)
+        self.trie.iter_continuations(prefix)
     }
 }
 
@@ -276,7 +276,7 @@ mod test {
         for prefix in prefixes {
             let conts: Vec<_> = vec
                 .vec
-                .continuations(prefix)
+                .iter_continuations(prefix)
                 .map(|(w, v)| (w, *v))
                 .collect();
             // check that no other words than the given conts start with the prefix
@@ -339,11 +339,11 @@ mod test {
             self.as_ref().path(prefix)
         }
 
-        fn continuations(
+        fn iter_continuations(
             &self,
             prefix: &[u8],
         ) -> Box<dyn Iterator<Item = (Vec<u8>, &Self::Value)> + '_> {
-            self.as_ref().continuations(prefix)
+            self.as_ref().iter_continuations(prefix)
         }
     }
     impl<T: ContinuationsTrie + ?Sized> ContinuationsTrie for Box<T> {
@@ -394,7 +394,10 @@ mod test {
 
         for (_, trie) in tries {
             for prefix in &prefixes {
-                let conts: Vec<_> = trie.continuations(prefix).map(|(w, v)| (w, *v)).collect();
+                let conts: Vec<_> = trie
+                    .iter_continuations(prefix)
+                    .map(|(w, v)| (w, *v))
+                    .collect();
                 // check that no other words than the given conts start with the prefix
                 assert!(words.iter().all(|w| {
                     let w = w.as_bytes();
diff --git a/text-utils-prefix/src/patricia.rs b/text-utils-prefix/src/patricia.rs
index dbde705..1a5ea9b 100644
--- a/text-utils-prefix/src/patricia.rs
+++ b/text-utils-prefix/src/patricia.rs
@@ -441,7 +441,7 @@ impl<V> PrefixSearch for PatriciaTrie<V> {
         path
     }
 
-    fn continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
+    fn iter_continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
         let Some(root) = &self.root else {
             return Box::new(empty());
         };
diff --git a/text-utils-prefix/src/vec.rs b/text-utils-prefix/src/vec.rs
index 2004e2c..5216bea 100644
--- a/text-utils-prefix/src/vec.rs
+++ b/text-utils-prefix/src/vec.rs
@@ -23,7 +23,7 @@ impl<V> Default for PrefixVec<V> {
 
 enum FindResult {
     Found(usize, usize),
-    NotFound(usize),
+    NotFound,
 }
 
 impl<V> PrefixVec<V> {
@@ -135,7 +135,7 @@ impl<V> PrefixVec<V> {
             let Some((new_left, new_right)) =
                 self.range_search(k, start_depth + depth, left, right)
             else {
-                return FindResult::NotFound(depth);
+                return FindResult::NotFound;
             };
             left = new_left;
             right = new_right;
@@ -220,14 +220,14 @@ impl<V> PrefixSearch for PrefixVec<V> {
         path
     }
 
-    fn continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
+    fn iter_continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
         match self.find_range(prefix, 0, self.data.len(), 0) {
             FindResult::Found(left, right) => Box::new(
                 self.data[left..right]
                     .iter()
                     .map(|(key, value)| (key.to_vec(), value)),
             ),
-            FindResult::NotFound(_) => Box::new(empty()),
+            FindResult::NotFound => Box::new(empty()),
         }
     }
 }
@@ -320,8 +320,8 @@ impl<V> PrefixSearch for ContinuationsVec<V> {
         self.vec.path(prefix)
     }
 
-    fn continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
-        self.vec.continuations(prefix)
+    fn iter_continuations(&self, prefix: &[u8]) -> Box<dyn Iterator<Item = (Vec<u8>, &V)> + '_> {
+        self.vec.iter_continuations(prefix)
     }
 }