From 97afe6fc30e12fee3a8e33a104299209495292c8 Mon Sep 17 00:00:00 2001 From: Sebastian Walter Date: Thu, 11 Apr 2024 12:39:15 +0200 Subject: [PATCH] various updates throughout the package --- python/text_utils/api/cli.py | 72 +- python/text_utils/api/processor.py | 21 +- .../cli/create_continuation_index.py | 4 +- python/text_utils/constraints.py | 85 ++ python/text_utils/inference/utils.py | 68 ++ src/continuations.rs | 31 +- src/data/loading.rs | 77 +- src/data/mod.rs | 23 +- src/data/preprocessing.rs | 71 +- src/grammar.rs | 12 - src/tokenization.rs | 66 +- text-utils-grammar/src/lr1.rs | 2 +- text-utils-grammar/src/utils.rs | 8 +- text-utils-prefix/Cargo.lock | 979 ++++++++++++++++++ text-utils-prefix/src/art.rs | 76 +- text-utils-prefix/src/lib.rs | 19 +- text-utils-prefix/src/patricia.rs | 2 +- text-utils-prefix/src/vec.rs | 12 +- 18 files changed, 1301 insertions(+), 327 deletions(-) create mode 100644 python/text_utils/constraints.py create mode 100644 text-utils-prefix/Cargo.lock diff --git a/python/text_utils/api/cli.py b/python/text_utils/api/cli.py index 4ab5817..72fed63 100644 --- a/python/text_utils/api/cli.py +++ b/python/text_utils/api/cli.py @@ -4,7 +4,7 @@ import time import logging import warnings -from typing import Iterator, Iterable, Union, Optional, Type +from typing import Iterator, Iterable, Union, Type try: import readline # noqa except ImportError: @@ -33,6 +33,7 @@ def parser( ) -> argparse.ArgumentParser: parser = argparse.ArgumentParser(name, description) model_group = parser.add_mutually_exclusive_group() + default_model = cls.text_processor_cls.default_model() model_group.add_argument( "-m", "--model", @@ -40,7 +41,7 @@ def parser( model.name for model in cls.text_processor_cls.available_models() ], - default=cls.text_processor_cls.default_model().name, + default=None if default_model is None else default_model.name, help=f"Name of the model to use for {cls.text_processor_cls.task}" ) model_group.add_argument( @@ -185,8 +186,8 @@ def __init__(self, args: argparse.Namespace): def version(self) -> str: raise NotImplementedError - def format_output(self, item: data.InferenceData) -> Iterable[str]: - return [item.text] + def format_output(self, output: str) -> Iterable[str]: + return [output] def _run_with_profiling(self, file: str) -> None: import cProfile @@ -194,17 +195,16 @@ def _run_with_profiling(self, file: str) -> None: def process_iter( self, - text_processor: TextProcessor, - iter: Iterator[data.InferenceData] - ) -> Iterator[data.InferenceData]: + processor: TextProcessor, + iter: Iterator[str] + ) -> Iterator[str]: raise NotImplementedError def process_file( self, - text_processor: TextProcessor, - path: str, - lang: Optional[str], - out_file: Union[str, TextIOWrapper] + processor: TextProcessor, + input_file: str, + output_file: str | TextIOWrapper ) -> None: raise NotImplementedError @@ -277,8 +277,7 @@ def run(self) -> None: start = time.perf_counter() if self.args.process is not None: self.args.progress = False - ipt = data.InferenceData(self.args.process) - opt = next(self.process_iter(self.cor, iter([ipt]))) + opt = next(self.process_iter(self.cor, iter([self.args.process]))) for line in self.format_output(opt): print(line) @@ -290,7 +289,7 @@ def run(self) -> None: assert isinstance(self.args.out_path, str) out = self.args.out_path - self.process_file(self.cor, self.args.file, self.args.lang, out) + self.process_file(self.cor, self.args.file, out) if self.args.report: for d in self.cor.devices: @@ -311,7 +310,7 @@ def run(self) -> None: not self.args.unsorted, self.cor.devices, next(self.cor.model.parameters()).dtype, - batch_max_tokens=self.args.batch_max_tokens, + self.args.batch_max_tokens, ) print(report) @@ -328,34 +327,19 @@ def run(self) -> None: return try: - if self.args.unsorted: - # correct lines from stdin as they come - input_it = ( - data.InferenceData(line.rstrip("\r\n")) - for line in sys.stdin - ) - sized_it = ProgressIterator( - input_it, - self.inference_data_size - ) - outputs = self.process_iter(self.cor, sized_it) - for opt in outputs: - for line in self.format_output(opt): - print(line) - else: - # read stdin completely, then potentially sort and correct - inputs = [ - data.InferenceData(line.rstrip("\r\n")) - for line in sys.stdin - ] - sized_it = ProgressIterator( - iter(inputs), - self.inference_data_size - ) - outputs = self.process_iter(self.cor, sized_it) - for opt in outputs: - for line in self.format_output(opt): - print(line) + # correct lines from stdin as they come + input_it = ( + line.rstrip("\r\n") + for line in sys.stdin + ) + sized_it = ProgressIterator( + input_it, + self.inference_data_size + ) + outputs = self.process_iter(self.cor, sized_it) + for opt in outputs: + for line in self.format_output(opt): + print(line) if self.args.report: for d in self.cor.devices: @@ -373,7 +357,7 @@ def run(self) -> None: not self.args.unsorted, self.cor.devices, next(self.cor.model.parameters()).dtype, - batch_max_tokens=self.args.batch_max_tokens, + self.args.batch_max_tokens, ) print(report) diff --git a/python/text_utils/api/processor.py b/python/text_utils/api/processor.py index 78af78a..8657db6 100644 --- a/python/text_utils/api/processor.py +++ b/python/text_utils/api/processor.py @@ -7,7 +7,7 @@ from tqdm import tqdm import torch -from torch import autocast, nn +from torch import nn from torch.backends import cudnn, cuda from text_utils import ( @@ -40,8 +40,10 @@ def available_models(cls) -> List[ModelInfo]: raise NotImplementedError @classmethod - def default_model(cls) -> ModelInfo: + def default_model(cls) -> ModelInfo | None: available_models = cls.available_models() + if len(available_models) == 0: + return None for info in available_models: if "default" in info.tags: return info @@ -85,7 +87,10 @@ def from_pretrained( force_download: bool = False ): if model is None: - model = cls.default_model().name + default = cls.default_model() + assert default is not None, "no default model available" + model = default.name + assert model is not None assert any(model == m.name for m in cls.available_models()), \ f"model {model} does not match any of the available models:\n" \ @@ -195,7 +200,7 @@ def _process_results( def _get_loader( self, - inputs: Union[Tuple[List[str], Optional[List[str]]], Iterator[data.InferenceData]], + inputs: list[str] | Iterator[data.InferenceData], batch_size: int = 16, batch_max_tokens: Optional[int] = None, sort: bool = True, @@ -229,7 +234,7 @@ def _get_loader( "sort": sort }) self._inference_loader_cfg.update(kwargs) - if isinstance(inputs, tuple): + if isinstance(inputs, list): files, languages = inputs loader = data.InferenceLoader.from_files( files=files, @@ -245,8 +250,8 @@ def _get_loader( ) else: raise ValueError( - f"unknown input type {type(inputs)}, must either be a tuple of " - f"files and languages or an iterator over sequence language pairs" + f"unknown input type {type(inputs)}, must either be a list of " + f"files and an iterator over strings" ) return loader @@ -274,7 +279,7 @@ def _process_sorted( progress_total: int, progress_unit: str = "seq", show_progress: bool = False, - ) -> List[data.InferenceData]: + ) -> list[data.InferenceData]: results = {} pbar = self._pbar( progress_desc, diff --git a/python/text_utils/cli/create_continuation_index.py b/python/text_utils/cli/create_continuation_index.py index 4efd0dc..ce29e49 100644 --- a/python/text_utils/cli/create_continuation_index.py +++ b/python/text_utils/cli/create_continuation_index.py @@ -19,7 +19,7 @@ def create(args: argparse.Namespace): os.makedirs(dir, exist_ok=True) start = time.perf_counter() - continuations.Continuations.build_from_file( + continuations.ContinuationIndex.build_from_file( args.input_file, args.output_file ) @@ -29,7 +29,7 @@ def create(args: argparse.Namespace): start = time.perf_counter() # empty continuations for testing conts = [] - continuations.Continuations.load_with_continuations( + continuations.ContinuationIndex.load_with_continuations( args.output_file, conts ) diff --git a/python/text_utils/constraints.py b/python/text_utils/constraints.py new file mode 100644 index 0000000..68d4870 --- /dev/null +++ b/python/text_utils/constraints.py @@ -0,0 +1,85 @@ +import copy + +from text_utils._internal import grammar +from text_utils._internal import continuations + + +# re-export grammar constraints +RegexConstraint = grammar.RegexConstraint +LR1Constraint = grammar.LR1Constraint + + +class Constraint: + """ + Base class for constraints. + """ + + def get(self) -> tuple[list[int], bool]: + """ + Returns the current constraint indices and whether we + are in a state that matches the constraint. + """ + raise NotImplementedError + + def reset(self, input: bytes | None = None) -> None: + """ + Resets the constraint to the initial state. + """ + raise NotImplementedError + + def next(self, index: int) -> None: + """ + Updates the constraint based on the chosen index / token id. + """ + raise NotImplementedError + + def is_match(self) -> bool: + """ + Returns whether the current state matches the constraint. + """ + raise NotImplementedError + + def clone(self) -> 'Constraint': + """ + Returns a copy of the constraint. + """ + raise NotImplementedError + + +class ContinuationConstraint(Constraint): + """ + Constraint for only allowing certain continuations for + a given prefix. + """ + + def __init__( + self, + cont_index: continuations.ContinuationIndex, + prefix: bytes | None = None + ): + self.prefix = prefix or bytes() + self.value = cont_index.get_value(self.prefix) + self.cont_index = cont_index + + def get(self) -> tuple[list[int], bool]: + indices, value = self.cont_index.get(self.prefix) + self.value = value + return indices, self.is_match() + + def reset(self, input: bytes | None = None) -> None: + self.prefix = input or bytes() + + def next(self, index: int) -> None: + self.prefix += self.cont_index.get_continuation(index) + + def is_match(self) -> bool: + return self.value is not None + + def clone(self) -> 'ContinuationConstraint': + return ContinuationConstraint( + self.cont_index, + self.prefix + ) + + def get_value(self) -> str | None: + return self.value diff --git a/python/text_utils/inference/utils.py b/python/text_utils/inference/utils.py index 162cf1f..2da4e7a 100644 --- a/python/text_utils/inference/utils.py +++ b/python/text_utils/inference/utils.py @@ -1,6 +1,7 @@ from typing import Callable, Any import torch +from text_utils.constraints import Constraint # maps from token ids, length, and other kwargs to distribution over next token id and other info DecodeFn = Callable[..., tuple[torch.Tensor, dict[str, Any]]] @@ -136,6 +137,73 @@ def __repr__(self) -> str: ] +def constraint_logit_fn( + retrieve_constraint_fn: Callable[[int | Beam], Constraint | None], + eos_token_id: int +) -> LogitFn: + def _constrain_logits( + logits: torch.Tensor, + beams_or_indices: list[int] | list[Beam] + ) -> torch.Tensor: + zeros = torch.full_like(logits, float("-inf")) + + batch_indices = [] + constrain_indices = [] + for i, beam_or_idx in enumerate(beams_or_indices): + constraint = retrieve_constraint_fn(beam_or_idx) + + if constraint is None: + zeros[i] = logits[i] + continue + + constrain_to, is_match = constraint.get() + + batch_indices.extend([i] * len(constrain_to)) + constrain_indices.extend(constrain_to) + + if len(constrain_to) == 0 or is_match: + batch_indices.append(i) + constrain_indices.append(eos_token_id) + + batch_indices = torch.tensor(batch_indices, device=logits.device) + constrain_indices = torch.tensor( + constrain_indices, + device=logits.device + ) + + zeros[batch_indices, constrain_indices] = logits[ + batch_indices, + constrain_indices + ] + + return zeros + + return _constrain_logits + + +def constraint_sample_fn( + retrieve_constraint_fn: Callable[[int], Constraint | None], + sample_fn: SampleFn, + eos_token_id: int +) -> SampleFn: + def _constrain_sample( + logits: torch.Tensor, + indices: list[int] + ) -> torch.Tensor: + token_ids = sample_fn(logits, indices) + for idx, token_id in zip(indices, token_ids.tolist()): + if token_id == eos_token_id: + continue + + constraint = retrieve_constraint_fn(idx) + if constraint is not None: + constraint.next(token_id) + + return token_ids + + return _constrain_sample + + def default_beam_candidate_fn() -> BeamCandidateFn: def _default_beam_candidate_fn( beam: Beam, diff --git a/src/continuations.rs b/src/continuations.rs index b57df90..e7c6ac8 100644 --- a/src/continuations.rs +++ b/src/continuations.rs @@ -9,18 +9,18 @@ use pyo3::prelude::*; use text_utils_prefix::{AdaptiveRadixTrie, ContinuationSearch, ContinuationTrie, PrefixSearch}; #[pyclass] -pub struct Continuations { - continuations: ContinuationTrie>, +pub struct ContinuationIndex { + cont_trie: ContinuationTrie>, } pub type ContinuationIndices = (Vec, Vec); #[pymethods] -impl Continuations { +impl ContinuationIndex { #[staticmethod] fn load_with_continuations(file: &str, continuations: Vec>) -> anyhow::Result { let trie = AdaptiveRadixTrie::load(file)?; Ok(Self { - continuations: ContinuationTrie::new(trie, continuations), + cont_trie: ContinuationTrie::new(trie, continuations), }) } @@ -46,14 +46,21 @@ impl Continuations { Ok(()) } - fn get(&self, key: &[u8]) -> Option { - self.continuations.get(key).cloned() + fn get_value(&self, key: &[u8]) -> Option { + self.cont_trie.get(key).cloned() } - fn continuation_indices(&self, prefix: &[u8]) -> (Vec, Option) { + fn get_continuation(&self, index: usize) -> Option<&[u8]> { + self.cont_trie + .continuations + .get(index) + .map(|c| c.as_slice()) + } + + fn get(&self, prefix: &[u8]) -> (Vec, Option) { ( - self.continuations.contains_continuations(prefix), - self.continuations.get(prefix).cloned(), + self.cont_trie.contains_continuations(prefix), + self.cont_trie.get(prefix).cloned(), ) } @@ -62,7 +69,7 @@ impl Continuations { prefixes: Vec>, ) -> (ContinuationIndices, Vec>) { ( - self.continuations + self.cont_trie .batch_contains_continuations(&prefixes) .into_iter() .enumerate() @@ -78,7 +85,7 @@ impl Continuations { ), prefixes .iter() - .map(|prefix| self.continuations.get(prefix).cloned()) + .map(|prefix| self.cont_trie.get(prefix).cloned()) .collect(), ) } @@ -87,7 +94,7 @@ impl Continuations { /// A submodule containing python implementations of a continuation trie pub(super) fn add_submodule(py: Python, parent_module: &PyModule) -> PyResult<()> { let m = PyModule::new(py, "continuations")?; - m.add_class::()?; + m.add_class::()?; parent_module.add_submodule(m)?; Ok(()) diff --git a/src/data/loading.rs b/src/data/loading.rs index f9278bd..64abded 100644 --- a/src/data/loading.rs +++ b/src/data/loading.rs @@ -122,7 +122,6 @@ where pub fn text_data_generator_from_files>( input: P, target: Option

, - lang: Option, ) -> anyhow::Result>>> { let input_len = count_lines(input.as_ref())?; let input_iter = LossyUtf8Reader::new(BufReader::new(open(input.as_ref())?)).lines(); @@ -148,7 +147,7 @@ pub fn text_data_generator_from_files>( } else { None }; - Ok(TextData::new(input_s?, target_s, lang.clone())) + Ok(TextData::new(input_s?, target_s)) }); Ok(Box::new(DataGenerator { min_len: input_len, @@ -168,7 +167,6 @@ pub fn inference_data_generator_from_file( pub fn text_data_generator_from_sequences( input: Vec, target: Option>, - language: Option>, ) -> anyhow::Result>>> { let len = input.len(); let input_iter = input.into_iter(); @@ -182,26 +180,13 @@ pub fn text_data_generator_from_sequences( } else { None }; - let mut lang_iter = if let Some(language) = language { - if language.len() != len { - return Err(anyhow!("expect a language for every sequence")); - } - Some(language.into_iter()) - } else { - None - }; let iter = input_iter.map(move |input_s| { let target_s = if let Some(target_iter_mut) = target_iter.as_mut() { target_iter_mut.next() } else { None }; - let lang_s = if let Some(lang_iter_mut) = lang_iter.as_mut() { - lang_iter_mut.next() - } else { - None - }; - Ok(TextData::new(input_s, target_s, lang_s)) + Ok(TextData::new(input_s, target_s)) }); Ok(Box::new(DataGenerator { iter, min_len: len })) } @@ -915,7 +900,7 @@ mod tests { let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let d = base.clone().join("resources/test/multi30k.txt"); let d2 = base.clone().join("resources/test/multi30k_rev.txt"); - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; let mut it = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Sequential, @@ -927,17 +912,15 @@ mod tests { let _data = TextData { target: MULTI30K_FIRST.to_string(), input: MULTI30K_FIRST.to_string(), - language: Some("1".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 0))); let _data = TextData { target: MULTI30K_SECOND.to_string(), input: MULTI30K_SECOND.to_string(), - language: Some("1".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 0))); // check sequential lines with input and target - let multi30k = text_data_generator_from_files(&d, Some(&d2), Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, Some(&d2))?; let mut it = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Sequential, @@ -948,18 +931,16 @@ mod tests { let _data = TextData { target: MULTI30K_FIRST.to_string(), input: MULTI30K_REV_FIRST.to_string(), - language: Some("1".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 0))); let _data = TextData { target: MULTI30K_SECOND.to_string(), input: MULTI30K_REV_SECOND.to_string(), - language: Some("1".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 0))); // check interleaved lines with two files - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; - let multi30k_rev = text_data_generator_from_files(&d2, None, Some("2".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; + let multi30k_rev = text_data_generator_from_files(&d2, None)?; let mut it = TextIterator::new( vec![multi30k, multi30k_rev], super::TextIterationStrategy::Interleaved, @@ -970,57 +951,32 @@ mod tests { let _data = TextData { target: MULTI30K_FIRST.to_string(), input: MULTI30K_FIRST.to_string(), - language: Some("1".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 0))); let _data = TextData { target: MULTI30K_REV_FIRST.to_string(), input: MULTI30K_REV_FIRST.to_string(), - language: Some("2".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 1))); let _data = TextData { target: MULTI30K_SECOND.to_string(), input: MULTI30K_SECOND.to_string(), - language: Some("1".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 0))); let _data = TextData { target: MULTI30K_REV_SECOND.to_string(), input: MULTI30K_REV_SECOND.to_string(), - language: Some("2".to_string()), }; assert!(matches!(it.next().unwrap(), (Ok(_data), 1))); - // check that they are indeed interleaved - let mut idx: usize = 4; - while let Some((data, _)) = it.next() { - assert_eq!( - &data.unwrap().language.unwrap(), - if idx % 2 == 0 { "1" } else { "2" } - ); - idx += 1; - } // check weighted lines with two files - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; - let multi30k_rev = text_data_generator_from_files(&d2, None, Some("2".to_string()))?; - let mut it = TextIterator::new( + let multi30k = text_data_generator_from_files(&d, None)?; + let multi30k_rev = text_data_generator_from_files(&d2, None)?; + let it = TextIterator::new( vec![multi30k, multi30k_rev], super::TextIterationStrategy::Weighted, None, )?; - assert_eq!(it.min_len(), 2 * 29000); - let mut first_count = 0; - let mut second_count = 0; - while let Some((data, _)) = it.next() { - if data.unwrap().language.unwrap().as_str() == "1" { - first_count += 1; - } else { - second_count += 1; - } - } - assert_eq!(first_count, 29000); - assert_eq!(first_count, second_count); Ok(()) } @@ -1036,7 +992,6 @@ mod tests { let tokenizer_cfg = TokenizerConfig { tokenize: TokenizeConfig::Dummy(Duration::from_millis(200)), special: SpecialConfig::default(), - language: None, }; let (pipeline, _) = text_data_pipeline_with_tokenizer( TextDataPipelineConfig { @@ -1048,7 +1003,7 @@ mod tests { 512, )?; // test if it works with one worker and record the time it took - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; let text_iter = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Sequential, @@ -1074,7 +1029,7 @@ mod tests { // if more cpus are available, test with more workers, check that its faster if n_cpus >= 2 { - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; let text_iter = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Sequential, @@ -1099,7 +1054,7 @@ mod tests { // test with even more workers, if available if n_cpus >= 4 { - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; let text_iter = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Sequential, @@ -1126,7 +1081,6 @@ mod tests { let tokenizer_cfg = TokenizerConfig { tokenize: TokenizeConfig::Dummy(Duration::from_millis(0)), special: SpecialConfig::default(), - language: None, }; let (pipeline, _) = text_data_pipeline_with_tokenizer( TextDataPipelineConfig { @@ -1137,7 +1091,7 @@ mod tests { tokenizer_cfg, 512, )?; - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; let text_iter = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Sequential, @@ -1166,7 +1120,7 @@ mod tests { let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let d = base.clone().join("resources/test/multi30k.txt"); - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; let text_iter = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Sequential, @@ -1179,7 +1133,6 @@ mod tests { let tokenizer_cfg = TokenizerConfig { tokenize: TokenizeConfig::Dummy(Duration::from_millis(0)), special: SpecialConfig::default(), - language: None, }; let (pipeline, _) = text_data_pipeline_with_tokenizer( TextDataPipelineConfig { @@ -1214,7 +1167,7 @@ mod tests { // (because some descriptions in multi30k appear twice) for shuffle in [true, false] { for sort in [true, false] { - let multi30k = text_data_generator_from_files(&d, None, Some("1".to_string()))?; + let multi30k = text_data_generator_from_files(&d, None)?; let text_iter = TextIterator::new( vec![multi30k], super::TextIterationStrategy::Weighted, diff --git a/src/data/mod.rs b/src/data/mod.rs index 08d15bb..4f26b3b 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -47,18 +47,12 @@ pub struct TextData { input: String, #[pyo3(get)] target: String, - #[pyo3(get)] - language: Option, } impl TextData { - pub fn new(input: String, target: Option, language: Option) -> Self { + pub fn new(input: String, target: Option) -> Self { let target = target.unwrap_or_else(|| input.clone()); - TextData { - input, - target, - language, - } + TextData { input, target } } } @@ -963,7 +957,6 @@ type DataIter = dyn Iterator, as Tensorize>::Ou struct DataLoader { pipeline: TextDataPipeline, files: Vec<(String, Option)>, - languages: Option>, strategy: TextIterationStrategy, tokenizer_config: TokenizerConfig, num_threads: u8, @@ -991,7 +984,6 @@ impl DataLoader { #[allow(clippy::too_many_arguments)] fn new( files: Vec<(String, Option)>, - languages: Option>, pipeline_config: TextDataPipelineConfig, tokenizer_config: TokenizerConfig, strategy: TextIterationStrategy, @@ -1027,7 +1019,6 @@ impl DataLoader { Ok(DataLoader { pipeline, files, - languages, strategy, tokenizer_config, num_threads, @@ -1053,13 +1044,8 @@ impl DataLoader { fn init_iter(&mut self) -> anyhow::Result<()> { let seed = self.seed.unwrap_or(0) + self.epoch as u64; let mut generators = vec![]; - for (idx, (input_file, target_file)) in self.files.iter().enumerate() { - let lang = if self.languages.is_some() { - Some(self.languages.as_ref().unwrap()[idx].clone()) - } else { - None - }; - let generator = text_data_generator_from_files(input_file, target_file.as_ref(), lang)?; + for (input_file, target_file) in self.files.iter() { + let generator = text_data_generator_from_files(input_file, target_file.as_ref())?; generators.push(generator); } @@ -1161,7 +1147,6 @@ impl DataLoader { } Self::new( files, - languages, pipeline_config, tokenizer_config, strategy, diff --git a/src/data/preprocessing.rs b/src/data/preprocessing.rs index 084f77a..f9b367d 100644 --- a/src/data/preprocessing.rs +++ b/src/data/preprocessing.rs @@ -54,12 +54,12 @@ pub enum PreprocessingFnConfig { ByteSubstring(usize, bool), // randomly edit and replace words in text SpellingCorruption(f64, bool, SpellingCorruptionMode), - // randomly replace the language token with the given default - LanguageDropout(f64), // mark inputs with additional info Mark(String, String), // add prefix to input sequence Prefix(String), + // decode from json + JsonDecode(bool, bool), // concatenate input and target sequences with a separator Concatenate(String), } @@ -168,12 +168,6 @@ impl<'a> FromPyObject<'a> for PreprocessingFnConfig { }; PreprocessingFnConfig::ByteSubstring(max_bytes.extract()?, use_graphemes) } - "language_dropout" => { - let Some(p) = d.get_item("prob")? else { - return Err(py_required_key_error("prob", "language dropout config")); - }; - PreprocessingFnConfig::LanguageDropout(p.extract()?) - } "spelling_corruption" => { let Some(p) = d.get_item("prob")? else { return Err(py_required_key_error("prob", "spelling corruption config")); @@ -207,6 +201,19 @@ impl<'a> FromPyObject<'a> for PreprocessingFnConfig { }; PreprocessingFnConfig::Prefix(prefix.extract()?) } + "json_decode" => { + let decode_input = d + .get_item("input")? + .map(|value| value.extract()) + .transpose()? + .unwrap_or(false); + let decode_target = d + .get_item("target")? + .map(|value| value.extract()) + .transpose()? + .unwrap_or(false); + PreprocessingFnConfig::JsonDecode(decode_input, decode_target) + } "concatenate" => { let separator = if let Some(sep) = d.get_item("separator")? { sep.extract()? @@ -302,14 +309,7 @@ fn substring anyhow::Result> + Send + )) } }; - Ok(( - TextData { - target, - input, - ..item - }, - info, - )) + Ok((TextData { target, input }, info)) }) } @@ -329,25 +329,6 @@ fn byte_substring(max_length: usize, use_graphemes: bool) -> Box Box { - let prob = prob.clamp(0.0, 1.0); - Box::new(move |item, info| { - let mut rng = ChaCha8Rng::seed_from_u64(info.seed); - let r: f64 = rng.gen(); - if r < prob { - Ok(( - TextData { - language: None, - ..item - }, - info, - )) - } else { - Ok((item, info)) - } - }) -} - #[derive(PartialEq, Debug, Clone)] pub enum SpellingCorruptionMode { Artificial(f64, f64, Option), @@ -678,12 +659,24 @@ pub fn preprocessing(cfg: PreprocessingFnConfig) -> Box { PreprocessingFnConfig::Normalize(scheme, use_graphemes) => { apply_to_text(move |s| Ok(normalize(s, scheme, use_graphemes))) } - PreprocessingFnConfig::LanguageDropout(p) => language_dropout(p), PreprocessingFnConfig::SpellingCorruption(p, full_del, mode) => { corrupt_spelling(p, full_del, mode) } PreprocessingFnConfig::Mark(key, value) => mark(key, value), PreprocessingFnConfig::Prefix(prefix) => apply_to_text(move |s| Ok(prefix.clone() + s)), + PreprocessingFnConfig::JsonDecode(decode_input, decode_target) => { + Box::new(move |mut item, info| { + if decode_input { + item.input = serde_json::from_str(&item.input) + .map_err(|e| anyhow!("failed to decode input text from json: {}", e))?; + } + if decode_target { + item.target = serde_json::from_str(&item.target) + .map_err(|e| anyhow!("failed to decode target text from json: {}", e))?; + } + Ok((item, info)) + }) + } PreprocessingFnConfig::Concatenate(separator) => concatenate(separator), } } @@ -697,15 +690,15 @@ mod tests { #[test] fn test_corrupt_whitespace() -> anyhow::Result<()> { let noise_fn = corrupt_whitespace(0.0, 1.0, true); - let data = TextData::new("a test".to_string(), None, None); + let data = TextData::new("a test".to_string(), None); let info = TextDataInfo::default(); let (noised, _) = noise_fn(data.clone(), info.clone())?; assert_eq!(&noised.input, "atest"); let noise_fn = corrupt_whitespace(1.0, 0.0, true); - let data = TextData::new("a test".to_string(), None, None); + let data = TextData::new("a test".to_string(), None); let (noised, _) = noise_fn(data.clone(), info.clone())?; assert_eq!(&noised.input, "a t e s t"); - let data = TextData::new("Ginsberǵs".to_string(), None, None); + let data = TextData::new("Ginsberǵs".to_string(), None); let (noised, _) = noise_fn(data.clone(), info.clone())?; assert_eq!(&noised.input, "G i n s b e r ǵ s"); Ok(()) diff --git a/src/grammar.rs b/src/grammar.rs index 2a79532..ac43fbd 100644 --- a/src/grammar.rs +++ b/src/grammar.rs @@ -116,11 +116,6 @@ impl RegexConstraint { .map_err(|_| anyhow!("error locking inner state")) } - fn should_stop(&self) -> bool { - // always false for regex - false - } - fn next(&self, index: usize) -> anyhow::Result<()> { let inner = self.inner.clone(); let constraint = self.constraint.clone(); @@ -332,13 +327,6 @@ impl LR1Constraint { .map_err(|_| anyhow!("error locking inner state")) } - fn should_stop(&self) -> anyhow::Result { - self.inner - .lock() - .map(|inner| inner.is_match && self.constraint.only_skippable_matching(&inner.state)) - .map_err(|_| anyhow!("error locking inner state")) - } - fn next(&self, index: usize) -> anyhow::Result<()> { let inner = self.inner.clone(); let constraint = self.constraint.clone(); diff --git a/src/tokenization.rs b/src/tokenization.rs index 643e065..bac0fa6 100644 --- a/src/tokenization.rs +++ b/src/tokenization.rs @@ -713,7 +713,7 @@ pub trait BaseTokenize: Send + Sync + 'static { pub trait Tokenize: BaseTokenize { fn vocab_size(&self) -> usize; - fn get_vocab(&self) -> Vec>; + fn get_vocab(&self) -> anyhow::Result>>; fn tokenize( &self, @@ -1157,7 +1157,7 @@ where self.join_parts(&parts) } - fn get_vocab(&self) -> Vec> { + fn get_vocab(&self) -> anyhow::Result>> { let mut vocab = BTreeMap::new(); for (token, &id) in &self.state.1.vocab { assert!(vocab.insert(id, token.to_bytes()).is_none()); @@ -1165,7 +1165,7 @@ where for (special, &id) in &self.special_vocab.vocab { assert!(vocab.insert(id, special.to_bytes()).is_none()); } - vocab.into_values().collect() + Ok(vocab.into_values().collect()) } } @@ -1199,8 +1199,8 @@ impl Tokenize for DummyTokenizer { "".to_string() } - fn get_vocab(&self) -> Vec> { - vec![] + fn get_vocab(&self) -> anyhow::Result>> { + Ok(vec![]) } } @@ -1371,7 +1371,7 @@ impl Tokenize for HuggingfaceTokenizer { }) } - fn get_vocab(&self) -> Vec> { + fn get_vocab(&self) -> anyhow::Result>> { let decode_fn = match self.inner.get_model() { hft::ModelWrapper::BPE(bpe) => { let special = bpe @@ -1398,14 +1398,19 @@ impl Tokenize for HuggingfaceTokenizer { .to_vec() } } - _ => unimplemented!("get vocab for models other than BPE not implemented"), + _ => { + return Err(anyhow!( + "get vocab for models other than BPE not implemented" + )) + } }; - self.inner + Ok(self + .inner .get_vocab(true) .into_iter() .sorted_by_key(|(_, id)| *id) .map(|(k, v)| decode_fn(&k, &v)) - .collect() + .collect()) } } @@ -1530,9 +1535,7 @@ impl BPETokenizer { .zip(bytes.iter().enumerate().skip(1)) .filter_map(|((first_idx, first), (second_idx, second))| { let merged = [first.as_slice(), second.as_slice()].concat(); - let Some(merge_id) = self.state.0.get(&merged).copied() else { - return None; - }; + let merge_id = self.state.0.get(&merged).copied()?; // heap in rust is a max heap by default // so we reverse to get the earliest merges at earlier positions in // the sequence first @@ -1670,18 +1673,15 @@ impl Tokenize for BPETokenizer { String::from_utf8_lossy(&bytes).to_string() } - fn get_vocab(&self) -> Vec> { + fn get_vocab(&self) -> anyhow::Result>> { let mut vocab: BTreeMap<_, _> = (0..256).map(|b| (b as u32, vec![b as u8])).collect(); - for (merge, id) in &self.state.0 { assert!(vocab.insert(256 + id, merge.clone()).is_none()); } - for (special, &id) in &self.special_vocab.vocab { assert!(vocab.insert(id, special.to_bytes()).is_none()); } - - vocab.into_values().collect() + Ok(vocab.into_values().collect()) } } @@ -2179,12 +2179,12 @@ impl Tokenize for ByteTokenizer { String::from_utf8_lossy(&bytes).to_string() } - fn get_vocab(&self) -> Vec> { + fn get_vocab(&self) -> anyhow::Result>> { let mut vocab: BTreeMap<_, _> = (0..256).map(|b| (b as u32, vec![b as u8])).collect(); for (special, &id) in &self.special_vocab.vocab { vocab.insert(id, special.to_bytes()); } - vocab.into_values().collect() + Ok(vocab.into_values().collect()) } } @@ -2264,7 +2264,7 @@ impl Tokenize for ByT5Tokenizer { self.inner.de_tokenize(token_ids, ignore_special_tokens) } - fn get_vocab(&self) -> Vec> { + fn get_vocab(&self) -> anyhow::Result>> { self.inner.get_vocab() } } @@ -2361,7 +2361,7 @@ impl PyTokenizer { self.tokenizer.pad_token_id() } - fn get_vocab(&self) -> Vec> { + fn get_vocab(&self) -> anyhow::Result>> { self.tokenizer.get_vocab() } } @@ -2410,16 +2410,15 @@ mod tests { use_graphemes: true, }, SpecialConfig::default(), - None, ); let text = "a täst"; - let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, None, true).unwrap(); + let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, true).unwrap(); assert_eq!(token_ids.len(), 6); assert_eq!(token_ids[3], tok.unk_token_id()); assert_eq!(tok.de_tokenize(&token_ids, true), "a tst".to_string()); assert_eq!(tok.de_tokenize(&token_ids, false), "a tst".to_string()); let text = "a täst"; - let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, None, false).unwrap(); + let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, false).unwrap(); assert_eq!(token_ids.len(), 7); assert_eq!(token_ids[2], tok.pad_token_id); assert_eq!(token_ids[4], tok.unk_token_id()); @@ -2429,7 +2428,7 @@ mod tests { "a tst".to_string() ); let text = "a täst"; - let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, None, true).unwrap(); + let Tokenization { token_ids, .. } = tok.tokenize(text, None, None, true).unwrap(); assert_eq!(token_ids.len(), 11); assert_eq!(tok.de_tokenize(&token_ids, true), "a tst".to_string()); assert_eq!( @@ -2446,10 +2445,10 @@ mod tests { groups: ByteGroups::Bytes, aggregation: GroupAggregation::Mean, }; - let tok = ByteTokenizer::new(tokenize_cfg.clone(), SpecialConfig::default(), None); + let tok = ByteTokenizer::new(tokenize_cfg.clone(), SpecialConfig::default()); assert_eq!(tok.vocab_size(), 384); let text = "a täst"; - let Tokenization { token_ids, info } = tok.tokenize(text, None, None, None, true).unwrap(); + let Tokenization { token_ids, info } = tok.tokenize(text, None, None, true).unwrap(); assert_eq!( token_ids.iter().map(|tok| *tok as u8).collect::>(), text.as_bytes() @@ -2478,9 +2477,9 @@ mod tests { groups: ByteGroups::CodePoints, ..tokenize_cfg }; - let tok = ByteTokenizer::new(tokenize_cfg, SpecialConfig::default(), None); + let tok = ByteTokenizer::new(tokenize_cfg, SpecialConfig::default()); let text = "a täst"; - let Tokenization { token_ids, info } = tok.tokenize(text, None, None, None, true).unwrap(); + let Tokenization { token_ids, info } = tok.tokenize(text, None, None, true).unwrap(); assert_eq!( token_ids.iter().map(|tok| *tok as u8).collect::>(), text.as_bytes() @@ -2534,10 +2533,10 @@ mod tests { max_vocab_size: None, use_graphemes: true, }; - let bpe = BPETokenizer::new(bpe_config, special_config, None).unwrap(); + let bpe = BPETokenizer::new(bpe_config, special_config).unwrap(); info!("loaded bpe tokenizer"); let s = "this is a long reading couple restaurant"; - let token_ids = bpe.tokenize(s, None, None, None, true).unwrap().token_ids; + let token_ids = bpe.tokenize(s, None, None, true).unwrap().token_ids; info!("token ids: {token_ids:?}"); let tokens: Vec<_> = token_ids .iter() @@ -2556,7 +2555,7 @@ mod tests { .take(256) .collect(); - let token_ids = bpe.tokenize(&s, None, None, None, true).unwrap().token_ids; + let token_ids = bpe.tokenize(&s, None, None, true).unwrap().token_ids; let ds = bpe.de_tokenize(&token_ids, true); assert_eq!(s, ds); } @@ -2572,8 +2571,7 @@ mod tests { }; let tok = ByT5Tokenizer::new(tokenize_cfg); assert_eq!(tok.vocab_size(), 259); - let Tokenization { token_ids, info: _ } = - tok.tokenize("a täst", None, None, None, true).unwrap(); + let Tokenization { token_ids, info: _ } = tok.tokenize("a täst", None, None, true).unwrap(); assert_eq!(token_ids, vec![100, 35, 119, 198, 167, 118, 119, 1]); } diff --git a/text-utils-grammar/src/lr1.rs b/text-utils-grammar/src/lr1.rs index f9b5c3e..14c840c 100644 --- a/text-utils-grammar/src/lr1.rs +++ b/text-utils-grammar/src/lr1.rs @@ -181,7 +181,7 @@ fn find_token_or_matching( match pdfa.find_prefix_match(state, prefix) { PrefixMatch::None => continue, PrefixMatch::Maybe(state) => prefix_matches.push((pidx, state)), - PrefixMatch::UpTo(end, _) => { + PrefixMatch::UpTo(end) => { if !found_token || end > len { len = end; token = tidx.as_ref().copied(); diff --git a/text-utils-grammar/src/utils.rs b/text-utils-grammar/src/utils.rs index c137e53..d3cb596 100644 --- a/text-utils-grammar/src/utils.rs +++ b/text-utils-grammar/src/utils.rs @@ -102,7 +102,7 @@ impl Debug for PrefixDFA { pub(crate) enum PrefixMatch { None, Maybe(StateID), - UpTo(usize, StateID), + UpTo(usize), } impl PrefixDFA { @@ -153,16 +153,16 @@ impl PrefixDFA { #[inline] pub(crate) fn find_prefix_match(&self, mut state: StateID, prefix: &[u8]) -> PrefixMatch { let mut last_match = if self.is_match_state(state) { - Some((0, state)) + Some(0) } else { None }; for (i, &b) in prefix.iter().enumerate() { state = self.dfa.next_state(state, b); if self.is_match_state(state) { - last_match = Some((i + 1, state)); + last_match = Some(i + 1); } else if !self.is_maybe_match(state) { - return last_match.map_or(PrefixMatch::None, |(i, s)| PrefixMatch::UpTo(i, s)); + return last_match.map_or(PrefixMatch::None, |i| PrefixMatch::UpTo(i)); } } PrefixMatch::Maybe(state) diff --git a/text-utils-prefix/Cargo.lock b/text-utils-prefix/Cargo.lock new file mode 100644 index 0000000..5b9df20 --- /dev/null +++ b/text-utils-prefix/Cargo.lock @@ -0,0 +1,979 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" + +[[package]] +name = "art-tree" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba8426b68532975e76fe7ddd5f8cdd981d750c87b4a24687b5acc1f26ae3f1f2" + +[[package]] +name = "autocfg" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2678b2e3449475e95b0aa6f9b506a28e61b3dc8996592b983695e8ebb58a8b41" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d04d43504c61aa6c7531f1871dd0d418d91130162063b789da00fd7057a5e" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "serde", + "windows-targets", +] + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools 0.10.5", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "darling" +version = "0.20.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", + "serde", +] + +[[package]] +name = "either" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "getrandom" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown 0.14.3", + "serde", +] + +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + +[[package]] +name = "memchr" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "num-traits" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "patricia_tree" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f2f4539bffe53fc4b4da301df49d114b845b077bd5727b7fe2bd9d8df2ae68" +dependencies = [ + "bitflags", +] + +[[package]] +name = "plotters" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" + +[[package]] +name = "plotters-svg" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "proc-macro2" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" + +[[package]] +name = "ryu" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.197" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.197" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.115" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee80b0e361bbf88fd2f6e242ccd19cfda072cb0faa6ae694ecee08199938569a" +dependencies = [ + "base64", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.2.6", + "serde", + "serde_derive", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6561dc161a9224638a31d876ccdfefbc1df91d3f3a8342eddb35f055d48c7655" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "2.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "text-utils-prefix" +version = "0.1.0" +dependencies = [ + "art-tree", + "criterion", + "itertools 0.12.1", + "patricia_tree", + "rand", + "rand_chacha", + "rand_distr", + "rayon", + "serde", + "serde_json", + "serde_with", +] + +[[package]] +name = "time" +version = "0.3.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" diff --git a/text-utils-prefix/src/art.rs b/text-utils-prefix/src/art.rs index 7b74bc3..8e4e5e4 100644 --- a/text-utils-prefix/src/art.rs +++ b/text-utils-prefix/src/art.rs @@ -32,80 +32,6 @@ enum NodeType { }, // N256(Children, u16), } -// impl Serialize for NodeType { -// fn serialize(&self, serializer: S) -> Result -// where -// S: serde::Serializer, -// { -// match self { -// NodeType::Leaf(v) => { -// let mut s = serializer.serialize_tuple_variant("N", 0, "L", 1)?; -// s.serialize_field(v)?; -// s.end() -// } -// NodeType::N4(index, children, n) => { -// let mut s = serializer.serialize_tuple_variant("N", 1, "4", 3)?; -// s.serialize_field(index)?; -// s.serialize_field(children)?; -// s.serialize_field(n)?; -// s.end() -// } -// NodeType::N16(keys, children, n) => { -// let mut s = serializer.serialize_tuple_variant("N", 2, "16", 3)?; -// s.serialize_field(keys)?; -// s.serialize_field(children)?; -// s.serialize_field(n)?; -// s.end() -// } -// NodeType::N48(index, children, n) => { -// let mut s = serializer.serialize_tuple_variant("N", 3, "48", 3)?; -// let vec: Vec<_> = index.iter().collect(); -// s.serialize_field(&vec)?; -// let vec: Vec<_> = children.iter().collect(); -// s.serialize_field(&vec)?; -// s.serialize_field(n)?; -// s.end() -// } -// NodeType::N256(children, n) => { -// let mut s = serializer.serialize_tuple_variant("N", 4, "256", 2)?; -// let vec: Vec<_> = children.iter().collect(); -// s.serialize_field(&vec)?; -// s.serialize_field(n)?; -// s.end() -// } -// } -// } -// } - -// impl<'de, V: Deserialize<'de>> Deserialize<'de> for NodeType { -// fn deserialize(deserializer: D) -> Result -// where -// D: serde::Deserializer<'de>, -// { -// struct NodeTypeVisitor { -// marker: PhantomData, -// } -// -// impl<'de, V> Visitor<'de> for NodeTypeVisitor { -// type Value = V; -// -// fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { -// formatter.write_str("an adaptive radix trie node type") -// } -// -// fn visit_ -// } -// -// deserializer.deserialize_enum( -// "N", -// &["L", "4", "16", "48", "256"], -// NodeTypeVisitor { -// marker: PhantomData, -// }, -// ) -// } -// } - #[derive(Debug, Serialize, Deserialize)] struct Node { prefix: Box<[u8]>, @@ -805,7 +731,7 @@ impl PrefixSearch for AdaptiveRadixTrie { path } - fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { + fn iter_continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { let Some(root) = &self.root else { return Box::new(empty()); }; diff --git a/text-utils-prefix/src/lib.rs b/text-utils-prefix/src/lib.rs index fb9e332..d02d07a 100644 --- a/text-utils-prefix/src/lib.rs +++ b/text-utils-prefix/src/lib.rs @@ -22,7 +22,7 @@ pub trait PrefixSearch { fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)>; - fn continuations( + fn iter_continuations( &self, prefix: &[u8], ) -> Box, &Self::Value)> + '_>; @@ -63,7 +63,7 @@ pub trait ContinuationsTrie { pub struct ContinuationTrie { pub trie: T, - continuations: Vec>, + pub continuations: Vec>, optimized: (Vec, Vec), } @@ -115,11 +115,11 @@ where self.trie.path(prefix) } - fn continuations( + fn iter_continuations( &self, prefix: &[u8], ) -> Box, &Self::Value)> + '_> { - self.trie.continuations(prefix) + self.trie.iter_continuations(prefix) } } @@ -276,7 +276,7 @@ mod test { for prefix in prefixes { let conts: Vec<_> = vec .vec - .continuations(prefix) + .iter_continuations(prefix) .map(|(w, v)| (w, *v)) .collect(); // check that no other words than the given conts start with the prefix @@ -339,11 +339,11 @@ mod test { self.as_ref().path(prefix) } - fn continuations( + fn iter_continuations( &self, prefix: &[u8], ) -> Box, &Self::Value)> + '_> { - self.as_ref().continuations(prefix) + self.as_ref().iter_continuations(prefix) } } impl ContinuationsTrie for Box { @@ -394,7 +394,10 @@ mod test { for (_, trie) in tries { for prefix in &prefixes { - let conts: Vec<_> = trie.continuations(prefix).map(|(w, v)| (w, *v)).collect(); + let conts: Vec<_> = trie + .iter_continuations(prefix) + .map(|(w, v)| (w, *v)) + .collect(); // check that no other words than the given conts start with the prefix assert!(words.iter().all(|w| { let w = w.as_bytes(); diff --git a/text-utils-prefix/src/patricia.rs b/text-utils-prefix/src/patricia.rs index dbde705..1a5ea9b 100644 --- a/text-utils-prefix/src/patricia.rs +++ b/text-utils-prefix/src/patricia.rs @@ -441,7 +441,7 @@ impl PrefixSearch for PatriciaTrie { path } - fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { + fn iter_continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { let Some(root) = &self.root else { return Box::new(empty()); }; diff --git a/text-utils-prefix/src/vec.rs b/text-utils-prefix/src/vec.rs index 2004e2c..5216bea 100644 --- a/text-utils-prefix/src/vec.rs +++ b/text-utils-prefix/src/vec.rs @@ -23,7 +23,7 @@ impl Default for PrefixVec { enum FindResult { Found(usize, usize), - NotFound(usize), + NotFound, } impl PrefixVec { @@ -135,7 +135,7 @@ impl PrefixVec { let Some((new_left, new_right)) = self.range_search(k, start_depth + depth, left, right) else { - return FindResult::NotFound(depth); + return FindResult::NotFound; }; left = new_left; right = new_right; @@ -220,14 +220,14 @@ impl PrefixSearch for PrefixVec { path } - fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { + fn iter_continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { match self.find_range(prefix, 0, self.data.len(), 0) { FindResult::Found(left, right) => Box::new( self.data[left..right] .iter() .map(|(key, value)| (key.to_vec(), value)), ), - FindResult::NotFound(_) => Box::new(empty()), + FindResult::NotFound => Box::new(empty()), } } } @@ -320,8 +320,8 @@ impl PrefixSearch for ContinuationsVec { self.vec.path(prefix) } - fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { - self.vec.continuations(prefix) + fn iter_continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { + self.vec.iter_continuations(prefix) } }