From 51840006249119a9a707aee1e6a4a1fb62f207ae Mon Sep 17 00:00:00 2001 From: Sebastian Walter Date: Wed, 31 Jan 2024 18:35:42 +0100 Subject: [PATCH] update adaptive prefix trie --- text-utils-prefix/Cargo.toml | 3 +- text-utils-prefix/src/adaptive_radix_trie.rs | 66 ++++++++++++-------- text-utils-prefix/src/bin/test_art.rs | 15 +++++ text-utils-prefix/src/bin/test_patricia.rs | 15 +++++ text-utils-prefix/src/bin/test_trie.rs | 15 +++++ text-utils-prefix/src/patricia_trie.rs | 12 +++- text-utils-prefix/src/trie.rs | 8 +-- 7 files changed, 100 insertions(+), 34 deletions(-) create mode 100644 text-utils-prefix/src/bin/test_art.rs create mode 100644 text-utils-prefix/src/bin/test_patricia.rs create mode 100644 text-utils-prefix/src/bin/test_trie.rs diff --git a/text-utils-prefix/Cargo.toml b/text-utils-prefix/Cargo.toml index 5c1f51f..0d5b3ca 100644 --- a/text-utils-prefix/Cargo.toml +++ b/text-utils-prefix/Cargo.toml @@ -6,6 +6,8 @@ edition = "2021" [dependencies] rayon = "1.8" itertools = "0.12" +serde = { version = "1.0", features = ["derive"] } +serde-big-array = "0.5" [dev-dependencies] criterion = "0.5" @@ -14,7 +16,6 @@ patricia_tree = "0.8.0" rand = "0.8" rand_distr = "0.4" rand_chacha = "0.3" -serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" [profile.release] diff --git a/text-utils-prefix/src/adaptive_radix_trie.rs b/text-utils-prefix/src/adaptive_radix_trie.rs index d88a16a..cf41f5d 100644 --- a/text-utils-prefix/src/adaptive_radix_trie.rs +++ b/text-utils-prefix/src/adaptive_radix_trie.rs @@ -5,18 +5,18 @@ use std::{ use crate::{ContinuationSearch, PrefixSearch}; -type Index = [u8; N]; -type Children = [Option>>; N]; +type Index = Box<[u8; N]>; +type Children = Box<[Option>>; N]>; #[derive(Default, Debug)] enum NodeType { #[default] Empty, Leaf(V), - N4(Index<4>, Children, usize), - N16(Index<16>, Children, usize), - N48(Box>, Children, usize), - N256(Children, usize), + N4(Index<4>, Children, u8), + N16(Index<16>, Children, u8), + N48(Index<256>, Children, u8), + N256(Children, u16), } #[derive(Debug)] @@ -117,7 +117,11 @@ impl Node { fn new_inner(prefix: Vec) -> Self { Self { prefix: prefix.into_boxed_slice(), - inner: NodeType::N4(std::array::from_fn(|_| 0), std::array::from_fn(|_| None), 0), + inner: NodeType::N4( + Box::new(std::array::from_fn(|_| 0)), + Box::new(std::array::from_fn(|_| None)), + 0, + ), } } @@ -184,17 +188,17 @@ impl Node { match &self.inner { NodeType::Empty | NodeType::Leaf(_) => Box::new(empty()), NodeType::N4(_, children, num_children) => Box::new( - children[..*num_children] + children[..*num_children as usize] .iter() .filter_map(|child| child.as_deref()), ), NodeType::N16(_, children, num_children) => Box::new( - children[..*num_children] + children[..*num_children as usize] .iter() .filter_map(|child| child.as_deref()), ), NodeType::N48(_, children, num_children) => Box::new( - children[..*num_children] + children[..*num_children as usize] .iter() .filter_map(|child| child.as_deref()), ), @@ -214,8 +218,9 @@ impl Node { NodeType::Empty | NodeType::Leaf(_) => unreachable!("should not happen"), NodeType::N4(keys, children, num_children) => { // also keep sorted order for n4 for easier upgrade - let idx = keys[..*num_children].binary_search(&key).unwrap_err(); - if idx < *num_children { + let n = *num_children as usize; + let idx = keys[..n].binary_search(&key).unwrap_err(); + if idx < n { keys[idx..].rotate_right(1); children[idx..].rotate_right(1); } @@ -224,8 +229,9 @@ impl Node { *num_children += 1; } NodeType::N16(keys, children, num_children) => { - let idx = keys[..*num_children].binary_search(&key).unwrap_err(); - if idx < *num_children { + let n = *num_children as usize; + let idx = keys[..n].binary_search(&key).unwrap_err(); + if idx < n { keys[idx..].rotate_right(1); children[idx..].rotate_right(1); } @@ -234,8 +240,8 @@ impl Node { *num_children += 1; } NodeType::N48(index, children, num_children) => { - index[key as usize] = *num_children as u8; - children[*num_children] = Some(Box::new(child)); + index[key as usize] = *num_children; + children[*num_children as usize] = Some(Box::new(child)); *num_children += 1; } NodeType::N256(children, num_children) => { @@ -291,6 +297,7 @@ impl Node { NodeType::Empty | NodeType::Leaf(_) => None, NodeType::N4(keys, children, num_children) => { for i in 0..*num_children { + let i = i as usize; if keys[i] == key { return children[i].as_deref(); } @@ -298,7 +305,7 @@ impl Node { None } NodeType::N16(keys, children, num_children) => { - let idx = keys[..*num_children].binary_search(&key).ok()?; + let idx = keys[..*num_children as usize].binary_search(&key).ok()?; children[idx].as_deref() } NodeType::N48(keys, children, _) => { @@ -314,6 +321,7 @@ impl Node { NodeType::Empty | NodeType::Leaf(_) => None, NodeType::N4(keys, children, num_children) => { for i in 0..*num_children { + let i = i as usize; if keys[i] == key { return children[i].as_deref_mut(); } @@ -321,7 +329,7 @@ impl Node { None } NodeType::N16(keys, children, num_children) => { - let idx = keys[..*num_children].binary_search(&key).ok()?; + let idx = keys[..*num_children as usize].binary_search(&key).ok()?; children[idx].as_deref_mut() } NodeType::N48(keys, children, _) => children @@ -336,9 +344,8 @@ impl Node { NodeType::Empty | NodeType::Leaf(_) => { unreachable!("should not happen") } - NodeType::N256(_, num_children) => { + NodeType::N256(..) => { // upgrade should only be called on non empty n256 nodes - assert!(*num_children < 256); return; } NodeType::N4(keys, children, num_children) => { @@ -349,15 +356,15 @@ impl Node { assert_eq!(*num_children, 4); // just move over because n4 is also sorted NodeType::N16( - std::array::from_fn(|i| if i < 4 { keys[i] } else { 0 }), - std::array::from_fn(|i| { + Box::new(std::array::from_fn(|i| if i < 4 { keys[i] } else { 0 })), + Box::new(std::array::from_fn(|i| { if i < 4 { assert!(children[i].is_some()); std::mem::take(&mut children[i]) } else { None } - }), + })), 4, ) } @@ -373,14 +380,14 @@ impl Node { } NodeType::N48( Box::new(index), - std::array::from_fn(|i| { + Box::new(std::array::from_fn(|i| { if i < 16 { assert!(children[i].is_some()); std::mem::take(&mut children[i]) } else { None } - }), + })), 16, ) } @@ -391,7 +398,7 @@ impl Node { } assert_eq!(*num_children, 48); NodeType::N256( - std::array::from_fn(|i| { + Box::new(std::array::from_fn(|i| { let idx = index[i]; if idx < 48 { assert!(children[idx as usize].is_some()); @@ -399,7 +406,7 @@ impl Node { } else { None } - }), + })), 48, ) } @@ -673,12 +680,17 @@ impl ContinuationSearch for AdaptiveRadixTrie { #[cfg(test)] mod test { + use crate::adaptive_radix_trie::Node; use crate::{adaptive_radix_trie::AdaptiveRadixTrie, PrefixSearch}; use std::fs; use std::path::PathBuf; #[test] fn test_trie() { + println!( + "size of adaptive radix trie node: {}", + std::mem::size_of::>() + ); let mut trie = AdaptiveRadixTrie::default(); assert_eq!(trie.get(b"hello"), None); assert_eq!(trie.get(b""), None); diff --git a/text-utils-prefix/src/bin/test_art.rs b/text-utils-prefix/src/bin/test_art.rs new file mode 100644 index 0000000..8f702c0 --- /dev/null +++ b/text-utils-prefix/src/bin/test_art.rs @@ -0,0 +1,15 @@ +use std::{fs, path::PathBuf}; + +use text_utils_prefix::adaptive_radix_trie::AdaptiveRadixTrie; + +fn main() { + let dir = env!("CARGO_MANIFEST_DIR"); + let index = fs::read_to_string(PathBuf::from(dir).join("resources/test/index.txt")) + .expect("failed to read file"); + let n = 10_000_000; + let words: Vec<_> = index.lines().map(|s| s.as_bytes()).take(n).collect(); + + let trie: AdaptiveRadixTrie<_> = words.iter().enumerate().map(|(i, w)| (w, i)).collect(); + let stats = trie.stats(); + println!("{stats:#?}"); +} diff --git a/text-utils-prefix/src/bin/test_patricia.rs b/text-utils-prefix/src/bin/test_patricia.rs new file mode 100644 index 0000000..062b2c0 --- /dev/null +++ b/text-utils-prefix/src/bin/test_patricia.rs @@ -0,0 +1,15 @@ +use std::{fs, path::PathBuf}; + +use text_utils_prefix::patricia_trie::PatriciaTrie; + +fn main() { + let dir = env!("CARGO_MANIFEST_DIR"); + let index = fs::read_to_string(PathBuf::from(dir).join("resources/test/index.txt")) + .expect("failed to read file"); + let n = 1_000_000; + let words: Vec<_> = index.lines().map(|s| s.as_bytes()).take(n).collect(); + + let trie: PatriciaTrie<_> = words.iter().enumerate().map(|(i, w)| (w, i)).collect(); + let stats = trie.stats(); + println!("{stats:#?}"); +} diff --git a/text-utils-prefix/src/bin/test_trie.rs b/text-utils-prefix/src/bin/test_trie.rs new file mode 100644 index 0000000..246b58f --- /dev/null +++ b/text-utils-prefix/src/bin/test_trie.rs @@ -0,0 +1,15 @@ +use std::{fs, path::PathBuf}; + +use text_utils_prefix::trie::Trie; + +fn main() { + let dir = env!("CARGO_MANIFEST_DIR"); + let index = fs::read_to_string(PathBuf::from(dir).join("resources/test/index.txt")) + .expect("failed to read file"); + let n = 100_000; + let words: Vec<_> = index.lines().map(|s| s.as_bytes()).take(n).collect(); + + let trie: Trie<_> = words.iter().enumerate().map(|(i, w)| (w, i)).collect(); + let stats = trie.stats(); + println!("{stats:#?}"); +} diff --git a/text-utils-prefix/src/patricia_trie.rs b/text-utils-prefix/src/patricia_trie.rs index c84c294..bc959e8 100644 --- a/text-utils-prefix/src/patricia_trie.rs +++ b/text-utils-prefix/src/patricia_trie.rs @@ -10,7 +10,7 @@ enum NodeType { #[default] Empty, Leaf(V), - Inner([Option>>; 256]), + Inner(Box<[Option>>; 256]>), } #[derive(Debug)] @@ -105,7 +105,7 @@ impl Node { fn new_inner(prefix: Vec) -> Self { Self { prefix: prefix.into_boxed_slice(), - inner: NodeType::Inner(std::array::from_fn(|_| None)), + inner: NodeType::Inner(Box::new(std::array::from_fn(|_| None))), } } @@ -494,12 +494,20 @@ impl ContinuationSearch for PatriciaTrie { #[cfg(test)] mod test { + use crate::patricia_trie::Node; use crate::{patricia_trie::PatriciaTrie, PrefixSearch}; use std::fs; use std::path::PathBuf; #[test] fn test_trie() { + println!( + "size of patricia trie node: {}, box array: {}, box slice: {}, vec: {}", + std::mem::size_of::>(), + std::mem::size_of::>(), + std::mem::size_of::>(), + std::mem::size_of::>() + ); let mut trie = PatriciaTrie::default(); assert_eq!(trie.get(b"hello"), None); assert_eq!(trie.get(b""), None); diff --git a/text-utils-prefix/src/trie.rs b/text-utils-prefix/src/trie.rs index 8047711..bc84118 100644 --- a/text-utils-prefix/src/trie.rs +++ b/text-utils-prefix/src/trie.rs @@ -1,18 +1,16 @@ -use std::collections::HashMap; - use crate::PrefixSearch; #[derive(Debug)] struct Node { value: Option, - children: [Option>>; 256], + children: Box<[Option>>; 256]>, } impl Default for Node { fn default() -> Self { Self { value: None, - children: std::array::from_fn(|_| None), + children: Box::new(std::array::from_fn(|_| None)), } } } @@ -177,12 +175,14 @@ impl PrefixSearch for Trie { #[cfg(test)] mod test { + use crate::trie::Node; use crate::{trie::Trie, PrefixSearch}; use std::fs; use std::path::PathBuf; #[test] fn test_trie() { + println!("size of trie node: {}", std::mem::size_of::>()); let mut trie = Trie::default(); assert_eq!(trie.get(b"hello"), None); assert_eq!(trie.get(b""), None);