Skip to content

Commit

Permalink
Cache miniblock metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
westonpace committed Jan 2, 2025
1 parent c0c1b53 commit 4597956
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 34 deletions.
19 changes: 18 additions & 1 deletion rust/lance-core/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
use std::any::{Any, TypeId};
use std::sync::Arc;

use deepsize::{Context, DeepSizeOf};
use futures::Future;
use moka::sync::Cache;
use object_store::path::Path;

use crate::utils::path::LancePathExt;
use crate::Result;

pub use deepsize::{Context, DeepSizeOf};

type ArcAny = Arc<dyn Any + Send + Sync>;

#[derive(Clone)]
Expand Down Expand Up @@ -121,6 +122,12 @@ impl FileMetadataCache {
}
}

/// Fetch an item from the cache, using a str as the key
pub fn get_by_str<T: Send + Sync + 'static>(&self, path: &str) -> Option<Arc<T>> {
self.get(&Path::parse(path).unwrap())
}

/// Fetch an item from the cache
pub fn get<T: Send + Sync + 'static>(&self, path: &Path) -> Option<Arc<T>> {
let cache = self.cache.as_ref()?;
let temp: Path;
Expand All @@ -135,6 +142,7 @@ impl FileMetadataCache {
.map(|metadata| metadata.record.clone().downcast::<T>().unwrap())
}

/// Insert an item into the cache
pub fn insert<T: DeepSizeOf + Send + Sync + 'static>(&self, path: Path, metadata: Arc<T>) {
let Some(cache) = self.cache.as_ref() else {
return;
Expand All @@ -147,6 +155,15 @@ impl FileMetadataCache {
cache.insert((path, TypeId::of::<T>()), SizedRecord::new(metadata));
}

/// Insert an item into the cache, using a str as the key
pub fn insert_by_str<T: DeepSizeOf + Send + Sync + 'static>(
&self,
key: &str,
metadata: Arc<T>,
) {
self.insert(Path::parse(key).unwrap(), metadata);
}

/// Get an item
///
/// If it exists in the cache return that
Expand Down
2 changes: 2 additions & 0 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pub const COMPRESSION_LEVEL_META_KEY: &str = "lance-encoding:compression-level";
pub const BLOB_META_KEY: &str = "lance-encoding:blob";
pub const PACKED_STRUCT_LEGACY_META_KEY: &str = "packed";
pub const PACKED_STRUCT_META_KEY: &str = "lance-encoding:packed";
pub const STRUCTURAL_ENCODING_META_KEY: &str = "lance-encoding:structural-encoding";
pub const STRUCTURAL_ENCODING_MINIBLOCK: &str = "miniblock";

lazy_static::lazy_static! {
pub static ref BLOB_DESC_FIELDS: Fields =
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-core/src/utils/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ impl LancePathExt for Path {
fn child_path(&self, path: &Path) -> Path {
let mut new_path = self.clone();
for part in path.parts() {
new_path = path.child(part);
new_path = new_path.child(part);
}
new_path
}
Expand Down
39 changes: 38 additions & 1 deletion rust/lance-datagen/src/generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,38 @@ impl BatchGeneratorBuilder {
}
}

/// Factory for creating a single random array
pub struct ArrayGeneratorBuilder {
generator: Box<dyn ArrayGenerator>,
seed: Option<Seed>,
}

impl ArrayGeneratorBuilder {
fn new(generator: Box<dyn ArrayGenerator>) -> Self {
Self {
generator,
seed: None,
}
}

/// Use the given seed for the generator
pub fn with_seed(mut self, seed: Seed) -> Self {
self.seed = Some(seed);
self
}

/// Generate a single array with the given length
pub fn into_array_rows(
mut self,
length: RowCount,
) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
self.seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
);
self.generator.generate(length, &mut rng)
}
}

const MS_PER_DAY: i64 = 86400000;

pub mod array {
Expand Down Expand Up @@ -1858,11 +1890,16 @@ pub mod array {
}
}

/// Create a BatchGeneratorBuilder to start generating data
/// Create a BatchGeneratorBuilder to start generating batch data
pub fn gen() -> BatchGeneratorBuilder {
BatchGeneratorBuilder::default()
}

/// Create an ArrayGeneratorBuilder to start generating array data
pub fn gen_array(gen: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
ArrayGeneratorBuilder::new(gen)
}

/// Create a BatchGeneratorBuilder with the given schema
///
/// You can add more columns or convert this into a reader immediately
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-encoding/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ impl DataBlock {
Self::Empty() => Self::Empty(),
Self::Constant(inner) => Self::Constant(inner),
Self::AllNull(_) => panic!("Cannot remove validity on all-null data"),
Self::Nullable(inner) => *inner.data,
Self::Nullable(inner) => inner.data.remove_validity(),
Self::FixedWidth(inner) => Self::FixedWidth(inner),
Self::FixedSizeList(inner) => Self::FixedSizeList(inner.remove_validity()),
Self::VariableWidth(inner) => Self::VariableWidth(inner),
Expand Down
Loading

0 comments on commit 4597956

Please sign in to comment.