diff --git a/Cargo.lock b/Cargo.lock index 80967e9d..4a0a86ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -78,7 +84,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.3", "object", "rustc-demangle", ] @@ -216,6 +222,40 @@ dependencies = [ "libc", ] +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "crypto-common" version = "0.1.6" @@ -317,6 +357,12 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "encoding_rs" version = "0.8.34" @@ -366,6 +412,16 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" +[[package]] +name = "flate2" +version = "1.0.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +dependencies = [ + "crc32fast", + "miniz_oxide 0.8.0", +] + [[package]] name = "fnv" version = "1.0.7" @@ -518,6 +574,12 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "h2" version = "0.3.26" @@ -785,6 +847,26 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "lopdf" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +dependencies = [ + "chrono", + "encoding_rs", + "flate2", + "indexmap", + "itoa", + "log", + "md-5", + "nom", + "rangemap", + "rayon", + "time", + "weezl", +] + [[package]] name = "lru-cache" version = "0.1.2" @@ -828,6 +910,12 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.3" @@ -837,6 +925,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "0.8.11" @@ -913,6 +1010,16 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -1150,6 +1257,32 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rangemap" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684" + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.1" @@ -1215,6 +1348,8 @@ version = "0.2.1" dependencies = [ "anyhow", "futures", + "glob", + "lopdf", "ordered-float", "reqwest", "schemars", @@ -2129,6 +2264,12 @@ version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" +[[package]] +name = "weezl" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" + [[package]] name = "widestring" version = "1.1.0" diff --git a/rig-core/Cargo.toml b/rig-core/Cargo.toml index 28561465..c109f09a 100644 --- a/rig-core/Cargo.toml +++ b/rig-core/Cargo.toml @@ -8,8 +8,8 @@ description = "An opinionated library for building LLM powered applications." repository = "https://github.com/0xPlaygrounds/rig" [lib] -name="rig" -path="src/lib.rs" +name = "rig" +path = "src/lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -22,8 +22,13 @@ futures = "0.3.29" ordered-float = "4.2.0" schemars = "0.8.16" thiserror = "1.0.61" +glob = "0.3.1" +lopdf = { version = "0.34.0" } [dev-dependencies] anyhow = "1.0.75" tokio = { version = "1.34.0", features = ["full"] } tracing-subscriber = "0.3.18" + +# [features] +# lopdf = ["dep:lopdf"] diff --git a/rig-core/src/lib.rs b/rig-core/src/lib.rs index 86c25209..ca693569 100644 --- a/rig-core/src/lib.rs +++ b/rig-core/src/lib.rs @@ -72,6 +72,7 @@ pub mod completion; pub mod embeddings; pub mod extractor; pub mod json_utils; +pub mod loaders; pub mod providers; pub mod tool; pub mod vector_store; diff --git a/rig-core/src/loaders.rs b/rig-core/src/loaders.rs new file mode 100644 index 00000000..e30ee1b1 --- /dev/null +++ b/rig-core/src/loaders.rs @@ -0,0 +1,264 @@ +use std::fs; + +use futures::Stream; +use glob::{glob, GlobError}; +use lopdf::{Document, Error as LopdfError}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum LoaderError { + #[error("Invalid glob pattern: {0}")] + InvalidGlobPattern(String), + + #[error("IO error: {0}")] + IoError(#[from] std::io::Error), + + #[error("Pattern error: {0}")] + PatternError(#[from] glob::PatternError), + + #[error("Glob error: {0}")] + GlobError(#[from] glob::GlobError), +} + +pub trait Loader { + fn glob(self, glob: &str) -> Result>, LoaderError>; +} + +struct FileLoader { + iter_generator: Box Result>, LoaderError>>, +} + +type FileLoaderWithPath = FileLoader<(String, String)>; + +// struct WithPath; +// struct IgnoreErrors; +// struct IgnoreErrorsWithPath; + +impl FileLoader { + pub fn new() -> Self { + Self { + iter_generator: Box::new( + |pattern: String| -> Result>, LoaderError> { + let paths = glob(&pattern).map_err(|e| LoaderError::PatternError(e))?; + let iter = paths.filter_map(Result::ok).map(|path| { + fs::read_to_string(path) + .map_err(|e| LoaderError::IoError(e)) + .unwrap_or_default() + }); + Ok(Box::new(iter)) + }, + ), + } + } + + pub fn with_path(self) -> FileLoaderWithPath { + FileLoader::<(String, String)> { + iter_generator: Box::new( + move |pattern: String| -> Result>, LoaderError> { + let paths = glob(&pattern).map_err(|e| LoaderError::PatternError(e))?; + let iter = paths.filter_map(Result::ok).map(|path| { + let content = fs::read_to_string(&path) + .map_err(|e| LoaderError::IoError(e)) + .unwrap_or_default(); + (path.to_string_lossy().into_owned(), content) + }); + Ok(Box::new(iter)) + }, + ), + } + } +} + +impl Loader for FileLoader { + fn glob(self, pattern: &str) -> Result>, LoaderError> { + (self.iter_generator)(pattern.to_string()) + } +} + +// impl Loader for FileLoader { +// fn iter(&self, source: &str) -> Box> { +// todo!("Return an iterator over the files matching the source pattern") +// } + +// fn stream(&self, source: &str) -> Box> { +// todo!("Return a stream over the files matching the source pattern") +// } +// } + +#[derive(Error, Debug)] +pub enum PdfLoaderError { + #[error("Loader error: {0}")] + LoaderError(#[from] LoaderError), + + #[error("PDF error: {0}")] + PdfError(#[from] LopdfError), +} + +struct PdfLoader { + iter_generator: Box Result>, PdfLoaderError>>, +} + +type StateWithPath = (String, String); +type StateByPage = (usize, String); +type StateWithPathByPage = (String, Vec<(usize, String)>); + +impl PdfLoader { + pub fn new() -> Self { + Self { + iter_generator: Box::new( + |pattern: String| -> Result>, PdfLoaderError> { + let paths = glob(&pattern).map_err(LoaderError::PatternError)?; + let iter = paths.filter_map(Result::ok).map(|path| { + let doc = Document::load(&path).unwrap_or_default(); + let content = doc + .page_iter() + .map(|(i, _)| doc.extract_text(&[i]).unwrap_or_default()) + .collect::(); + content + }); + Ok(Box::new(iter)) + }, + ), + } + } + + pub fn with_path(self) -> PdfLoader { + PdfLoader:: { + iter_generator: Box::new( + move |pattern: String| -> Result>, PdfLoaderError> { + let paths = glob(&pattern).map_err(LoaderError::PatternError)?; + let iter = paths.filter_map(Result::ok).map(|path| { + let doc = Document::load(&path).map_err(|e| PdfLoaderError::PdfError(e)).unwrap(); + let content = doc + .page_iter() + .map(|(i, _)| doc.extract_text(&[i]).unwrap_or_default()) + .collect::(); + (path.to_string_lossy().into_owned(), content) + }); + Ok(Box::new(iter)) + }, + ), + } + } + + pub fn by_page(self) -> PdfLoader { + PdfLoader:: { + iter_generator: Box::new( + move |pattern: String| -> Result< + Box>, + PdfLoaderError, + > { + let paths = glob(&pattern).map_err(LoaderError::PatternError)?; + let iter = paths.filter_map(Result::ok).flat_map(|path| { + let doc = Document::load(&path).map_err(|e| PdfLoaderError::PdfError(e)).unwrap(); + doc + .page_iter() + .map(|(i, _)| (i as usize, doc.extract_text(&[i]).unwrap_or_default())) + .collect::>() + }); + Ok(Box::new(iter)) + }, + ), + } + } +} + +impl PdfLoader { + pub fn with_path(self) -> PdfLoader { + PdfLoader:: { + iter_generator: + Box::new( + move |pattern: String| -> Result< + Box>, + PdfLoaderError, + > { + let paths = glob(&pattern).map_err(|e| { + PdfLoaderError::LoaderError(LoaderError::PatternError(e)) + })?; + let iter = paths.filter_map(Result::ok).map(|path| { + let doc = Document::load(&path) + .map_err(|e| PdfLoaderError::PdfError(e)) + .unwrap(); + let page_iterator = doc + .page_iter() + .map(|(i, _)| { + (i as usize, doc.extract_text(&[i]).unwrap_or_default()) + }) + .collect::>(); + (path.to_string_lossy().into_owned(), page_iterator) + }); + Ok(Box::new(iter)) + }, + ), + } + } +} + +impl PdfLoader { + pub fn by_page(self) -> PdfLoader { + PdfLoader::new().by_page().with_path() + } +} + +impl Loader for PdfLoader { + fn glob(self, pattern: &str) -> Result + 'static)>, LoaderError> { + (self.iter_generator)(pattern.to_string()) + } +} + +mod tests { + use super::{FileLoader, PdfLoader}; + use glob::glob; + use lopdf::{Document, Error as LopdfError}; + + #[test] + fn test_file_loader() { + let loader = FileLoader::new(); + let files = loader.glob("src/*.rs").unwrap().for_each(|file| { + if let Some(first_line) = file.lines().next() { + println!("{}", first_line); + } + }); + } + + #[test] + fn test_file_loader_with_path() { + let loader = FileLoader::new().with_path(); + let files = loader.glob("src/*.rs").unwrap().collect::>(); + assert_eq!(files.len(), 1); + } + #[test] + fn test_pdf_loader() { + let loader = PdfLoader::new(); + let pdfs = loader.glob("docs/*.pdf").unwrap().collect::>(); + pdfs.iter().for_each(|content| { + println!("{}", content); + }); + } + + #[test] + fn test_pdf_loader_with_path() { + for file in glob("*.pdf").unwrap() { + println!("{:?}", file); + } + let loader = PdfLoader::new().with_path(); + let pdfs = loader.glob("*.pdf").unwrap().collect::>(); + pdfs.iter().for_each(|(path, content)| { + println!("{}: {}", path, content); + }); + } + + #[test] + fn test_pdf_loader_by_page() { + let loader = PdfLoader::new().with_path().by_page(); + loader + .glob("*.pdf") + .expect("no pdfs") + .for_each(|(path, pages)| { + println!("{}:", path); + pages.iter().for_each(|page| { + println!("{}", page); + }); + }); + } +}