diff --git a/embedder.go b/embedder.go index 05bde0e..7454487 100644 --- a/embedder.go +++ b/embedder.go @@ -39,13 +39,13 @@ type EmbeddedChunk = rag.EmbeddedChunk // flexible configuration API. // // Common options include: -// - WithEmbedderProvider: Choose the embedding service provider -// - WithEmbedderModel: Select the specific embedding model -// - WithEmbedderAPIKey: Configure authentication +// - SetEmbedderProvider: Choose the embedding service provider +// - SetEmbedderModel: Select the specific embedding model +// - SetEmbedderAPIKey: Configure authentication // - SetOption: Set custom provider-specific options type EmbedderOption = rag.EmbedderOption -// WithEmbedderProvider sets the provider for the Embedder. +// SetEmbedderProvider sets the provider for the Embedder. // Supported providers include: // - "openai": OpenAI's text-embedding-ada-002 and other models // - "cohere": Cohere's embedding models @@ -54,14 +54,14 @@ type EmbedderOption = rag.EmbedderOption // Example: // // embedder, err := NewEmbedder( -// WithEmbedderProvider("openai"), -// WithEmbedderModel("text-embedding-ada-002"), +// SetEmbedderProvider("openai"), +// SetEmbedderModel("text-embedding-ada-002"), // ) -func WithEmbedderProvider(provider string) EmbedderOption { +func SetEmbedderProvider(provider string) EmbedderOption { return rag.SetProvider(provider) } -// WithEmbedderModel sets the specific model to use for embedding. +// SetEmbedderModel sets the specific model to use for embedding. // Available models depend on the chosen provider: // - OpenAI: "text-embedding-ada-002" (recommended) // - Cohere: "embed-multilingual-v2.0" @@ -70,14 +70,14 @@ func WithEmbedderProvider(provider string) EmbedderOption { // Example: // // embedder, err := NewEmbedder( -// WithEmbedderProvider("openai"), -// WithEmbedderModel("text-embedding-ada-002"), +// SetEmbedderProvider("openai"), +// SetEmbedderModel("text-embedding-ada-002"), // ) -func WithEmbedderModel(model string) EmbedderOption { +func SetEmbedderModel(model string) EmbedderOption { return rag.SetModel(model) } -// WithEmbedderAPIKey sets the authentication key for the embedding service. +// SetEmbedderAPIKey sets the authentication key for the embedding service. // This is required for most cloud-based embedding providers. // // Security Note: Store API keys securely and never commit them to version control. @@ -86,10 +86,10 @@ func WithEmbedderModel(model string) EmbedderOption { // Example: // // embedder, err := NewEmbedder( -// WithEmbedderProvider("openai"), -// WithEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), +// SetEmbedderProvider("openai"), +// SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), // ) -func WithEmbedderAPIKey(apiKey string) EmbedderOption { +func SetEmbedderAPIKey(apiKey string) EmbedderOption { return rag.SetAPIKey(apiKey) } @@ -100,7 +100,7 @@ func WithEmbedderAPIKey(apiKey string) EmbedderOption { // Example: // // embedder, err := NewEmbedder( -// WithEmbedderProvider("openai"), +// SetEmbedderProvider("openai"), // SetOption("timeout", 30*time.Second), // SetOption("max_retries", 3), // ) @@ -125,9 +125,9 @@ type Embedder = providers.Embedder // Example: // // embedder, err := NewEmbedder( -// WithEmbedderProvider("openai"), -// WithEmbedderModel("text-embedding-ada-002"), -// WithEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), +// SetEmbedderProvider("openai"), +// SetEmbedderModel("text-embedding-ada-002"), +// SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), // ) // if err != nil { // log.Fatal(err) @@ -148,7 +148,7 @@ type EmbeddingService struct { // // Example: // -// embedder, _ := NewEmbedder(WithEmbedderProvider("openai")) +// embedder, _ := NewEmbedder(SetEmbedderProvider("openai")) // service := NewEmbeddingService(embedder) func NewEmbeddingService(embedder Embedder) *EmbeddingService { return &EmbeddingService{ diff --git a/examples/concurrent_loader_example.go b/examples/concurrent_loader_example.go index 7699808..917512c 100644 --- a/examples/concurrent_loader_example.go +++ b/examples/concurrent_loader_example.go @@ -17,7 +17,7 @@ func main() { // Create a new ConcurrentPDFLoader with custom options loader := raggo.NewConcurrentPDFLoader( - raggo.SetTimeout(1*time.Minute), + raggo.SetLoaderTimeout(1*time.Minute), raggo.SetTempDir(os.TempDir()), ) @@ -51,4 +51,3 @@ func main() { fmt.Printf("%d. %s\n", i+1, file) } } - diff --git a/examples/embedding_example.go b/examples/embedding_example.go index 8d6b740..416d35a 100644 --- a/examples/embedding_example.go +++ b/examples/embedding_example.go @@ -30,9 +30,9 @@ func main() { // Create a new Embedder embedder, err := raggo.NewEmbedder( - raggo.SetProvider("openai"), - raggo.SetAPIKey(os.Getenv("OPENAI_API_KEY")), // Make sure to set this environment variable - raggo.SetModel("text-embedding-3-small"), + raggo.SetEmbedderProvider("openai"), + raggo.SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), // Make sure to set this environment variable + raggo.SetEmbedderModel("text-embedding-3-small"), ) if err != nil { log.Fatalf("Failed to create embedder: %v", err) @@ -76,7 +76,7 @@ func main() { } fmt.Printf("Embedded Chunk %d:\n", i+1) fmt.Printf(" Text: %s\n", truncateString(chunk.Text, 50)) - fmt.Printf(" Embedding Vector Length: %d\n", len(chunk.Embedding)) + fmt.Printf(" Embedding Vector Length: %d\n", len(chunk.Embeddings["default"])) fmt.Printf(" Metadata: %v\n", chunk.Metadata) fmt.Println() } diff --git a/examples/full_process.go b/examples/full_process.go index 3882649..eea40fe 100644 --- a/examples/full_process.go +++ b/examples/full_process.go @@ -87,9 +87,9 @@ func main() { } embedder, err := raggo.NewEmbedder( - raggo.WithEmbedderProvider("openai"), - raggo.WithEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), - raggo.WithEmbedderModel("text-embedding-3-small"), + raggo.SetEmbedderProvider("openai"), + raggo.SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), + raggo.SetEmbedderModel("text-embedding-3-small"), ) if err != nil { log.Fatalf("Failed to create embedder: %v", err) diff --git a/examples/loader_example.go b/examples/loader_example.go index 1c96469..761960c 100644 --- a/examples/loader_example.go +++ b/examples/loader_example.go @@ -17,7 +17,7 @@ func main() { // Create a new Loader with custom options loader := raggo.NewLoader( - raggo.SetTimeout(1*time.Minute), + raggo.SetLoaderTimeout(1*time.Minute), raggo.SetTempDir(os.TempDir()), ) @@ -99,4 +99,3 @@ func dirExample(loader raggo.Loader) { } } } - diff --git a/examples/parser_example.go b/examples/parser_example.go index 08b93a5..0768393 100644 --- a/examples/parser_example.go +++ b/examples/parser_example.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "log" "os" @@ -14,13 +15,14 @@ func main() { raggo.SetLogLevel(raggo.LogLevelInfo) parser := raggo.NewParser() + loader := raggo.NewLoader() fmt.Println("Running examples with INFO level logging:") - runExamples(parser) + runExamples(parser, loader) } -func runExamples(parser raggo.Parser) { +func runExamples(parser raggo.Parser, loader raggo.Loader) { // Example 1: Parse PDF file pdfExample(parser) @@ -28,7 +30,7 @@ func runExamples(parser raggo.Parser) { textExample(parser) // Example 3: Parse directory - dirExample(parser) + dirExample(loader) } func pdfExample(parser raggo.Parser) { @@ -71,34 +73,21 @@ func textExample(parser raggo.Parser) { fmt.Printf("Text file parsed. Content length: %d\n", len(doc.Content)) } -func dirExample(parser raggo.Parser) { - fmt.Println("Example 3: Parsing directory") +func dirExample(loader raggo.Loader) { + fmt.Println("Example 3: Loading directory") wd, err := os.Getwd() if err != nil { log.Fatalf("Failed to get working directory: %v", err) } testDataDir := filepath.Join(wd, "testdata") - fileCount := 0 - err = filepath.Walk(testDataDir, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - _, err := parser.Parse(path) - if err != nil { - fmt.Printf("Error parsing %s: %v\n", path, err) - } else { - fileCount++ - } - } - return nil - }) - + paths, err := loader.LoadDir(context.Background(), testDataDir) if err != nil { - log.Printf("Error walking directory: %v\n", err) + log.Printf("Error loading directory: %v\n", err) } else { - fmt.Printf("Parsed %d files in directory\n", fileCount) + fmt.Printf("Loaded %d files from directory\n", len(paths)) + for i, path := range paths { + fmt.Printf("%d: %s\n", i+1, path) + } } } - diff --git a/examples/process_embedding_benchmark.go b/examples/process_embedding_benchmark.go index 37173b8..dc3a197 100644 --- a/examples/process_embedding_benchmark.go +++ b/examples/process_embedding_benchmark.go @@ -35,9 +35,9 @@ func main() { } embedder, err := raggo.NewEmbedder( - raggo.WithEmbedderProvider("openai"), - raggo.WithEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), - raggo.WithEmbedderModel("text-embedding-3-small"), + raggo.SetEmbedderProvider("openai"), + raggo.SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), + raggo.SetEmbedderModel("text-embedding-3-small"), ) if err != nil { log.Fatalf("Failed to create embedder: %v", err) @@ -53,10 +53,78 @@ func main() { log.Fatalf("Failed to create LLM: %v", err) } - benchmarkPDFProcessing(parser, chunker, embedder, llm, targetDir) + // Create VectorDB instance + vectorDB, err := raggo.NewVectorDB( + raggo.WithType("milvus"), + raggo.WithAddress("localhost:19530"), + raggo.WithTimeout(30*time.Second), + ) + if err != nil { + log.Fatalf("Failed to create vector database: %v", err) + } + defer vectorDB.Close() + + // Connect to the database + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := vectorDB.Connect(ctx); err != nil { + log.Fatalf("Failed to connect to vector database: %v", err) + } + + collectionName := "benchmark_docs" + + // Create collection with schema if it doesn't exist + exists, err := vectorDB.HasCollection(ctx, collectionName) + if err != nil { + log.Fatalf("Failed to check collection existence: %v", err) + } + + if exists { + err = vectorDB.DropCollection(ctx, collectionName) + if err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + } + + schema := raggo.Schema{ + Name: collectionName, + Fields: []raggo.Field{ + {Name: "ID", DataType: "int64", PrimaryKey: true, AutoID: false}, + {Name: "Embedding", DataType: "float_vector", Dimension: 1536}, + {Name: "Text", DataType: "varchar", MaxLength: 65535}, + {Name: "Metadata", DataType: "json", MaxLength: 65535}, + }, + } + + err = vectorDB.CreateCollection(ctx, collectionName, schema) + if err != nil { + log.Fatalf("Failed to create collection: %v", err) + } + + // Create index for vector search + err = vectorDB.CreateIndex(ctx, collectionName, "Embedding", raggo.Index{ + Type: "HNSW", + Metric: "L2", + Parameters: map[string]interface{}{ + "M": 16, + "efConstruction": 256, + }, + }) + if err != nil { + log.Fatalf("Failed to create index: %v", err) + } + + // Load the collection + err = vectorDB.LoadCollection(ctx, collectionName) + if err != nil { + log.Fatalf("Failed to load collection: %v", err) + } + + benchmarkPDFProcessing(parser, chunker, embedder, llm, vectorDB, targetDir, collectionName) } -func benchmarkPDFProcessing(parser raggo.Parser, chunker raggo.Chunker, embedder raggo.Embedder, llm gollm.LLM, targetDir string) { +func benchmarkPDFProcessing(parser raggo.Parser, chunker raggo.Chunker, embedder raggo.Embedder, llm gollm.LLM, vectorDB *raggo.VectorDB, targetDir, collectionName string) { files, err := filepath.Glob(filepath.Join(targetDir, "*.pdf")) if err != nil { log.Fatalf("Failed to list PDF files: %v", err) @@ -82,7 +150,7 @@ func benchmarkPDFProcessing(parser raggo.Parser, chunker raggo.Chunker, embedder wg.Add(1) go func(filePath string) { defer wg.Done() - tokens, embeds, summaries, err := processAndEmbedPDF(parser, chunker, embedder, llm, filePath) + tokens, embeds, summaries, err := processAndEmbedPDF(parser, chunker, embedder, llm, vectorDB, filePath, collectionName) mu.Lock() defer mu.Unlock() if err != nil { @@ -116,7 +184,7 @@ func benchmarkPDFProcessing(parser raggo.Parser, chunker raggo.Chunker, embedder fmt.Printf("Average summaries per second: %.2f\n", float64(summaryCount)/duration.Seconds()) } -func processAndEmbedPDF(parser raggo.Parser, chunker raggo.Chunker, embedder raggo.Embedder, llm gollm.LLM, filePath string) (int, int, int, error) { +func processAndEmbedPDF(parser raggo.Parser, chunker raggo.Chunker, embedder raggo.Embedder, llm gollm.LLM, vectorDB *raggo.VectorDB, filePath, collectionName string) (int, int, int, error) { log.Printf("Processing file: %s", filePath) doc, err := parser.Parse(filePath) @@ -138,19 +206,41 @@ func processAndEmbedPDF(parser raggo.Parser, chunker raggo.Chunker, embedder rag log.Printf("Generated summary for %s, Summary length: %d", filePath, len(summary)) - chunks := chunker.Chunk(summary) - + chunks := chunker.Chunk(doc.Content) totalTokens := 0 embedCount := 0 - for _, chunk := range chunks { + // Create records for batch insertion + var records []raggo.Record + for i, chunk := range chunks { totalTokens += len(chunk.Text) // Simple approximation - _, err := embedder.Embed(context.Context(context.Background()), chunk.Text) + embedding, err := embedder.Embed(context.Background(), chunk.Text) if err != nil { return totalTokens, embedCount, 1, fmt.Errorf("error embedding chunk: %w", err) } embedCount++ + + // Create record with metadata + records = append(records, raggo.Record{ + Fields: map[string]interface{}{ + "ID": int64(i), + "Embedding": embedding, + "Text": chunk.Text, + "Metadata": map[string]interface{}{ + "source": filePath, + "chunk": i, + "summary": summary, + "timestamp": time.Now().Unix(), + }, + }, + }) + } + + // Batch insert records + err = vectorDB.Insert(context.Background(), collectionName, records) + if err != nil { + return totalTokens, embedCount, 1, fmt.Errorf("error inserting into vector database: %w", err) } log.Printf("Successfully processed %s: %d tokens, %d embeddings", filePath, totalTokens, embedCount) diff --git a/examples/processing_benchmark.go b/examples/processing_benchmark.go index f8e4567..53f2559 100644 --- a/examples/processing_benchmark.go +++ b/examples/processing_benchmark.go @@ -7,9 +7,11 @@ import ( "os" "path/filepath" "sync" + "sync/atomic" "time" "github.com/pkoukk/tiktoken-go" + "github.com/teilomillet/gollm" "github.com/teilomillet/raggo" "golang.org/x/time/rate" ) @@ -17,6 +19,9 @@ import ( const ( embeddingTPM = 5_000_000 // Tokens per minute for text-embedding-3-small embeddingRPM = 5_000 // Requests per minute for text-embedding-3-small + defaultTimeout = 5 * time.Minute + defaultBatchSize = 10 + defaultCollectionName = "pdf_embeddings" // Add default collection name ) type RateLimitedEmbedder struct { @@ -90,7 +95,7 @@ func main() { } // Initialize components - parser := raggo.NewParser() + parser := raggo.PDFParser() // Initialize the PDF parser chunker, err := raggo.NewChunker( raggo.ChunkSize(512), raggo.ChunkOverlap(64), @@ -100,9 +105,9 @@ func main() { } embedder, err := raggo.NewEmbedder( - raggo.SetProvider("openai"), - raggo.SetAPIKey(os.Getenv("OPENAI_API_KEY")), - raggo.SetModel("text-embedding-3-small"), + raggo.SetEmbedderProvider("openai"), + raggo.SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), + raggo.SetEmbedderModel("text-embedding-3-small"), ) if err != nil { log.Fatalf("Failed to create embedder: %v", err) @@ -113,32 +118,56 @@ func main() { log.Fatalf("Failed to create rate-limited embedder: %v", err) } - // Create a new ConcurrentPDFLoader - loader := raggo.NewConcurrentPDFLoader( - raggo.SetTimeout(5*time.Minute), // Increased timeout - raggo.SetTempDir(targetDir), - ) + // Create LLM and VectorDB + llm, err := gollm.NewLLM( /* LLM configuration */ ) + if err != nil { + log.Fatalf("Failed to create LLM: %v", err) + } + + vectorDB, err := raggo.NewVectorDB( /* VectorDB configuration */ ) + if err != nil { + log.Fatalf("Failed to create VectorDB: %v", err) + } // Run the benchmark - desiredCount := 100 // Adjust this number as needed - benchmarkPDFProcessing(loader, parser, chunker, rateLimitedEmbedder, sourceDir, targetDir, desiredCount) + benchmarkPDFProcessing( + parser, // parser + chunker, // chunker + rateLimitedEmbedder.embedder, // embedder + llm, // llm + vectorDB, // vectorDB + sourceDir, // sourceDir + targetDir, // targetDir + ) } -func benchmarkPDFProcessing(loader raggo.ConcurrentPDFLoader, parser raggo.Parser, chunker raggo.Chunker, embedder *RateLimitedEmbedder, sourceDir, targetDir string, desiredCount int) { - fmt.Printf("Starting PDF processing benchmark with %d files...\n", desiredCount) +func benchmarkPDFProcessing( + parser raggo.Parser, + chunker raggo.Chunker, + embedder raggo.Embedder, + llm gollm.LLM, + vectorDB *raggo.VectorDB, + sourceDir string, + targetDir string, +) { + fmt.Printf("Starting PDF processing benchmark...\n") start := time.Now() // Load PDFs loadStart := time.Now() - loadedFiles, err := loader.LoadPDFsConcurrent(context.Background(), sourceDir, targetDir, desiredCount) + loader := raggo.NewConcurrentPDFLoader( + raggo.SetLoaderTimeout(5*time.Minute), // Increased timeout + raggo.SetTempDir(targetDir), + ) + loadedFiles, err := loader.LoadPDFsConcurrent(context.Background(), sourceDir, targetDir, 100) loadDuration := time.Since(loadStart) if err != nil { log.Printf("Warning: Encountered errors while loading PDF files: %v", err) } - fmt.Printf("Attempted to load %d PDF files, successfully loaded %d in %v\n", desiredCount, len(loadedFiles), loadDuration) + fmt.Printf("Attempted to load PDF files, successfully loaded %d in %v\n", len(loadedFiles), loadDuration) if len(loadedFiles) == 0 { log.Fatalf("No files were successfully loaded. Cannot continue benchmark.") @@ -149,21 +178,31 @@ func benchmarkPDFProcessing(loader raggo.ConcurrentPDFLoader, parser raggo.Parse processStart := time.Now() errorCount := 0 successCount := 0 - totalTokens := 0 - embedCount := 0 + var totalTokens int64 + var embedCount int64 + var totalChunks int64 for _, file := range loadedFiles { wg.Add(1) go func(filePath string) { defer wg.Done() - tokens, embeds, err := processAndEmbedPDF(parser, chunker, embedder, filePath) + tokens, embeds, chunks, err := processAndEmbedPDF( + parser, + chunker, + embedder, + llm, + vectorDB, + defaultCollectionName, // Add collection name + filePath, + ) if err != nil { log.Printf("Error processing %s: %v", filePath, err) errorCount++ } else { successCount++ - totalTokens += tokens - embedCount += embeds + atomic.AddInt64(&totalTokens, int64(tokens)) + atomic.AddInt64(&embedCount, int64(embeds)) + atomic.AddInt64(&totalChunks, int64(chunks)) } }(file) } @@ -175,7 +214,7 @@ func benchmarkPDFProcessing(loader raggo.ConcurrentPDFLoader, parser raggo.Parse // Print benchmark results fmt.Printf("\nBenchmark Results:\n") - fmt.Printf("Total files attempted: %d\n", desiredCount) + fmt.Printf("Total files attempted: %d\n", len(loadedFiles)) fmt.Printf("Files successfully loaded: %d\n", len(loadedFiles)) fmt.Printf("Files successfully processed: %d\n", successCount) fmt.Printf("Files with processing errors: %d\n", errorCount) @@ -184,6 +223,7 @@ func benchmarkPDFProcessing(loader raggo.ConcurrentPDFLoader, parser raggo.Parse fmt.Printf("Total time: %v\n", totalDuration) fmt.Printf("Total tokens processed: %d\n", totalTokens) fmt.Printf("Total embeddings created: %d\n", embedCount) + fmt.Printf("Total chunks processed: %d\n", totalChunks) if successCount > 0 { fmt.Printf("Average time per successfully processed file: %v\n", totalDuration/time.Duration(successCount)) } @@ -191,31 +231,39 @@ func benchmarkPDFProcessing(loader raggo.ConcurrentPDFLoader, parser raggo.Parse fmt.Printf("Average embeddings per second: %.2f\n", float64(embedCount)/processDuration.Seconds()) } -func processAndEmbedPDF(parser raggo.Parser, chunker raggo.Chunker, embedder *RateLimitedEmbedder, filePath string) (int, int, error) { +func processAndEmbedPDF( + parser raggo.Parser, + chunker raggo.Chunker, + embedder raggo.Embedder, + llm gollm.LLM, + vectorDB *raggo.VectorDB, + collectionName string, + filePath string, +) (int, int, int, error) { // Parse the PDF doc, err := parser.Parse(filePath) if err != nil { - return 0, 0, fmt.Errorf("error parsing PDF: %w", err) + return 0, 0, 0, fmt.Errorf("error parsing PDF: %w", err) } // Chunk the content chunks := chunker.Chunk(doc.Content) - totalTokens := 0 embedCount := 0 // Embed each chunk for _, chunk := range chunks { - tokens := embedder.tokenCounter.Count(chunk.Text) - totalTokens += tokens + tokens := len(chunk.Text) _, err := embedder.Embed(context.Background(), chunk.Text) if err != nil { - return totalTokens, embedCount, fmt.Errorf("error embedding chunk: %w", err) + return totalTokens, embedCount, len(chunks), fmt.Errorf("error embedding chunk: %w", err) } + embedCount++ + totalTokens += tokens } log.Printf("Processed and embedded %s: %d characters, %d chunks, %d tokens", filePath, len(doc.Content), len(chunks), totalTokens) - return totalTokens, embedCount, nil + return totalTokens, embedCount, len(chunks), nil } diff --git a/examples/recruit_example.go b/examples/recruit_example.go index 2b01a4f..c1d9f86 100644 --- a/examples/recruit_example.go +++ b/examples/recruit_example.go @@ -77,9 +77,9 @@ func main() { debug("Initializing embedder...") embedder, err := raggo.NewEmbedder( - raggo.WithEmbedderProvider("openai"), - raggo.WithEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), - raggo.WithEmbedderModel("text-embedding-3-small"), + raggo.SetEmbedderProvider("openai"), + raggo.SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), + raggo.SetEmbedderModel("text-embedding-3-small"), ) if err != nil { log.Fatalf("Failed to create embedder: %v", err) diff --git a/examples/tsne_example.go b/examples/tsne_example.go index f615c1d..f06c94c 100644 --- a/examples/tsne_example.go +++ b/examples/tsne_example.go @@ -25,9 +25,9 @@ func main() { // Initialize components parser := raggo.NewParser() embedder, err := raggo.NewEmbedder( - raggo.SetProvider("openai"), - raggo.SetAPIKey(os.Getenv("OPENAI_API_KEY")), - raggo.SetModel("text-embedding-3-small"), + raggo.SetEmbedderProvider("openai"), + raggo.SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), + raggo.SetEmbedderModel("text-embedding-3-small"), ) if err != nil { log.Fatalf("Failed to create embedder: %v", err) diff --git a/examples/vectordb_example.go b/examples/vectordb_example.go index bbc3bbf..7fb0ac8 100644 --- a/examples/vectordb_example.go +++ b/examples/vectordb_example.go @@ -33,9 +33,9 @@ func main() { // Create a new Embedder embedder, err := raggo.NewEmbedder( - raggo.SetProvider("openai"), - raggo.SetAPIKey(os.Getenv("OPENAI_API_KEY")), - raggo.SetModel("text-embedding-3-small"), + raggo.SetEmbedderProvider("openai"), + raggo.SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")), + raggo.SetEmbedderModel("text-embedding-3-small"), ) if err != nil { log.Fatalf("Failed to create embedder: %v", err) @@ -277,4 +277,3 @@ func truncateString(s string, length int) string { } return s[:length-3] + "..." } - diff --git a/loader.go b/loader.go index 9f54b04..916a94b 100644 --- a/loader.go +++ b/loader.go @@ -86,7 +86,7 @@ func WithHTTPClient(client *http.Client) LoaderOption { return rag.WithHTTPClient(client) } -// WithLoaderTimeout sets a custom timeout for all loader operations. +// SetLoaderTimeout sets a custom timeout for all loader operations. // The timeout applies to: // - URL downloads // - File operations @@ -95,8 +95,8 @@ func WithHTTPClient(client *http.Client) LoaderOption { // Example: // // // Set a 2-minute timeout for all operations -// loader := NewLoader(WithLoaderTimeout(2 * time.Minute)) -func WithLoaderTimeout(timeout time.Duration) LoaderOption { +// loader := NewLoader(SetLoaderTimeout(2 * time.Minute)) +func SetLoaderTimeout(timeout time.Duration) LoaderOption { return rag.WithTimeout(timeout) } @@ -127,7 +127,7 @@ func SetTempDir(dir string) LoaderOption { // // loader := NewLoader( // WithHTTPClient(customClient), -// WithLoaderTimeout(time.Minute), +// SetLoaderTimeout(time.Minute), // SetTempDir("/custom/temp"), // ) func NewLoader(opts ...LoaderOption) Loader { diff --git a/parser.go b/parser.go index ed76b4f..739c8bb 100644 --- a/parser.go +++ b/parser.go @@ -4,6 +4,8 @@ package raggo import ( + "context" + "github.com/teilomillet/raggo/rag" ) @@ -33,24 +35,211 @@ type Document = rag.Document // - Content extraction // - Metadata collection // - Error handling +// +// Example: +// +// // Create a new parser with default settings +// parser := NewParser() +// +// // Parse a document +// doc, err := parser.Parse("document.pdf") +// if err != nil { +// log.Fatal(err) +// } +// fmt.Printf("Content: %s\n", doc.Content) type Parser interface { // Parse processes a file and returns its content and metadata. // Returns an error if the parsing operation fails. Parse(filePath string) (Document, error) } -// NewParser creates a new Parser with default settings and handlers. -// The default configuration includes: -// - PDF document support -// - Plain text file support -// - Extension-based file type detection +// parserWrapper combines Parser and Loader capabilities into a single type. +// It implements both the Parser interface for document parsing and provides +// loading functionality through an embedded rag.Loader. +// +// The wrapper uses dependency injection to allow customization of both +// the parser and loader components, making it highly configurable and +// testable. +// +// Example usage with custom loader: +// +// customLoader := rag.NewLoader( +// rag.WithTimeout(time.Minute), +// rag.WithTempDir("/custom/temp"), +// ) +// +// parser := NewParser( +// WithLoader(customLoader), +// ) +type parserWrapper struct { + parser Parser // Core parsing implementation + loader *rag.Loader // Document loading capabilities +} + +// ParserOption defines functional options for configuring a Parser. +// This follows the functional options pattern, allowing flexible and +// extensible configuration of the parser without breaking existing code. +// +// Custom options can be created by implementing this function type +// and modifying the parserWrapper fields as needed. +// +// Example creating a custom option: +// +// func WithCustomTimeout(timeout time.Duration) ParserOption { +// return func(pw *parserWrapper) { +// pw.loader = rag.NewLoader(rag.WithTimeout(timeout)) +// } +// } +type ParserOption func(*parserWrapper) + +// WithLoader injects a custom loader into the parser. +// This option allows you to provide a pre-configured rag.Loader +// instance with custom settings for timeout, temporary directory, +// HTTP client, or other loader-specific configurations. +// +// Example: +// +// customLoader := rag.NewLoader( +// rag.WithTimeout(time.Minute), +// rag.WithTempDir("/custom/temp"), +// ) +// +// parser := NewParser( +// WithLoader(customLoader), +// ) +func WithLoader(loader *rag.Loader) ParserOption { + return func(pw *parserWrapper) { + pw.loader = loader + } +} + +// NewParser creates a new Parser with the given options. +// It initializes a parserWrapper with default settings and applies +// any provided configuration options. The resulting parser implements +// both document parsing and loading capabilities. +// +// Default configuration includes: +// - Standard rag.Loader with default timeout and temp directory +// - Default parser manager for handling various file types +// - Built-in support for common file formats // // Example: // +// // Create parser with default settings // parser := NewParser() +// +// // Create parser with custom loader +// parser := NewParser( +// WithLoader(customLoader), +// ) +// +// // Create parser with multiple options +// parser := NewParser( +// WithLoader(customLoader), +// WithCustomOption(...), +// ) +func NewParser(opts ...ParserOption) Parser { + pw := &parserWrapper{ + parser: rag.NewParserManager(), + loader: rag.NewLoader(), + } + + for _, opt := range opts { + opt(pw) + } + + return pw +} + +// Parse implements the Parser interface by processing a file and extracting +// its content and metadata. It delegates the actual parsing to the underlying +// parser implementation while maintaining the option to preprocess files +// using the loader capabilities. +// +// Returns: +// - Document: Contains the extracted content and metadata +// - error: Any error encountered during parsing +// +// Example: +// // doc, err := parser.Parse("document.pdf") -func NewParser() Parser { - return rag.NewParserManager() +// if err != nil { +// log.Fatal(err) +// } +// fmt.Printf("Content: %s\n", doc.Content) +func (pw *parserWrapper) Parse(filePath string) (Document, error) { + return pw.parser.Parse(filePath) +} + +// LoadDir implements the Loader interface by recursively processing +// all files in a directory. It delegates to the embedded loader's +// implementation while maintaining the parser's context. +// +// Parameters: +// - ctx: Context for cancellation and timeout +// - dir: Directory path to process +// +// Returns: +// - []string: Paths to all processed files +// - error: Any error encountered during directory processing +// +// Example: +// +// paths, err := parser.LoadDir(ctx, "/path/to/docs") +// if err != nil { +// log.Fatal(err) +// } +// for _, path := range paths { +// fmt.Printf("Processed: %s\n", path) +// } +func (pw *parserWrapper) LoadDir(ctx context.Context, dir string) ([]string, error) { + return pw.loader.LoadDir(ctx, dir) +} + +// LoadFile implements the Loader interface by processing a single file. +// It uses the embedded loader's implementation while maintaining the +// parser's context. +// +// Parameters: +// - ctx: Context for cancellation and timeout +// - path: Path to the file to process +// +// Returns: +// - string: Path to the processed file +// - error: Any error encountered during file processing +// +// Example: +// +// processedPath, err := parser.LoadFile(ctx, "document.pdf") +// if err != nil { +// log.Fatal(err) +// } +// fmt.Printf("Processed file at: %s\n", processedPath) +func (pw *parserWrapper) LoadFile(ctx context.Context, path string) (string, error) { + return pw.loader.LoadFile(ctx, path) +} + +// LoadURL implements the Loader interface by downloading and processing +// a file from a URL. It uses the embedded loader's implementation while +// maintaining the parser's context. +// +// Parameters: +// - ctx: Context for cancellation and timeout +// - url: URL of the file to download and process +// +// Returns: +// - string: Path to the downloaded and processed file +// - error: Any error encountered during download or processing +// +// Example: +// +// processedPath, err := parser.LoadURL(ctx, "https://example.com/doc.pdf") +// if err != nil { +// log.Fatal(err) +// } +// fmt.Printf("Downloaded and processed file at: %s\n", processedPath) +func (pw *parserWrapper) LoadURL(ctx context.Context, url string) (string, error) { + return pw.loader.LoadURL(ctx, url) } // SetFileTypeDetector customizes how file types are detected. diff --git a/rag.go b/rag.go index e31c02b..d88ec8a 100644 --- a/rag.go +++ b/rag.go @@ -375,9 +375,9 @@ func (r *RAG) initialize() error { // Initialize embedder embedder, err := NewEmbedder( - WithEmbedderProvider(r.config.Provider), - WithEmbedderModel(r.config.Model), - WithEmbedderAPIKey(r.config.APIKey), + SetEmbedderProvider(r.config.Provider), + SetEmbedderModel(r.config.Model), + SetEmbedderAPIKey(r.config.APIKey), ) if err != nil { return fmt.Errorf("failed to create embedder: %w", err) diff --git a/register.go b/register.go index b0b24e3..b1922dc 100644 --- a/register.go +++ b/register.go @@ -114,7 +114,7 @@ func Register(ctx context.Context, source string, opts ...RegisterOption) error Debug("Creating loader") loader := NewLoader( SetTempDir(cfg.TempDir), - WithLoaderTimeout(cfg.Timeout), + SetLoaderTimeout(cfg.Timeout), ) // Create chunker @@ -130,9 +130,9 @@ func Register(ctx context.Context, source string, opts ...RegisterOption) error // Create embedder Debug("Creating embedder") embedder, err := NewEmbedder( - WithEmbedderProvider(cfg.EmbeddingProvider), - WithEmbedderModel(cfg.EmbeddingModel), - WithEmbedderAPIKey(cfg.EmbeddingKey), + SetEmbedderProvider(cfg.EmbeddingProvider), + SetEmbedderModel(cfg.EmbeddingModel), + SetEmbedderAPIKey(cfg.EmbeddingKey), ) if err != nil { return fmt.Errorf("failed to create embedder: %w", err)