Skip to content

Commit

Permalink
feat: knowledge - re-use embeddings on document layer (copy embedding…
Browse files Browse the repository at this point in the history
…s from docs with same content) (#444)
  • Loading branch information
iwilltry42 authored Feb 21, 2025
1 parent 2a5e7ee commit 73689ed
Show file tree
Hide file tree
Showing 19 changed files with 315 additions and 49 deletions.
2 changes: 1 addition & 1 deletion knowledge/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ toolchain go1.23.2
replace (
github.com/hupe1980/golc => github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7 // nbformat extension
github.com/ledongthuc/pdf => github.com/iwilltry42/pdf v0.0.0-20240517145113-99fbaebc5dd3 // fix for reading some PDFs: https://github.com/ledongthuc/pdf/pull/36 + https://github.com/iwilltry42/pdf/pull/2
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20241108083648-75828c6a4726
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20250218054308-81ac4c30d459
github.com/tmc/langchaingo => github.com/StrongMonkey/langchaingo v0.0.0-20240617180437-9af4bee04c8b // Context-Aware Markdown Splitting
)

Expand Down
4 changes: 2 additions & 2 deletions knowledge/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,8 @@ github.com/invopop/yaml v0.3.1 h1:f0+ZpmhfBSS4MhG+4HYseMdJhoeeopbSKbq5Rpeelso=
github.com/invopop/yaml v0.3.1/go.mod h1:PMOp3nn4/12yEZUFfmOuNHJsZToEEOwoWsT+D81KkeA=
github.com/iwilltry42/bm25-go v0.0.0-20240909111832-a928590cc9da h1:aqahDXw6bOtbupzBGZpnV57M3JShpN0jrtd6cyclH8I=
github.com/iwilltry42/bm25-go v0.0.0-20240909111832-a928590cc9da/go.mod h1:AmA5WoRtPljzo03tlOVOoLKPh9doUNUQrtrHaq5VkUg=
github.com/iwilltry42/chromem-go v0.0.0-20241108083648-75828c6a4726 h1:/Td69qGNjjiyDT4tBUdVi+6hW0HEBFExZwcCYXpzxC0=
github.com/iwilltry42/chromem-go v0.0.0-20241108083648-75828c6a4726/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/chromem-go v0.0.0-20250218054308-81ac4c30d459 h1:QLcDKRY6+D/u55btOQdU9gzA+h4g6JTPCevUG1zmW6s=
github.com/iwilltry42/chromem-go v0.0.0-20250218054308-81ac4c30d459/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7 h1:2AzzbKVW1iP2F+ovqJKq801l6tgxYPt9m2zFKbs+i/Y=
github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7/go.mod h1:w692KzkSTSvXROfyu+jYauNXB4YaL1s8zHPDMnNW88o=
github.com/iwilltry42/pdf v0.0.0-20240517145113-99fbaebc5dd3 h1:rCVwFT7Q+HxpijWfSzKTYX4pCDMS7oy/I/WzU30VXyI=
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type SharedIngestionOpts struct {
IngestionFlows []flows.IngestionFlow
IsDuplicateFuncName string
Metadata map[string]string
ReuseEmbeddings bool
}

type IngestPathsOpts struct {
Expand Down
2 changes: 2 additions & 0 deletions knowledge/pkg/client/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ func (c *StandaloneClient) IngestFromWorkspace(ctx context.Context, datasetID st
IsDuplicateFuncName: opts.IsDuplicateFuncName,
ExtraMetadata: meta,
IngestionFlows: opts.IngestionFlows,
ReuseEmbeddings: opts.ReuseEmbeddings,
}

_, err = c.Ingest(log.ToCtx(ctx, log.FromCtx(ctx).With("filepath", file).With("absolute_path", iopts.FileMetadata.AbsolutePath)), datasetID, finfo.Name, fileContent, iopts)
Expand Down Expand Up @@ -163,6 +164,7 @@ func (c *StandaloneClient) IngestPaths(ctx context.Context, datasetID string, op
},
IsDuplicateFuncName: opts.IsDuplicateFuncName,
ExtraMetadata: extraMetadata,
ReuseEmbeddings: opts.ReuseEmbeddings,
}

if opts != nil {
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/cmd/askdir.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error {
ingestOpts := &client.IngestPathsOpts{
SharedIngestionOpts: client.SharedIngestionOpts{
IsDuplicateFuncName: s.DeduplicationFuncName,
ReuseEmbeddings: true,
},
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func (s *ClientIngest) run(ctx context.Context, filePath string) error {
SharedIngestionOpts: client.SharedIngestionOpts{
IsDuplicateFuncName: s.DeduplicationFuncName,
Metadata: metadata,
ReuseEmbeddings: true,
},
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Expand Down
15 changes: 15 additions & 0 deletions knowledge/pkg/datastore/datastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/gptscript-ai/knowledge/pkg/config"
etypes "github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/types"
"github.com/gptscript-ai/knowledge/pkg/index/types"
"github.com/gptscript-ai/knowledge/pkg/log"
"github.com/gptscript-ai/knowledge/pkg/output"

Expand Down Expand Up @@ -108,6 +109,20 @@ func NewDatastore(ctx context.Context, indexDSN string, automigrate bool, vector
return ds, nil
}

func (s *Datastore) GetDatasetForDocument(ctx context.Context, documentID string) (*types.Dataset, error) {
docIdx, err := s.Index.GetDocumentByID(ctx, documentID)
if err != nil {
return nil, err
}

dataset, err := s.GetDataset(ctx, docIdx.Dataset, nil)
if err != nil {
return nil, err
}

return dataset, nil
}

func (s *Datastore) Close() error {
var errmsgs []string
if err := s.Index.Close(); err != nil {
Expand Down
5 changes: 3 additions & 2 deletions knowledge/pkg/datastore/embeddings/embeddings.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ package embeddings
import (
"errors"
"fmt"
"reflect"
"strings"

"github.com/gptscript-ai/knowledge/pkg/config"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/cohere"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/jina"
Expand All @@ -14,8 +17,6 @@ import (
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/types"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/vertex"
"github.com/mitchellh/mapstructure"
"reflect"
"strings"
)

func GetSelectedEmbeddingsModelProvider(selected string, embeddingsConfig config.EmbeddingsConfig) (types.EmbeddingModelProvider, error) {
Expand Down
58 changes: 57 additions & 1 deletion knowledge/pkg/datastore/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/gptscript-ai/knowledge/pkg/log"
"github.com/gptscript-ai/knowledge/pkg/output"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
cg "github.com/philippgille/chromem-go"

"github.com/google/uuid"
"github.com/gptscript-ai/knowledge/pkg/datastore/filetypes"
Expand All @@ -27,6 +28,7 @@ type IngestOpts struct {
IsDuplicateFunc IsDuplicateFunc
IngestionFlows []flows.IngestionFlow
ExtraMetadata map[string]any
ReuseEmbeddings bool
}

// Ingest loads a document from a reader and adds it to the dataset.
Expand Down Expand Up @@ -159,7 +161,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, filename strin
}

// Mandatory Transformation: Add filename to metadata -> append extraMetadata, but do not override filename or absPath
metadata := map[string]any{"filename": filename, "absPath": opts.FileMetadata.AbsolutePath, "fileSize": opts.FileMetadata.Size}
metadata := map[string]any{"filename": filename, "absPath": opts.FileMetadata.AbsolutePath, "fileSize": opts.FileMetadata.Size, "embeddingModel": s.EmbeddingModelProvider.EmbeddingModelName()}
for k, v := range opts.ExtraMetadata {
if _, ok := metadata[k]; !ok {
metadata[k] = v
Expand Down Expand Up @@ -198,6 +200,60 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, filename strin
statusLog = statusLog.With("num_documents", len(docs))
ctx = log.ToCtx(ctx, statusLog)

if opts.ReuseEmbeddings {
slog.Debug("Checking if existing embeddings can be reused", "count", len(docs))
for i, doc := range docs {
existingDocs, err := s.Vectorstore.GetDocuments(ctx, "", nil, []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorEquals,
Value: doc.Content,
},
})
if err != nil {
slog.Debug("failed to get documents for reuse", "error", err)
continue
}
if len(existingDocs) == 0 {
slog.Debug("no existing documents found for reuse")
continue
}
for _, existingDoc := range existingDocs {
if emb, ok := existingDoc.Metadata["embeddingModel"]; ok {
if emb == s.EmbeddingModelProvider.EmbeddingModelName() {
slog.Info("Reusing existing embedding", "docID", existingDoc.ID, "embeddingModelMeta", emb, "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
docs[i].Embedding = existingDoc.Embedding
break
} else {
slog.Debug("not using existing embedding", "docID", existingDoc.ID, "embeddingModel", emb, "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
continue
}
}

existingDocumentDataset, err := s.GetDatasetForDocument(ctx, existingDoc.ID)
if err != nil {
slog.Debug("failed to get document dataset", "error", err)
continue
}

if existingDocumentDataset.EmbeddingsProviderConfig != nil {
existingEmbeddingProvider, err := embeddings.ProviderFromConfig(*existingDocumentDataset.EmbeddingsProviderConfig)
if err != nil {
slog.Debug("failed to get embeddings model provider", "error", err)
continue
}
if existingEmbeddingProvider.EmbeddingModelName() == s.EmbeddingModelProvider.EmbeddingModelName() {
slog.Info("Reusing existing embedding", "docID", existingDoc.ID, "embeddingModel", existingEmbeddingProvider.EmbeddingModelName(), "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
docs[i].Embedding = existingDoc.Embedding
break
} else {
slog.Debug("not using existing embedding", "docID", existingDoc.ID, "embeddingModel", existingEmbeddingProvider.EmbeddingModelName(), "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
continue
}
}
}
}
}

statusLog.Debug("Adding documents to vectorstore")
startTime := time.Now()
docIDs, err := s.Vectorstore.AddDocuments(ctx, docs, datasetID)
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type Index interface {
PruneFiles(ctx context.Context, datasetID string, pathPrefix string, keep []string) ([]types.File, error)

// Fundamental Document Operations
GetDocumentByID(ctx context.Context, documentID string) (*types.Document, error)
DeleteDocument(ctx context.Context, documentID, datasetID string) error

Close() error
Expand Down
4 changes: 4 additions & 0 deletions knowledge/pkg/index/postgres/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ func (i *Index) FindFileByMetadata(ctx context.Context, dataset string, metadata
return i.DB.FindFileByMetadata(ctx, dataset, metadata, includeDocuments)
}

func (i *Index) GetDocumentByID(ctx context.Context, documentID string) (*types.Document, error) {
return i.DB.GetDocument(ctx, documentID)
}

func (i *Index) DeleteDocument(ctx context.Context, documentID, datasetID string) error {
return i.DB.DeleteDocument(ctx, documentID, datasetID)
}
4 changes: 4 additions & 0 deletions knowledge/pkg/index/sqlite/sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ func (i *Index) FindFileByMetadata(ctx context.Context, dataset string, metadata
return i.DB.FindFileByMetadata(ctx, dataset, metadata, includeDocuments)
}

func (i *Index) GetDocumentByID(ctx context.Context, documentID string) (*types.Document, error) {
return i.DB.GetDocument(ctx, documentID)
}

func (i *Index) DeleteDocument(ctx context.Context, documentID, datasetID string) error {
return i.DB.DeleteDocument(ctx, documentID, datasetID)
}
10 changes: 10 additions & 0 deletions knowledge/pkg/index/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,16 @@ func (db *DB) FindFileByMetadata(ctx context.Context, dataset string, metadata F
return &file, nil
}

func (db *DB) GetDocument(ctx context.Context, documentID string) (*Document, error) {
var document Document
tx := db.WithContext(ctx).First(&document, "id = ?", documentID)
if tx.Error != nil {
return nil, ErrDBDocumentNotFound
}

return &document, nil
}

func (db *DB) DeleteDocument(ctx context.Context, documentID, datasetID string) error {
// Find in Database
var document Document
Expand Down
42 changes: 42 additions & 0 deletions knowledge/pkg/vectorstore/helper/sql.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package helper

import (
"fmt"
"strings"

cg "github.com/philippgille/chromem-go"
)

func BuildWhereDocumentClause(whereDocs []cg.WhereDocument, joinOperator string) (string, error) {
if len(whereDocs) == 0 {
return "TRUE", nil
}
if joinOperator == "" {
joinOperator = "AND"
}
joinOperator = fmt.Sprintf(" %s ", strings.TrimSpace(joinOperator)) // ensure space around operator
var whereClauses []string
for _, wd := range whereDocs {
switch wd.Operator {
case cg.WhereDocumentOperatorAnd:
wc, err := BuildWhereDocumentClause(wd.WhereDocuments, "AND")
if err != nil {
return "", err
}
whereClauses = append(whereClauses, fmt.Sprintf("(%s)", wc))
case cg.WhereDocumentOperatorOr:
wc, err := BuildWhereDocumentClause(wd.WhereDocuments, "OR")
if err != nil {
return "", err
}
whereClauses = append(whereClauses, fmt.Sprintf("(%s)", wc))
case cg.WhereDocumentOperatorEquals:
whereClauses = append(whereClauses, fmt.Sprintf("document = '%s'", wd.Value))
case cg.WhereDocumentOperatorContains:
whereClauses = append(whereClauses, fmt.Sprintf("document LIKE '%%%s%%'", wd.Value))
case cg.WhereDocumentOperatorNotContains:
whereClauses = append(whereClauses, fmt.Sprintf("document NOT LIKE '%%%s%%'", wd.Value))
}
}
return strings.Join(whereClauses, joinOperator), nil
}
107 changes: 107 additions & 0 deletions knowledge/pkg/vectorstore/helper/sql_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package helper

import (
"testing"

cg "github.com/philippgille/chromem-go"
"github.com/stretchr/testify/assert"
)

func TestBuildWhereDocumentClause_EmptyInput_TRUEClause(t *testing.T) {
var whereDocs []cg.WhereDocument
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "TRUE", whereClause)
}

func TestBuildWhereDocumentClause_SingleEqualsCondition_ReturnsCorrectClause(t *testing.T) {
whereDocs := []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test"},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "document = 'test'", whereClause)
}

func TestBuildWhereDocumentClause_SingleContainsCondition_ReturnsCorrectClause(t *testing.T) {
whereDocs := []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorContains, Value: "test"},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "document LIKE '%test%'", whereClause)
}

func TestBuildWhereDocumentClause_SingleNotContainsCondition_ReturnsCorrectClause(t *testing.T) {
whereDocs := []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorNotContains, Value: "test"},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "document NOT LIKE '%test%'", whereClause)
}

func TestBuildWhereDocumentClause_AndCondition_ReturnsCorrectClauses(t *testing.T) {
whereDocs := []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test1"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test2"},
},
},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "(document = 'test1' AND document = 'test2')", whereClause)
}

func TestBuildWhereDocumentClause_OrCondition_ReturnsCorrectClauses(t *testing.T) {
whereDocs := []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorOr,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test1"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test2"},
},
},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "OR")
assert.NoError(t, err)
assert.Equal(t, "(document = 'test1' OR document = 'test2')", whereClause)
}

func TestBuildWhereDocumentClause_Nested_ReturnsCorrectClauses(t *testing.T) {
whereDocs := []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorOr,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test1"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test2"},
},
},
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test3"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test4"},
},
},
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test5"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test6"},
},
},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test7"},
},
},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "(document = 'test1' OR document = 'test2') AND (document = 'test3' AND document = 'test4') AND ((document = 'test5' AND document = 'test6') AND document = 'test7')", whereClause)
}
Loading

0 comments on commit 73689ed

Please sign in to comment.