Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: knowledge - re-use embeddings on document layer (copy embeddings from docs with same content) #444

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion knowledge/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ toolchain go1.23.2
replace (
github.com/hupe1980/golc => github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7 // nbformat extension
github.com/ledongthuc/pdf => github.com/iwilltry42/pdf v0.0.0-20240517145113-99fbaebc5dd3 // fix for reading some PDFs: https://github.com/ledongthuc/pdf/pull/36 + https://github.com/iwilltry42/pdf/pull/2
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20241108083648-75828c6a4726
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20250218054308-81ac4c30d459
github.com/tmc/langchaingo => github.com/StrongMonkey/langchaingo v0.0.0-20240617180437-9af4bee04c8b // Context-Aware Markdown Splitting
)

Expand Down
4 changes: 2 additions & 2 deletions knowledge/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,8 @@ github.com/invopop/yaml v0.3.1 h1:f0+ZpmhfBSS4MhG+4HYseMdJhoeeopbSKbq5Rpeelso=
github.com/invopop/yaml v0.3.1/go.mod h1:PMOp3nn4/12yEZUFfmOuNHJsZToEEOwoWsT+D81KkeA=
github.com/iwilltry42/bm25-go v0.0.0-20240909111832-a928590cc9da h1:aqahDXw6bOtbupzBGZpnV57M3JShpN0jrtd6cyclH8I=
github.com/iwilltry42/bm25-go v0.0.0-20240909111832-a928590cc9da/go.mod h1:AmA5WoRtPljzo03tlOVOoLKPh9doUNUQrtrHaq5VkUg=
github.com/iwilltry42/chromem-go v0.0.0-20241108083648-75828c6a4726 h1:/Td69qGNjjiyDT4tBUdVi+6hW0HEBFExZwcCYXpzxC0=
github.com/iwilltry42/chromem-go v0.0.0-20241108083648-75828c6a4726/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/chromem-go v0.0.0-20250218054308-81ac4c30d459 h1:QLcDKRY6+D/u55btOQdU9gzA+h4g6JTPCevUG1zmW6s=
github.com/iwilltry42/chromem-go v0.0.0-20250218054308-81ac4c30d459/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7 h1:2AzzbKVW1iP2F+ovqJKq801l6tgxYPt9m2zFKbs+i/Y=
github.com/iwilltry42/golc v0.0.113-0.20240802113826-d065a3c5b0c7/go.mod h1:w692KzkSTSvXROfyu+jYauNXB4YaL1s8zHPDMnNW88o=
github.com/iwilltry42/pdf v0.0.0-20240517145113-99fbaebc5dd3 h1:rCVwFT7Q+HxpijWfSzKTYX4pCDMS7oy/I/WzU30VXyI=
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type SharedIngestionOpts struct {
IngestionFlows []flows.IngestionFlow
IsDuplicateFuncName string
Metadata map[string]string
ReuseEmbeddings bool
}

type IngestPathsOpts struct {
Expand Down
2 changes: 2 additions & 0 deletions knowledge/pkg/client/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ func (c *StandaloneClient) IngestFromWorkspace(ctx context.Context, datasetID st
IsDuplicateFuncName: opts.IsDuplicateFuncName,
ExtraMetadata: meta,
IngestionFlows: opts.IngestionFlows,
ReuseEmbeddings: opts.ReuseEmbeddings,
}

_, err = c.Ingest(log.ToCtx(ctx, log.FromCtx(ctx).With("filepath", file).With("absolute_path", iopts.FileMetadata.AbsolutePath)), datasetID, finfo.Name, fileContent, iopts)
Expand Down Expand Up @@ -163,6 +164,7 @@ func (c *StandaloneClient) IngestPaths(ctx context.Context, datasetID string, op
},
IsDuplicateFuncName: opts.IsDuplicateFuncName,
ExtraMetadata: extraMetadata,
ReuseEmbeddings: opts.ReuseEmbeddings,
}

if opts != nil {
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/cmd/askdir.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func (s *ClientAskDir) Run(cmd *cobra.Command, args []string) error {
ingestOpts := &client.IngestPathsOpts{
SharedIngestionOpts: client.SharedIngestionOpts{
IsDuplicateFuncName: s.DeduplicationFuncName,
ReuseEmbeddings: true,
},
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func (s *ClientIngest) run(ctx context.Context, filePath string) error {
SharedIngestionOpts: client.SharedIngestionOpts{
IsDuplicateFuncName: s.DeduplicationFuncName,
Metadata: metadata,
ReuseEmbeddings: true,
},
IgnoreExtensions: strings.Split(s.IgnoreExtensions, ","),
Concurrency: s.Concurrency,
Expand Down
15 changes: 15 additions & 0 deletions knowledge/pkg/datastore/datastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/gptscript-ai/knowledge/pkg/config"
etypes "github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/types"
"github.com/gptscript-ai/knowledge/pkg/index/types"
"github.com/gptscript-ai/knowledge/pkg/log"
"github.com/gptscript-ai/knowledge/pkg/output"

Expand Down Expand Up @@ -108,6 +109,20 @@ func NewDatastore(ctx context.Context, indexDSN string, automigrate bool, vector
return ds, nil
}

func (s *Datastore) GetDatasetForDocument(ctx context.Context, documentID string) (*types.Dataset, error) {
docIdx, err := s.Index.GetDocumentByID(ctx, documentID)
if err != nil {
return nil, err
}

dataset, err := s.GetDataset(ctx, docIdx.Dataset, nil)
if err != nil {
return nil, err
}

return dataset, nil
}

func (s *Datastore) Close() error {
var errmsgs []string
if err := s.Index.Close(); err != nil {
Expand Down
5 changes: 3 additions & 2 deletions knowledge/pkg/datastore/embeddings/embeddings.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ package embeddings
import (
"errors"
"fmt"
"reflect"
"strings"

"github.com/gptscript-ai/knowledge/pkg/config"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/cohere"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/jina"
Expand All @@ -14,8 +17,6 @@ import (
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/types"
"github.com/gptscript-ai/knowledge/pkg/datastore/embeddings/vertex"
"github.com/mitchellh/mapstructure"
"reflect"
"strings"
)

func GetSelectedEmbeddingsModelProvider(selected string, embeddingsConfig config.EmbeddingsConfig) (types.EmbeddingModelProvider, error) {
Expand Down
58 changes: 57 additions & 1 deletion knowledge/pkg/datastore/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/gptscript-ai/knowledge/pkg/log"
"github.com/gptscript-ai/knowledge/pkg/output"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore/types"
cg "github.com/philippgille/chromem-go"

"github.com/google/uuid"
"github.com/gptscript-ai/knowledge/pkg/datastore/filetypes"
Expand All @@ -27,6 +28,7 @@ type IngestOpts struct {
IsDuplicateFunc IsDuplicateFunc
IngestionFlows []flows.IngestionFlow
ExtraMetadata map[string]any
ReuseEmbeddings bool
}

// Ingest loads a document from a reader and adds it to the dataset.
Expand Down Expand Up @@ -159,7 +161,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, filename strin
}

// Mandatory Transformation: Add filename to metadata -> append extraMetadata, but do not override filename or absPath
metadata := map[string]any{"filename": filename, "absPath": opts.FileMetadata.AbsolutePath, "fileSize": opts.FileMetadata.Size}
metadata := map[string]any{"filename": filename, "absPath": opts.FileMetadata.AbsolutePath, "fileSize": opts.FileMetadata.Size, "embeddingModel": s.EmbeddingModelProvider.EmbeddingModelName()}
for k, v := range opts.ExtraMetadata {
if _, ok := metadata[k]; !ok {
metadata[k] = v
Expand Down Expand Up @@ -198,6 +200,60 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, filename strin
statusLog = statusLog.With("num_documents", len(docs))
ctx = log.ToCtx(ctx, statusLog)

if opts.ReuseEmbeddings {
slog.Debug("Checking if existing embeddings can be reused", "count", len(docs))
for i, doc := range docs {
existingDocs, err := s.Vectorstore.GetDocuments(ctx, "", nil, []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorEquals,
Value: doc.Content,
},
})
if err != nil {
slog.Debug("failed to get documents for reuse", "error", err)
continue
}
if len(existingDocs) == 0 {
slog.Debug("no existing documents found for reuse")
continue
}
for _, existingDoc := range existingDocs {
if emb, ok := existingDoc.Metadata["embeddingModel"]; ok {
if emb == s.EmbeddingModelProvider.EmbeddingModelName() {
slog.Info("Reusing existing embedding", "docID", existingDoc.ID, "embeddingModelMeta", emb, "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
docs[i].Embedding = existingDoc.Embedding
break
} else {
slog.Debug("not using existing embedding", "docID", existingDoc.ID, "embeddingModel", emb, "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
continue
}
}

existingDocumentDataset, err := s.GetDatasetForDocument(ctx, existingDoc.ID)
if err != nil {
slog.Debug("failed to get document dataset", "error", err)
continue
}

if existingDocumentDataset.EmbeddingsProviderConfig != nil {
existingEmbeddingProvider, err := embeddings.ProviderFromConfig(*existingDocumentDataset.EmbeddingsProviderConfig)
if err != nil {
slog.Debug("failed to get embeddings model provider", "error", err)
continue
}
if existingEmbeddingProvider.EmbeddingModelName() == s.EmbeddingModelProvider.EmbeddingModelName() {
slog.Info("Reusing existing embedding", "docID", existingDoc.ID, "embeddingModel", existingEmbeddingProvider.EmbeddingModelName(), "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
docs[i].Embedding = existingDoc.Embedding
break
} else {
slog.Debug("not using existing embedding", "docID", existingDoc.ID, "embeddingModel", existingEmbeddingProvider.EmbeddingModelName(), "configuredModel", s.EmbeddingModelProvider.EmbeddingModelName())
continue
}
}
}
}
}

statusLog.Debug("Adding documents to vectorstore")
startTime := time.Now()
docIDs, err := s.Vectorstore.AddDocuments(ctx, docs, datasetID)
Expand Down
1 change: 1 addition & 0 deletions knowledge/pkg/index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type Index interface {
PruneFiles(ctx context.Context, datasetID string, pathPrefix string, keep []string) ([]types.File, error)

// Fundamental Document Operations
GetDocumentByID(ctx context.Context, documentID string) (*types.Document, error)
DeleteDocument(ctx context.Context, documentID, datasetID string) error

Close() error
Expand Down
4 changes: 4 additions & 0 deletions knowledge/pkg/index/postgres/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ func (i *Index) FindFileByMetadata(ctx context.Context, dataset string, metadata
return i.DB.FindFileByMetadata(ctx, dataset, metadata, includeDocuments)
}

func (i *Index) GetDocumentByID(ctx context.Context, documentID string) (*types.Document, error) {
return i.DB.GetDocument(ctx, documentID)
}

func (i *Index) DeleteDocument(ctx context.Context, documentID, datasetID string) error {
return i.DB.DeleteDocument(ctx, documentID, datasetID)
}
4 changes: 4 additions & 0 deletions knowledge/pkg/index/sqlite/sqlite.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ func (i *Index) FindFileByMetadata(ctx context.Context, dataset string, metadata
return i.DB.FindFileByMetadata(ctx, dataset, metadata, includeDocuments)
}

func (i *Index) GetDocumentByID(ctx context.Context, documentID string) (*types.Document, error) {
return i.DB.GetDocument(ctx, documentID)
}

func (i *Index) DeleteDocument(ctx context.Context, documentID, datasetID string) error {
return i.DB.DeleteDocument(ctx, documentID, datasetID)
}
10 changes: 10 additions & 0 deletions knowledge/pkg/index/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,16 @@ func (db *DB) FindFileByMetadata(ctx context.Context, dataset string, metadata F
return &file, nil
}

func (db *DB) GetDocument(ctx context.Context, documentID string) (*Document, error) {
var document Document
tx := db.WithContext(ctx).First(&document, "id = ?", documentID)
if tx.Error != nil {
return nil, ErrDBDocumentNotFound
}

return &document, nil
}

func (db *DB) DeleteDocument(ctx context.Context, documentID, datasetID string) error {
// Find in Database
var document Document
Expand Down
42 changes: 42 additions & 0 deletions knowledge/pkg/vectorstore/helper/sql.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package helper

import (
"fmt"
"strings"

cg "github.com/philippgille/chromem-go"
)

func BuildWhereDocumentClause(whereDocs []cg.WhereDocument, joinOperator string) (string, error) {
if len(whereDocs) == 0 {
return "TRUE", nil
}
if joinOperator == "" {
joinOperator = "AND"
}
joinOperator = fmt.Sprintf(" %s ", strings.TrimSpace(joinOperator)) // ensure space around operator
var whereClauses []string
for _, wd := range whereDocs {
switch wd.Operator {
case cg.WhereDocumentOperatorAnd:
wc, err := BuildWhereDocumentClause(wd.WhereDocuments, "AND")
if err != nil {
return "", err
}
whereClauses = append(whereClauses, fmt.Sprintf("(%s)", wc))
case cg.WhereDocumentOperatorOr:
wc, err := BuildWhereDocumentClause(wd.WhereDocuments, "OR")
if err != nil {
return "", err
}
whereClauses = append(whereClauses, fmt.Sprintf("(%s)", wc))
case cg.WhereDocumentOperatorEquals:
whereClauses = append(whereClauses, fmt.Sprintf("document = '%s'", wd.Value))
case cg.WhereDocumentOperatorContains:
whereClauses = append(whereClauses, fmt.Sprintf("document LIKE '%%%s%%'", wd.Value))
case cg.WhereDocumentOperatorNotContains:
whereClauses = append(whereClauses, fmt.Sprintf("document NOT LIKE '%%%s%%'", wd.Value))
}
}
return strings.Join(whereClauses, joinOperator), nil
}
107 changes: 107 additions & 0 deletions knowledge/pkg/vectorstore/helper/sql_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package helper

import (
"testing"

cg "github.com/philippgille/chromem-go"
"github.com/stretchr/testify/assert"
)

func TestBuildWhereDocumentClause_EmptyInput_TRUEClause(t *testing.T) {
var whereDocs []cg.WhereDocument
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "TRUE", whereClause)
}

func TestBuildWhereDocumentClause_SingleEqualsCondition_ReturnsCorrectClause(t *testing.T) {
whereDocs := []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test"},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "document = 'test'", whereClause)
}

func TestBuildWhereDocumentClause_SingleContainsCondition_ReturnsCorrectClause(t *testing.T) {
whereDocs := []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorContains, Value: "test"},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "document LIKE '%test%'", whereClause)
}

func TestBuildWhereDocumentClause_SingleNotContainsCondition_ReturnsCorrectClause(t *testing.T) {
whereDocs := []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorNotContains, Value: "test"},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "document NOT LIKE '%test%'", whereClause)
}

func TestBuildWhereDocumentClause_AndCondition_ReturnsCorrectClauses(t *testing.T) {
whereDocs := []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test1"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test2"},
},
},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "(document = 'test1' AND document = 'test2')", whereClause)
}

func TestBuildWhereDocumentClause_OrCondition_ReturnsCorrectClauses(t *testing.T) {
whereDocs := []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorOr,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test1"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test2"},
},
},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "OR")
assert.NoError(t, err)
assert.Equal(t, "(document = 'test1' OR document = 'test2')", whereClause)
}

func TestBuildWhereDocumentClause_Nested_ReturnsCorrectClauses(t *testing.T) {
whereDocs := []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorOr,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test1"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test2"},
},
},
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test3"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test4"},
},
},
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{
Operator: cg.WhereDocumentOperatorAnd,
WhereDocuments: []cg.WhereDocument{
{Operator: cg.WhereDocumentOperatorEquals, Value: "test5"},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test6"},
},
},
{Operator: cg.WhereDocumentOperatorEquals, Value: "test7"},
},
},
}
whereClause, err := BuildWhereDocumentClause(whereDocs, "AND")
assert.NoError(t, err)
assert.Equal(t, "(document = 'test1' OR document = 'test2') AND (document = 'test3' AND document = 'test4') AND ((document = 'test5' AND document = 'test6') AND document = 'test7')", whereClause)
}
Loading
Loading