From db356ed9e0dca529ad16014d9cd41730340740b3 Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Mon, 10 Feb 2025 13:18:29 +0100 Subject: [PATCH] enhance: add file source and location metadata to knowledge results --- knowledge/pkg/datastore/retrieve.go | 10 +++- result-formatter/main.go | 89 +++++++++++++++++++++-------- 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/knowledge/pkg/datastore/retrieve.go b/knowledge/pkg/datastore/retrieve.go index 8f77f1ee..532de7fc 100644 --- a/knowledge/pkg/datastore/retrieve.go +++ b/knowledge/pkg/datastore/retrieve.go @@ -100,5 +100,13 @@ func (s *Datastore) SimilaritySearch(ctx context.Context, query string, numDocum } } } - return s.Vectorstore.SimilaritySearch(ctx, query, numDocuments, datasetID, where, whereDocument, ef) + docs, err := s.Vectorstore.SimilaritySearch(ctx, query, numDocuments, datasetID, where, whereDocument, ef) + if err != nil { + return nil, err + } + for i, doc := range docs { + doc.Metadata["datasetID"] = datasetID + docs[i] = doc + } + return docs, nil } diff --git a/result-formatter/main.go b/result-formatter/main.go index 0a745244..871f6abb 100644 --- a/result-formatter/main.go +++ b/result-formatter/main.go @@ -4,8 +4,9 @@ import ( "context" "encoding/json" "fmt" + "log/slog" + neturl "net/url" "os" - "strconv" "strings" "sync" @@ -22,14 +23,27 @@ type subqueryResults struct { } type document struct { - ID string `json:"id"` - Content string `json:"content,omitempty"` - Metadata map[string]any `json:"metadata,omitempty"` + ID string `json:"id"` + Content string `json:"content,omitempty"` + Metadata metadata `json:"metadata,omitempty"` +} + +type metadata struct { + Source string `json:"source,omitempty"` + WorkspaceID string `json:"workspaceID,omitempty"` + URL string `json:"url,omitempty"` + Pages string `json:"pages,omitempty"` + Page int `json:"page,omitempty"` + TotalPages int `json:"totalPages,omitempty"` + FileSize int `json:"fileSize,omitempty"` + WorkspaceFileName string `json:"workspaceFileName,omitempty"` // workspaceFileName is the location of the converted file, not the original file - e.g. /foo.pdf.json + DatasetID string `json:"datasetID,omitempty"` } type hit struct { - URL string `json:"url,omitempty"` - Content string `json:"content,omitempty"` + URL string `json:"url,omitempty"` // URL should be the original source of the document (Web URL, OneDrive Link, etc.) + Location string `json:"location,omitempty"` // Location should be the location of the result in the original source (page numbers, etc.) + Content string `json:"content,omitempty"` // Content should be the text content of the document } type inputContent struct { @@ -44,14 +58,14 @@ func main() { ctx = context.Background() ) - // This is ugly code, I know. Beauty comes later. + // This is ugly code, I know. Beauty comes later. Cleaned up a little. Still room for improvement. if clientErr != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to create gptscript client: %v\n", clientErr) + slog.Error("failed to create gptscript client", "error", clientErr) } if err := json.Unmarshal([]byte(out), &output); err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to unmarshal output: %v\n", err) + slog.Debug("failed to unmarshal output", "err", err) fmt.Print(out) return } @@ -59,7 +73,7 @@ func main() { var ( outDocs []hit wg sync.WaitGroup - fullyFetched = map[string]struct{}{} + fullyFetched = map[string]int{} // fullyFetched is a map of files that have been fully fetched from the workspace - the value is the index in outDocs budget = 120_000 ) @@ -68,27 +82,51 @@ func main() { break } for _, doc := range result.ResultDocuments { - filename, _ := doc.Metadata["workspaceFileName"].(string) - if _, ok := fullyFetched[filename]; ok { + filename := doc.Metadata.WorkspaceFileName + + // We parse the location regardless of the file potentially being fully fetched already to preserve the + // source reference metadata (i.e. where in the document the information was found). + // This is a UX thing to help users with manual proofreading of answers. + var location string + if doc.Metadata.Pages != "" { + location = "Pages " + doc.Metadata.Pages + } else if doc.Metadata.Page > 0 { + location = fmt.Sprintf("Page %d", doc.Metadata.Page) + } + if location != "" && doc.Metadata.TotalPages > 0 { + location = fmt.Sprintf("%s of %d", location, doc.Metadata.TotalPages) + slog.Debug("result doc in file", "filename", filename, "location", location) + } + + if ffi, ok := fullyFetched[filename]; ok { + if location != "" { + outDocs[ffi].Location += " and " + location + } continue } - url, _ := doc.Metadata["url"].(string) + var url string + if doc.Metadata.URL != "" { + url = doc.Metadata.URL + } else if doc.Metadata.Source != "" { + url = "knowledge://" + neturl.PathEscape(doc.Metadata.DatasetID+"::"+strings.TrimPrefix(doc.Metadata.Source, "ws://")) // ::, where datasetID is / + } + outDocs = append(outDocs, hit{ - URL: url, - Content: doc.Content, + URL: url, + Content: doc.Content, + Location: location, }) index := len(outDocs) - 1 if index < 3 && clientErr == nil { - fileSize, _ := doc.Metadata["fileSize"].(string) - size, _ := strconv.Atoi(fileSize) - workspaceID, _ := doc.Metadata["workspaceID"].(string) - if size > 5_000 && size < budget && workspaceID != "" { - _, _ = fmt.Fprintf(os.Stderr, "reading file in workspace: %s\n", filename) - fullyFetched[filename] = struct{}{} - budget -= size + fileSize := doc.Metadata.FileSize + workspaceID := doc.Metadata.WorkspaceID + if fileSize > 5_000 && fileSize < budget && workspaceID != "" { + slog.Debug("fetching full file from workspace", "file", filename, "sizeInBytes", fileSize) + fullyFetched[filename] = index + budget -= fileSize wg.Add(1) go func() { @@ -98,13 +136,13 @@ func main() { WorkspaceID: workspaceID, }) if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to read file in workspace: %v\n", err) + slog.Error("failed to read file in workspace", "error", err) return } var sourceContent inputContent if err := json.Unmarshal(content, &sourceContent); err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to unmarshal content: %v\n", err) + slog.Error("failed to unmarshal content", "error", err) return } @@ -115,10 +153,11 @@ func main() { if buffer.Len() > 0 { outDocs[index].Content = buffer.String() + outDocs[index].Location = "Full Document. Specifically " + outDocs[index].Location } }() } else { - _, _ = fmt.Fprintf(os.Stderr, "file size is not within the range: %s %s %d %d\n", workspaceID, filename, size, budget) + slog.Debug("filesize is not within range", "filename", fmt.Sprintf("%s/%s", workspaceID, filename), "filesize", fileSize, "budget", budget) } } }