wip

sourcegraph · Jan 6, 2024 · 575a8c4 · 575a8c4
1 parent fb236c0
commit 575a8c4
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 35 deletions.
diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts
@@ -28,10 +28,8 @@ export interface CorpusIndex {
  * An indexed document.
  */
 export interface IndexedDoc {
-    docID: Doc['id']
-
+    doc: Pick<Doc, 'id' | 'url'>
     content: Pick<Content, 'title' | 'textContent'> | null
-    url: Doc['url']
 
     /** The SHA-256 hash of the indexed content (including chunks). */
     contentID: string
@@ -80,20 +78,16 @@ export async function indexCorpus(
     const cache = cacheStore ? createCache(cacheStore) : noopCache
 
     // TODO(sqs): index takes ~235ms
-    console.time('index-docs')
     const indexedDocs = await cachedIndexCorpusDocs(corpus, { contentExtractor }, cache)
-    console.timeEnd('index-docs')
 
-    console.time('index-tfidf')
     const tfidf = await cachedCreateTFIDFIndex(indexedDocs, cache)
-    console.timeEnd('index-tfidf')
 
     const index: CorpusIndex = {
         data: corpus,
         docs: indexedDocs,
         tfidf,
         doc(id) {
-            const doc = indexedDocs.find(d => d.docID === id)
+            const doc = indexedDocs.find(d => d.doc.id === id)
             if (!doc) {
                 throw new Error(`no document with id ${id} in corpus`)
             }
@@ -115,8 +109,7 @@ async function indexCorpusDocs(
             const content = contentExtractor ? await contentExtractor.extractContent(doc) : null
             const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') })
             return {
-                docID: doc.id,
-                url: doc.url,
+                doc: { id: doc.id, url: doc.url },
                 content:
                     content?.title && content?.textContent
                         ? { title: content.title, textContent: content.textContent }

diff --git a/provider/docs/src/corpus/search/embeddings.ts b/provider/docs/src/corpus/search/embeddings.ts
@@ -4,6 +4,7 @@ import { type CorpusIndex, type CorpusSearchResult, type Query } from '..'
 import { isWebWindowRuntime, useWebWorker } from '../../env'
 import { type Logger } from '../../logger'
 import { embedTextOnWorker } from '../../mlWorker/webWorkerClient'
+import { terms } from './terms'
 
 // TODO(sqs): think we can remove this entirely...
 //
@@ -31,14 +32,14 @@ export async function embeddingsSearch(index: CorpusIndex, query: Query): Promis
     const textToEmbed = [query.meta?.activeFilename && `// ${query.meta?.activeFilename}`, query.text]
         .filter((s): s is string => Boolean(s))
         .join('\n')
-    const queryVec = await embedText(textToEmbed)
+    const queryVec = await embedText(terms(textToEmbed).join(' '))
     const cosSim = cosSimWith(queryVec)
 
-    const MIN_SCORE = 0.1
+    const MIN_SCORE = 0.25
 
     // Compute embeddings in parallel.
     const results: CorpusSearchResult[] = index.docs
-        .flatMap(({ docID, chunks }) =>
+        .flatMap(({ doc: { id: docID }, chunks }) =>
             chunks.map((chunk, i) => {
                 const score = cosSim(chunk.embeddings)
                 return score >= MIN_SCORE
@@ -50,7 +51,7 @@ export async function embeddingsSearch(index: CorpusIndex, query: Query): Promis
 
     results.sort((a, b) => b.score - a.score)
 
-    return results.slice(0, 1)
+    return results
 }
 
 const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {})

diff --git a/provider/docs/src/corpus/search/keyword.ts b/provider/docs/src/corpus/search/keyword.ts
@@ -3,19 +3,19 @@ import { terms } from './terms'
 import { computeTFIDF } from './tfidf'
 
 export function keywordSearch(index: CorpusIndex, query: Query): CorpusSearchResult[] {
-    console.time('kw' + query.text.length)
-
     const queryTerms = terms(query.text).filter(term => term.length >= 3)
 
     const results: CorpusSearchResult[] = []
-    for (const { docID, chunks } of index.docs) {
+    for (const {
+        doc: { id: docID },
+        chunks,
+    } of index.docs) {
         for (const [i, chunk] of chunks.entries()) {
             const score = queryTerms.reduce((score, term) => score + computeTFIDF(term, docID, i, index.tfidf), 0)
             if (score > 0) {
                 results.push({ doc: docID, chunk: i, score, excerpt: chunk.text })
             }
         }
     }
-    console.timeEnd('kw' + query.text.length)
     return results
 }
diff --git a/provider/docs/src/corpus/search/multi.ts b/provider/docs/src/corpus/search/multi.ts
@@ -48,12 +48,9 @@ export async function multiSearch(
         docResults.set(result.chunk, { ...chunkResult, score: chunkResult.score + result.score })
     }
 
-    const MIN_SCORE = 0.5
-
-    const results = Array.from(combinedResults.values())
-        .flatMap(docResults => Array.from(docResults.values()))
-        .filter(s => s.score >= MIN_SCORE)
-    return results.toSorted((a, b) => b.score - a.score)
+    const results = Array.from(combinedResults.values()).flatMap(docResults => Array.from(docResults.values()))
+    const MIN_SCORE = 0.3
+    return results.filter(s => s.score >= MIN_SCORE).toSorted((a, b) => b.score - a.score)
 }
 
 const SEARCH_METHODS: Record<

diff --git a/provider/docs/src/corpus/search/tfidf.ts b/provider/docs/src/corpus/search/tfidf.ts
@@ -35,7 +35,10 @@ export function createTFIDFIndex(docs: IndexedDoc[]): TFIDFIndex {
 
     let totalChunks = 0
 
-    for (const { docID, chunks } of docs) {
+    for (const {
+        doc: { id: docID },
+        chunks,
+    } of docs) {
         const docTermFrequency: Map<Term, number>[] = new Array<Map<Term, number>>(chunks.length)
         termFrequency.set(docID, docTermFrequency)
 
@@ -88,13 +91,7 @@ export interface TFIDFIndex {
  * Compute the TF-IDF for a term in a document chunk using an index created by
  * {@link createTFIDFIndex}.
  */
-export function computeTFIDF(termRaw: string, doc: DocID, chunk: ChunkIndex, index: TFIDFIndex): number {
-    const processedTerms = terms(termRaw)
-    if (processedTerms.length !== 1) {
-        throw new Error(`term ${JSON.stringify(termRaw)} is not a single term`)
-    }
-    const term = processedTerms[0]
-
+export function computeTFIDF(term: Term, doc: DocID, chunk: ChunkIndex, index: TFIDFIndex): number {
     const docTermLength = index.termLength.get(doc)
     if (!docTermLength) {
         throw new Error(`doc ${doc} not found in termLength`)

diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts
@@ -69,15 +69,15 @@ export default multiplex<Settings>(async settings => {
                         meta: { activeFilename: params.file },
                     })
                     for (const [i, sr] of searchResults.entries()) {
-                        const MAX_RESULTS = 4
+                        const MAX_RESULTS = 5
                         if (i >= MAX_RESULTS) {
                             break
                         }
 
                         const doc = index.doc(sr.doc)
                         result.push({
-                            title: doc.content?.title || doc.url || 'Untitled',
-                            url: doc.url,
+                            title: doc.content?.title || doc.doc?.url || 'Untitled',
+                            url: doc.doc?.url,
                             ui: {
                                 detail: truncate(doc.content?.textContent || sr.excerpt, 200),
                                 format: 'plaintext',