Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Jan 6, 2024
1 parent fb236c0 commit 575a8c4
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 35 deletions.
13 changes: 3 additions & 10 deletions provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@ export interface CorpusIndex {
* An indexed document.
*/
export interface IndexedDoc {
docID: Doc['id']

doc: Pick<Doc, 'id' | 'url'>
content: Pick<Content, 'title' | 'textContent'> | null
url: Doc['url']

/** The SHA-256 hash of the indexed content (including chunks). */
contentID: string
Expand Down Expand Up @@ -80,20 +78,16 @@ export async function indexCorpus(
const cache = cacheStore ? createCache(cacheStore) : noopCache

// TODO(sqs): index takes ~235ms
console.time('index-docs')
const indexedDocs = await cachedIndexCorpusDocs(corpus, { contentExtractor }, cache)
console.timeEnd('index-docs')

console.time('index-tfidf')
const tfidf = await cachedCreateTFIDFIndex(indexedDocs, cache)
console.timeEnd('index-tfidf')

const index: CorpusIndex = {
data: corpus,
docs: indexedDocs,
tfidf,
doc(id) {
const doc = indexedDocs.find(d => d.docID === id)
const doc = indexedDocs.find(d => d.doc.id === id)
if (!doc) {
throw new Error(`no document with id ${id} in corpus`)
}
Expand All @@ -115,8 +109,7 @@ async function indexCorpusDocs(
const content = contentExtractor ? await contentExtractor.extractContent(doc) : null
const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') })
return {
docID: doc.id,
url: doc.url,
doc: { id: doc.id, url: doc.url },
content:
content?.title && content?.textContent
? { title: content.title, textContent: content.textContent }
Expand Down
9 changes: 5 additions & 4 deletions provider/docs/src/corpus/search/embeddings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { type CorpusIndex, type CorpusSearchResult, type Query } from '..'
import { isWebWindowRuntime, useWebWorker } from '../../env'
import { type Logger } from '../../logger'
import { embedTextOnWorker } from '../../mlWorker/webWorkerClient'
import { terms } from './terms'

// TODO(sqs): think we can remove this entirely...
//
Expand Down Expand Up @@ -31,14 +32,14 @@ export async function embeddingsSearch(index: CorpusIndex, query: Query): Promis
const textToEmbed = [query.meta?.activeFilename && `// ${query.meta?.activeFilename}`, query.text]
.filter((s): s is string => Boolean(s))
.join('\n')
const queryVec = await embedText(textToEmbed)
const queryVec = await embedText(terms(textToEmbed).join(' '))
const cosSim = cosSimWith(queryVec)

const MIN_SCORE = 0.1
const MIN_SCORE = 0.25

// Compute embeddings in parallel.
const results: CorpusSearchResult[] = index.docs
.flatMap(({ docID, chunks }) =>
.flatMap(({ doc: { id: docID }, chunks }) =>
chunks.map((chunk, i) => {
const score = cosSim(chunk.embeddings)
return score >= MIN_SCORE
Expand All @@ -50,7 +51,7 @@ export async function embeddingsSearch(index: CorpusIndex, query: Query): Promis

results.sort((a, b) => b.score - a.score)

return results.slice(0, 1)
return results
}

const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {})
Expand Down
8 changes: 4 additions & 4 deletions provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@ import { terms } from './terms'
import { computeTFIDF } from './tfidf'

export function keywordSearch(index: CorpusIndex, query: Query): CorpusSearchResult[] {
console.time('kw' + query.text.length)

const queryTerms = terms(query.text).filter(term => term.length >= 3)

const results: CorpusSearchResult[] = []
for (const { docID, chunks } of index.docs) {
for (const {
doc: { id: docID },
chunks,
} of index.docs) {
for (const [i, chunk] of chunks.entries()) {
const score = queryTerms.reduce((score, term) => score + computeTFIDF(term, docID, i, index.tfidf), 0)
if (score > 0) {
results.push({ doc: docID, chunk: i, score, excerpt: chunk.text })
}
}
}
console.timeEnd('kw' + query.text.length)
return results
}
9 changes: 3 additions & 6 deletions provider/docs/src/corpus/search/multi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,9 @@ export async function multiSearch(
docResults.set(result.chunk, { ...chunkResult, score: chunkResult.score + result.score })
}

const MIN_SCORE = 0.5

const results = Array.from(combinedResults.values())
.flatMap(docResults => Array.from(docResults.values()))
.filter(s => s.score >= MIN_SCORE)
return results.toSorted((a, b) => b.score - a.score)
const results = Array.from(combinedResults.values()).flatMap(docResults => Array.from(docResults.values()))
const MIN_SCORE = 0.3
return results.filter(s => s.score >= MIN_SCORE).toSorted((a, b) => b.score - a.score)
}

const SEARCH_METHODS: Record<
Expand Down
13 changes: 5 additions & 8 deletions provider/docs/src/corpus/search/tfidf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ export function createTFIDFIndex(docs: IndexedDoc[]): TFIDFIndex {

let totalChunks = 0

for (const { docID, chunks } of docs) {
for (const {
doc: { id: docID },
chunks,
} of docs) {
const docTermFrequency: Map<Term, number>[] = new Array<Map<Term, number>>(chunks.length)
termFrequency.set(docID, docTermFrequency)

Expand Down Expand Up @@ -88,13 +91,7 @@ export interface TFIDFIndex {
* Compute the TF-IDF for a term in a document chunk using an index created by
* {@link createTFIDFIndex}.
*/
export function computeTFIDF(termRaw: string, doc: DocID, chunk: ChunkIndex, index: TFIDFIndex): number {
const processedTerms = terms(termRaw)
if (processedTerms.length !== 1) {
throw new Error(`term ${JSON.stringify(termRaw)} is not a single term`)
}
const term = processedTerms[0]

export function computeTFIDF(term: Term, doc: DocID, chunk: ChunkIndex, index: TFIDFIndex): number {
const docTermLength = index.termLength.get(doc)
if (!docTermLength) {
throw new Error(`doc ${doc} not found in termLength`)
Expand Down
6 changes: 3 additions & 3 deletions provider/docs/src/provider/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ export default multiplex<Settings>(async settings => {
meta: { activeFilename: params.file },
})
for (const [i, sr] of searchResults.entries()) {
const MAX_RESULTS = 4
const MAX_RESULTS = 5
if (i >= MAX_RESULTS) {
break
}

const doc = index.doc(sr.doc)
result.push({
title: doc.content?.title || doc.url || 'Untitled',
url: doc.url,
title: doc.content?.title || doc.doc?.url || 'Untitled',
url: doc.doc?.url,
ui: {
detail: truncate(doc.content?.textContent || sr.excerpt, 200),
format: 'plaintext',
Expand Down

0 comments on commit 575a8c4

Please sign in to comment.