From e2781feea711c2426cb4214eef6990cc77130b92 Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Sat, 6 Jan 2024 22:35:14 -0800 Subject: [PATCH] wip --- provider/docs/bin/create-archive.ts | 9 ++-- provider/docs/bin/create-index.ts | 43 ++++++++++++++++++ provider/docs/bin/docs-query.ts | 4 +- .../docs/src/corpus/index/corpusIndex.test.ts | 4 +- provider/docs/src/corpus/index/corpusIndex.ts | 45 +++---------------- provider/docs/src/e2e.test.ts | 4 +- provider/docs/src/provider/provider.ts | 4 +- provider/docs/src/search/embeddings.test.ts | 11 +++-- provider/docs/src/search/keyword.test.ts | 6 ++- provider/docs/src/search/tfidf.test.ts | 4 +- 10 files changed, 77 insertions(+), 57 deletions(-) create mode 100644 provider/docs/bin/create-index.ts diff --git a/provider/docs/bin/create-archive.ts b/provider/docs/bin/create-archive.ts index 92542129..0c5d123a 100644 --- a/provider/docs/bin/create-archive.ts +++ b/provider/docs/bin/create-archive.ts @@ -72,6 +72,9 @@ const options = archiveHandler.toOptions ? archiveHandler.toOptions(optionsRaw) const archive = await archiveHandler.createFn(options) const data = JSON.stringify(archive, null, 2) -console.error(`# ${archive.docs.length} docs, ${(data.length / 1024 / 1024).toFixed(1)} MB`) -console.error(`# Content ID: ${archive.contentID}`) -console.log(data) +console.error( + `# Archive complete: ${archive.docs.length} docs (${(data.length / 1024 / 1024).toFixed(1)} MB), content ID: ${ + archive.contentID + }` +) +process.stdout.write(data) diff --git a/provider/docs/bin/create-index.ts b/provider/docs/bin/create-index.ts new file mode 100644 index 00000000..46bc9aa5 --- /dev/null +++ b/provider/docs/bin/create-index.ts @@ -0,0 +1,43 @@ +import path from 'path' +import { CorpusArchive } from '../src/corpus/archive/corpusArchive' +import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor' +import { createCorpusIndex } from '../src/corpus/index/corpusIndex' + +function usage(): void { + console.error() + console.error(`Usage: ${path.basename(process.argv[1])} < /path/to/archive.json`) + console.error() + console.error('Note: Use the `create-archive` script to create the archive.json file.') + process.exit(1) +} + +const args = process.argv.slice(2) +if (args.length !== 0) { + console.error('Error: invalid arguments') + usage() +} + +const archive: CorpusArchive = await readJSONFromStdin() +console.error(`# Indexing archive: ${archive.docs.length} docs, content ID ${archive.contentID}`) + +const index = await createCorpusIndex(archive, { contentExtractor: extractContentUsingMozillaReadability }) + +function readJSONFromStdin(): Promise { + return new Promise((resolve, reject) => { + const data: string[] = [] + process.stdin.on('data', chunk => { + data.push(chunk.toString('utf8')) + }) + process.stdin.once('end', () => { + try { + const json = JSON.parse(data.join('')) + resolve(json) + } catch (error) { + reject(error) + } + }) + process.stdin.once('error', error => { + reject(error) + }) + }) +} diff --git a/provider/docs/bin/docs-query.ts b/provider/docs/bin/docs-query.ts index f0a199a3..8451437c 100644 --- a/provider/docs/bin/docs-query.ts +++ b/provider/docs/bin/docs-query.ts @@ -4,7 +4,7 @@ import envPaths from 'env-paths' import { type CorpusArchive } from '../src/corpus/archive/corpusArchive' import { createFileSystemCacheStore } from '../src/corpus/cache/store/fs' import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor' -import { indexCorpus } from '../src/corpus/index/corpusIndex' +import { createCorpusIndex } from '../src/corpus/index/corpusIndex' const args = process.argv.slice(2) @@ -33,7 +33,7 @@ const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusA const cacheDir = envPaths('opencodegraph-provider-docs').cache const fsCache = createFileSystemCacheStore(cacheDir) -const corpus = await indexCorpus(corpusData, { +const corpus = await createCorpusIndex(corpusData, { cacheStore: fsCache, contentExtractor: extractContentUsingMozillaReadability, }) diff --git a/provider/docs/src/corpus/index/corpusIndex.test.ts b/provider/docs/src/corpus/index/corpusIndex.test.ts index b5248737..809052f2 100644 --- a/provider/docs/src/corpus/index/corpusIndex.test.ts +++ b/provider/docs/src/corpus/index/corpusIndex.test.ts @@ -1,7 +1,7 @@ import { describe, expect, test } from 'vitest' import { createCorpusArchive } from '../archive/corpusArchive' import { type Doc, type DocID } from '../doc/doc' -import { indexCorpus } from './corpusIndex' +import { createCorpusIndex } from './corpusIndex' export function doc(id: DocID, text: string): Doc { return { id, text } @@ -9,6 +9,6 @@ export function doc(id: DocID, text: string): Doc { describe('indexCorpus', () => { test('#docs', async () => { - expect((await indexCorpus(await createCorpusArchive([doc(1, 'a'), doc(2, 'b')]))).docs.length).toBe(2) + expect((await createCorpusIndex(await createCorpusArchive([doc(1, 'a'), doc(2, 'b')]))).docs.length).toBe(2) }) }) diff --git a/provider/docs/src/corpus/index/corpusIndex.ts b/provider/docs/src/corpus/index/corpusIndex.ts index 41382c29..3f5849fe 100644 --- a/provider/docs/src/corpus/index/corpusIndex.ts +++ b/provider/docs/src/corpus/index/corpusIndex.ts @@ -1,9 +1,7 @@ import { embedText } from '../../search/embeddings' import { createTFIDFIndex, type TFIDFIndex } from '../../search/tfidf' import { type CorpusArchive } from '../archive/corpusArchive' -import { createCache, noopCache, type Cache, type CacheStore } from '../cache/cache' import { contentID } from '../cache/contentID' -import { memo } from '../cache/memo' import { chunk, type Chunk } from '../doc/chunks' import { type Content, type ContentExtractor } from '../doc/contentExtractor' import { type Doc } from '../doc/doc' @@ -30,53 +28,24 @@ export interface IndexedDoc { chunks: (Chunk & { embeddings: Float32Array })[] } -/** - * Options for indexing a corpus. - */ -export interface IndexOptions { - cacheStore?: CacheStore - contentExtractor?: ContentExtractor -} - /** * Index a corpus. */ -export async function indexCorpus( - corpus: CorpusArchive, - { cacheStore, contentExtractor }: IndexOptions = {} +export async function createCorpusIndex( + archive: CorpusArchive, + { contentExtractor }: { contentExtractor?: ContentExtractor } = {} ): Promise { - // TODO(sqs): remove cache, not needed, since this entire result is stored - - const cache = cacheStore ? createCache(cacheStore) : noopCache - - const indexedDocs = await cachedIndexCorpusDocs(corpus, { contentExtractor }, cache) - - const tfidf = await cachedCreateTFIDFIndex(indexedDocs, cache) - + const docs = await indexCorpusDocs(archive, { contentExtractor }) + const tfidf = createTFIDFIndex(docs) return { - docs: indexedDocs, + docs, tfidf, } } -async function cachedIndexCorpusDocs( - corpus: CorpusArchive, - options: Pick, - cache: Cache -): Promise { - const key = `indexCorpusDocs:${corpus.contentID}:${options.contentExtractor?.id ?? 'noContentExtractor'}` - return memo(cache, key, () => indexCorpusDocs(corpus, options)) -} - -async function cachedCreateTFIDFIndex(docs: IndexedDoc[], cache: Cache): Promise { - return memo(cache, `tfidfIndex:${await contentID(docs.map(doc => doc.contentID).join('\0'))}`, () => - Promise.resolve(createTFIDFIndex(docs)) - ) -} - async function indexCorpusDocs( corpus: CorpusArchive, - { contentExtractor }: Pick + { contentExtractor }: { contentExtractor?: ContentExtractor } ): Promise { return Promise.all( corpus.docs.map(async doc => { diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts index fac464c3..53901194 100644 --- a/provider/docs/src/e2e.test.ts +++ b/provider/docs/src/e2e.test.ts @@ -3,7 +3,7 @@ import path from 'node:path' import { describe, expect, test } from 'vitest' import { createClient } from './client/client' import { createCorpusArchive } from './corpus/archive/corpusArchive' -import { indexCorpus } from './corpus/index/corpusIndex' +import { createCorpusIndex } from './corpus/index/corpusIndex' import { type SearchResult } from './search/types' describe('e2e', () => { @@ -11,7 +11,7 @@ describe('e2e', () => { const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8') const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8') - const index = await indexCorpus(await createCorpusArchive([{ id: 1, text: docFile }])) + const index = await createCorpusIndex(await createCorpusArchive([{ id: 1, text: docFile }])) const client = createClient(index) const results = await client.search({ text: codeFile }) roundScores(results) diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index 97f30ea8..bf76aa39 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -11,7 +11,7 @@ import { createIndexedDBCacheStore } from '../corpus/cache/store/indexedDB' import { createWebStorageCacheStore } from '../corpus/cache/store/localStorage' import { chunk } from '../corpus/doc/chunks' import { extractContentUsingMozillaReadability } from '../corpus/doc/contentExtractor' -import { indexCorpus } from '../corpus/index/corpusIndex' +import { createCorpusIndex } from '../corpus/index/corpusIndex' import { multiplex } from './multiplex' /** Settings for the docs OpenCodeGraph provider. */ @@ -46,7 +46,7 @@ export default multiplex(async settings => { ignore: settings.corpus.ignore, logger: message => console.log(message), }) - const index = await indexCorpus(await createCorpusArchive(await source.docs()), { + const index = await createCorpusIndex(await createCorpusArchive(await source.docs()), { cacheStore: CORPUS_CACHE, contentExtractor: extractContentUsingMozillaReadability, logger: console.debug, diff --git a/provider/docs/src/search/embeddings.test.ts b/provider/docs/src/search/embeddings.test.ts index a7985b37..f885e9fc 100644 --- a/provider/docs/src/search/embeddings.test.ts +++ b/provider/docs/src/search/embeddings.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from 'vitest' import { createCorpusArchive } from '../corpus/archive/corpusArchive' -import { indexCorpus } from '../corpus/index/corpusIndex' +import { createCorpusIndex } from '../corpus/index/corpusIndex' import { doc } from '../corpus/index/corpusIndex.test' import { embeddingsSearch, embedTextInThisScope, similarity } from './embeddings' import { type SearchResult } from './types' @@ -8,9 +8,12 @@ import { type SearchResult } from './types' describe('embeddingsSearch', () => { test('finds matches', async () => { expect( - await embeddingsSearch(await indexCorpus(await createCorpusArchive([doc(1, 'xxxxxx'), doc(2, 'b')])), { - text: 'b', - }) + await embeddingsSearch( + await createCorpusIndex(await createCorpusArchive([doc(1, 'xxxxxx'), doc(2, 'b')])), + { + text: 'b', + } + ) ).toEqual([{ doc: 2, chunk: 0, score: 1, excerpt: 'b' }]) }) }) diff --git a/provider/docs/src/search/keyword.test.ts b/provider/docs/src/search/keyword.test.ts index 8b464ae0..618dd63c 100644 --- a/provider/docs/src/search/keyword.test.ts +++ b/provider/docs/src/search/keyword.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from 'vitest' import { createCorpusArchive } from '../corpus/archive/corpusArchive' -import { indexCorpus } from '../corpus/index/corpusIndex' +import { createCorpusIndex } from '../corpus/index/corpusIndex' import { doc } from '../corpus/index/corpusIndex.test' import { keywordSearch } from './keyword' import { calculateTFIDF } from './tfidf' @@ -9,7 +9,9 @@ import { type SearchResult } from './types' describe('keywordSearch', () => { test('finds matches', async () => { expect( - keywordSearch(await indexCorpus(await createCorpusArchive([doc(1, 'aaa'), doc(2, 'bbb')])), { text: 'bbb' }) + keywordSearch(await createCorpusIndex(await createCorpusArchive([doc(1, 'aaa'), doc(2, 'bbb')])), { + text: 'bbb', + }) ).toEqual([ { doc: 2, diff --git a/provider/docs/src/search/tfidf.test.ts b/provider/docs/src/search/tfidf.test.ts index 32ae76f8..880047b0 100644 --- a/provider/docs/src/search/tfidf.test.ts +++ b/provider/docs/src/search/tfidf.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from 'vitest' import { createCorpusArchive } from '../corpus/archive/corpusArchive' -import { indexCorpus } from '../corpus/index/corpusIndex' +import { createCorpusIndex } from '../corpus/index/corpusIndex' import { calculateTFIDF, computeTFIDF, createTFIDFIndex } from './tfidf' describe('createTFIDFIndex', async () => { @@ -10,7 +10,7 @@ describe('createTFIDFIndex', async () => { { id: 3, text: 'c d e' }, ]) const docIDs = data.docs.map(({ id }) => id) - const index = await indexCorpus(data) + const index = await createCorpusIndex(data) const tfidfIndex = createTFIDFIndex(index.docs) test('term in 1 doc', () => {