Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Jan 7, 2024
1 parent 4097244 commit e2781fe
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 57 deletions.
9 changes: 6 additions & 3 deletions provider/docs/bin/create-archive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ const options = archiveHandler.toOptions ? archiveHandler.toOptions(optionsRaw)
const archive = await archiveHandler.createFn(options)

const data = JSON.stringify(archive, null, 2)
console.error(`# ${archive.docs.length} docs, ${(data.length / 1024 / 1024).toFixed(1)} MB`)
console.error(`# Content ID: ${archive.contentID}`)
console.log(data)
console.error(
`# Archive complete: ${archive.docs.length} docs (${(data.length / 1024 / 1024).toFixed(1)} MB), content ID: ${
archive.contentID
}`
)
process.stdout.write(data)
43 changes: 43 additions & 0 deletions provider/docs/bin/create-index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import path from 'path'
import { CorpusArchive } from '../src/corpus/archive/corpusArchive'
import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor'
import { createCorpusIndex } from '../src/corpus/index/corpusIndex'

function usage(): void {
console.error()
console.error(`Usage: ${path.basename(process.argv[1])} < /path/to/archive.json`)
console.error()
console.error('Note: Use the `create-archive` script to create the archive.json file.')
process.exit(1)
}

const args = process.argv.slice(2)
if (args.length !== 0) {
console.error('Error: invalid arguments')
usage()
}

const archive: CorpusArchive = await readJSONFromStdin()
console.error(`# Indexing archive: ${archive.docs.length} docs, content ID ${archive.contentID}`)

const index = await createCorpusIndex(archive, { contentExtractor: extractContentUsingMozillaReadability })

function readJSONFromStdin(): Promise<any> {
return new Promise((resolve, reject) => {
const data: string[] = []
process.stdin.on('data', chunk => {
data.push(chunk.toString('utf8'))
})
process.stdin.once('end', () => {
try {
const json = JSON.parse(data.join(''))
resolve(json)
} catch (error) {
reject(error)
}
})
process.stdin.once('error', error => {
reject(error)
})
})
}
4 changes: 2 additions & 2 deletions provider/docs/bin/docs-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import envPaths from 'env-paths'
import { type CorpusArchive } from '../src/corpus/archive/corpusArchive'
import { createFileSystemCacheStore } from '../src/corpus/cache/store/fs'
import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor'
import { indexCorpus } from '../src/corpus/index/corpusIndex'
import { createCorpusIndex } from '../src/corpus/index/corpusIndex'

const args = process.argv.slice(2)

Expand Down Expand Up @@ -33,7 +33,7 @@ const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusA
const cacheDir = envPaths('opencodegraph-provider-docs').cache
const fsCache = createFileSystemCacheStore(cacheDir)

const corpus = await indexCorpus(corpusData, {
const corpus = await createCorpusIndex(corpusData, {
cacheStore: fsCache,
contentExtractor: extractContentUsingMozillaReadability,
})
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/index/corpusIndex.test.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import { describe, expect, test } from 'vitest'
import { createCorpusArchive } from '../archive/corpusArchive'
import { type Doc, type DocID } from '../doc/doc'
import { indexCorpus } from './corpusIndex'
import { createCorpusIndex } from './corpusIndex'

export function doc(id: DocID, text: string): Doc {
return { id, text }
}

describe('indexCorpus', () => {
test('#docs', async () => {
expect((await indexCorpus(await createCorpusArchive([doc(1, 'a'), doc(2, 'b')]))).docs.length).toBe(2)
expect((await createCorpusIndex(await createCorpusArchive([doc(1, 'a'), doc(2, 'b')]))).docs.length).toBe(2)
})
})
45 changes: 7 additions & 38 deletions provider/docs/src/corpus/index/corpusIndex.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import { embedText } from '../../search/embeddings'
import { createTFIDFIndex, type TFIDFIndex } from '../../search/tfidf'
import { type CorpusArchive } from '../archive/corpusArchive'
import { createCache, noopCache, type Cache, type CacheStore } from '../cache/cache'
import { contentID } from '../cache/contentID'
import { memo } from '../cache/memo'
import { chunk, type Chunk } from '../doc/chunks'
import { type Content, type ContentExtractor } from '../doc/contentExtractor'
import { type Doc } from '../doc/doc'
Expand All @@ -30,53 +28,24 @@ export interface IndexedDoc {
chunks: (Chunk & { embeddings: Float32Array })[]
}

/**
* Options for indexing a corpus.
*/
export interface IndexOptions {
cacheStore?: CacheStore
contentExtractor?: ContentExtractor
}

/**
* Index a corpus.
*/
export async function indexCorpus(
corpus: CorpusArchive,
{ cacheStore, contentExtractor }: IndexOptions = {}
export async function createCorpusIndex(
archive: CorpusArchive,
{ contentExtractor }: { contentExtractor?: ContentExtractor } = {}
): Promise<CorpusIndex> {
// TODO(sqs): remove cache, not needed, since this entire result is stored

const cache = cacheStore ? createCache(cacheStore) : noopCache

const indexedDocs = await cachedIndexCorpusDocs(corpus, { contentExtractor }, cache)

const tfidf = await cachedCreateTFIDFIndex(indexedDocs, cache)

const docs = await indexCorpusDocs(archive, { contentExtractor })
const tfidf = createTFIDFIndex(docs)
return {
docs: indexedDocs,
docs,
tfidf,
}
}

async function cachedIndexCorpusDocs(
corpus: CorpusArchive,
options: Pick<IndexOptions, 'contentExtractor'>,
cache: Cache
): Promise<IndexedDoc[]> {
const key = `indexCorpusDocs:${corpus.contentID}:${options.contentExtractor?.id ?? 'noContentExtractor'}`
return memo(cache, key, () => indexCorpusDocs(corpus, options))
}

async function cachedCreateTFIDFIndex(docs: IndexedDoc[], cache: Cache): Promise<TFIDFIndex> {
return memo(cache, `tfidfIndex:${await contentID(docs.map(doc => doc.contentID).join('\0'))}`, () =>
Promise.resolve(createTFIDFIndex(docs))
)
}

async function indexCorpusDocs(
corpus: CorpusArchive,
{ contentExtractor }: Pick<IndexOptions, 'contentExtractor'>
{ contentExtractor }: { contentExtractor?: ContentExtractor }
): Promise<IndexedDoc[]> {
return Promise.all(
corpus.docs.map(async doc => {
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ import path from 'node:path'
import { describe, expect, test } from 'vitest'
import { createClient } from './client/client'
import { createCorpusArchive } from './corpus/archive/corpusArchive'
import { indexCorpus } from './corpus/index/corpusIndex'
import { createCorpusIndex } from './corpus/index/corpusIndex'
import { type SearchResult } from './search/types'

describe('e2e', () => {
test('urlParsing', async () => {
const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8')
const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8')

const index = await indexCorpus(await createCorpusArchive([{ id: 1, text: docFile }]))
const index = await createCorpusIndex(await createCorpusArchive([{ id: 1, text: docFile }]))
const client = createClient(index)
const results = await client.search({ text: codeFile })
roundScores(results)
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/provider/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { createIndexedDBCacheStore } from '../corpus/cache/store/indexedDB'
import { createWebStorageCacheStore } from '../corpus/cache/store/localStorage'
import { chunk } from '../corpus/doc/chunks'
import { extractContentUsingMozillaReadability } from '../corpus/doc/contentExtractor'
import { indexCorpus } from '../corpus/index/corpusIndex'
import { createCorpusIndex } from '../corpus/index/corpusIndex'
import { multiplex } from './multiplex'

/** Settings for the docs OpenCodeGraph provider. */
Expand Down Expand Up @@ -46,7 +46,7 @@ export default multiplex<Settings>(async settings => {
ignore: settings.corpus.ignore,
logger: message => console.log(message),
})
const index = await indexCorpus(await createCorpusArchive(await source.docs()), {
const index = await createCorpusIndex(await createCorpusArchive(await source.docs()), {
cacheStore: CORPUS_CACHE,
contentExtractor: extractContentUsingMozillaReadability,
logger: console.debug,
Expand Down
11 changes: 7 additions & 4 deletions provider/docs/src/search/embeddings.test.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import { describe, expect, test } from 'vitest'
import { createCorpusArchive } from '../corpus/archive/corpusArchive'
import { indexCorpus } from '../corpus/index/corpusIndex'
import { createCorpusIndex } from '../corpus/index/corpusIndex'
import { doc } from '../corpus/index/corpusIndex.test'
import { embeddingsSearch, embedTextInThisScope, similarity } from './embeddings'
import { type SearchResult } from './types'

describe('embeddingsSearch', () => {
test('finds matches', async () => {
expect(
await embeddingsSearch(await indexCorpus(await createCorpusArchive([doc(1, 'xxxxxx'), doc(2, 'b')])), {
text: 'b',
})
await embeddingsSearch(
await createCorpusIndex(await createCorpusArchive([doc(1, 'xxxxxx'), doc(2, 'b')])),
{
text: 'b',
}
)
).toEqual<SearchResult[]>([{ doc: 2, chunk: 0, score: 1, excerpt: 'b' }])
})
})
Expand Down
6 changes: 4 additions & 2 deletions provider/docs/src/search/keyword.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { describe, expect, test } from 'vitest'
import { createCorpusArchive } from '../corpus/archive/corpusArchive'
import { indexCorpus } from '../corpus/index/corpusIndex'
import { createCorpusIndex } from '../corpus/index/corpusIndex'
import { doc } from '../corpus/index/corpusIndex.test'
import { keywordSearch } from './keyword'
import { calculateTFIDF } from './tfidf'
Expand All @@ -9,7 +9,9 @@ import { type SearchResult } from './types'
describe('keywordSearch', () => {
test('finds matches', async () => {
expect(
keywordSearch(await indexCorpus(await createCorpusArchive([doc(1, 'aaa'), doc(2, 'bbb')])), { text: 'bbb' })
keywordSearch(await createCorpusIndex(await createCorpusArchive([doc(1, 'aaa'), doc(2, 'bbb')])), {
text: 'bbb',
})
).toEqual<SearchResult[]>([
{
doc: 2,
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/search/tfidf.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { describe, expect, test } from 'vitest'
import { createCorpusArchive } from '../corpus/archive/corpusArchive'
import { indexCorpus } from '../corpus/index/corpusIndex'
import { createCorpusIndex } from '../corpus/index/corpusIndex'
import { calculateTFIDF, computeTFIDF, createTFIDFIndex } from './tfidf'

describe('createTFIDFIndex', async () => {
Expand All @@ -10,7 +10,7 @@ describe('createTFIDFIndex', async () => {
{ id: 3, text: 'c d e' },
])
const docIDs = data.docs.map(({ id }) => id)
const index = await indexCorpus(data)
const index = await createCorpusIndex(data)
const tfidfIndex = createTFIDFIndex(index.docs)

test('term in 1 doc', () => {
Expand Down

0 comments on commit e2781fe

Please sign in to comment.