From ad54681d1e920042895a13b63239de538432f0b8 Mon Sep 17 00:00:00 2001 From: Le Roux Bodenstein Date: Wed, 25 Oct 2023 16:09:52 +0100 Subject: [PATCH] analyze a whole database and find its relationships --- package-lock.json | 31 ++++++++++++ package.json | 1 + scripts/analyze-database.ts | 78 ++++++++++++++++++++++++++++++ src/database-analyzer.ts | 95 +++++++++++++++++++++++++++++++++++++ src/index.ts | 12 ++++- 5 files changed, 215 insertions(+), 2 deletions(-) create mode 100755 scripts/analyze-database.ts create mode 100644 src/database-analyzer.ts diff --git a/package-lock.json b/package-lock.json index 848d5e1..9971ab4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,6 +18,7 @@ "@types/mocha": "^10.0.1", "@types/node": "^18.11.18", "@types/reservoir": "^0.1.0", + "@types/yargs": "^17.0.29", "@typescript-eslint/eslint-plugin": "^5.47.1", "@typescript-eslint/parser": "^5.47.1", "bson": "^5.0.1", @@ -2338,6 +2339,21 @@ "@types/webidl-conversions": "*" } }, + "node_modules/@types/yargs": { + "version": "17.0.29", + "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.29.tgz", + "integrity": "sha512-nacjqA3ee9zRF/++a3FUY1suHTFKZeHba2n8WeDw9cCVdmzmHpIxyzOJBcpHvvEmS8E9KqWlSnWHUkOrkhWcvA==", + "dev": true, + "dependencies": { + "@types/yargs-parser": "*" + } + }, + "node_modules/@types/yargs-parser": { + "version": "21.0.2", + "resolved": "https://registry.npmjs.org/@types/yargs-parser/-/yargs-parser-21.0.2.tgz", + "integrity": "sha512-5qcvofLPbfjmBfKaLfj/+f+Sbd6pN4zl7w7VSVI5uz7m9QZTuB2aZAa2uo1wHFBNN2x6g/SoTkXmd8mQnQF2Cw==", + "dev": true + }, "node_modules/@typescript-eslint/eslint-plugin": { "version": "5.47.1", "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.47.1.tgz", @@ -9857,6 +9873,21 @@ "@types/webidl-conversions": "*" } }, + "@types/yargs": { + "version": "17.0.29", + "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.29.tgz", + "integrity": "sha512-nacjqA3ee9zRF/++a3FUY1suHTFKZeHba2n8WeDw9cCVdmzmHpIxyzOJBcpHvvEmS8E9KqWlSnWHUkOrkhWcvA==", + "dev": true, + "requires": { + "@types/yargs-parser": "*" + } + }, + "@types/yargs-parser": { + "version": "21.0.2", + "resolved": "https://registry.npmjs.org/@types/yargs-parser/-/yargs-parser-21.0.2.tgz", + "integrity": "sha512-5qcvofLPbfjmBfKaLfj/+f+Sbd6pN4zl7w7VSVI5uz7m9QZTuB2aZAa2uo1wHFBNN2x6g/SoTkXmd8mQnQF2Cw==", + "dev": true + }, "@typescript-eslint/eslint-plugin": { "version": "5.47.1", "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-5.47.1.tgz", diff --git a/package.json b/package.json index fd1f128..dfd7045 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "@types/mocha": "^10.0.1", "@types/node": "^18.11.18", "@types/reservoir": "^0.1.0", + "@types/yargs": "^17.0.29", "@typescript-eslint/eslint-plugin": "^5.47.1", "@typescript-eslint/parser": "^5.47.1", "bson": "^5.0.1", diff --git a/scripts/analyze-database.ts b/scripts/analyze-database.ts new file mode 100755 index 0000000..221918e --- /dev/null +++ b/scripts/analyze-database.ts @@ -0,0 +1,78 @@ +#!/usr/bin/env npx ts-node + +import { MongoClient, Document } from 'mongodb'; +import yargs from 'yargs'; +import { hideBin } from 'yargs/helpers'; + +import type { Schema, Relationship } from '../src'; +import { SchemaAnalyzer, findRelationshipsForSchema } from '../src'; + +async function analyzeCollection(documents: AsyncIterable) { + const analyzer = new SchemaAnalyzer({ + storeValues: true + }); + for await (const doc of documents) { + analyzer.analyzeDoc(doc); + } + return analyzer; +} + +let client: MongoClient; +async function run() { + const argv = await yargs(hideBin(process.argv)) + .option('sampleSize', { type: 'number', default: 1000 }) + .argv; + + const [uri, databaseName] = argv._ as [string, string]; + if (!(uri && databaseName)) { + throw new Error('USAGE: analyze-database.ts connectionURI databaseName'); + } + + client = new MongoClient(uri); + await client.connect(); + + const db = client.db(databaseName); + + const collectionInfos = await db.listCollections().toArray(); + console.dir(collectionInfos); + + const collections: Record = {}; + + const relationships: Relationship[] = []; + + const collectionNames = collectionInfos.map((c) => c.name); + + for (const coll of collectionInfos) { + console.log(coll.name); + const collection = db.collection(coll.name); + const cursor = collection.aggregate([{ + $sample: { + size: argv.sampleSize + } + }], { + allowDiskUse: true + }); + + const analyzer = await analyzeCollection(cursor); + + const schema = analyzer.getResult(); + collections[coll.name] = schema; + + relationships.push(...await findRelationshipsForSchema(db, coll.name, collectionNames, schema)); + + console.log(); // newline + } + + console.dir(relationships, { depth: null }); +} + +if (require.main === module) { + run() + .finally(() => { + client?.close(); + }) + .catch((err) => { + console.error(err.stack); + process.exit(1); + }); +} diff --git a/src/database-analyzer.ts b/src/database-analyzer.ts new file mode 100644 index 0000000..0336f50 --- /dev/null +++ b/src/database-analyzer.ts @@ -0,0 +1,95 @@ + +import type { Schema } from './schema-analyzer'; +import type { Db } from 'mongodb'; + +type CollectionFieldReference = { + collection: string; + fieldPath: string[]; +} + +type FieldReferenceWithValues = CollectionFieldReference & { + values: any[] +} + +export type Relationship = { + from: CollectionFieldReference; + to: CollectionFieldReference; +} + +function shuffleArray(array: any[]) { + for (let i = array.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [array[i], array[j]] = [array[j], array[i]]; + } +} + +function findCandidateReferencesForSchema(collectionName: string, schema: Schema) { + const candidatePaths: FieldReferenceWithValues[] = []; + + for (const field of schema.fields) { + if (field.name === '_id') { + continue; + } + + // TODO: also consider anything matching a known naming convention like /_id$/ + // TODO: we might also want to consider any large integers if there are lots of different values? + + const values: any[] = []; + for (const typeInfo of field.types) { + if (['ObjectId', 'UUID'].includes(typeInfo.bsonType)) { + values.push(...(typeInfo as { values: any[]}).values ?? []); + } + } + if (values.length) { + // in case the sample came from limit()* and wasn't already sorted randomly + shuffleArray(values); + + candidatePaths.push({ + collection: collectionName, + fieldPath: field.path, + values + }); + console.log(field.path); + } + } + + return candidatePaths; +} + +async function findRelationshipsCandidate(db: Db, collectionNames: string[], candidatePaths: FieldReferenceWithValues[]) { + const relationships: Relationship[] = []; + + // not the most efficient.. + for (const { collection, fieldPath, values } of candidatePaths) { + for (const target of collectionNames) { + const ids = values.slice(0, 10); + const result = (await db.collection(target).aggregate([ + { $match: { _id: { $in: ids } } }, + { $count: 'matches' } + ]).toArray()); + + if (result.length) { + console.log(collection, fieldPath, result); + relationships.push({ + from: { + collection, + fieldPath + }, + to: { + collection: target, + fieldPath: ['_id'] + } + }); + // no point checking the collections - we assume this is a many to one + break; + } + } + } + + return relationships; +} + +export async function findRelationshipsForSchema(db: Db, collectionName: string, collectionNames: string[], schema: Schema) { + const candidatePaths = findCandidateReferencesForSchema(collectionName, schema); + return await findRelationshipsCandidate(db, collectionNames, candidatePaths); +} diff --git a/src/index.ts b/src/index.ts index dd2b196..a792626 100644 --- a/src/index.ts +++ b/src/index.ts @@ -23,6 +23,12 @@ import type { SimplifiedSchema } from './schema-analyzer'; import * as schemaStats from './stats'; +import type { + Relationship +} from './database-analyzer'; +import { + findRelationshipsForSchema +} from './database-analyzer'; type MongoDBCursor = AggregationCursor | FindCursor; @@ -109,7 +115,8 @@ export type { SimplifiedSchemaDocumentType, SimplifiedSchemaType, SimplifiedSchemaField, - SimplifiedSchema + SimplifiedSchema, + Relationship }; export { @@ -119,5 +126,6 @@ export { getSchemaPaths, getSimplifiedSchema, SchemaAnalyzer, - schemaStats + schemaStats, + findRelationshipsForSchema };