Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: analyze a whole database and find its relationships [WIP] [Skunkworks] #207

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
"@types/mocha": "^10.0.1",
"@types/node": "^18.11.18",
"@types/reservoir": "^0.1.0",
"@types/yargs": "^17.0.29",
"@typescript-eslint/eslint-plugin": "^5.47.1",
"@typescript-eslint/parser": "^5.47.1",
"bson": "^5.0.1",
Expand Down
78 changes: 78 additions & 0 deletions scripts/analyze-database.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env npx ts-node

import { MongoClient, Document } from 'mongodb';
import yargs from 'yargs';
import { hideBin } from 'yargs/helpers';

import type { Schema, Relationship } from '../src';
import { SchemaAnalyzer, findRelationshipsForSchema } from '../src';

async function analyzeCollection(documents: AsyncIterable<Document>) {
const analyzer = new SchemaAnalyzer({
storeValues: true
});
for await (const doc of documents) {
analyzer.analyzeDoc(doc);
}
return analyzer;
}

let client: MongoClient;
async function run() {
const argv = await yargs(hideBin(process.argv))
.option('sampleSize', { type: 'number', default: 1000 })
.argv;

const [uri, databaseName] = argv._ as [string, string];
if (!(uri && databaseName)) {
throw new Error('USAGE: analyze-database.ts connectionURI databaseName');
}

client = new MongoClient(uri);
await client.connect();

const db = client.db(databaseName);

const collectionInfos = await db.listCollections().toArray();
console.dir(collectionInfos);

const collections: Record<string, Schema> = {};

const relationships: Relationship[] = [];

const collectionNames = collectionInfos.map((c) => c.name);

for (const coll of collectionInfos) {
console.log(coll.name);
const collection = db.collection(coll.name);
const cursor = collection.aggregate([{
$sample: {
size: argv.sampleSize
}
}], {
allowDiskUse: true
});

const analyzer = await analyzeCollection(cursor);

const schema = analyzer.getResult();
collections[coll.name] = schema;

relationships.push(...await findRelationshipsForSchema(db, coll.name, collectionNames, schema));

console.log(); // newline
}

console.dir(relationships, { depth: null });
}

if (require.main === module) {
run()
.finally(() => {
client?.close();
})
.catch((err) => {
console.error(err.stack);
process.exit(1);
});
}
95 changes: 95 additions & 0 deletions src/database-analyzer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@

import type { Schema } from './schema-analyzer';
import type { Db } from 'mongodb';

type CollectionFieldReference = {
collection: string;
fieldPath: string[];
}

type FieldReferenceWithValues = CollectionFieldReference & {
values: any[]
}

export type Relationship = {
from: CollectionFieldReference;
to: CollectionFieldReference;
}

function shuffleArray(array: any[]) {
for (let i = array.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[array[i], array[j]] = [array[j], array[i]];
}
}

function findCandidateReferencesForSchema(collectionName: string, schema: Schema) {
const candidatePaths: FieldReferenceWithValues[] = [];

for (const field of schema.fields) {
if (field.name === '_id') {
continue;
}

// TODO: also consider anything matching a known naming convention like /_id$/
// TODO: we might also want to consider any large integers if there are lots of different values?

const values: any[] = [];
for (const typeInfo of field.types) {
if (['ObjectId', 'UUID'].includes(typeInfo.bsonType)) {
values.push(...(typeInfo as { values: any[]}).values ?? []);
}
}
if (values.length) {
// in case the sample came from limit()* and wasn't already sorted randomly
shuffleArray(values);

candidatePaths.push({
collection: collectionName,
fieldPath: field.path,
values
});
console.log(field.path);
}
}

return candidatePaths;
}

async function findRelationshipsCandidate(db: Db, collectionNames: string[], candidatePaths: FieldReferenceWithValues[]) {
const relationships: Relationship[] = [];

// not the most efficient..
for (const { collection, fieldPath, values } of candidatePaths) {
for (const target of collectionNames) {
const ids = values.slice(0, 10);
const result = (await db.collection(target).aggregate([
{ $match: { _id: { $in: ids } } },
{ $count: 'matches' }
]).toArray());

if (result.length) {
console.log(collection, fieldPath, result);
relationships.push({
from: {
collection,
fieldPath
},
to: {
collection: target,
fieldPath: ['_id']
}
});
// no point checking the collections - we assume this is a many to one
break;
}
}
}

return relationships;
}

export async function findRelationshipsForSchema(db: Db, collectionName: string, collectionNames: string[], schema: Schema) {
const candidatePaths = findCandidateReferencesForSchema(collectionName, schema);
return await findRelationshipsCandidate(db, collectionNames, candidatePaths);
}
12 changes: 10 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ import type {
SimplifiedSchema
} from './schema-analyzer';
import * as schemaStats from './stats';
import type {
Relationship
} from './database-analyzer';
import {
findRelationshipsForSchema
} from './database-analyzer';

type MongoDBCursor = AggregationCursor | FindCursor;

Expand Down Expand Up @@ -109,7 +115,8 @@ export type {
SimplifiedSchemaDocumentType,
SimplifiedSchemaType,
SimplifiedSchemaField,
SimplifiedSchema
SimplifiedSchema,
Relationship
};

export {
Expand All @@ -119,5 +126,6 @@ export {
getSchemaPaths,
getSimplifiedSchema,
SchemaAnalyzer,
schemaStats
schemaStats,
findRelationshipsForSchema
};