From 79d756a753bc87ec4597fe06d6a9333dc7b44ff0 Mon Sep 17 00:00:00 2001 From: Jacob Cable <32874567+cabljac@users.noreply.github.com> Date: Fri, 31 Jan 2025 19:20:53 +0000 Subject: [PATCH] feat(fs-bq-import-collection): add transformFunction option (#2251) --- .../guides/IMPORT_EXISTING_DOCUMENTS.md | 32 +++++++++++++++++++ .../scripts/import/package-lock.json | 12 +++---- .../scripts/import/src/config.ts | 24 ++++++++++++++ .../scripts/import/src/index.ts | 1 + .../scripts/import/src/program.ts | 4 +++ .../scripts/import/src/types.ts | 1 + .../scripts/import/src/worker.ts | 1 + 7 files changed, 67 insertions(+), 8 deletions(-) diff --git a/firestore-bigquery-export/guides/IMPORT_EXISTING_DOCUMENTS.md b/firestore-bigquery-export/guides/IMPORT_EXISTING_DOCUMENTS.md index 179a9271a..a88c43d99 100644 --- a/firestore-bigquery-export/guides/IMPORT_EXISTING_DOCUMENTS.md +++ b/firestore-bigquery-export/guides/IMPORT_EXISTING_DOCUMENTS.md @@ -139,3 +139,35 @@ This helps you quickly identify problematic documents and take action accordingl To retry the failed imports, you can use the output file to manually inspect or reprocess the documents. For example, you could create a script that reads the failed paths and reattempts the import. > **Note:** If the specified file already exists, it will be **cleared** before writing new failed batch paths. + +### Using a Transform Function + +You can optionally provide a transform function URL (`--transform-function-url` or `-f`) that will transform document data before it's written to BigQuery. The transform function should should recieve document data and return transformed data. The payload will contain the following: + +``` +{ + data: [{ + insertId: int; + json: { + timestamp: int; + event_id: int; + document_name: string; + document_id: int; + operation: ChangeType; + data: string; + }, + }] +} +``` + +The response should be identical in structure. + +Example usage of the script with transform function option: + +```shell +npx @firebaseextensions/fs-bq-import-collection --non-interactive \ + -P \ + -s \ + -d \ + -f https://us-west1-my-project.cloudfunctions.net/transformFunction +``` diff --git a/firestore-bigquery-export/scripts/import/package-lock.json b/firestore-bigquery-export/scripts/import/package-lock.json index d5ce4db63..e5d2edc63 100644 --- a/firestore-bigquery-export/scripts/import/package-lock.json +++ b/firestore-bigquery-export/scripts/import/package-lock.json @@ -9095,9 +9095,8 @@ }, "node_modules/cross-spawn": { "version": "7.0.6", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", - "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", "dev": true, + "license": "MIT", "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", @@ -9459,8 +9458,7 @@ }, "node_modules/express": { "version": "4.21.2", - "resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz", - "integrity": "sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==", + "license": "MIT", "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", @@ -11953,8 +11951,6 @@ }, "node_modules/nanoid": { "version": "5.0.9", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-5.0.9.tgz", - "integrity": "sha512-Aooyr6MXU6HpvvWXKoVoXwKMs/KyVakWwg7xQfv5/S/RIgJMy0Ifa45H9qqYy7pTCszrHzP21Uk4PZq2HpEM8Q==", "dev": true, "funding": [ { @@ -11962,6 +11958,7 @@ "url": "https://github.com/sponsors/ai" } ], + "license": "MIT", "bin": { "nanoid": "bin/nanoid.js" }, @@ -12257,8 +12254,7 @@ }, "node_modules/path-to-regexp": { "version": "0.1.12", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz", - "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==" + "license": "MIT" }, "node_modules/pathval": { "version": "1.1.1", diff --git a/firestore-bigquery-export/scripts/import/src/config.ts b/firestore-bigquery-export/scripts/import/src/config.ts index c3ee6921b..b9f839027 100644 --- a/firestore-bigquery-export/scripts/import/src/config.ts +++ b/firestore-bigquery-export/scripts/import/src/config.ts @@ -170,6 +170,21 @@ const questions = [ type: "confirm", default: false, }, + { + message: "What's the URL of your transform function? (Optional)", + name: "transformFunctionUrl", + type: "input", + default: "", + validate: (value) => { + if (!value) return true; + try { + new URL(value); + return true; + } catch { + return "Please enter a valid URL or leave empty"; + } + }, + }, { message: "Would you like to use a local firestore emulator?", name: "useEmulator", @@ -213,6 +228,15 @@ export async function parseConfig(): Promise { if (program.datasetLocation === undefined) { errors.push("DatasetLocation is not specified."); } + + if (program.transformFunctionUrl) { + try { + new URL(program.transformFunctionUrl); + } catch { + errors.push("Transform function URL is invalid"); + } + } + if (!validateBatchSize(program.batchSize)) { errors.push("Invalid batch size."); } diff --git a/firestore-bigquery-export/scripts/import/src/index.ts b/firestore-bigquery-export/scripts/import/src/index.ts index 2a975ce40..c7da5dd7e 100644 --- a/firestore-bigquery-export/scripts/import/src/index.ts +++ b/firestore-bigquery-export/scripts/import/src/index.ts @@ -81,6 +81,7 @@ const run = async (): Promise => { wildcardIds: queryCollectionGroup, useNewSnapshotQuerySyntax, bqProjectId: bigQueryProjectId, + transformFunction: config.transformFunctionUrl, }); await initializeDataSink(dataSink, config); diff --git a/firestore-bigquery-export/scripts/import/src/program.ts b/firestore-bigquery-export/scripts/import/src/program.ts index 52ff4e440..87982aa16 100644 --- a/firestore-bigquery-export/scripts/import/src/program.ts +++ b/firestore-bigquery-export/scripts/import/src/program.ts @@ -54,6 +54,10 @@ export const getCLIOptions = () => { "-u, --use-new-snapshot-query-syntax [true|false]", "Whether to use updated latest snapshot query" ) + .option( + "-f, --transform-function-url ", + "URL of function to transform data before export (e.g., https://us-west1-project.cloudfunctions.net/transform)" + ) .option( "-e, --use-emulator [true|false]", "Whether to use the firestore emulator" diff --git a/firestore-bigquery-export/scripts/import/src/types.ts b/firestore-bigquery-export/scripts/import/src/types.ts index 28798f3b0..e2dfdcd18 100644 --- a/firestore-bigquery-export/scripts/import/src/types.ts +++ b/firestore-bigquery-export/scripts/import/src/types.ts @@ -16,6 +16,7 @@ export interface CliConfig { rawChangeLogName: string; cursorPositionFile: string; failedBatchOutput?: string; + transformFunctionUrl?: string; } export interface CliConfigError { diff --git a/firestore-bigquery-export/scripts/import/src/worker.ts b/firestore-bigquery-export/scripts/import/src/worker.ts index c24ca9661..23e9a9065 100644 --- a/firestore-bigquery-export/scripts/import/src/worker.ts +++ b/firestore-bigquery-export/scripts/import/src/worker.ts @@ -73,6 +73,7 @@ async function processDocuments( wildcardIds: true, skipInit: true, useNewSnapshotQuerySyntax: config.useNewSnapshotQuerySyntax, + transformFunction: config.transformFunctionUrl, }); // Process documents in batches until we've covered the entire partition