Merge pull request #99 from sillsdev/ImageFileName

feat: Modify image file names
sillsdev · Apr 9, 2024 · 9d11a98 · 9d11a98
2 parents 5d10789 + 36da64a
commit 9d11a98
Show file tree

Hide file tree

Showing 13 changed files with 279 additions and 114 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -11,11 +11,17 @@
     "Greenshot",
     "imgur",
     "kanban",
+    "sillsdev",
     "unlocalized"
   ],
   "workbench.colorCustomizations": {
     "statusBar.background": "#d649ca",
     "statusBar.noFolderBackground": "#d649ca",
     "statussBar.prominentBackground": "#d649ca"
+  },
+  "markdownlint.config": {
+    "MD025":false,
+    "MD033":false,
+    "MD040":false
   }
 }
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Example Site: https://sillsdev.github.io/docu-notion-sample-site/
 
 # Instructions
 
-## 1. Set up your documentation site.
+## 1. Set up your documentation site
 
 First, prepare your markdown-based static file system like [Docusaurus](https://docusaurus.io/). For a shortcut with github actions, search, and deployment to github pages, you can just copy [this template](https://github.com/sillsdev/docu-notion-sample-site).
 
@@ -27,15 +27,15 @@ Go to the page that will be the root of your site. This page should have, as dir
 
 <img width="318" alt="image" src="https://github.com/sillsdev/docu-notion/assets/8448/810c6dca-f9ab-4370-93b4-dc1479332af7">
 
-## 5. Add your pages under your Outline page.
+## 5. Add your pages under your Outline page
 
 Currently, docu-notion expects that each page has only one of the following: sub-pages, links to other pages, or normal content. Do not mix them. You can add content pages directly here, but then you won't be able to make use of the workflow features. If those matter to you, instead make new pages under the "Database" and then link to them in your outline pages.
 
 ## 6. Pull your pages
 
-First, determine the id of your root page by clicking "Share" and looking at the url it gives you. E.g.
-https://www.notion.so/hattonjohn/My-Docs-0456aa5842946bdbea3a4f37c97a0e5
-means that the id is "0456aa5842946PRETEND4f37c97a0e5".
+First, determine the ID of your root page by clicking "Share" and looking at the url it gives you. E.g.
+`https://www.notion.so/hattonjohn/My-Docs-0456aa5842946PRETEND4f37c97a0e5`
+means that the ID is `0456aa5842946PRETEND4f37c97a0e5`.
 
 Try it out:
 
@@ -114,26 +114,27 @@ NOTE: if you just localize an image, it will not get picked up. You also must lo
 
 # Automated builds with Github Actions
 
-Here is a working Github Action script to copy and customize: https://github.com/BloomBooks/bloom-docs/blob/master/.github/workflows/release.yml
+Here is a [working Github Action script to copy and customize](https://github.com/BloomBooks/bloom-docs/blob/master/.github/workflows/release.yml).
 
 # Command line
 
-Usage: docu-notion -n <token> -r <root> [options]
+Usage: `docu-notion -n <token> -r <root> [options]`
 
 Options:
 
 | flag                                  | required? | description                                                                                                                                                                                                        |
 | ------------------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| -n, --notion-token <string>           | required  | notion api token, which looks like `secret_3bc1b50XFYb15123RHF243x43450XFY33250XFYa343`                                                                                                                            |
-| -r, --root-page <string>              | required  | The 31 character ID of the page which is the root of your docs page in notion. The code will look like `9120ec9960244ead80fa2ef4bc1bba25`. This page must have a child page named 'Outline'                        |
-| -m, --markdown-output-path <string>   |           | Root of the hierarchy for md files. WARNING: node-pull-mdx will delete files from this directory. Note also that if it finds localized images, it will create an i18n/ directory as a sibling. (default: "./docs") |
-| -t, --status-tag <string>             |           | Database pages without a Notion page property 'status' matching this will be ignored. Use '\*' to ignore status altogether. (default: `Publish`)                                                                   |
-| --locales <codes>                     |           | Comma-separated list of iso 639-2 codes, the same list as in docusaurus.config.js, minus the primary (i.e. 'en'). This is needed for image localization. (default: [])                                             |
-| -l, --log-level <level>               |           | Log level (choices: `info`, `verbose`, `debug`)                                                                                                                                                                    |
-| -i, --img-output-path <string>        |           | Path to directory where images will be stored. If this is not included, images will be placed in the same directory as the document that uses them, which then allows for localization of screenshots.             |
-| -p, --img-prefix-in-markdown <string> |           | When referencing an image from markdown, prefix with this path instead of the full img-output-path. Should be used only in conjunction with --img-output-path.                                                     |
-| --require-slugs                       |           | If set, docu-notion will fail if any pages it would otherwise publish are missing a slug in Notion. |
-| -h, --help                            |           | display help for command                              |
+| `-n, --notion-token <string>`           | required  | notion api token, which looks like `secret_3bc1b50XFYb15123RHF243x43450XFY33250XFYa343`                                                                                                                            |
+| `-r, --root-page <string>`              | required  | The 31 character ID of the page which is the root of your docs page in notion. The code will look like `9120ec9960244ead80fa2ef4bc1bba25`. This page must have a child page named 'Outline'                        |
+| `-m, --markdown-output-path <string>`   |           | Root of the hierarchy for md files. WARNING: node-pull-mdx will delete files from this directory. Note also that if it finds localized images, it will create an i18n/ directory as a sibling. (default: `./docs`) |
+| `-t, --status-tag <string>`             |           | Database pages without a Notion page property 'status' matching this will be ignored. Use '\*' to ignore status altogether. (default: `Publish`)                                                                   |
+| `--locales <codes>`                     |           | Comma-separated list of iso 639-2 codes, the same list as in docusaurus.config.js, minus the primary (i.e. 'en'). This is needed for image localization. (default: `[]`)                                             |
+| `-l, --log-level <level>`               |           | Log level (choices: `info`, `verbose`, `debug`)                                                                                                                                                                    |
+| `-i, --img-output-path <string>`        |           | Path to directory where images will be stored. If this is not included, images will be placed in the same directory as the document that uses them, which then allows for localization of screenshots.             |
+| `-p, --img-prefix-in-markdown <string>` |           | When referencing an image from markdown, prefix with this path instead of the full img-output-path. Should be used only in conjunction with --img-output-path.                                                     |
+| `--require-slugs`                       |           | If set, docu-notion will fail if any pages it would otherwise publish are missing a slug in Notion. |
+| `--image-file-name-format <format>`     |           | choices:<ul><li>`default`: {page slug (if any)}.{image block ID}</li><li>`content-hash`: Use a hash of the image content.</li><li>`legacy`: Use the legacy (before v0.16) method of determining file names. Set this to maintain backward compatibility.</li></ul>All formats will use the original file extension. |
+| `-h, --help`                            |           | display help for command                              |
 
 # Plugins
 
@@ -155,8 +156,10 @@ The default admonition type, if no matching icon is found, is "note".
 # Known Workarounds
 
 ### Start a numbered list at a number other than 1
+
 In Notion, make sure the block is "Text," not "Numbered List".
+
 - But make sure the number does NOT have a space in front of it. This can/will cause issues with sub-list items.
 - One way to get Notion to let you do this:
-    - Create a numbered list item where the text duplicates the number you want. Convert that numbered list item to "Text."
-    - i.e. Type "1. 1. Item one." Notion makes the first "1." into a number in a list. When you convert back to "Text," you're left with plain text "1. Item one."
+  - Create a numbered list item where the text duplicates the number you want. Convert that numbered list item to "Text."
+  - i.e. Type "1. 1. Item one." Notion makes the first "1." into a number in a list. When you convert back to "Text," you're left with plain text "1. Item one."
diff --git a/package.json b/package.json
@@ -17,7 +17,8 @@
     "pull-sample-site": "npm run ts -- -n $DOCU_NOTION_INTEGRATION_TOKEN -r $DOCU_NOTION_SAMPLE_ROOT_PAGE  --log-level debug",
     "// test with a semi-stable/public site:": "",
     "pull-sample": "npm run ts -- -n $DOCU_NOTION_INTEGRATION_TOKEN -r $DOCU_NOTION_SAMPLE_ROOT_PAGE -m ./sample --locales en,es,fr,de --log-level verbose",
-    "pull-sample-with-paths": "npm run ts -- -n $DOCU_NOTION_INTEGRATION_TOKEN -r $DOCU_NOTION_SAMPLE_ROOT_PAGE -m ./sample --img-output-path ./sample_img"
+    "pull-sample-with-paths": "npm run ts -- -n $DOCU_NOTION_INTEGRATION_TOKEN -r $DOCU_NOTION_SAMPLE_ROOT_PAGE -m ./sample --img-output-path ./sample_img",
+    "lint": "eslint . --ext .ts"
   },
   "//file-type": "have to use this version before they switched to ESM, which gives a compile error related to require()",
   "//chalk@4": "also ESM related problem",

diff --git a/src/MakeImagePersistencePlan.ts b/src/MakeImagePersistencePlan.ts
@@ -2,9 +2,13 @@ import { ImageSet } from "./images";
 import * as Path from "path";
 import { error } from "./log";
 import { exit } from "process";
+import crypto from "crypto";
+import { DocuNotionOptions } from "./pull";
 
 export function makeImagePersistencePlan(
+  options: DocuNotionOptions,
   imageSet: ImageSet,
+  imageBlockId: string,
   imageOutputRootPath: string,
   imagePrefix: string
 ): void {
@@ -23,23 +27,55 @@ export function makeImagePersistencePlan(
     }
   }
 
-  // Since most images come from pasting screenshots, there isn't normally a filename. That's fine, we just make a hash of the url
-  // Images that are stored by notion come to us with a complex url that changes over time, so we pick out the UUID that doesn't change. Example:
-  //    https://s3.us-west-2.amazonaws.com/secure.notion-static.com/d1058f46-4d2f-4292-8388-4ad393383439/Untitled.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIAT73L2G45EIPT3X45%2F20220516%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20220516T233630Z&X-Amz-Expires=3600&X-Amz-Signature=f215704094fcc884d37073b0b108cf6d1c9da9b7d57a898da38bc30c30b4c4b5&X-Amz-SignedHeaders=host&x-id=GetObject
-  // But around Sept 2023, they changed the url to be something like:
-  //    https://prod-files-secure.s3.us-west-2.amazonaws.com/d9a2b712-cf69-4bd6-9d65-87a4ceeacca2/d1bcdc8c-b065-4e40-9a11-392aabeb220e/Untitled.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIAT73L2G45EIPT3X45%2F20230915%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20230915T161258Z&X-Amz-Expires=3600&X-Amz-Signature=28fca48e65fba86d539c3c4b7676fce1fa0857aa194f7b33dd4a468ecca6ab24&X-Amz-SignedHeaders=host&x-id=GetObject
-  // The thing we want is the last UUID before the ?
+  if (options.imageFileNameFormat === "legacy") {
+    // Original behavior and comment:
+    //   Since most images come from pasting screenshots, there isn't normally a filename. That's fine, we just make a hash of the url
+    //   Images that are stored by notion come to us with a complex url that changes over time, so we pick out the UUID that doesn't change. Example:
+    //      https://s3.us-west-2.amazonaws.com/secure.notion-static.com/d1058f46-4d2f-4292-8388-4ad393383439/Untitled.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIAT73L2G45EIPT3X45%2F20220516%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20220516T233630Z&X-Amz-Expires=3600&X-Amz-Signature=f215704094fcc884d37073b0b108cf6d1c9da9b7d57a898da38bc30c30b4c4b5&X-Amz-SignedHeaders=host&x-id=GetObject
+    //   But around Sept 2023, they changed the url to be something like:
+    //      https://prod-files-secure.s3.us-west-2.amazonaws.com/d9a2b712-cf69-4bd6-9d65-87a4ceeacca2/d1bcdc8c-b065-4e40-9a11-392aabeb220e/Untitled.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIAT73L2G45EIPT3X45%2F20230915%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20230915T161258Z&X-Amz-Expires=3600&X-Amz-Signature=28fca48e65fba86d539c3c4b7676fce1fa0857aa194f7b33dd4a468ecca6ab24&X-Amz-SignedHeaders=host&x-id=GetObject
+    //   The thing we want is the last UUID before the ?
+    const thingToHash = findLastUuid(urlBeforeQuery) ?? urlBeforeQuery;
 
-  const thingToHash = findLastUuid(urlBeforeQuery) ?? urlBeforeQuery;
+    const hash = hashOfString(thingToHash);
+    imageSet.outputFileName = `${hash}.${imageFileExtension}`;
+  } else if (options.imageFileNameFormat === "content-hash") {
+    // This was requested by a user: https://github.com/sillsdev/docu-notion/issues/76.
+    // We chose not to include it in the default file name because we want to maintain
+    // as much stability in the file name as feasible for an image localization workflow.
+    // However, particularly in a workflow which is not concerned with localization,
+    // this could be a good option. One benefit is that the image only needs to exist once
+    // in the file system regardless of how many times it is used in the site.
+    const imageHash = hashOfBufferContent(imageSet.primaryBuffer!);
+    imageSet.outputFileName = `${imageHash}.${imageFileExtension}`;
+  } else {
+    // We decided not to do this for the default format because it means
+    // instability for the file name in Crowdin, which causes loss of localizations.
+    // If we decide to include it in the future, we should add a unit test.
+    // const imageFileName = Path.basename(urlBeforeQuery);
+    // const imageFileNameWithoutExtension = Path.parse(imageFileName).name;
+    // const originalFileNamePart = ["untitled", "unnamed"].includes(
+    //   imageFileNameWithoutExtension.toLocaleLowerCase()
+    // )
+    //   ? ""
+    //   : `${imageFileNameWithoutExtension.substring(0, 50)}.`;
 
-  const hash = hashOfString(thingToHash);
-  imageSet.outputFileName = `${hash}.${imageFileExtension}`;
+    // Format is page slug (if there is one) followed by the image block ID from Notion.
+    // The image block ID will remain stable as long as any changes to the image are done
+    // using the Replace feature. Also, image blocks can be moved using the Move To feature.
+    // We decided to include the page slug for easier workflow during localization, particularly in Crowdin.
+    // The block ID is a unique GUID and thus provides a unique file name.
+    const pageSlugPart = imageSet.pageInfo?.slug
+      ? `${imageSet.pageInfo.slug.replace(/^\//, "")}.`
+      : "";
+    imageSet.outputFileName = `${pageSlugPart}${imageBlockId}.${imageFileExtension}`;
+  }
 
   imageSet.primaryFileOutputPath = Path.posix.join(
     imageOutputRootPath?.length > 0
       ? imageOutputRootPath
-      : imageSet.pathToParentDocument!,
-    imageSet.outputFileName
+      : imageSet.pageInfo!.directoryContainingMarkdown,
+    decodeURI(imageSet.outputFileName)
   );
 
   if (imageOutputRootPath && imageSet.localizedUrls.length) {
@@ -73,3 +109,8 @@ export function hashOfString(s: string): number {
 
   return Math.abs(hash);
 }
+
+function hashOfBufferContent(buffer: Buffer): string {
+  const hash = crypto.createHash("sha256").update(buffer).digest("hex");
+  return hash.slice(0, 20);
+}