From 96ff67d75604c0792f7bc54286a6d662b35cbd17 Mon Sep 17 00:00:00 2001 From: Grace Park Date: Fri, 1 Oct 2021 10:23:58 -0700 Subject: [PATCH] Workflow to test docs.github.com links on github/github (#21601) * run prettier * removing files from test/integration * update cron job to not start on the hour * add github/github comment * update comment * updating to search by indices in the content rather than by line * small updates and updating character max * update name of script run * updating to use api search code and get contents * using api search code and get contents * adding token check and .js * remove blank line for title * update issue body content * update comment * add support for GitHub.developer_help_url links --- .../check-broken-links-github-github.yml | 75 ++++++++++ script/check-github-github-links.js | 141 ++++++++++++++++++ script/helpers/git-utils.js | 43 ++++++ 3 files changed, 259 insertions(+) create mode 100644 .github/workflows/check-broken-links-github-github.yml create mode 100755 script/check-github-github-links.js diff --git a/.github/workflows/check-broken-links-github-github.yml b/.github/workflows/check-broken-links-github-github.yml new file mode 100644 index 000000000000..f89ef6b9b6b4 --- /dev/null +++ b/.github/workflows/check-broken-links-github-github.yml @@ -0,0 +1,75 @@ +name: Check Broken Docs Links in github/github + +# **What it does**: This checks for any broken docs.github.com links in github/github +# **Why we have it**: Make sure all docs in github/github are up to date +# **Who does it impact**: Docs engineering, people on GitHub + +on: + workflow_dispatch: + schedule: + - cron: '20 13 * * 1' # run every Monday at 1:20PM UTC + +# **IMPORTANT:** Do not change the FREEZE environment variable set here! +# This workflow runs on a recurring basis. To temporarily disable it (e.g., +# during a docs deployment freeze), add an Actions Secret to the repo settings +# called `FREEZE` with a value of `true`. To re-enable workflow, simply +# delete that Secret from the repo settings. The environment variable here +# will duplicate that Secret's value for later evaluation. +env: + FREEZE: ${{ secrets.FREEZE }} + +jobs: + check_github_github_links: + if: github.repository == 'github/docs-internal' + runs-on: ubuntu-latest + env: + # need to use a token from a user with access to github/github for this step + GITHUB_TOKEN: ${{ secrets.DOCS_BOT_FR }} + FIRST_RESPONDER_PROJECT: Docs content first responder + REPORT_AUTHOR: docubot + REPORT_LABEL: github github broken link report + REPORT_REPOSITORY: github/docs-content + steps: + - if: ${{ env.FREEZE == 'true' }} + run: | + echo 'The repo is currently frozen! Exiting this workflow.' + exit 1 # prevents further steps from running + + - name: Checkout + uses: actions/checkout@5a4ac9002d0be2fb38bd78e4b4dbde5606d7042f + + - name: Setup Node + uses: actions/setup-node@38d90ce44d5275ad62cc48384b3d8a58c500bb5f + with: + node-version: 16.8.x + cache: npm + + - name: Install Node.js dependencies + run: npm ci + + - name: Run broken github/github link check + run: | + script/check-github-github-links.js > broken_github_github_links.md + + # check-github-github-links.js returns 0 if no links are broken, and 1 if any links + # are broken. When an Actions step's exit code is 1, the action run's job status + # is failure and the run ends. The following steps create an issue for the + # broken link report only if any links are broken, so `if: ${{ failure() }}` + # ensures the steps run despite the previous step's failure of the job. + # + # https://docs.github.com/actions/reference/context-and-expression-syntax-for-github-actions#job-status-check-functions + + - if: ${{ failure() }} + name: Get title for issue + id: check + run: echo "::set-output name=title::$(head -1 broken_github_github_links.md)" + - if: ${{ failure() }} + name: Create issue from file + id: github-github-broken-link-report + uses: peter-evans/create-issue-from-file@b4f9ee0a9d4abbfc6986601d9b1a4f8f8e74c77e + with: + token: ${{ env.GITHUB_TOKEN }} + title: ${{ steps.check.outputs.title }} + content-filepath: ./broken_github_github_links.md + repository: ${{ env.REPORT_REPOSITORY }} + labels: ${{ env.REPORT_LABEL }} diff --git a/script/check-github-github-links.js b/script/check-github-github-links.js new file mode 100755 index 000000000000..311f30481cc8 --- /dev/null +++ b/script/check-github-github-links.js @@ -0,0 +1,141 @@ +#!/usr/bin/env node + +// [start-readme] +// +// Run this script to get all broken docs.github.com links in github/github +// +// [end-readme] + +import { getContents, getPathsWithMatchingStrings } from './helpers/git-utils.js' +import got from 'got' + +if (!process.env.GITHUB_TOKEN) { + console.error('Error! You must have a GITHUB_TOKEN set in an .env file to run this script.') + process.exit(1) +} + +main() + +async function main() { + const searchStrings = ['https://docs.github.com', 'GitHub help_url', 'GitHub developer_help_url'] + const foundFiles = await getPathsWithMatchingStrings(searchStrings, 'github', 'github') + const searchFiles = [...foundFiles] + .filter((file) => endsWithAny(['.rb', '.yml', '.yaml', '.txt', '.pdf', '.erb', '.js'], file)) + .filter( + (file) => + !file.includes('test/') && + !file.includes('app/views/') && + !file.includes('config.') && + !file.includes('app/api/description/') + ) + + const docsLinksFiles = [] + const urlRegEx = + /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)/g + + for (const file of searchFiles) { + const contents = await getContents('github', 'github', 'master', file) + + if ( + contents.includes('https://docs.github.com') || + contents.includes('GitHub.help_url') || + contents.includes('GitHub.developer_help_url') + ) { + const docsIndices = getIndicesOf('https://docs.github.com', contents) + const helpIndices = getIndicesOf('GitHub.help_url', contents) + helpIndices.push(...getIndicesOf('GitHub.developer_help_url', contents)) + if (docsIndices.length > 0) { + docsIndices.forEach((numIndex) => { + // Assuming we don't have links close to 500 characters long + const docsLink = contents.substring(numIndex, numIndex + 500).match(urlRegEx) + docsLinksFiles.push([docsLink[0].toString().replace(/[^a-zA-Z0-9]*$|\\n$/g, ''), file]) + }) + } + + if (helpIndices.length > 0) { + helpIndices.forEach((numIndex) => { + // There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip + if ( + (contents.substring(numIndex, numIndex + 11) === 'GitHub.help' && + contents.charAt(numIndex + 16) !== '#') || + (contents.substring(numIndex, numIndex + 16) === 'GitHub.developer' && + contents.charAt(numIndex + 26) !== '#') + ) { + const startSearchIndex = contents.indexOf('/', numIndex) + // Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url + // There are certain links that don't start with `/` so we want to skip those. + // If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip + if (startSearchIndex - numIndex < 30) { + const helpLink = + 'https://docs.github.com' + + contents + .substring( + startSearchIndex, + regexIndexOf( + contents, + /\n|"\)|{@email_tracking_params}|\^http|Ahttps|example|This|TODO"|[{}|"%><.,')* ]/, + startSearchIndex + 1 + ) + ) + .trim() + docsLinksFiles.push([helpLink, file]) + } + } + }) + } + } + } + const brokenLinks = [] + await Promise.all( + docsLinksFiles.map(async (file) => { + try { + await got(file[0]) + } catch { + brokenLinks.push(file) + } + }) + ) + if (!brokenLinks.length) { + console.log('All links are good!') + process.exit(0) + } + + console.log(`Found ${brokenLinks.length} total broken links in github/github`) + console.log('```') + + console.log(`${JSON.stringify([...brokenLinks], null, 2)}`) + + console.log('```') + // Exit unsuccessfully if broken links are found. + process.exit(1) +} + +function endsWithAny(suffixes, string) { + for (const suffix of suffixes) { + if (string.endsWith(suffix)) return true + } + + return false +} + +function getIndicesOf(searchString, string) { + const searchStrLen = searchString.length + if (searchStrLen === 0) return [] + + let startIndex = 0 + let index + const indices = [] + + while ((index = string.indexOf(searchString, startIndex)) > -1) { + indices.push(index) + startIndex = index + searchStrLen + } + + return indices +} + +function regexIndexOf(string, regex, startPos) { + const indexOf = string.substring(startPos || 0).search(regex) + + return indexOf >= 0 ? indexOf + (startPos || 0) : indexOf +} diff --git a/script/helpers/git-utils.js b/script/helpers/git-utils.js index 45bb66a043e8..f6baf7ff1f7f 100644 --- a/script/helpers/git-utils.js +++ b/script/helpers/git-utils.js @@ -126,3 +126,46 @@ export async function createIssueComment(owner, repo, pullNumber, body) { throw err } } + +// Search for a string in a file in code and return the array of paths to files that contain string +export async function getPathsWithMatchingStrings(strArr, org, repo) { + const perPage = 100 + const paths = new Set() + + for (const str of strArr) { + try { + const q = `q=${str}+in:file+repo:${org}/${repo}` + let currentPage = 1 + let totalCount = 0 + let currentCount = 0 + + do { + const data = await searchCode(q, perPage, currentPage) + data.items.map((el) => paths.add(el.path)) + totalCount = data.total_count + currentCount += data.items.length + currentPage++ + } while (currentCount < totalCount) + } catch (err) { + console.log(`error searching for ${str} in ${org}/${repo}`) + throw err + } + } + + return paths +} + +async function searchCode(q, perPage, currentPage) { + try { + const { data } = await github.rest.search.code({ + q, + per_page: perPage, + page: currentPage, + }) + + return data + } catch (err) { + console.log(`error searching for ${q} in code`) + throw err + } +}