From a63fb35d4a50c2566993be3bf6b882d286f9f303 Mon Sep 17 00:00:00 2001 From: Kara Date: Mon, 9 Sep 2024 12:31:27 -0700 Subject: [PATCH] Properly credit contributors who were truncated by GitHub API (#31739) * Properly credit contributors who were truncated by GitHub's 500 email limit * update list oops forgot my origin was so out of date * another --- .github/workflows/update-credits.yml | 2 + Tools/contribs_shared.ps1 | 4 ++ Tools/dump_github_contributors.ps1 | 90 +++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update-credits.yml b/.github/workflows/update-credits.yml index fb3508385e5..849cd6c924c 100644 --- a/.github/workflows/update-credits.yml +++ b/.github/workflows/update-credits.yml @@ -19,6 +19,8 @@ jobs: - name: Get this week's Contributors shell: pwsh + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} run: Tools/dump_github_contributors.ps1 > Resources/Credits/GitHub.txt # TODO diff --git a/Tools/contribs_shared.ps1 b/Tools/contribs_shared.ps1 index ba97c50a9a3..12340cda703 100644 --- a/Tools/contribs_shared.ps1 +++ b/Tools/contribs_shared.ps1 @@ -9,6 +9,10 @@ $ignore = @{ "PJBot" = $true + "github-actions[bot]" = $true "ZDDM" = $true "TYoung86" = $true + "paul" = $true # erroneously included -- presumably from PaulRitter, somehow, who is already credited + "08a" = $true # erroneously included -- valid github account, but not an actual contributor, probably an alias of a contributor who does not own this github account and is already credited somewhere. + "UristMcContributor" = $true # this was an account used to demonstrate how to create a valid PR, and is in actuality Willhelm53, who is already credited. } diff --git a/Tools/dump_github_contributors.ps1 b/Tools/dump_github_contributors.ps1 index 68b1d7f426d..288e003337e 100755 --- a/Tools/dump_github_contributors.ps1 +++ b/Tools/dump_github_contributors.ps1 @@ -3,10 +3,22 @@ $scriptDir = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent . $(join-path $scriptDir contribs_shared.ps1) +if ($null -eq $env:GITHUB_TOKEN) +{ + throw "A GitHub API token is required to run this script properly without being rate limited. If you're a user, generate a personal access token and use that. If you're running this in a GitHub action, make sure you expose the GITHUB_TOKEN secret as an environment variable." +} + function load_contribs([string] $repo) { + # https://developer.github.com/enterprise/2.8/v3/repos/#list-contributors + # We use the ?anon=1 query param for reasons explained later. $qParams = @{ "per_page" = 100 + "anon" = 1 + } + + $headers = @{ + Authorization="Bearer $env:GITHUB_TOKEN" } $url = "https://api.github.com/repos/{0}/contributors" -f $repo @@ -15,7 +27,7 @@ function load_contribs([string] $repo) while ($null -ne $url) { - $resp = Invoke-WebRequest $url -Body $qParams + $resp = Invoke-WebRequest $url -Body $qParams -Headers $headers $url = $resp.RelationLink.next @@ -23,6 +35,80 @@ function load_contribs([string] $repo) $r += $j } + # After collecting all the paginated data, we still aren't done. + # GitHub's API, for some reason, has a hard cap on 500 email addresses per repo which it will collate + # SS14 has gone past this limit for quite some time, so GitHub will stop including accounts, starting + # with those that have lower contributions, as valid distinct users with a `login` field. + # + # This is obviously a problem. + # To remedy, we first use the ?anon=1 parameter to force GitHub to include all committers emails, even + # those that it has, in its great and infinite wisdom, chosen to not properly attach to a GitHub account. + # + # Of course, this is normally an issue -- we use this API specifically because we want to only get + # committers with valid GitHub accounts, otherwise we pollute the contributor log with random aliases + # and names that people don't use, things like that. + # + # So, okay, solution: + # 1) Go over our list, and check for ones which only have a `name` and `email` field ('anonymous' contributors) + # and which dont already appear. + # 2) Check to see if the email ends with `@users.noreply.github.com`. + # - To my knowledge, GitHub includes an email in the form of `(numbers)+(username)@users.noreply.github.com` + # - when commits are made using someones GitHub account, and they aren't attaching another email to their account + # 3) If an email of this form was found, we can assume this is one of the 'missing' contribs and extract their GitHub username. + # 4) If an email of this form -wasn't- found, but they're still anonymous, we -unfortunately- still have to check if they're a valid GitHub user + # because GitHub might have just force-anonymized them anyway! + # + # It's possible their `name` is a valid GitHub user, but that this is a coincidence and they aren't actually a contributor. + # There is kind of not really jack shit we can do about that! It's not that common though and it's probably more likely to attribute + # correctly than not. + # 5) Then, we just add a `login` field to our object with their true username and let the rest of the code do its job. + + foreach ($contributor in $r) + { + if ($null -ne $contributor.name ` + -And $null -ne $contributor.email ` + -And $contributor.email -match '\d+\+(.*)@users\.noreply\.github\.com$') + { + $username = $Matches.1 + # Use their `name` if its equivalent to the extracted username, + # since that one will have proper casing. Otherwise just let them be a lowercasecel + if ($contributor.name.ToLower() -eq $username) + { + $username = $contributor.name + } + + if (($r).login -contains $username) + { + continue + } + + $contributor | Add-Member -MemberType NoteProperty -Name "login" -Value $username + } + elseif ($null -eq $contributor.login ` + -And $null -ne $contributor.name ` + -And !$contributor.name.Contains(" ")) + { + $username = $contributor.name + # They're an anonymous user, without a GH email, and their name doesn't contain a space + # (since a valid GH username can't have a space) + # Might still be a valid contrib??? + if (($r).login -contains $username) + { + continue + } + + $userUrl = "https://api.github.com/users/{0}" -f $username + + try + { + $userResp = Invoke-WebRequest $userUrl -Headers $headers + $userJ = ConvertFrom-Json $userResp.Content + $contributor | Add-Member -MemberType NoteProperty -Name "login" -Value $userJ.login + } + catch {} # if it 404s do nothing. powershell doesn't seem to really have a simpler way to do this. + } + } + return $r } @@ -34,4 +120,4 @@ $contentJson = load_contribs("DeltaV-Station/Delta-v-rebase") | Where-Object { -not $ignore[$_] }` | ForEach-Object { if($replacements[$_] -eq $null){ $_ } else { $replacements[$_] }} ` | Sort-object ` - | Join-String -Separator ", " + | Join-String -Separator ", " \ No newline at end of file