Skip to content

Commit

Permalink
Follow git logic when parsing patch identities (#44)
Browse files Browse the repository at this point in the history
When GitHub creates patches for Dependabot PRs, it generates a "From:"
line that is not valid according to RFC 5322: the address spec contains
unquoted special characters (the "[bot]" in "dependabot[bot]"). While
the 'net/mail' parser makes some exceptions to the spec, this is not one
of them, so parsing these patch headers fails.

Git's 'mailinfo' command avoids this by only implementing the unquoting
part of RFC 5322 and then applying a heuristic to separate the string in
to name and email values that seem reasonable.

This commit does two things:

1. Reimplements ParsePatchIdentity to follow Git's logic, so that it can
   accept a wider range of inputs, including quoted strings.  Strings
   accepted by the previous implementation parse in the same way with
   one exception: inputs that contain whitespace inside the angle
   brackets for an email address now use the email address as the name
   and drop any separate name component.

2. When parsing mail-formatted patches, use ParsePatchIdentity to parse
   the "From:" line instead of the 'net/mail' function.
  • Loading branch information
bluekeyes authored May 6, 2024
1 parent 3f2ea5c commit a00d2cc
Show file tree
Hide file tree
Showing 4 changed files with 321 additions and 142 deletions.
71 changes: 6 additions & 65 deletions gitdiff/patch_header.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,62 +68,6 @@ func (h *PatchHeader) Message() string {
return msg.String()
}

// PatchIdentity identifies a person who authored or committed a patch.
type PatchIdentity struct {
Name string
Email string
}

func (i PatchIdentity) String() string {
name := i.Name
if name == "" {
name = `""`
}
return fmt.Sprintf("%s <%s>", name, i.Email)
}

// ParsePatchIdentity parses a patch identity string. A valid string contains
// an optional name followed by an email address in angle brackets. The angle
// brackets must always exist, but may enclose an empty address. At least one
// of the name or the email address must be non-empty. If the string only
// contains an email address, that value is also used as the name.
//
// The name must not contain a left angle bracket, '<', and the email address
// must not contain a right angle bracket, '>'. Otherwise, there are no
// restrictions on the format of either field.
func ParsePatchIdentity(s string) (PatchIdentity, error) {
var emailStart, emailEnd int
for i, c := range s {
if c == '<' && emailStart == 0 {
emailStart = i + 1
}
if c == '>' && emailStart > 0 {
emailEnd = i
break
}
}
if emailStart > 0 && emailEnd == 0 {
return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s)
}

var name, email string
if emailStart > 0 {
name = strings.TrimSpace(s[:emailStart-1])
}
if emailStart > 0 && emailEnd > 0 {
email = strings.TrimSpace(s[emailStart:emailEnd])
}
if name == "" && email != "" {
name = email
}

if name == "" && email == "" {
return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s)
}

return PatchIdentity{Name: name, Email: email}, nil
}

// ParsePatchDate parses a patch date string. It returns the parsed time or an
// error if s has an unknown format. ParsePatchDate supports the iso, rfc,
// short, raw, unix, and default formats (with local variants) used by the
Expand Down Expand Up @@ -425,16 +369,13 @@ func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*Pa
}
}

addrs, err := msg.Header.AddressList("From")
if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) {
return nil, err
}
if len(addrs) > 0 {
addr := addrs[0]
if addr.Name == "" {
addr.Name = addr.Address
from := msg.Header.Get("From")
if from != "" {
u, err := ParsePatchIdentity(from)
if err != nil {
return nil, err
}
h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address}
h.Author = &u
}

date := msg.Header.Get("Date")
Expand Down
99 changes: 22 additions & 77 deletions gitdiff/patch_header_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,83 +5,6 @@ import (
"time"
)

func TestParsePatchIdentity(t *testing.T) {
tests := map[string]struct {
Input string
Output PatchIdentity
Err interface{}
}{
"simple": {
Input: "Morton Haypenny <[email protected]>",
Output: PatchIdentity{
Name: "Morton Haypenny",
Email: "[email protected]",
},
},
"extraWhitespace": {
Input: " Morton Haypenny <[email protected] > ",
Output: PatchIdentity{
Name: "Morton Haypenny",
Email: "[email protected]",
},
},
"trailingCharacters": {
Input: "Morton Haypenny <[email protected]> unrelated garbage",
Output: PatchIdentity{
Name: "Morton Haypenny",
Email: "[email protected]",
},
},
"onlyEmail": {
Input: "<[email protected]>",
Output: PatchIdentity{
Name: "[email protected]",
Email: "[email protected]",
},
},
"emptyEmail": {
Input: "Morton Haypenny <>",
Output: PatchIdentity{
Name: "Morton Haypenny",
Email: "",
},
},
"missingEmail": {
Input: "Morton Haypenny",
Err: "invalid identity",
},
"missingNameAndEmptyEmail": {
Input: "<>",
Err: "invalid identity",
},
"empty": {
Input: "",
Err: "invalid identity",
},
"unclosedEmail": {
Input: "Morton Haypenny <[email protected]",
Err: "unclosed email",
},
}

for name, test := range tests {
t.Run(name, func(t *testing.T) {
id, err := ParsePatchIdentity(test.Input)
if test.Err != nil {
assertError(t, test.Err, err, "parsing identity")
return
}
if err != nil {
t.Fatalf("unexpected error parsing identity: %v", err)
}

if test.Output != id {
t.Errorf("incorrect identity: expected %#v, actual %#v", test.Output, id)
}
})
}
}

func TestParsePatchDate(t *testing.T) {
expected := time.Date(2020, 4, 9, 8, 7, 6, 0, time.UTC)

Expand Down Expand Up @@ -349,6 +272,28 @@ Another body line.
Body: expectedBody,
},
},
"mailboxRFC5322SpecialCharacters": {
Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <12345+dependabot[bot]@users.noreply.github.com>
Date: Sat, 11 Apr 2020 15:21:23 -0700
Subject: [PATCH] A sample commit to test header parsing
The medium format shows the body, which
may wrap on to multiple lines.
Another body line.
`,
Header: PatchHeader{
SHA: expectedSHA,
Author: &PatchIdentity{
Name: "dependabot[bot]",
Email: "12345+dependabot[bot]@users.noreply.github.com",
},
AuthorDate: expectedDate,
Title: expectedTitle,
Body: expectedBody,
},
},
"mailboxAppendix": {
Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001
From: Morton Haypenny <[email protected]>
Expand Down
166 changes: 166 additions & 0 deletions gitdiff/patch_identity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package gitdiff

import (
"fmt"
"strings"
)

// PatchIdentity identifies a person who authored or committed a patch.
type PatchIdentity struct {
Name string
Email string
}

func (i PatchIdentity) String() string {
name := i.Name
if name == "" {
name = `""`
}
return fmt.Sprintf("%s <%s>", name, i.Email)
}

// ParsePatchIdentity parses a patch identity string. A patch identity contains
// an email address and an optional name in [RFC 5322] format. This is either a
// plain email adddress or a name followed by an address in angle brackets:
//
// [email protected]
// Author Name <[email protected]>
//
// If the input is not one of these formats, ParsePatchIdentity applies a
// heuristic to separate the name and email portions. If both the name and
// email are missing or empty, ParsePatchIdentity returns an error. It
// otherwise does not validate the result.
//
// [RFC 5322]: https://datatracker.ietf.org/doc/html/rfc5322
func ParsePatchIdentity(s string) (PatchIdentity, error) {
s = normalizeSpace(s)
s = unquotePairs(s)

var name, email string
if at := strings.IndexByte(s, '@'); at >= 0 {
start, end := at, at
for start >= 0 && !isRFC5332Space(s[start]) && s[start] != '<' {
start--
}
for end < len(s) && !isRFC5332Space(s[end]) && s[end] != '>' {
end++
}
email = s[start+1 : end]

// Adjust the boundaries so that we drop angle brackets, but keep
// spaces when removing the email to form the name.
if start < 0 || s[start] != '<' {
start++
}
if end >= len(s) || s[end] != '>' {
end--
}
name = s[:start] + s[end+1:]
} else {
start, end := 0, 0
for i := 0; i < len(s); i++ {
if s[i] == '<' && start == 0 {
start = i + 1
}
if s[i] == '>' && start > 0 {
end = i
break
}
}
if start > 0 && end >= start {
email = strings.TrimSpace(s[start:end])
name = s[:start-1]
}
}

// After extracting the email, the name might contain extra whitespace
// again and may be surrounded by comment characters. The git source gives
// these examples of when this can happen:
//
// "Name <email@domain>"
// "email@domain (Name)"
// "Name <email@domain> (Comment)"
//
name = normalizeSpace(name)
if strings.HasPrefix(name, "(") && strings.HasSuffix(name, ")") {
name = name[1 : len(name)-1]
}
name = strings.TrimSpace(name)

// If the name is empty or contains email-like characters, use the email
// instead (assuming one exists)
if name == "" || strings.ContainsAny(name, "@<>") {
name = email
}

if name == "" && email == "" {
return PatchIdentity{}, fmt.Errorf("invalid identity string %q", s)
}
return PatchIdentity{Name: name, Email: email}, nil
}

// unquotePairs process the RFC5322 tokens "quoted-string" and "comment" to
// remove any "quoted-pairs" (backslash-espaced characters). It also removes
// the quotes from any quoted strings, but leaves the comment delimiters.
func unquotePairs(s string) string {
quote := false
comments := 0
escaped := false

var out strings.Builder
for i := 0; i < len(s); i++ {
if escaped {
escaped = false
} else {
switch s[i] {
case '\\':
// quoted-pair is only allowed in quoted-string/comment
if quote || comments > 0 {
escaped = true
continue // drop '\' character
}

case '"':
if comments == 0 {
quote = !quote
continue // drop '"' character
}

case '(':
if !quote {
comments++
}
case ')':
if comments > 0 {
comments--
}
}
}
out.WriteByte(s[i])
}
return out.String()
}

// normalizeSpace trims leading and trailing whitespace from s and converts
// inner sequences of one or more whitespace characters to single spaces.
func normalizeSpace(s string) string {
var sb strings.Builder
for i := 0; i < len(s); i++ {
c := s[i]
if !isRFC5332Space(c) {
if sb.Len() > 0 && isRFC5332Space(s[i-1]) {
sb.WriteByte(' ')
}
sb.WriteByte(c)
}
}
return sb.String()
}

func isRFC5332Space(c byte) bool {
switch c {
case '\t', '\n', '\r', ' ':
return true
}
return false
}
Loading

0 comments on commit a00d2cc

Please sign in to comment.