Follow git logic when parsing patch identities (#44)

When GitHub creates patches for Dependabot PRs, it generates a "From:" line that is not valid according to RFC 5322: the address spec contains unquoted special characters (the "[bot]" in "dependabot[bot]"). While the 'net/mail' parser makes some exceptions to the spec, this is not one of them, so parsing these patch headers fails. Git's 'mailinfo' command avoids this by only implementing the unquoting part of RFC 5322 and then applying a heuristic to separate the string in to name and email values that seem reasonable. This commit does two things: 1. Reimplements ParsePatchIdentity to follow Git's logic, so that it can accept a wider range of inputs, including quoted strings. Strings accepted by the previous implementation parse in the same way with one exception: inputs that contain whitespace inside the angle brackets for an email address now use the email address as the name and drop any separate name component. 2. When parsing mail-formatted patches, use ParsePatchIdentity to parse the "From:" line instead of the 'net/mail' function.
bluekeyes · May 6, 2024 · a00d2cc · a00d2cc
1 parent 3f2ea5c
commit a00d2cc
Show file tree

Hide file tree

Showing 4 changed files with 321 additions and 142 deletions.
diff --git a/gitdiff/patch_header.go b/gitdiff/patch_header.go
@@ -68,62 +68,6 @@ func (h *PatchHeader) Message() string {
 	return msg.String()
 }
 
-// PatchIdentity identifies a person who authored or committed a patch.
-type PatchIdentity struct {
-	Name  string
-	Email string
-}
-
-func (i PatchIdentity) String() string {
-	name := i.Name
-	if name == "" {
-		name = `""`
-	}
-	return fmt.Sprintf("%s <%s>", name, i.Email)
-}
-
-// ParsePatchIdentity parses a patch identity string. A valid string contains
-// an optional name followed by an email address in angle brackets. The angle
-// brackets must always exist, but may enclose an empty address. At least one
-// of the name or the email address must be non-empty. If the string only
-// contains an email address, that value is also used as the name.
-//
-// The name must not contain a left angle bracket, '<', and the email address
-// must not contain a right angle bracket, '>'. Otherwise, there are no
-// restrictions on the format of either field.
-func ParsePatchIdentity(s string) (PatchIdentity, error) {
-	var emailStart, emailEnd int
-	for i, c := range s {
-		if c == '<' && emailStart == 0 {
-			emailStart = i + 1
-		}
-		if c == '>' && emailStart > 0 {
-			emailEnd = i
-			break
-		}
-	}
-	if emailStart > 0 && emailEnd == 0 {
-		return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s)
-	}
-
-	var name, email string
-	if emailStart > 0 {
-		name = strings.TrimSpace(s[:emailStart-1])
-	}
-	if emailStart > 0 && emailEnd > 0 {
-		email = strings.TrimSpace(s[emailStart:emailEnd])
-	}
-	if name == "" && email != "" {
-		name = email
-	}
-
-	if name == "" && email == "" {
-		return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s)
-	}
-
-	return PatchIdentity{Name: name, Email: email}, nil
-}
-
 // ParsePatchDate parses a patch date string. It returns the parsed time or an
 // error if s has an unknown format. ParsePatchDate supports the iso, rfc,
 // short, raw, unix, and default formats (with local variants) used by the
@@ -425,16 +369,13 @@ func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*Pa
 		}
 	}
 
-	addrs, err := msg.Header.AddressList("From")
-	if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) {
-		return nil, err
-	}
-	if len(addrs) > 0 {
-		addr := addrs[0]
-		if addr.Name == "" {
-			addr.Name = addr.Address
+	from := msg.Header.Get("From")
+	if from != "" {
+		u, err := ParsePatchIdentity(from)
+		if err != nil {
+			return nil, err
 		}
-		h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address}
+		h.Author = &u
 	}
 
 	date := msg.Header.Get("Date")

diff --git a/gitdiff/patch_header_test.go b/gitdiff/patch_header_test.go
@@ -5,83 +5,6 @@ import (
 	"time"
 )
 
-func TestParsePatchIdentity(t *testing.T) {
-	tests := map[string]struct {
-		Input  string
-		Output PatchIdentity
-		Err    interface{}
-	}{
-		"simple": {
-			Input: "Morton Haypenny <[email protected]>",
-			Output: PatchIdentity{
-				Name:  "Morton Haypenny",
-				Email: "[email protected]",
-			},
-		},
-		"extraWhitespace": {
-			Input: "   Morton Haypenny  <[email protected]  >  ",
-			Output: PatchIdentity{
-				Name:  "Morton Haypenny",
-				Email: "[email protected]",
-			},
-		},
-		"trailingCharacters": {
-			Input: "Morton Haypenny <[email protected]> unrelated garbage",
-			Output: PatchIdentity{
-				Name:  "Morton Haypenny",
-				Email: "[email protected]",
-			},
-		},
-		"onlyEmail": {
-			Input: "<[email protected]>",
-			Output: PatchIdentity{
-				Name:  "[email protected]",
-				Email: "[email protected]",
-			},
-		},
-		"emptyEmail": {
-			Input: "Morton Haypenny <>",
-			Output: PatchIdentity{
-				Name:  "Morton Haypenny",
-				Email: "",
-			},
-		},
-		"missingEmail": {
-			Input: "Morton Haypenny",
-			Err:   "invalid identity",
-		},
-		"missingNameAndEmptyEmail": {
-			Input: "<>",
-			Err:   "invalid identity",
-		},
-		"empty": {
-			Input: "",
-			Err:   "invalid identity",
-		},
-		"unclosedEmail": {
-			Input: "Morton Haypenny <[email protected]",
-			Err:   "unclosed email",
-		},
-	}
-
-	for name, test := range tests {
-		t.Run(name, func(t *testing.T) {
-			id, err := ParsePatchIdentity(test.Input)
-			if test.Err != nil {
-				assertError(t, test.Err, err, "parsing identity")
-				return
-			}
-			if err != nil {
-				t.Fatalf("unexpected error parsing identity: %v", err)
-			}
-
-			if test.Output != id {
-				t.Errorf("incorrect identity: expected %#v, actual %#v", test.Output, id)
-			}
-		})
-	}
-}
-
 func TestParsePatchDate(t *testing.T) {
 	expected := time.Date(2020, 4, 9, 8, 7, 6, 0, time.UTC)
 
@@ -349,6 +272,28 @@ Another body line.
 				Body:       expectedBody,
 			},
 		},
+		"mailboxRFC5322SpecialCharacters": {
+			Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001
+From: "dependabot[bot]" <12345+dependabot[bot]@users.noreply.github.com>
+Date: Sat, 11 Apr 2020 15:21:23 -0700
+Subject: [PATCH] A sample commit to test header parsing
+
+The medium format shows the body, which
+may wrap on to multiple lines.
+
+Another body line.
+`,
+			Header: PatchHeader{
+				SHA: expectedSHA,
+				Author: &PatchIdentity{
+					Name:  "dependabot[bot]",
+					Email: "12345+dependabot[bot]@users.noreply.github.com",
+				},
+				AuthorDate: expectedDate,
+				Title:      expectedTitle,
+				Body:       expectedBody,
+			},
+		},
 		"mailboxAppendix": {
 			Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001
 From: Morton Haypenny <[email protected]>

diff --git a/gitdiff/patch_identity.go b/gitdiff/patch_identity.go
@@ -0,0 +1,166 @@
+package gitdiff
+
+import (
+	"fmt"
+	"strings"
+)
+
+// PatchIdentity identifies a person who authored or committed a patch.
+type PatchIdentity struct {
+	Name  string
+	Email string
+}
+
+func (i PatchIdentity) String() string {
+	name := i.Name
+	if name == "" {
+		name = `""`
+	}
+	return fmt.Sprintf("%s <%s>", name, i.Email)
+}
+
+// ParsePatchIdentity parses a patch identity string. A patch identity contains
+// an email address and an optional name in [RFC 5322] format. This is either a
+// plain email adddress or a name followed by an address in angle brackets:
+//
+//	[email protected]
+//	Author Name <[email protected]>
+//
+// If the input is not one of these formats, ParsePatchIdentity applies a
+// heuristic to separate the name and email portions. If both the name and
+// email are missing or empty, ParsePatchIdentity returns an error. It
+// otherwise does not validate the result.
+//
+// [RFC 5322]: https://datatracker.ietf.org/doc/html/rfc5322
+func ParsePatchIdentity(s string) (PatchIdentity, error) {
+	s = normalizeSpace(s)
+	s = unquotePairs(s)
+
+	var name, email string
+	if at := strings.IndexByte(s, '@'); at >= 0 {
+		start, end := at, at
+		for start >= 0 && !isRFC5332Space(s[start]) && s[start] != '<' {
+			start--
+		}
+		for end < len(s) && !isRFC5332Space(s[end]) && s[end] != '>' {
+			end++
+		}
+		email = s[start+1 : end]
+
+		// Adjust the boundaries so that we drop angle brackets, but keep
+		// spaces when removing the email to form the name.
+		if start < 0 || s[start] != '<' {
+			start++
+		}
+		if end >= len(s) || s[end] != '>' {
+			end--
+		}
+		name = s[:start] + s[end+1:]
+	} else {
+		start, end := 0, 0
+		for i := 0; i < len(s); i++ {
+			if s[i] == '<' && start == 0 {
+				start = i + 1
+			}
+			if s[i] == '>' && start > 0 {
+				end = i
+				break
+			}
+		}
+		if start > 0 && end >= start {
+			email = strings.TrimSpace(s[start:end])
+			name = s[:start-1]
+		}
+	}
+
+	// After extracting the email, the name might contain extra whitespace
+	// again and may be surrounded by comment characters. The git source gives
+	// these examples of when this can happen:
+	//
+	//   "Name <email@domain>"
+	//   "email@domain (Name)"
+	//   "Name <email@domain> (Comment)"
+	//
+	name = normalizeSpace(name)
+	if strings.HasPrefix(name, "(") && strings.HasSuffix(name, ")") {
+		name = name[1 : len(name)-1]
+	}
+	name = strings.TrimSpace(name)
+
+	// If the name is empty or contains email-like characters, use the email
+	// instead (assuming one exists)
+	if name == "" || strings.ContainsAny(name, "@<>") {
+		name = email
+	}
+
+	if name == "" && email == "" {
+		return PatchIdentity{}, fmt.Errorf("invalid identity string %q", s)
+	}
+	return PatchIdentity{Name: name, Email: email}, nil
+}
+
+// unquotePairs process the RFC5322 tokens "quoted-string" and "comment" to
+// remove any "quoted-pairs" (backslash-espaced characters). It also removes
+// the quotes from any quoted strings, but leaves the comment delimiters.
+func unquotePairs(s string) string {
+	quote := false
+	comments := 0
+	escaped := false
+
+	var out strings.Builder
+	for i := 0; i < len(s); i++ {
+		if escaped {
+			escaped = false
+		} else {
+			switch s[i] {
+			case '\\':
+				// quoted-pair is only allowed in quoted-string/comment
+				if quote || comments > 0 {
+					escaped = true
+					continue // drop '\' character
+				}
+
+			case '"':
+				if comments == 0 {
+					quote = !quote
+					continue // drop '"' character
+				}
+
+			case '(':
+				if !quote {
+					comments++
+				}
+			case ')':
+				if comments > 0 {
+					comments--
+				}
+			}
+		}
+		out.WriteByte(s[i])
+	}
+	return out.String()
+}
+
+// normalizeSpace trims leading and trailing whitespace from s and converts
+// inner sequences of one or more whitespace characters to single spaces.
+func normalizeSpace(s string) string {
+	var sb strings.Builder
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		if !isRFC5332Space(c) {
+			if sb.Len() > 0 && isRFC5332Space(s[i-1]) {
+				sb.WriteByte(' ')
+			}
+			sb.WriteByte(c)
+		}
+	}
+	return sb.String()
+}
+
+func isRFC5332Space(c byte) bool {
+	switch c {
+	case '\t', '\n', '\r', ' ':
+		return true
+	}
+	return false
+}