Merge pull request #2 from clear-street/skurella/trim-codepoint-indic…

…es-when-trimming-line-data Trim codepoint indices when trimming line data
clear-street · Feb 8, 2024 · 26c0724 · 26c0724
2 parents ee607f5 + 4fb0249
commit 26c0724
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 16 deletions.
diff --git a/decode.go b/decode.go
@@ -197,20 +197,26 @@ func (d *Decoder) readLine(v reflect.Value) (err error, ok bool) {
 }
 
 func rawValueFromLine(value rawValue, startPos, endPos int, format format) rawValue {
-	var trimFunc func(string) string
+	var trimFunc func(in string) (out string, leftRemoved int, rightRemoved int)
 
 	switch format.alignment {
 	case left:
-		trimFunc = func(s string) string {
-			return strings.TrimRight(s, string(format.padChar))
+		trimFunc = func(s string) (out string, leftRemoved int, rightRemoved int) {
+			out = strings.TrimRight(s, string(format.padChar))
+			return out, 0, len(s) - len(out)
 		}
 	case right:
-		trimFunc = func(s string) string {
-			return strings.TrimLeft(s, string(format.padChar))
+		trimFunc = func(s string) (out string, leftRemoved int, rightRemoved int) {
+			out = strings.TrimLeft(s, string(format.padChar))
+			return out, len(s) - len(out), 0
 		}
 	default:
-		trimFunc = func(s string) string {
-			return strings.Trim(s, string(format.padChar))
+		trimFunc = func(s string) (out string, leftRemoved int, rightRemoved int) {
+			leftTrimmed := strings.TrimLeft(s, string(format.padChar))
+			leftRemoved = len(s) - len(leftTrimmed)
+			rightTrimmed := strings.TrimRight(leftTrimmed, string(format.padChar))
+			rightRemoved = len(leftTrimmed) - len(rightTrimmed)
+			return rightTrimmed, leftRemoved, rightRemoved
 		}
 	}
 
@@ -228,17 +234,34 @@ func rawValueFromLine(value rawValue, startPos, endPos int, format format) rawVa
 			lineData = value.data[relevantIndices[0]:value.codepointIndices[endPos]]
 		}
 
-		// We trimmed data from the front of the string.
-		// We need to adjust the codepoint indices to reflect this, as they have shifted.
-		removedFromFront := relevantIndices[0]
-		newIndices := make([]int, 0, len(relevantIndices))
-		for _, idx := range relevantIndices {
-			newIndices = append(newIndices, idx-removedFromFront)
+		newIndices := relevantIndices
+		if relevantIndices[0] > 0 {
+			// We trimmed data from the front of the string.
+			// We need to adjust the codepoint indices to reflect this, as they have shifted.
+			removedFromFront := relevantIndices[0]
+			newIndices = make([]int, 0, len(relevantIndices))
+			for _, idx := range relevantIndices {
+				newIndices = append(newIndices, idx-removedFromFront)
+			}
+		}
+
+		// Trim the new line data.
+		newLineData, leftRemovedBytes, rightRemovedBytes := trimFunc(lineData)
+		trimmedIndices := newIndices
+		if leftRemovedBytes > 0 || rightRemovedBytes > 0 {
+			// We must trim our codepoint indices list in order to match
+			// the newly trimmed line data string.
+			trimmedIndices = []int{}
+			for _, idx := range newIndices {
+				if idx >= leftRemovedBytes && idx < len(lineData)-rightRemovedBytes {
+					trimmedIndices = append(trimmedIndices, idx-leftRemovedBytes)
+				}
+			}
 		}
 
 		return rawValue{
-			data:             trimFunc(lineData),
-			codepointIndices: newIndices,
+			data:             newLineData,
+			codepointIndices: trimmedIndices,
 		}
 	} else {
 		if len(value.data) == 0 || startPos > len(value.data) {
@@ -247,8 +270,9 @@ func rawValueFromLine(value rawValue, startPos, endPos int, format format) rawVa
 		if endPos > len(value.data) {
 			endPos = len(value.data)
 		}
+		newLineData, _, _ := trimFunc(value.data[startPos-1 : endPos])
 		return rawValue{
-			data: trimFunc(value.data[startPos-1 : endPos]),
+			data: newLineData,
 		}
 	}
 }

diff --git a/decode_test.go b/decode_test.go
@@ -443,6 +443,65 @@ func TestDecodeSetUseCodepointIndices_Nested(t *testing.T) {
 	}
 }
 
+func TestDecodeSetUseCodepointIndices_PaddingTrimmed(t *testing.T) {
+	type Nested struct {
+		First  int64  `fixed:"1,2,right,0"`
+		Second string `fixed:"3,4"`
+		Third  string `fixed:"5,6"`
+		Fourth string `fixed:"7,8"`
+	}
+	type Test struct {
+		First  Nested `fixed:"1,8"`
+		Second string `fixed:"9,10"`
+	}
+
+	for _, tt := range []struct {
+		name     string
+		raw      []byte
+		expected Test
+	}{
+		{
+			name: "All ASCII characters",
+			raw:  []byte("00      11"),
+			expected: Test{
+				First: Nested{
+					First:  0,
+					Second: "",
+					Third:  "",
+					Fourth: "",
+				},
+				Second: "11",
+			},
+		},
+		{
+			name: "Multi-byte characters",
+			raw:  []byte("00      ☃☃"),
+			expected: Test{
+				First: Nested{
+					First:  0,
+					Second: "",
+					Third:  "",
+					Fourth: "",
+				},
+				Second: "☃☃",
+			},
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			d := NewDecoder(bytes.NewReader(tt.raw))
+			d.SetUseCodepointIndices(true)
+			var s Test
+			err := d.Decode(&s)
+			if err != nil {
+				t.Errorf("Unexpected err: %v", err)
+			}
+			if !reflect.DeepEqual(tt.expected, s) {
+				t.Errorf("Decode(%v) want %v, have %v", tt.raw, tt.expected, s)
+			}
+		})
+	}
+}
+
 // Verify the behavior of Decoder.Decode at the end of a file. See
 // https://github.com/ianlopshire/go-fixedwidth/issues/6 for more details.
 func TestDecode_EOF(t *testing.T) {