Java: Stop breaking surrogate pairs in toDelta()

Resolves google#69 for Java Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
dmsnell · Jan 30, 2024 · 13309a1 · 13309a1
1 parent 143d61d
commit 13309a1
Show file tree

Hide file tree

Showing 2 changed files with 177 additions and 8 deletions.
diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java
@@ -19,6 +19,7 @@
 package name.fraser.neil.plaintext;
 
 import java.io.UnsupportedEncodingException;
+import java.lang.Character;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.util.*;
@@ -1293,6 +1294,46 @@ public void diff_cleanupMerge(LinkedList<Diff> diffs) {
     }
   }
 
+  /**
+   * Rearrange diff boudnaries that split Unicode surrogate pairs.
+   * @param diffs Linked list of diff objects
+   */
+  public void diff_cleanupSplitSurrogates(List<Diff> diffs) {
+    char lastEnd = 0;
+    boolean isFirst = true;
+    HashSet<Diff> toRemove = new HashSet<Diff>();
+
+    for (Diff aDiff : diffs) {
+      if (aDiff.text.isEmpty()) {
+        toRemove.add(aDiff);
+        continue;
+      }
+
+      char thisTop = aDiff.text.charAt(0);
+      char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1);
+
+      if (Character.isHighSurrogate(thisEnd)) {
+        lastEnd = thisEnd;
+        aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1);
+      }
+
+      if (!isFirst && Character.isHighSurrogate(lastEnd) && Character.isLowSurrogate(thisTop)) {
+        aDiff.text = lastEnd + aDiff.text;
+      }
+
+      isFirst = false;
+
+      if ( aDiff.text.isEmpty() ) {
+        toRemove.add(aDiff);
+        continue;
+      }
+    }
+
+    for (Diff aDiff : toRemove) {
+      diffs.remove(aDiff);
+    }
+  }
+
   /**
    * loc is a location in text1, compute and return the equivalent location in
    * text2.
@@ -1429,6 +1470,7 @@ public int diff_levenshtein(List<Diff> diffs) {
    */
   public String diff_toDelta(List<Diff> diffs) {
     StringBuilder text = new StringBuilder();
+    this.diff_cleanupSplitSurrogates(diffs);
     for (Diff aDiff : diffs) {
       switch (aDiff.operation) {
       case INSERT:
@@ -1457,6 +1499,103 @@ public String diff_toDelta(List<Diff> diffs) {
     return delta;
   }
 
+  private int digit16(char b) throws IllegalArgumentException {
+    switch (b) {
+      case '0': return 0;
+      case '1': return 1;
+      case '2': return 2;
+      case '3': return 3;
+      case '4': return 4;
+      case '5': return 5;
+      case '6': return 6;
+      case '7': return 7;
+      case '8': return 8;
+      case '9': return 9;
+      case 'A': case 'a': return 10;
+      case 'B': case 'b': return 11;
+      case 'C': case 'c': return 12;
+      case 'D': case 'd': return 13;
+      case 'E': case 'e': return 14;
+      case 'F': case 'f': return 15;
+      default:
+        throw new IllegalArgumentException();
+    }
+  }
+
+  private String decodeURI(String text) throws IllegalArgumentException {
+    int i = 0;
+    StringBuilder decoded = new StringBuilder(text.length());
+
+    while (i < text.length()) {
+      if (text.charAt(i) != '%') {
+        decoded.append(text.charAt(i++));
+        continue;
+      }
+
+      // start a percent-sequence
+      int byte1 = (digit16(text.charAt(i + 1)) << 4) + digit16(text.charAt(i + 2));
+      if ((byte1 & 0x80) == 0) {
+        decoded.append(Character.toChars(byte1));
+        i += 3;
+        continue;
+      }
+
+      if ( text.charAt(i + 3) != '%') {
+        throw new IllegalArgumentException();
+      }
+
+      int byte2 = (digit16(text.charAt(i + 4)) << 4) + digit16(text.charAt(i + 5));
+      if ((byte2 & 0xC0) != 0x80) {
+        throw new IllegalArgumentException();
+      }
+      byte2 = byte2 & 0x3F;
+      if ((byte1 & 0xE0) == 0xC0) {
+        decoded.append(Character.toChars(((byte1 & 0x1F) << 6) | byte2));
+        i += 6;
+        continue;
+      }
+
+      if (text.charAt(i + 6) != '%') {
+        throw new IllegalArgumentException();
+      }
+
+      int byte3 = (digit16(text.charAt(i + 7)) << 4) + digit16(text.charAt(i + 8));
+      if ((byte3 & 0xC0) != 0x80) {
+        throw new IllegalArgumentException();
+      }
+      byte3 = byte3 & 0x3F;
+      if ((byte1 & 0xF0) == 0xE0) {
+        // unpaired surrogate are fine here
+        decoded.append(Character.toChars(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3));
+        i += 9;
+        continue;
+      }
+
+      if (text.charAt(i + 9) != '%') {
+        throw new IllegalArgumentException();
+      }
+
+      int byte4 = (digit16(text.charAt(i + 10)) << 4) + digit16(text.charAt(i + 11));
+      if ((byte4 & 0xC0) != 0x80) {
+        throw new IllegalArgumentException();
+      }
+      byte4 = byte4 & 0x3F;
+      if ((byte1 & 0xF8) == 0xF0) {
+        int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
+        if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
+          decoded.append(Character.toChars((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800));
+          decoded.append(Character.toChars(0xDC00 | (codePoint & 0xFFFF) & 0x3FF));
+          i += 12;
+          continue;
+        }
+      }
+
+      throw new IllegalArgumentException();
+    }
+
+    return decoded.toString();
+  }
+
   /**
    * Given the original text1, and an encoded string which describes the
    * operations required to transform text1 into text2, compute the full diff.
@@ -1483,10 +1622,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
         // decode would change all "+" to " "
         param = param.replace("+", "%2B");
         try {
-          param = URLDecoder.decode(param, "UTF-8");
-        } catch (UnsupportedEncodingException e) {
-          // Not likely on modern system.
-          throw new Error("This system does not support UTF-8.", e);
+          param = this.decodeURI(param);
         } catch (IllegalArgumentException e) {
           // Malformed URI sequence.
           throw new IllegalArgumentException(
@@ -2269,10 +2405,7 @@ public List<Patch> patch_fromText(String textline)
         line = text.getFirst().substring(1);
         line = line.replace("+", "%2B");  // decode would change all "+" to " "
         try {
-          line = URLDecoder.decode(line, "UTF-8");
-        } catch (UnsupportedEncodingException e) {
-          // Not likely on modern system.
-          throw new Error("This system does not support UTF-8.", e);
+          line = this.decodeURI(line);
         } catch (IllegalArgumentException e) {
           // Malformed URI sequence.
           throw new IllegalArgumentException(

diff --git a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java
@@ -424,6 +424,42 @@ public static void testDiffDelta() {
 
     assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta));
 
+    diffs = diffList(new Diff(EQUAL, "\ud83d\ude4b\ud83d"), new Diff(INSERT, "\ude4c\ud83d"), new Diff(EQUAL, "\ude4b"));
+    delta = dmp.diff_toDelta(diffs);
+    assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta);
+
+    assertEquals(
+      "diff_toDelta: insert surrogate pair between similar high surrogates",
+      dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd70"), new Diff(EQUAL, "\ud83c\udd71"))),
+      dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70\ud83c"), new Diff(INSERT, "\udd70\ud83c"), new Diff(EQUAL, "\udd71")))
+    );
+
+    assertEquals(
+      "diff_toDelta: swap surrogate pairs delete/insert",
+      dmp.diff_toDelta(diffList(new Diff(DELETE, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd71"))),
+      dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(DELETE, "\udd70"), new Diff(INSERT, "\udd71")))
+    );
+
+    assertEquals(
+      "diff_toDelta: swap surrogate pairs insert/delete",
+      dmp.diff_toDelta(diffList(new Diff(INSERT, "\ud83c\udd70"), new Diff(DELETE, "\ud83c\udd71"))),
+      dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(INSERT, "\udd70"), new Diff(DELETE, "\udd71")))
+    );
+
+    assertEquals(
+      "diff_toDelta: empty diff groups",
+      dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(DELETE, ""), new Diff(INSERT, "ghijk"))),
+      dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(INSERT, "ghijk")))
+    );
+
+    // Different versions of the library may have created deltas with
+    // half of a surrogate pair encoded as if it were valid UTF-8
+    assertEquals(
+      "diff_toDelta: surrogate half encoded as UTF8",
+      dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "-2\t+%F0%9F%85%B1")),
+      dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1"))
+    );
+
     // Verify pool of unchanged characters.
     diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "));
     String text2 = dmp.diff_text2(diffs);