Encode characters in Latin-1 to avoid (de)serialization failure (#37)

AFLplusplus · Apr 3, 2022 · ff4e5a2 · ff4e5a2 · ilyaa-c2a · Feb 27, 2023
1 parent c34493d
commit ff4e5a2
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 3 deletions.
diff --git a/grammars/README.md b/grammars/README.md
@@ -27,4 +27,6 @@ An example, `test_hex.json`, is included in this directory:
 }
 ```
 
-Note that, this workaround only works for ASCII characters (i.e., `\u0000` \~ `\u007f`). Otherwise, the special characters will be converted into more than one byte in the UTF-8 encoding. A wrong grammar file, `wrong_hex.json`, is included in this directory as well. Please refer to [post1](https://www.utf8-chartable.de/) and [post2](https://stackoverflow.com/a/59624562) for more details.
+Note that, this workaround only works for ASCII characters (i.e., `\u0000` \~ `\u00ff`). Otherwise, the grammar file cannot be processed.
+
+References: [post1](https://www.utf8-chartable.de/), [post2](https://stackoverflow.com/a/59624562), [post3](https://stackoverflow.com/a/66601996), and [post4](https://stackoverflow.com/questions/66601743/python3-str-to-bytes-convertation-problem).
diff --git a/grammars/f1_c_gen.py b/grammars/f1_c_gen.py
@@ -82,9 +82,14 @@ def to_bytes(self):
         val_len = len(self.val)
         ret += val_len.to_bytes(4, byteorder='little', signed=False)
         # val
-        val_bytes = bytes(self.val, 'utf-8')
+        # Latin-1 is an 8-bit character set. The first 128 characters of its
+        # set are identical to the US ASCII standard. By encoding the string as
+        # Latin-1, we can handle all hex characters from \u0000 to \u00ff
+        # Refs:
+        # - https://stackoverflow.com/questions/66601743/python3-str-to-bytes-convertation-problem
+        # - https://kb.iu.edu/d/aepu
+        val_bytes = bytes(self.val, 'latin-1')
         if val_len != len(val_bytes):
-            # NOTE: we only support ASCII characters (i.e., single-byte characters)
             print(f'The length of `val` should be {val_len}, but found {len(val_bytes)}.')
             print(f'`val` bytes in UTF-8 encoding: {val_bytes}')
             print('Please check your grammar file!')

diff --git a/grammars/wrong_hex.json → grammars/wrong_hex_for_utf8.json b/grammars/wrong_hex.json → grammars/wrong_hex_for_utf8.json