py/objstr: Add check for valid UTF-8 when making a str from bytes.

This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64.
picoscratch · Sep 6, 2017 · 68c2817 · 68c2817
1 parent 069fc48
commit 68c2817
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 0 deletions.
diff --git a/py/mpconfig.h b/py/mpconfig.h
@@ -691,6 +691,11 @@ typedef double mp_float_t;
 #define MICROPY_PY_BUILTINS_STR_UNICODE (0)
 #endif
 
+// Whether to check for valid UTF-8 when converting bytes to str
+#ifndef MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+#define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE)
+#endif
+
 // Whether str.center() method provided
 #ifndef MICROPY_PY_BUILTINS_STR_CENTER
 #define MICROPY_PY_BUILTINS_STR_CENTER (0)

diff --git a/py/objstr.c b/py/objstr.c
@@ -161,13 +161,23 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
                 if (str_hash == 0) {
                     str_hash = qstr_compute_hash(str_data, str_len);
                 }
+                #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+                if (!utf8_check(str_data, str_len)) {
+                    mp_raise_msg(&mp_type_UnicodeError, NULL);
+                }
+                #endif
                 mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
                 o->data = str_data;
                 o->hash = str_hash;
                 return MP_OBJ_FROM_PTR(o);
             } else {
                 mp_buffer_info_t bufinfo;
                 mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
+                #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+                if (!utf8_check(bufinfo.buf, bufinfo.len)) {
+                    mp_raise_msg(&mp_type_UnicodeError, NULL);
+                }
+                #endif
                 return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
             }
     }

diff --git a/py/unicode.c b/py/unicode.c
@@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
     }
     return n;
 }
+
+bool utf8_check(const byte *p, size_t len) {
+    uint8_t need = 0;
+    const byte *end = p + len;
+    for (; p < end; p++) {
+        byte c = *p;
+        if (need) {
+            if (c >= 0x80) {
+                need--;
+            } else {
+                // mismatch
+                return 0;
+            }
+        } else {
+            if (c >= 0xc0) {
+                if (c >= 0xf8) {
+                    // mismatch
+                    return 0;
+                }
+                need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
+            } else if (c >= 0x80) {
+                // mismatch
+                return 0;
+            }
+        }
+    }
+    return need == 0; // no pending fragments allowed
+}
diff --git a/py/unicode.h b/py/unicode.h
@@ -30,5 +30,6 @@
 #include "py/misc.h"
 
 mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
+bool utf8_check(const byte *p, size_t len);
 
 #endif // MICROPY_INCLUDED_PY_UNICODE_H
diff --git a/tests/unicode/unicode.py b/tests/unicode/unicode.py
@@ -33,3 +33,17 @@
     int('\u0200')
 except ValueError:
     print('ValueError')
+
+# test invalid UTF-8 string
+try:
+    str(b'ab\xa1', 'utf8')
+except UnicodeError:
+    print('UnicodeError')
+try:
+    str(b'ab\xf8', 'utf8')
+except UnicodeError:
+    print('UnicodeError')
+try:
+    str(bytearray(b'ab\xc0a'), 'utf8')
+except UnicodeError:
+    print('UnicodeError')