Skip to content

Commit

Permalink
py/objstr: Add check for valid UTF-8 when making a str from bytes.
Browse files Browse the repository at this point in the history
This patch adds a function utf8_check() to check for a valid UTF-8 encoded
string, and calls it when constructing a str from raw bytes.  The feature
is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and
is enabled if unicode is enabled.  It costs about 110 bytes on Thumb-2, 150
bytes on Xtensa and 170 bytes on x86-64.
  • Loading branch information
TonyLianLong authored and dpgeorge committed Sep 6, 2017
1 parent 069fc48 commit 68c2817
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 0 deletions.
5 changes: 5 additions & 0 deletions py/mpconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,11 @@ typedef double mp_float_t;
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
#endif

// Whether to check for valid UTF-8 when converting bytes to str
#ifndef MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
#define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE)
#endif

// Whether str.center() method provided
#ifndef MICROPY_PY_BUILTINS_STR_CENTER
#define MICROPY_PY_BUILTINS_STR_CENTER (0)
Expand Down
10 changes: 10 additions & 0 deletions py/objstr.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,23 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
if (str_hash == 0) {
str_hash = qstr_compute_hash(str_data, str_len);
}
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
if (!utf8_check(str_data, str_len)) {
mp_raise_msg(&mp_type_UnicodeError, NULL);
}
#endif
mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
o->data = str_data;
o->hash = str_hash;
return MP_OBJ_FROM_PTR(o);
} else {
mp_buffer_info_t bufinfo;
mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
if (!utf8_check(bufinfo.buf, bufinfo.len)) {
mp_raise_msg(&mp_type_UnicodeError, NULL);
}
#endif
return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
}
}
Expand Down
28 changes: 28 additions & 0 deletions py/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
}
return n;
}

bool utf8_check(const byte *p, size_t len) {
uint8_t need = 0;
const byte *end = p + len;
for (; p < end; p++) {
byte c = *p;
if (need) {
if (c >= 0x80) {
need--;
} else {
// mismatch
return 0;
}
} else {
if (c >= 0xc0) {
if (c >= 0xf8) {
// mismatch
return 0;
}
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
} else if (c >= 0x80) {
// mismatch
return 0;
}
}
}
return need == 0; // no pending fragments allowed
}
1 change: 1 addition & 0 deletions py/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@
#include "py/misc.h"

mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
bool utf8_check(const byte *p, size_t len);

#endif // MICROPY_INCLUDED_PY_UNICODE_H
14 changes: 14 additions & 0 deletions tests/unicode/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,17 @@
int('\u0200')
except ValueError:
print('ValueError')

# test invalid UTF-8 string
try:
str(b'ab\xa1', 'utf8')
except UnicodeError:
print('UnicodeError')
try:
str(b'ab\xf8', 'utf8')
except UnicodeError:
print('UnicodeError')
try:
str(bytearray(b'ab\xc0a'), 'utf8')
except UnicodeError:
print('UnicodeError')

0 comments on commit 68c2817

Please sign in to comment.