diff --git a/src/prepro/data_builder.py b/src/prepro/data_builder.py index 31b80af7..7edc33ca 100644 --- a/src/prepro/data_builder.py +++ b/src/prepro/data_builder.py @@ -22,7 +22,7 @@ def load_json(p, lower): source = [] tgt = [] flag = False - for sent in json.load(open(p))['sentences']: + for sent in json.load(open(p, encoding ='utf-8-sig'))['sentences']: tokens = [t['word'] for t in sent['tokens']] if (lower): tokens = [t.lower() for t in tokens] @@ -137,7 +137,7 @@ def _rouge_clean(s): def hashhex(s): """Returns a heximal formated SHA1 hash of the input string.""" h = hashlib.sha1() - h.update(s.encode('utf-8')) + h.update(s.encode('utf-8-sig')) return h.hexdigest()