Merge pull request pelias#1373 from pelias/unicode

improved unicode support
michaelkirk-pelias · Oct 16, 2019 · 8b881da · 8b881da
2 parents d0cf44f + c250026
commit 8b881da
Show file tree

Hide file tree

Showing 6 changed files with 157 additions and 2 deletions.
diff --git a/helper/unicode.js b/helper/unicode.js
@@ -0,0 +1,94 @@
+const _ = require('lodash');
+const regenerate = require('regenerate');
+
+// non-printable control characters
+// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
+const CONTROL_CODES = regenerate()
+  .addRange(0x0000, 0x001F) // C0 (0000-001F)
+  .add(0x007F) // Delete
+  .addRange(0x0080, 0x009F) // C1 (0080-009F)
+  .toRegExp('g');
+
+// non-standard spaces
+// ref: http://jkorpela.fi/chars/spaces.html
+const ALTERNATE_SPACES = regenerate()
+  .add(0x00A0) // NO-BREAK SPACE
+  .add(0x1680) // OGHAM SPACE MARK
+  .add(0x180E) // MONGOLIAN VOWEL SEPARATOR
+  .addRange(0x2000, 0x200B) // EN QUAD - ZERO WIDTH SPACE
+  .add(0x202F) // NARROW NO-BREAK SPACE
+  .add(0x205F) // MEDIUM MATHEMATICAL SPACE
+  .add(0x3000) // IDEOGRAPHIC SPACE
+  .add(0xFEFF) // ZERO WIDTH NO-BREAK SPACE
+  .toRegExp('g');
+
+// pattern to match consecutive spaces
+// const CONSECUTIVE_SPACES = /\s{2,}/g;
+
+// unicode combining marks
+// see: https://github.com/pelias/pelias/issues/829#issuecomment-542614645
+// ref: https://en.wikipedia.org/wiki/Combining_character
+const COMBINING_MARKS = regenerate()
+  .addRange(0x0300, 0x036F) // Combining Diacritical Marks (0300–036F)
+  .addRange(0x1AB0, 0x1AFF) // Combining Diacritical Marks Extended (1AB0–1AFF)
+  .addRange(0x1DC0, 0x1DFF) // Combining Diacritical Marks Supplement (1DC0–1DFF)
+  .addRange(0x20D0, 0x20FF) // Combining Diacritical Marks for Symbols (20D0–20FF)
+  .addRange(0xFE20, 0xFE2F) // Combining Half Marks (FE20–FE2F)
+  .add(0x3099) // combining dakuten (U+3099)
+  .add(0x309A) // combining handakuten (U+309A)
+  .toRegExp('g');
+
+// miscellaneous symbols with no relevance to geocoding
+const MISC_UNSUPPORTED_SYMBOLS = regenerate()
+  // Superscripts and Subscripts (2070-209F)
+  // Currency Symbols (20A0-20CF)
+  // Letterlike Symbols (2100-214F)
+  // Number Forms (2150-218F)
+  // Arrows (2190-21FF)
+  // Mathematical Operators (2200-22FF)
+  // Miscellaneous Technical (2300-23FF)
+  // Control Pictures (2400-243F)
+  // Optical Character Recognition (2440-245F)
+  // Enclosed Alphanumerics (2460-24FF)
+  // Box Drawing (2500-257F)
+  // Block Elements (2580-259F)
+  // Geometric Shapes (25A0-25FF)
+  // Miscellaneous Symbols (2600-26FF)
+  // Dingbats (2700-27BF)
+  // Miscellaneous Mathematical Symbols-A (27C0-27EF)
+  // Supplemental Arrows-A (27F0-27FF)
+  // Braille Patterns (2800-28FF)
+  // Supplemental Arrows-B (2900-297F)
+  // Miscellaneous Mathematical Symbols-B (2980-29FF)
+  // Supplemental Mathematical Operators (2A00-2AFF)
+  // Miscellaneous Symbols and Arrows (2B00-2BFF)
+  .addRange(0x2070, 0x2BFF) // A Range Covering Consecutive Blocks Listed Above
+
+  // symbols
+  .addRange(0x02B0, 0x02FF) // Spacing Modifier Letters (02B0-02FF)
+  .addRange(0x1400, 0x167F) // Unified Canadian Aboriginal Syllabics (1400-167F)
+  .addRange(0x1D100, 0x1D1FF) // Musical Symbols (1D100-1D1FF)
+  .addRange(0x1D400, 0x1D7FF) // Mathematical Alphanumeric Symbols (1D400-1D7FF)
+
+  // emojis
+  .addRange(0x1F300, 0x1F5FF) // Miscellaneous Symbols and Pictographs (1F300-1F5FF)
+  .addRange(0x1F3FB, 0x1F3FF) // Emoji Modifier Fitzpatrick (skin tones) (1F3FB–1F3FF)
+  .addRange(0x1F600, 0x1F64F) // Emoticons (1F600–1F64F)
+  .addRange(0x1F680, 0x1F6FF) // Transport and Map Symbols (1F680-1F6FF)
+  .addRange(0x1F900, 0x1F9FF) // Supplemental Symbols and Pictographs (1F900-1F9FF)
+  .toRegExp('g');
+
+function normalize(str) {
+
+  // sanity checking
+  if(!_.isString(str)){ return str; }
+
+  return str
+    .normalize('NFC')
+    .replace(CONTROL_CODES, '')
+    .replace(ALTERNATE_SPACES, ' ')
+    .replace(MISC_UNSUPPORTED_SYMBOLS, '')
+    .replace(COMBINING_MARKS, '');
+}
+
+module.exports.normalize = normalize;
diff --git a/package.json b/package.json
@@ -59,6 +59,7 @@
     "pelias-query": "^9.14.0",
     "pelias-sorting": "^1.2.0",
     "predicates": "^2.0.0",
+    "regenerate": "^1.4.0",
     "retry": "^0.12.0",
     "stable": "^0.1.8",
     "stats-lite": "^2.0.4",

diff --git a/sanitizer/_text.js b/sanitizer/_text.js
@@ -1,4 +1,5 @@
 const _ = require('lodash');
+const unicode = require('../helper/unicode');
 const MAX_TEXT_LENGTH = 140;
 
 // ref: https://en.wikipedia.org/wiki/Quotation_mark
@@ -10,8 +11,11 @@ function _sanitize( raw, clean ){
   // error & warning messages
   const messages = { errors: [], warnings: [] };
 
+  // normalize unicode marks
+  let text = unicode.normalize(raw.text);
+
   // remove superfluous whitespace and quotes
-  let text =  _.trim( _.trim( raw.text ), QUOTES );
+  text = _.trim(_.trim(raw.text), QUOTES);
 
   // validate input 'text'
   if( !_.isString(text) || _.isEmpty(text) ){

diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js
@@ -1,4 +1,5 @@
 const logger = require('pelias-logger').get('api');
+const unicode = require('../helper/unicode');
 const Tokenizer = require('pelias-parser/tokenization/Tokenizer');
 const Solution = require('pelias-parser/solver/Solution');
 const AddressParser = require('pelias-parser/parser/AddressParser');
@@ -22,8 +23,11 @@ function _sanitize (raw, clean) {
   // error & warning messages
   var messages = { errors: [], warnings: [] };
 
+  // normalize unicode marks
+  let text = unicode.normalize(raw.text);
+
   // remove superfluous whitespace
-  let text = _.trim(raw.text);
+  text = _.trim(text);
 
   // validate input 'text'
   if( !_.isString(text) || _.isEmpty(text) ){

diff --git a/test/unit/helper/unicode.js b/test/unit/helper/unicode.js
@@ -0,0 +1,51 @@
+const unicode = require('../../../helper/unicode');
+
+module.exports.tests = {};
+
+module.exports.tests.normalize = function (test) {
+  const norm = unicode.normalize;
+  test('normalize: NFC', function (t) {
+    let decomposed = String.fromCharCode(105) + String.fromCharCode(776);
+    let composed = String.fromCharCode(239);
+    t.equal(norm(decomposed), composed);
+    t.equal(norm(composed), composed);
+    t.end();
+  });
+  test('normalize: remove control codes', function (t) {
+    t.equal(norm('a\u0000b\u001Fc'), 'abc');
+    t.equal(norm('a\u007Fb\u007Fc'), 'abc');
+    t.equal(norm('a\u0080b\u009Fc'), 'abc');
+    t.end();
+  });
+  test('normalize: convert alt spaces', function (t) {
+    t.equal(norm('a b\u00A0c\u00A0d'), 'a b c d');
+    t.equal(norm('a b\u180Ec\u2000d'), 'a b c d');
+    t.equal(norm('a b\u205Fc\uFEFFd'), 'a b c d');
+    t.end();
+  });
+  test('normalize: strip extra combining marks', function (t) {
+    let decomposed = String.fromCharCode(32) + String.fromCharCode(776);
+    let composed = String.fromCharCode(32);
+    t.equal(norm(decomposed), composed);
+    t.equal(norm(composed), composed);
+    t.end();
+  });
+  test('normalize: strip unsupported symbols', function (t) {
+    t.equal(norm('↸a⇨b'), 'ab', 'arrows');
+    t.equal(norm('╦a╳b'), 'ab', 'box drawing');
+    t.equal(norm('𝄞a𝇎b'), 'ab', 'muscial symbols');
+    t.equal(norm('💩a😎b'), 'ab', 'emoji');
+    t.equal(norm('🙌🏿a🙌🏻b'), 'ab', 'emoji');
+    t.end();
+  });
+};
+
+module.exports.all = function (tape, common) {
+  function test(name, testFunction) {
+    return tape('unicode: ' + name, testFunction);
+  }
+
+  for (var testCase in module.exports.tests) {
+    module.exports.tests[testCase](test, common);
+  }
+};
diff --git a/test/unit/run.js b/test/unit/run.js
@@ -43,6 +43,7 @@ var tests = [
   require('./helper/TypeMapping'),
   require('./helper/type_mapping'),
   require('./helper/stackTraceLine'),
+  require('./helper/unicode'),
   require('./middleware/access_log'),
   require('./middleware/accuracy'),
   require('./middleware/assignLabels'),