Skip to content

Commit

Permalink
Merge pull request pelias#1373 from pelias/unicode
Browse files Browse the repository at this point in the history
improved unicode support
  • Loading branch information
missinglink authored Oct 16, 2019
2 parents d0cf44f + c250026 commit 8b881da
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 2 deletions.
94 changes: 94 additions & 0 deletions helper/unicode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
const _ = require('lodash');
const regenerate = require('regenerate');

// non-printable control characters
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
const CONTROL_CODES = regenerate()
.addRange(0x0000, 0x001F) // C0 (0000-001F)
.add(0x007F) // Delete
.addRange(0x0080, 0x009F) // C1 (0080-009F)
.toRegExp('g');

// non-standard spaces
// ref: http://jkorpela.fi/chars/spaces.html
const ALTERNATE_SPACES = regenerate()
.add(0x00A0) // NO-BREAK SPACE
.add(0x1680) // OGHAM SPACE MARK
.add(0x180E) // MONGOLIAN VOWEL SEPARATOR
.addRange(0x2000, 0x200B) // EN QUAD - ZERO WIDTH SPACE
.add(0x202F) // NARROW NO-BREAK SPACE
.add(0x205F) // MEDIUM MATHEMATICAL SPACE
.add(0x3000) // IDEOGRAPHIC SPACE
.add(0xFEFF) // ZERO WIDTH NO-BREAK SPACE
.toRegExp('g');

// pattern to match consecutive spaces
// const CONSECUTIVE_SPACES = /\s{2,}/g;

// unicode combining marks
// see: https://github.com/pelias/pelias/issues/829#issuecomment-542614645
// ref: https://en.wikipedia.org/wiki/Combining_character
const COMBINING_MARKS = regenerate()
.addRange(0x0300, 0x036F) // Combining Diacritical Marks (0300–036F)
.addRange(0x1AB0, 0x1AFF) // Combining Diacritical Marks Extended (1AB0–1AFF)
.addRange(0x1DC0, 0x1DFF) // Combining Diacritical Marks Supplement (1DC0–1DFF)
.addRange(0x20D0, 0x20FF) // Combining Diacritical Marks for Symbols (20D0–20FF)
.addRange(0xFE20, 0xFE2F) // Combining Half Marks (FE20–FE2F)
.add(0x3099) // combining dakuten (U+3099)
.add(0x309A) // combining handakuten (U+309A)
.toRegExp('g');

// miscellaneous symbols with no relevance to geocoding
const MISC_UNSUPPORTED_SYMBOLS = regenerate()
// Superscripts and Subscripts (2070-209F)
// Currency Symbols (20A0-20CF)
// Letterlike Symbols (2100-214F)
// Number Forms (2150-218F)
// Arrows (2190-21FF)
// Mathematical Operators (2200-22FF)
// Miscellaneous Technical (2300-23FF)
// Control Pictures (2400-243F)
// Optical Character Recognition (2440-245F)
// Enclosed Alphanumerics (2460-24FF)
// Box Drawing (2500-257F)
// Block Elements (2580-259F)
// Geometric Shapes (25A0-25FF)
// Miscellaneous Symbols (2600-26FF)
// Dingbats (2700-27BF)
// Miscellaneous Mathematical Symbols-A (27C0-27EF)
// Supplemental Arrows-A (27F0-27FF)
// Braille Patterns (2800-28FF)
// Supplemental Arrows-B (2900-297F)
// Miscellaneous Mathematical Symbols-B (2980-29FF)
// Supplemental Mathematical Operators (2A00-2AFF)
// Miscellaneous Symbols and Arrows (2B00-2BFF)
.addRange(0x2070, 0x2BFF) // A Range Covering Consecutive Blocks Listed Above

// symbols
.addRange(0x02B0, 0x02FF) // Spacing Modifier Letters (02B0-02FF)
.addRange(0x1400, 0x167F) // Unified Canadian Aboriginal Syllabics (1400-167F)
.addRange(0x1D100, 0x1D1FF) // Musical Symbols (1D100-1D1FF)
.addRange(0x1D400, 0x1D7FF) // Mathematical Alphanumeric Symbols (1D400-1D7FF)

// emojis
.addRange(0x1F300, 0x1F5FF) // Miscellaneous Symbols and Pictographs (1F300-1F5FF)
.addRange(0x1F3FB, 0x1F3FF) // Emoji Modifier Fitzpatrick (skin tones) (1F3FB–1F3FF)
.addRange(0x1F600, 0x1F64F) // Emoticons (1F600–1F64F)
.addRange(0x1F680, 0x1F6FF) // Transport and Map Symbols (1F680-1F6FF)
.addRange(0x1F900, 0x1F9FF) // Supplemental Symbols and Pictographs (1F900-1F9FF)
.toRegExp('g');

function normalize(str) {

// sanity checking
if(!_.isString(str)){ return str; }

return str
.normalize('NFC')
.replace(CONTROL_CODES, '')
.replace(ALTERNATE_SPACES, ' ')
.replace(MISC_UNSUPPORTED_SYMBOLS, '')
.replace(COMBINING_MARKS, '');
}

module.exports.normalize = normalize;
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
"pelias-query": "^9.14.0",
"pelias-sorting": "^1.2.0",
"predicates": "^2.0.0",
"regenerate": "^1.4.0",
"retry": "^0.12.0",
"stable": "^0.1.8",
"stats-lite": "^2.0.4",
Expand Down
6 changes: 5 additions & 1 deletion sanitizer/_text.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const _ = require('lodash');
const unicode = require('../helper/unicode');
const MAX_TEXT_LENGTH = 140;

// ref: https://en.wikipedia.org/wiki/Quotation_mark
Expand All @@ -10,8 +11,11 @@ function _sanitize( raw, clean ){
// error & warning messages
const messages = { errors: [], warnings: [] };

// normalize unicode marks
let text = unicode.normalize(raw.text);

// remove superfluous whitespace and quotes
let text = _.trim( _.trim( raw.text ), QUOTES );
text = _.trim(_.trim(raw.text), QUOTES);

// validate input 'text'
if( !_.isString(text) || _.isEmpty(text) ){
Expand Down
6 changes: 5 additions & 1 deletion sanitizer/_text_pelias_parser.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const logger = require('pelias-logger').get('api');
const unicode = require('../helper/unicode');
const Tokenizer = require('pelias-parser/tokenization/Tokenizer');
const Solution = require('pelias-parser/solver/Solution');
const AddressParser = require('pelias-parser/parser/AddressParser');
Expand All @@ -22,8 +23,11 @@ function _sanitize (raw, clean) {
// error & warning messages
var messages = { errors: [], warnings: [] };

// normalize unicode marks
let text = unicode.normalize(raw.text);

// remove superfluous whitespace
let text = _.trim(raw.text);
text = _.trim(text);

// validate input 'text'
if( !_.isString(text) || _.isEmpty(text) ){
Expand Down
51 changes: 51 additions & 0 deletions test/unit/helper/unicode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
const unicode = require('../../../helper/unicode');

module.exports.tests = {};

module.exports.tests.normalize = function (test) {
const norm = unicode.normalize;
test('normalize: NFC', function (t) {
let decomposed = String.fromCharCode(105) + String.fromCharCode(776);
let composed = String.fromCharCode(239);
t.equal(norm(decomposed), composed);
t.equal(norm(composed), composed);
t.end();
});
test('normalize: remove control codes', function (t) {
t.equal(norm('a\u0000b\u001Fc'), 'abc');
t.equal(norm('a\u007Fb\u007Fc'), 'abc');
t.equal(norm('a\u0080b\u009Fc'), 'abc');
t.end();
});
test('normalize: convert alt spaces', function (t) {
t.equal(norm('a b\u00A0c\u00A0d'), 'a b c d');
t.equal(norm('a b\u180Ec\u2000d'), 'a b c d');
t.equal(norm('a b\u205Fc\uFEFFd'), 'a b c d');
t.end();
});
test('normalize: strip extra combining marks', function (t) {
let decomposed = String.fromCharCode(32) + String.fromCharCode(776);
let composed = String.fromCharCode(32);
t.equal(norm(decomposed), composed);
t.equal(norm(composed), composed);
t.end();
});
test('normalize: strip unsupported symbols', function (t) {
t.equal(norm('↸a⇨b'), 'ab', 'arrows');
t.equal(norm('╦a╳b'), 'ab', 'box drawing');
t.equal(norm('𝄞a𝇎b'), 'ab', 'muscial symbols');
t.equal(norm('💩a😎b'), 'ab', 'emoji');
t.equal(norm('🙌🏿a🙌🏻b'), 'ab', 'emoji');
t.end();
});
};

module.exports.all = function (tape, common) {
function test(name, testFunction) {
return tape('unicode: ' + name, testFunction);
}

for (var testCase in module.exports.tests) {
module.exports.tests[testCase](test, common);
}
};
1 change: 1 addition & 0 deletions test/unit/run.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ var tests = [
require('./helper/TypeMapping'),
require('./helper/type_mapping'),
require('./helper/stackTraceLine'),
require('./helper/unicode'),
require('./middleware/access_log'),
require('./middleware/accuracy'),
require('./middleware/assignLabels'),
Expand Down

0 comments on commit 8b881da

Please sign in to comment.