From 5ef452b1a6be44914764a7a70d114ba0f06801db Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Wed, 3 Jul 2019 12:42:59 -0400 Subject: [PATCH] feat(parser): Filter out URLs before sending to pelias/model We have had numerous reports from Pelias users about concerning error message during builds regarding the URL regex filter from https://github.com/pelias/model/pull/115. While this filter is good, the resulting error message is alarming. Looking today at the output of a planet build, it appears that many of these errors come from the polylines file created by Valhalla out of the OSM street network. Looking at the contents of the polyline file and corresponding record on OSM, it seems that Valhalla puts the contents of the `ref` tag in the polyline file as an alternate name. The [ref tag](https://wiki.openstreetmap.org/wiki/Key:ref?uselang=en-US) will often contain a URL. This means that not only will the error happen frequently, but many records that are actaully valid will be filtered out. An example of this is the [Iowa Women of Achievement bridge](ttps://www.openstreetmap.org/way/65066830) which is completely valid in terms of name, geometry, and tagging but contains a URL in the `ref` field. The polylines importer currently selects a single name value from the list of names in the polylines file by choosing the longest. This PR adds an additional filter that first removes any URL-like values from consideration, and should completely eliminate any of the otherwise concerning errors while ensuring all valid records make it into Elasticsearch. Fixes https://github.com/pelias/whosonfirst/issues/456 Fixes https://github.com/pelias/polylines/issues/216 Fixes https://github.com/pelias/docker/issues/89 Connects https://github.com/pelias/model/issues/116 --- stream/parser.js | 18 ++++++++++++++---- test/stream/parser.js | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/stream/parser.js b/stream/parser.js index 39b482c..68aa43c 100644 --- a/stream/parser.js +++ b/stream/parser.js @@ -26,8 +26,15 @@ function parser( precision ){ // decode polyline var geojson = polyline.toGeoJSON(cols[0], precision); + const name = selectName(cols.slice(1)); + + // skip record if there is no valid name + //if (!name) { + //return next(); + //} + // select name - geojson.properties = { name: selectName(cols.slice(1)) }; + geojson.properties = { name: name }; // compute bbox geojson = extent.bboxify( geojson ); @@ -47,11 +54,14 @@ function parser( precision ){ // each connected road can have one or more names // we select one name to be the default. function selectName( names ){ - // return the longest name + // filter out URLs + // then return the longest name // @todo: can we improve this logic? - return names.reduce( function( a, b ){ + return names.filter( function ( name) { + return !name.match(/^http(s)?:\/\//); + }).reduce( function( a, b ){ return a.length > b.length ? a : b; - }); + }, ''); } module.exports = parser; diff --git a/test/stream/parser.js b/test/stream/parser.js index 0e5c462..4a18bcd 100644 --- a/test/stream/parser.js +++ b/test/stream/parser.js @@ -53,7 +53,7 @@ module.exports.tests.parse_simple = function(test, common) { }); }; -// row does not contain a valid name +// row does not follow polyline schema module.exports.tests.parse_invalid = function(test, common) { test('parse: invalid row', function(t) { @@ -94,6 +94,42 @@ module.exports.tests.select_name = function(test, common) { }); }; +// URLs are filtered out +module.exports.tests.filter_url = function(test, common) { + test('parse: filter URL', function(t) { + + var stream = parser(6); + var row = ['a','foo','foooooooo','https://this-website-should-not-be-the-name.com'].join('\0'); + var expected = 'foooooooo'; + + function assert( actual, enc, next ){ + t.deepEqual( actual.properties.name, expected, 'longest non-URL name selected' ); + next(); + } + + stream.pipe( through.obj( assert, function(){ t.end(); } )); + stream.write(row); + stream.end(); + }); +}; + +module.exports.tests.filter_only_url = function(test, common) { + test('parse:document with only URL name is skipped', function(t) { + + var stream = parser(6); + var row = ['https://this-website-should-not-be-the-name.com'].join('\0'); + + function assert( actual, enc, next ){ + t.fail('no valid rows'); // should not execute + next(); + } + + stream.pipe( through.obj( assert, function(){ t.end(); } )); + stream.write(row); + stream.end(); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) {