From d8512186d93d67999166be6d9aa6e5b82805d397 Mon Sep 17 00:00:00 2001 From: Joxit Date: Wed, 22 Apr 2020 00:04:56 +0200 Subject: [PATCH] feat(sqlite): Drop support for bundle downloads BREAKING CHANGE: Support for the bundle downloads (files ending in `tar.bz2`) has been removed. Only SQLite downloads are supported and the `whosonfirst` importer will now behave as if `imports.whosonfirst.sqlite` is set to true. fixes #496 fixes #226 closes #460 --- README.md | 41 ++---- bin/download | 2 +- index.js | 3 - package.json | 7 +- schema.js | 10 +- src/bundleList.js | 107 +-------------- src/components/loadJSON.js | 47 ------- src/components/metadataStream.js | 10 -- src/components/parseMetaFiles.js | 16 --- src/readStream.js | 70 +--------- src/wofIdToPath.js | 15 --- test/bundleList.js | 150 --------------------- test/components/loadJSONTest.js | 142 -------------------- test/components/metadataStream.js | 41 ------ test/components/parseMetaFiles.js | 33 ----- test/components/wofIdToPath.js | 43 ------ test/readStreamTest.js | 138 +++++++------------ test/schema.js | 63 ++------- test/test.js | 4 - utils/download_data.js | 70 ---------- utils/download_data_all.js | 72 ---------- utils/download_sqlite_all.js | 68 ++++------ utils/sqlite_clean.js | 8 +- utils/sqlite_common.js | 40 ------ utils/sqlite_download.js | 60 --------- utils/sqlite_download.sh | 71 ---------- utils/sqlite_extract_data.js | 212 ------------------------------ 27 files changed, 115 insertions(+), 1428 deletions(-) delete mode 100644 src/components/loadJSON.js delete mode 100644 src/components/metadataStream.js delete mode 100644 src/components/parseMetaFiles.js delete mode 100644 src/wofIdToPath.js delete mode 100644 test/components/loadJSONTest.js delete mode 100644 test/components/metadataStream.js delete mode 100644 test/components/parseMetaFiles.js delete mode 100644 test/components/wofIdToPath.js delete mode 100644 utils/download_data.js delete mode 100644 utils/download_data_all.js delete mode 100644 utils/sqlite_common.js delete mode 100644 utils/sqlite_download.js delete mode 100755 utils/sqlite_download.sh delete mode 100644 utils/sqlite_extract_data.js diff --git a/README.md b/README.md index 2ee40c38..c0876c89 100644 --- a/README.md +++ b/README.md @@ -39,26 +39,25 @@ The following configuration options are supported by this importer. Full path to where Who's on First data is located (note: the included [downloader script](#downloading-the-data) will automatically place the WOF data here, and is the recommended way to obtain WOF data) -### `imports.whosonfirst.importPlace` +### `imports.whosonfirst.countryCode` * Required: no * Default: `` -Set to a WOF ID or array of IDs to import data only for descendants of those records, rather than the entire planet. +Set sqlite country codes to download. Geocode Earth provides two types of SQLite extracts: +- [combined](https://geocode.earth/data/whosonfirst/combined): databases of the whole planet for `Administrative Boundaries`, `Postal Code` and `Constituencies` +- [single country](https://geocode.earth/data/whosonfirst): per country databases for `Administrative Boundaries`, `Postal Code` and `Constituencies` -You can use the [Who's on First Spelunker](https://spelunker.whosonfirst.org) or the `source_id` field from any WOF result of a Pelias query to determine these values. - -Specifying a value for `importPlace` will download the full planet SQLite database (27GB). Support for individual country downloads [may be added in the future](https://github.com/pelias/whosonfirst/issues/459) - -### `imports.whosonfirst.importVenues` +### `imports.whosonfirst.importPlace` * Required: no -* Default: `false` +* Default: `` -Set to true to enable importing venue records. There are over 15 million venues so this option will add substantial download and disk usage requirements. +Set to a WOF ID or array of IDs to import data only for descendants of those records, rather than the entire planet. -It is currently [not recommended to import venues](https://github.com/pelias/whosonfirst/issues/94). +You can use the [Who's on First Spelunker](https://spelunker.whosonfirst.org) or the `source_id` field from any WOF result of a Pelias query to determine these values. +Specifying a value for `importPlace` will download the full planet SQLite database (27GB). Support for individual country downloads [may be added in the future](https://github.com/pelias/whosonfirst/issues/459) ### `imports.whosonfirst.importPostalcodes` @@ -67,15 +66,6 @@ It is currently [not recommended to import venues](https://github.com/pelias/who Set to true to enable importing postalcode records. There are over 3 million postal code records. -### `imports.whosonfirst.missingFilesAreFatal` - -* Required: no -* Default: `false` - -Set to `true` for missing files from [Who's on First bundles](https://dist.whosonfirst.org/bundles/) to stop the import process. - -This flag is useful if you consider it vital that all Who's on First data is successfully imported, and can be helpful to guard against incomplete downloads or other types of failure. - ### `imports.whosonfirst.maxDownloads` * Required: no @@ -86,25 +76,21 @@ The maximum number of files to download simultaneously. Higher values can be fas ### `imports.whosonfirst.dataHost` * Required: no -* Default: `https://dist.whosonfirst.org/` +* Default: `https://data.geocode.earth/wof/dist` The location to download Who's on First data from. Changing this can be useful to use custom data, pin data to a specific date, etc. ### `imports.whosonfirst.sqlite` * Required: no -* Default: `false` +* Default: `true` Set to `true` to use Who's on First SQLite databases instead of GeoJSON bundles. SQLite databases take up less space on disk and can be much more efficient to download and extract. -This option may [become the default in the near future](https://github.com/pelias/whosonfirst/issues/460). - -However, both the Who's on First processes to generate -these files and the Pelias code to use them is new and not yet considered -production ready. +This option [is the default](https://github.com/pelias/whosonfirst/issues/460). ## Downloading the Data @@ -169,9 +155,6 @@ Other types may be included in the future. This project exposes a number of node streams for dealing with Who's on First data and metadata files: -- `metadataStream`: streams rows from a Who's on First metadata file -- `parseMetaFiles`: CSV parse stream configured for metadata file contents -- `loadJSON`: parallel stream that asynchronously loads GeoJSON files - `recordHasIdAndProperties`: rejects Who's on First records missing id or properties - `isActiveRecord`: rejects records that are superseded, deprecated, or otherwise inactive - `isNotNullIslandRelated`: rejects [Null Island](https://spelunker.whosonfirst.org/id/1) and other records that intersect it (currently just postal codes at 0/0) diff --git a/bin/download b/bin/download index 20859a2a..21f32e70 100755 --- a/bin/download +++ b/bin/download @@ -1,3 +1,3 @@ #!/bin/bash -exec node ./utils/download_data.js +exec node ./utils/download_sqlite_all.js diff --git a/index.js b/index.js index 6a772608..65df9dcf 100644 --- a/index.js +++ b/index.js @@ -1,9 +1,6 @@ module.exports = { - metadataStream: require('./src/components/metadataStream'), isActiveRecord: require('./src/components/isActiveRecord').create, isNotNullIslandRelated: require('./src/components/isNotNullIslandRelated').create, - loadJSON: require('./src/components/loadJSON').create, - parseMetaFiles: require('./src/components/parseMetaFiles').create, recordHasIdAndProperties: require('./src/components/recordHasIdAndProperties').create, recordHasName: require('./src/components/recordHasName').create, conformsTo: require('./src/components/conformsTo').create, diff --git a/package.json b/package.json index 265fd25f..b9acdc06 100644 --- a/package.json +++ b/package.json @@ -29,16 +29,13 @@ "async": "^3.0.1", "better-sqlite3": "^6.0.0", "combined-stream": "^1.0.5", - "command-exists": "^1.2.8", - "csv-stream": "^0.2.0", + "command-exists": "^1.2.9", "download-file-sync": "^1.0.4", "fs-extra": "^8.0.0", "iso3166-1": "^0.5.0", - "klaw-sync": "^6.0.0", "lodash": "^4.5.1", - "parallel-transform": "^1.1.0", "pelias-blacklist-stream": "^1.0.0", - "pelias-config": "^4.9.0", + "pelias-config": "^4.9.1", "pelias-dbclient": "^2.13.0", "pelias-logger": "^1.2.1", "pelias-model": "^7.1.0", diff --git a/schema.js b/schema.js index 88e0f2b1..45d8957c 100644 --- a/schema.js +++ b/schema.js @@ -5,17 +5,17 @@ const Joi = require('@hapi/joi'); // * imports.whosonfirst.datapath (string) // // optional: -// * imports.whosonfirst.importVenues (boolean) (default: false) +// * imports.whosonfirst.countryCode (string OR array[string]) (default: []) // * imports.whosonfirst.importPostalcodes (boolean) (default: false) // * imports.whosonfirst.importConstituencies (boolean) (default: false) // * imports.whosonfirst.importIntersections (boolean) (default: false) // * imports.whosonfirst.importPlace (integer OR array[integer]) (default: none) -// * imports.whosonfirst.missingFilesAreFatal (boolean) (default: false) +// * imports.whosonfirst.sqlite (boolean) (default: true) module.exports = Joi.object().keys({ imports: Joi.object().required().keys({ whosonfirst: Joi.object().required().keys({ - countries: Joi.alternatives().try( + countryCode: Joi.alternatives().try( Joi.string(), Joi.array().items(Joi.string()).default([]) ).default([]), @@ -28,10 +28,8 @@ module.exports = Joi.object().keys({ importVenues: Joi.boolean().default(false).truthy('yes').falsy('no'), importPostalcodes: Joi.boolean().default(false).truthy('yes').falsy('no'), importConstituencies: Joi.boolean().default(false).truthy('yes').falsy('no'), - importIntersections: Joi.boolean().default(false).truthy('yes').falsy('no'), - missingFilesAreFatal: Joi.boolean().default(false).truthy('yes').falsy('no'), maxDownloads: Joi.number().integer(), - sqlite: Joi.boolean().default(false).truthy('yes').falsy('no') + sqlite: Joi.boolean().default(true).truthy('yes').falsy('no') }).unknown(false) }).unknown(true) }).unknown(true); diff --git a/src/bundleList.js b/src/bundleList.js index ecd8ceed..36f66be1 100644 --- a/src/bundleList.js +++ b/src/bundleList.js @@ -1,9 +1,6 @@ -const readline = require('readline'); const fs = require('fs-extra'); const path = require('path'); -const downloadFileSync = require('download-file-sync'); const _ = require('lodash'); -const klawSync = require('klaw-sync'); const peliasConfig = require( 'pelias-config' ).generate(require('../schema')); @@ -37,21 +34,11 @@ const postalcodeRoles = [ 'postalcode' ]; -const venueRoles = [ - 'venue' -]; - const SQLITE_REGEX = /whosonfirst-data-[a-z0-9-]+\.db$/; function getPlacetypes() { let roles = hierarchyRoles; - // admin-only env var should override the config setting since the hierarchy bundles are useful - // on their own to allow other importers to start when using admin lookup - if (peliasConfig.imports.whosonfirst.importVenues && process.argv[2] !== '--admin-only') { - roles = roles.concat(venueRoles); - } - if (peliasConfig.imports.whosonfirst.importPostalcodes && process.argv[2] !== '--admin-only') { roles = roles.concat(postalcodeRoles); } @@ -59,69 +46,6 @@ function getPlacetypes() { return roles; } -function ensureBundleIndexExists(metaDataPath) { - const wofDataHost = peliasConfig.get('imports.whosonfirst.dataHost') || 'https://dist.whosonfirst.org'; - const bundleIndexFile = path.join(metaDataPath, 'whosonfirst_bundle_index.txt'); - const bundleIndexUrl = `${wofDataHost}/bundles/index.txt`; - - //ensure required directory structure exists - fs.ensureDirSync(metaDataPath); - - if (!fs.existsSync(bundleIndexFile)) { - - const klawOptions = { - nodir: true, - filter: (f) => (f.path.indexOf('-latest.csv') !== -1) - }; - const metaFiles = _.map(klawSync(metaDataPath, klawOptions), - (f) => (path.basename(f.path))); - - // if there are no existing meta files and the bundle index file is not found, - // download bundle index - if (_.isEmpty(metaFiles)) { - fs.writeFileSync(bundleIndexFile, downloadFileSync(bundleIndexUrl)); - } - else { - fs.writeFileSync(bundleIndexFile, metaFiles.join('\n')); - } - } -} - -function getBundleList(callback) { - const metaDataPath = path.join(peliasConfig.imports.whosonfirst.datapath, 'meta'); - const bundleIndexFile = path.join(metaDataPath, 'whosonfirst_bundle_index.txt'); - - ensureBundleIndexExists(metaDataPath); - - const roles = getPlacetypes(); - - // the order in which the bundles are listed is critical to the correct execution - // of the admin hierarchy lookup code in whosonfirst importer, - // so in order to preserve the order specified by the roles list - // we must collect the bundles from the index files by buckets - // and then at the end merge all the buckets into a single ordered array - const bundleBuckets = initBundleBuckets(roles); - - const rl = readline.createInterface({ - input: fs.createReadStream(bundleIndexFile) - }); - - rl.on('line', (line) => { - - const parts = line.split(' '); - const record = parts[parts.length - 1]; - - sortBundleByBuckets(roles, record, bundleBuckets); - - }).on('close', () => { - - const bundles = _.sortedUniq(combineBundleBuckets(roles, bundleBuckets)); - - callback(null, bundles); - - }); -} - function getDBList(callback) { const databasesPath = path.join(peliasConfig.imports.whosonfirst.datapath, 'sqlite'); //ensure required directory structure exists @@ -138,36 +62,7 @@ function getList(callback) { if (peliasConfig.imports.whosonfirst.sqlite) { return getDBList(callback); } - getBundleList(callback); -} - -function initBundleBuckets(roles) { - const bundleBuckets = {}; - roles.forEach( (role) => { - bundleBuckets[role] = []; - }); - return bundleBuckets; -} - -function sortBundleByBuckets(roles, bundle, bundleBuckets) { - roles.forEach((role) => { - // search for the occurrence of role-latest-bundle, like region-latest-bundle - // timestamped bundles should be skipped as they are of the format role-timestamp-bundle - const validBundleRegex = new RegExp(`${role}-[\\w-]*latest`); - if (validBundleRegex.test( bundle ) ) { - bundleBuckets[role].push(bundle); - } - }); -} - -function combineBundleBuckets(roles, bundleBuckets) { - let bundles = []; - - roles.forEach( (role) => { - bundles = _.concat(bundles, _.get(bundleBuckets, role, [])); - }); - - return bundles; + callback('Bundles are no longer supported!'); } module.exports.getPlacetypes = getPlacetypes; diff --git a/src/components/loadJSON.js b/src/components/loadJSON.js deleted file mode 100644 index 4681a2ee..00000000 --- a/src/components/loadJSON.js +++ /dev/null @@ -1,47 +0,0 @@ -const path = require('path'); -const fs = require('fs'); -const parallelTransform = require('parallel-transform'); -const wofIdToPath = require('../wofIdToPath'); - -const maxInFlight = 10; - -const logger = require( 'pelias-logger' ).get( 'whosonfirst' ); - -module.exports.create = function create(wofRoot, missingFilesAreFatal) { - return parallelTransform(maxInFlight, function(record, next) { - - if (!record.path || record.path === 'path') { - // we can generate the record path if column not present in metadata - record.path = wofIdToPath(record.id).concat(record.id+'.geojson').join(path.sep); - - // failed to infer the data disk path - if(!path.length){ - logger.warn('WOF record has no path', record); - return next(); - } - } - - const full_file_path = path.join(wofRoot, 'data', record.path); - - fs.readFile(full_file_path, (err, data) => { - if (err) { - logger.error(err.message); - - // only forward the error is missing files should be fatal - return next(missingFilesAreFatal ? err : null); - - } - - try { - next(null, JSON.parse(data)); - - } catch (parse_err) { - logger.error(`exception parsing JSON for id ${record.id} in file ${record.path}: ${parse_err}`); - next(parse_err); - - } - - }); - - }); -}; diff --git a/src/components/metadataStream.js b/src/components/metadataStream.js deleted file mode 100644 index c6b237af..00000000 --- a/src/components/metadataStream.js +++ /dev/null @@ -1,10 +0,0 @@ -const fs = require('fs'); -const path = require('path'); - -module.exports = (wofRoot) => { - return { - create: (placetype) => { - return fs.createReadStream(path.join(wofRoot, 'meta', `whosonfirst-data-${placetype}-latest.csv`)); - } - }; -}; diff --git a/src/components/parseMetaFiles.js b/src/components/parseMetaFiles.js deleted file mode 100644 index aed4d61c..00000000 --- a/src/components/parseMetaFiles.js +++ /dev/null @@ -1,16 +0,0 @@ -const csv_stream = require('csv-stream'); -const EOL = require('os').EOL; - -// this CSV parser assumes that: -// - the first line contains column names -// - the delimiter is a comma - -const options = { - escapeChar : '"', // default is an empty string - enclosedChar : '"', // default is an empty string - endLine: EOL -}; - -module.exports.create = function create() { - return csv_stream.createStream(options); -}; diff --git a/src/readStream.js b/src/readStream.js index daee2dbd..6c78965e 100644 --- a/src/readStream.js +++ b/src/readStream.js @@ -1,14 +1,10 @@ -var combinedStream = require('combined-stream'); -var fs = require('fs'); -var through2 = require('through2'); -var path = require('path'); +const combinedStream = require('combined-stream'); +const through2 = require('through2'); +const path = require('path'); const logger = require( 'pelias-logger' ).get( 'whosonfirst' ); const getPlacetypes = require('./bundleList').getPlacetypes; -const parseMetaFiles = require('./components/parseMetaFiles'); -const isNotNullIslandRelated = require('./components/isNotNullIslandRelated'); -const loadJSON = require('./components/loadJSON'); const recordHasIdAndProperties = require('./components/recordHasIdAndProperties'); const isActiveRecord = require('./components/isActiveRecord'); const extractFields = require('./components/extractFields'); @@ -16,15 +12,6 @@ const recordHasName = require('./components/recordHasName'); const SQLiteStream = require('./components/sqliteStream'); const toJSONStream = require('./components/toJSONStream'); -/* - * Convert a base directory and list of types into a list of meta file paths - */ -function getMetaFilePaths(wofRoot, bundles) { - return bundles.map((bundle) => { - return path.join(wofRoot, 'meta', bundle); - }); -} - /* * Convert a base directory and list of databases names into a list of sqlite file paths */ @@ -34,45 +21,12 @@ function getSqliteFilePaths(wofRoot, databases) { }); } -/* - * Given the path to a meta CSV file, return a stream of the individual records - * within that CSV file. - */ -function createOneMetaRecordStream(metaFilePath) { - // All of these arguments are optional. - const options = { - escapeChar : '"', // default is an empty string - enclosedChar : '"' // default is an empty string - }; - - return fs.createReadStream(metaFilePath) - .pipe(parseMetaFiles.create()); -} - -/* - * given a list of meta file paths, create a combined stream that reads all the - * records via the csv parser - */ -function createMetaRecordStream(metaFilePaths, types) { - const metaRecordStream = combinedStream.create(); - - metaFilePaths.forEach((metaFilePath) => { - metaRecordStream.append( (next) => { - logger.info( `Loading ${path.basename(metaFilePath)} records from ${path.dirname(metaFilePath)}` ); - next(createOneMetaRecordStream(metaFilePath)); - }); - }); - - return metaRecordStream; -} - /* * given a list of databases file paths, create a combined stream that reads all the * records via the SQLite reader stream */ function createSQLiteRecordStream(dbPaths, importPlace) { const sqliteStream = combinedStream.create(); - dbPaths.forEach((dbPath) => { getPlacetypes().forEach(placetype => { sqliteStream.append( (next) => { @@ -89,24 +43,14 @@ function createSQLiteRecordStream(dbPaths, importPlace) { } /* - This function creates a stream that processes files in `meta/`: - CSV parses them, extracts the required fields, stores only admin records for - later, and passes all records on for further processing + This function creates a stream that processes files in `sqlite/`: + It will load all geojson in all sqlite in the folder */ function createReadStream(wofConfig, types, wofAdminRecords) { const wofRoot = wofConfig.datapath; - const metaFilePaths = getMetaFilePaths(wofRoot, types); - - // Select correct stream between meta and SQLite based on config and do specialized stuff - const stream = wofConfig.sqlite === true ? - createSQLiteRecordStream(getSqliteFilePaths(wofRoot, types), wofConfig.importPlace) - .pipe(toJSONStream.create()) : - createMetaRecordStream(metaFilePaths, types) - .pipe(isNotNullIslandRelated.create()) - .pipe(loadJSON.create(wofRoot, wofConfig.missingFilesAreFatal)); - // All the pipeline is the same for both meta and SQLite streams - return stream + return createSQLiteRecordStream(getSqliteFilePaths(wofRoot, types), wofConfig.importPlace) + .pipe(toJSONStream.create()) .pipe(recordHasIdAndProperties.create()) .pipe(isActiveRecord.create()) .pipe(extractFields.create()) diff --git a/src/wofIdToPath.js b/src/wofIdToPath.js deleted file mode 100644 index 7a759ae9..00000000 --- a/src/wofIdToPath.js +++ /dev/null @@ -1,15 +0,0 @@ -'use strict'; - -// convert wofid integer to array of path components -function wofIdToPath( id ){ - let strId = id.toString(); - let parts = []; - while( strId.length ){ - let part = strId.substr(0, 3); - parts.push(part); - strId = strId.substr(3); - } - return parts; -} - -module.exports = wofIdToPath; diff --git a/test/bundleList.js b/test/bundleList.js index 501e21f5..979804bd 100644 --- a/test/bundleList.js +++ b/test/bundleList.js @@ -31,10 +31,6 @@ const POSTALCODES = [ 'postalcode' ]; -const VENUES = [ - 'venue' -]; - const SQLITE_EXAMPLE = [ 'whosonfirst-data-constituency-us-ct-1481486175.db', 'whosonfirst-data-latest.db', @@ -43,152 +39,6 @@ const SQLITE_EXAMPLE = [ ]; tape('bundlesList tests', (test) => { - test.test('all bundles', (t) => { - - const config = { - generate: () => { - return peliasConfig.generateCustom({ - imports: { - whosonfirst: { - datapath: 'foo', - importVenues: true, - importPostalcodes: true - } - } - }); - } - }; - - const bundles = proxyquire('../src/bundleList', { 'pelias-config': config }); - - const expected = ADMIN.concat(POSTALCODES).concat(VENUES); - - bundles.generateBundleList((err, bundlesList) => { - expected.every((type) => { - const found = bundlesList.some((bundle) => { - return bundle.indexOf(type) !== -1; - }); - t.assert(found, type + ' bundle(s) missing'); - return found; - }); - fs.removeSync('foo'); - t.end(); - }); - }); - - test.test('region venue bundles', (t) => { - const config = { - generate: () => { - return peliasConfig.generateCustom({ - imports: { - whosonfirst: { - datapath: 'foo', - importVenues: true, - importPostalcodes: true - } - } - }); - } - }; - - const bundles = proxyquire('../src/bundleList', { 'pelias-config': config }); - - bundles.generateBundleList((err, bundlesList) => { - t.assert(bundlesList.includes('whosonfirst-data-venue-us-ca-latest.tar.bz2'), 'venue bundle for regions are included'); - fs.removeSync('foo'); - t.end(); - }); - }); - - test.test('admin only bundles', (t) => { - - const config = { - generate: () => { - return peliasConfig.generateCustom({ - imports: { - whosonfirst: { - datapath: 'foo', - importPostalcodes: false - } - } - }); - } - }; - - const bundles = proxyquire('../src/bundleList', { 'pelias-config': config }); - - const expected = ADMIN; - const unexpected = POSTALCODES.concat(VENUES); - - bundles.generateBundleList((err, bundlesList) => { - expected.every((type) => { - const found = bundlesList.some((bundle) => { - return bundle.indexOf(type) !== -1; - }); - t.assert(found, type + ' bundle(s) missing'); - return found; - }); - - unexpected.every((type) => { - const found = bundlesList.some((bundle) => { - return bundle.indexOf(type) !== -1; - }); - t.assert(!found, type + ' bundle(s) should not be there'); - return !found; - }); - fs.removeSync('foo'); - t.end(); - }); - }); - - test.test('--admin-only flag', (t) => { - - const config = { - generate: () => { - return peliasConfig.generateCustom({ - imports: { - whosonfirst: { - datapath: 'foo', - importVenues: true, - importPostalcodes: true - } - } - }); - } - }; - - const previousValue = process.argv[2]; - process.argv[2] = '--admin-only'; - - const bundles = proxyquire('../src/bundleList', { 'pelias-config': config }); - - const expected = ADMIN; - const unexpected = POSTALCODES.concat(VENUES); - - bundles.generateBundleList((err, bundlesList) => { - expected.every((type) => { - const found = bundlesList.some((bundle) => { - return bundle.indexOf(type) !== -1; - }); - t.assert(found, type + ' bundle(s) missing'); - return found; - }); - - t.deepEquals(bundlesList, _.sortedUniq(bundlesList), 'no duplicates should exist in the bundle list'); - - unexpected.every((type) => { - const found = bundlesList.some((bundle) => { - return bundle.indexOf(type) !== -1; - }); - t.assert(!found, type + ' bundle(s) should not be there'); - return !found; - }); - fs.removeSync('foo'); - t.end(); - - process.argv[2] = previousValue; - }); - }); test.test('supports sqlite', (t) => { temp.mkdir('supports_sqlite', (err, temp_dir) => { diff --git a/test/components/loadJSONTest.js b/test/components/loadJSONTest.js deleted file mode 100644 index f99655aa..00000000 --- a/test/components/loadJSONTest.js +++ /dev/null @@ -1,142 +0,0 @@ -const tape = require('tape'); -const event_stream = require('event-stream'); -const path = require('path'); -const fs = require('fs'); -const temp = require('temp').track(); -const proxyquire = require('proxyquire').noCallThru(); - -function test_stream(input, testedStream, callback, error_callback) { - if (!error_callback) { - error_callback = () => {}; - } - - if (!callback) { - callback = function() {}; - } - - const input_stream = event_stream.readArray(input); - const destination_stream = event_stream.writeArray(callback); - - input_stream - .pipe(testedStream) - .on('error', error_callback) - .pipe(destination_stream); - -} - -tape('loadJSON tests', (test) => { - test.test('json should be loaded from file', (t) => { - temp.mkdir('tmp_wof_data', (err, temp_dir) => { - fs.mkdirSync(path.join(temp_dir, 'data')); - - // write the contents to a file - const filename = path.join(temp_dir, 'data', 'datafile.geojson'); - fs.writeFileSync(filename, '{ "a": 1, "b": 2 }\n'); - - const loadJSON = require('../../src/components/loadJSON'); - - const input = { - path: path.basename(filename) - }; - - test_stream([input], loadJSON.create(temp_dir), (err, actual) => { - temp.cleanupSync(); - t.deepEqual(actual, [{ a: 1, b: 2 }], 'should be equal'); - t.end(); - }); - - }); - - }); - - test.test('invalid JSON should log an error and not pass along anything', (t) => { - const logger = require('pelias-mock-logger')(); - - temp.mkdir('tmp_wof_data', (err, temp_dir) => { - fs.mkdirSync(path.join(temp_dir, 'data')); - - const loadJSON = proxyquire('../../src/components/loadJSON', { - 'pelias-logger': logger - }); - - // write the contents to a file - const filename = path.join(temp_dir, 'data', 'datafile.geojson'); - fs.writeFileSync(filename, 'this is not json\n'); - - const input = { - id: '17', - path: path.basename(filename) - }; - - test_stream([input], loadJSON.create(temp_dir), undefined, (err, actual) => { - temp.cleanupSync(); - t.deepEqual(actual, undefined, 'an error should be thrown'); - t.ok(logger.isErrorMessage(new RegExp( - `exception parsing JSON for id 17 in file ${path.basename(filename)}: SyntaxError: Unexpected token h.*`))); - t.end(); - }); - - }); - - }); - - test.test('missing file should be non-fatal by default', (t) => { - const logger = require('pelias-mock-logger')(); - - temp.mkdir('tmp_wof_data', (err, temp_dir) => { - fs.mkdirSync(path.join(temp_dir, 'data')); - - const loadJSON = proxyquire('../../src/components/loadJSON', { - 'pelias-logger': logger - }); - - // non-existent file - const input = { - path: 'datafile.geojson' - }; - - test_stream([input], loadJSON.create(temp_dir), (err, actual) => { - temp.cleanupSync(); - t.ok(logger.isErrorMessage(/ENOENT: no such file or directory/), 'error output present'); - t.end(); - }, (err) => { - // because loadJSON uses parallelStream internally, the only way to test - // that the error wasn't passed to next() is to handle in the error callback - t.fail('error callback should not have been called since missing files are non-fatal'); - }); - - }); - - }); - - test.test('missing file should be fatal when specified by parameter', (t) => { - const logger = require('pelias-mock-logger')(); - - temp.mkdir('tmp_wof_data', (err, temp_dir) => { - fs.mkdirSync(path.join(temp_dir, 'data')); - - const loadJSON = proxyquire('../../src/components/loadJSON', { - 'pelias-logger': logger - }); - - // non-existent file - const input = { - path: 'datafile.geojson' - }; - - test_stream([input], loadJSON.create(temp_dir, true), (err, actual) => { - t.ok(logger.isErrorMessage(/ENOENT: no such file or directory/), 'error output present'); - temp.cleanupSync(); - t.end(); - }, (err) => { - // because loadJSON uses parallelStream internally, the only way to test - // that the error was passed to next() is to handle in the error callback - t.ok(err); - }); - - }); - - }); - test.end(); - -}); diff --git a/test/components/metadataStream.js b/test/components/metadataStream.js deleted file mode 100644 index 11102f1e..00000000 --- a/test/components/metadataStream.js +++ /dev/null @@ -1,41 +0,0 @@ - -const tape = require('tape'); -const fs = require('fs'); -const path = require('path'); -const temp = require('temp').track(); -const through2 = require('through2'); - -tape('metadataStream tests', (test) => { - test.test('should read all data', (t) => { - temp.mkdir('tmp_wof_root', (err, temp_dir) => { - fs.mkdirSync(path.join(temp_dir, 'meta')); - - // write some data to a file that will be read - fs.writeFileSync( - path.join(temp_dir, 'meta', 'whosonfirst-data-my_placetype-latest.csv'), - 'some metadata'); - - const metadataStream = require('../../src/components/metadataStream')(temp_dir); - const stream = metadataStream.create('my_placetype'); - - let contents = ''; - - stream.pipe(through2.obj(function(data, enc, next) { - contents += data; - next(); - - })).on('finish', () => { - temp.cleanup((err, stats) => { - t.notOk(err); - t.deepEqual(contents, 'some metadata', 'should be equal'); - t.end(); - }); - - }); - - }); - - }); - test.end(); - -}); diff --git a/test/components/parseMetaFiles.js b/test/components/parseMetaFiles.js deleted file mode 100644 index 0bb1159b..00000000 --- a/test/components/parseMetaFiles.js +++ /dev/null @@ -1,33 +0,0 @@ -const tape = require('tape'); -const Readable = require('stream').Readable; -const through2 = require('through2'); -const EOL = require('os').EOL; - -const parseMetaFiles = require('../../src/components/parseMetaFiles'); - -tape('parseMetaFiles tests', (test) => { - test.test('first row should be column names with " for escape/enclosing', (t) => { - const s = new Readable(); - s.push(`id,name${EOL}`); - s.push(`1,name1${EOL}`); - s.push(`2,name","2${EOL}`); - s.push(`3,"name,3"${EOL}`); - s.push(null); - - const expected = [ - { id: '1', name: 'name1'}, - { id: '2', name: 'name,2'}, - { id: '3', name: 'name,3'} - ]; - - s.pipe(parseMetaFiles.create()).pipe(through2.obj(function(record, enc, next) { - t.deepEquals(record, expected.shift()); - next(); - })).on('finish', () => { - t.end(); - }); - - }); - test.end(); - -}); diff --git a/test/components/wofIdToPath.js b/test/components/wofIdToPath.js deleted file mode 100644 index a06d0b31..00000000 --- a/test/components/wofIdToPath.js +++ /dev/null @@ -1,43 +0,0 @@ -const tape = require('tape'); -const wofIdToPath = require('../../src/wofIdToPath'); - -tape('wofIdToPath', (t) => { - t.test('invalid path', (t) => { - t.deepEqual(wofIdToPath(''), [], 'should be empty'); - t.end(); - }); - t.test('9 digit string', (t) => { - t.deepEqual(wofIdToPath('123456789'), ['123', '456', '789']); - t.end(); - }); - t.test('9 digit integer', (t) => { - t.deepEqual(wofIdToPath(123456789), ['123', '456', '789']); - t.end(); - }); - t.test('10 digit string', (t) => { - t.deepEqual(wofIdToPath('1234567890'), ['123', '456', '789', '0']); - t.end(); - }); - t.test('10 digit integer', (t) => { - t.deepEqual(wofIdToPath(1234567890), ['123', '456', '789', '0']); - t.end(); - }); - t.test('1 digit string', (t) => { - t.deepEqual(wofIdToPath('1'), ['1']); - t.end(); - }); - t.test('1 digit integer', (t) => { - t.deepEqual(wofIdToPath(1), ['1']); - t.end(); - }); - t.test('0 string', (t) => { - t.deepEqual(wofIdToPath('0'), ['0']); - t.end(); - }); - t.test('0 integer', (t) => { - t.deepEqual(wofIdToPath(0), ['0']); - t.end(); - }); - t.end(); - -}); diff --git a/test/readStreamTest.js b/test/readStreamTest.js index d20facb6..b4774bb7 100644 --- a/test/readStreamTest.js +++ b/test/readStreamTest.js @@ -14,16 +14,29 @@ tape('readStream', (test) => { 'pelias-logger': logger }); - temp.mkdir('tmp_wof', (err, temp_dir) => { - fs.mkdirSync(path.join(temp_dir, 'meta')); - fs.mkdirSync(path.join(temp_dir, 'data')); + temp.mkdir('tmp_sqlite', (err, temp_dir) => { + generateWOFDB(path.join(temp_dir, 'sqlite', 'whosonfirst-data-admin-xy-latest.db'), [ + { + id: 123, + properties: { + 'wof:name': 'name 1', + 'wof:placetype': 'region', + 'geom:latitude': 12.121212, + 'geom:longitude': 21.212121, + 'wof:abbreviation': 'XY', + 'geom:bbox': '-13.691314,49.909613,1.771169,60.847886', + 'gn:population': 98765, + 'misc:photo_sum': 87654 + } + } + ]); - fs.writeFileSync(path.join(temp_dir, 'meta', 'whosonfirst-data-type1-latest.csv'), 'id,path\n123,123.geojson\n'); - fs.writeFileSync(path.join(temp_dir, 'data', '123.geojson'), JSON.stringify({ + generateWOFDB(path.join(temp_dir, 'sqlite', 'whosonfirst-data-admin-xx-latest.db'), [ + { id: 123, properties: { 'wof:name': 'name 1', - 'wof:placetype': 'place type 1', + 'wof:placetype': 'region', 'geom:latitude': 12.121212, 'geom:longitude': 21.212121, 'wof:abbreviation': 'XY', @@ -31,27 +44,19 @@ tape('readStream', (test) => { 'gn:population': 98765, 'misc:photo_sum': 87654 } - })); - - // write out second meta and data files - fs.writeFileSync(path.join(temp_dir, 'meta', 'whosonfirst-data-type2-latest.csv'), 'id,path\n456,456.geojson\n'); - fs.writeFileSync(path.join(temp_dir, 'data', '456.geojson'), JSON.stringify({ + }, + { id: 456, properties: { 'wof:name': 'name 2', - 'wof:placetype': 'place type 2', + 'wof:placetype': 'localadmin', 'geom:latitude': 13.131313, 'geom:longitude': 31.313131, 'wof:abbreviation': 'XY', 'geom:bbox': '-24.539906,34.815009,69.033946,81.85871' } - })); - - // write out third meta and data files that are ignored - // it will be ignored since 'type3' is not passed as a supported type - // this shows that types are supported instead of all files being globbed - fs.writeFileSync(path.join(temp_dir, 'meta', 'whosonfirst-data-type3-latest.csv'), 'id,path\n789,789.geojson\n'); - fs.writeFileSync(path.join(temp_dir, 'data', '789.geojson'), JSON.stringify({ + }, + { id: 789, properties: { 'wof:name': 'name 3', @@ -60,15 +65,16 @@ tape('readStream', (test) => { 'geom:longitude': 41.414141, 'geom:bbox': '-24.539906,34.815009,69.033946,81.85871' } - })); + } + ]); const wofConfig = { datapath: temp_dir, - missingFilesAreFatal: false + sqlite: true }; const wofAdminRecords = {}; - const filenames = ['whosonfirst-data-type1-latest.csv', 'whosonfirst-data-type2-latest.csv']; + const filenames = ['whosonfirst-data-admin-xy-latest.db', 'whosonfirst-data-admin-xx-latest.db']; const stream = readStream.create(wofConfig, filenames, wofAdminRecords); stream.on('finish', _ => { @@ -80,7 +86,7 @@ tape('readStream', (test) => { name: 'name 1', name_aliases: [], name_langs: {}, - place_type: 'place type 1', + place_type: 'region', lat: 12.121212, lon: 21.212121, abbreviation: 'XY', @@ -88,7 +94,7 @@ tape('readStream', (test) => { population: 98765, popularity: 87654, hierarchies: [ - { 'place type 1_id': 123 } + { 'region_id': 123 } ] }, '456': { @@ -96,7 +102,7 @@ tape('readStream', (test) => { name: 'name 2', name_aliases: [], name_langs: {}, - place_type: 'place type 2', + place_type: 'localadmin', lat: 13.131313, lon: 31.313131, abbreviation: 'XY', @@ -104,85 +110,28 @@ tape('readStream', (test) => { population: undefined, popularity: undefined, hierarchies: [ - { 'place type 2_id': 456 } + { 'localadmin_id': 456 } ] } }); - t.deepEquals(logger.getInfoMessages(), [ - `Loading whosonfirst-data-type1-latest.csv records from ${temp_dir}/meta`, - `Loading whosonfirst-data-type2-latest.csv records from ${temp_dir}/meta` - ]); - t.end(); - - }); - - }); + const xyMessages = logger.getDebugMessages().filter(m => m.indexOf('whosonfirst-data-admin-xy-latest.db') >= 0); + const xxMessages = logger.getDebugMessages().filter(m => m.indexOf('whosonfirst-data-admin-xx-latest.db') >= 0); - }); - - test.test('missingFilesAreFatal=false from config should be passed to loadJSON', (t) => { - temp.mkdir('tmp_wof', (err, temp_dir) => { - t.plan(2, 'plan for 2 tests so we know that loadJSON was actually used'); - - const readStream = proxyquire('../src/readStream', { - './components/loadJSON': { - create: (wofRoot, missingFilesAreFatal) => { - t.equals(wofRoot, temp_dir); - t.equals(missingFilesAreFatal, false); - return through2.obj(); - } - } - }); - - const wofConfig = { - datapath: temp_dir, - missingFilesAreFatal: false - }; - - const wofAdminRecords = {}; - const stream = readStream.create(wofConfig, [], wofAdminRecords); - - stream.on('finish', _ => { - temp.cleanupSync(); + t.deepEquals(xyMessages.length, 17); + t.deepEquals(xyMessages.length, xxMessages.length); t.end(); }); - }); - }); - - test.test('missingFilesAreFatal=true from config should be passed to loadJSON', (t) => { - temp.mkdir('tmp_wof', (err, temp_dir) => { - t.plan(2, 'plan for 2 tests so we know that loadJSON was actually used'); - - const readStream = proxyquire('../src/readStream', { - './components/loadJSON': { - create: (wofRoot, missingFilesAreFatal) => { - t.equals(wofRoot, temp_dir); - t.equals(missingFilesAreFatal, true); - return through2.obj(); - } - } - }); - - const wofConfig = { - datapath: temp_dir, - missingFilesAreFatal: true - }; - - const wofAdminRecords = {}; - const stream = readStream.create(wofConfig, [], wofAdminRecords); - stream.on('finish', _ => { - temp.cleanupSync(); - t.end(); - }); - - }); }); - test.end(); test.test('load sqlite', t => { + const logger = require('pelias-mock-logger')(); + + const readStream = proxyquire('../src/readStream', { + 'pelias-logger': logger + }); temp.mkdir('tmp_sqlite', (err, temp_dir) => { generateWOFDB(path.join(temp_dir, 'sqlite', 'whosonfirst-data-latest.db'), [ { @@ -227,7 +176,7 @@ tape('readStream', (test) => { } ]); const records = {}; - require('../src/readStream') + readStream .create({datapath: temp_dir, sqlite: true}, ['whosonfirst-data-latest.db'], records) .on('finish', (err) => { t.notOk(err); @@ -247,9 +196,10 @@ tape('readStream', (test) => { hierarchies: [ { 'region_id': 421302191 } ] } }); + t.deepEqual(logger.getDebugMessages().length, 17); t.end(); }); }); }); - + test.end(); }); diff --git a/test/schema.js b/test/schema.js index 2e57c324..5ccd59fe 100644 --- a/test/schema.js +++ b/test/schema.js @@ -16,7 +16,7 @@ tape('tests for looking up hierarchies', function(test) { var config = { imports: { whosonfirst: { - importVenues: true + importPostalcodes: true } } }; @@ -26,7 +26,7 @@ tape('tests for looking up hierarchies', function(test) { }); - test.test('missing importVenues, importPostalcodes, and missingFilesAreFatal should not throw error', function(t) { + test.test('missing importPostalcodes, and missingFilesAreFatal should not throw error', function(t) { var config = { imports: { whosonfirst: { @@ -45,7 +45,7 @@ tape('tests for looking up hierarchies', function(test) { imports: { whosonfirst: { datapath: '/path/to/data', - importVenues: true, + importPostalcodes: true, spurious_key: 'value' } } @@ -74,18 +74,18 @@ tape('tests for looking up hierarchies', function(test) { }); - test.test('non-boolean importVenues should throw error', function(t) { + test.test('non-boolean sqlite should throw error', function(t) { [null, 17, {}, [], 'string'].forEach((value) => { var config = { imports: { whosonfirst: { datapath: '/path/to/data', - importVenues: value + sqlite: value } } }; - t.throws(validate.bind(null, config), /"imports.whosonfirst.importVenues" must be a boolean/); + t.throws(validate.bind(null, config), /"imports.whosonfirst.sqlite" must be a boolean/); }); t.end(); @@ -110,31 +110,31 @@ tape('tests for looking up hierarchies', function(test) { }); - test.test('non-boolean missingFilesAreFatal should throw error', function(t) { - [null, 17, {}, [], 'string'].forEach((value) => { + test.test('non-string/array countryCode should throw error', function(t) { + [null, 17, {}, true].forEach((value) => { var config = { imports: { whosonfirst: { datapath: '/path/to/data', - missingFilesAreFatal: value + countryCode: value } } }; - t.throws(validate.bind(null, config), /"imports.whosonfirst.missingFilesAreFatal" must be a boolean/); + t.throws(validate.bind(null, config), `"imports.whosonfirst.countryCode" must be one of [string, array]`); }); t.end(); }); - test.test('case-insensitive \'yes\' and true should be valid importVenues values', function(t) { + test.test('case-insensitive \'yes\' and true should be valid sqlite values', function(t) { [true, 'YeS', 'yEs'].forEach((value) => { var config = { imports: { whosonfirst: { datapath: '/path/to/data', - importVenues: value + sqlite: value } } }; @@ -146,13 +146,13 @@ tape('tests for looking up hierarchies', function(test) { }); - test.test('case-insensitive \'no\' and false should be valid importVenues values', function(t) { + test.test('case-insensitive \'no\' and false should be valid sqlite values', function(t) { [false, 'nO', 'No'].forEach((value) => { var config = { imports: { whosonfirst: { datapath: '/path/to/data', - importVenues: value + sqlite: value } } }; @@ -200,41 +200,6 @@ tape('tests for looking up hierarchies', function(test) { }); - test.test('case-insensitive \'yes\' and true should be valid missingFilesAreFatal values', function(t) { - [true, 'YeS', 'yEs'].forEach((value) => { - var config = { - imports: { - whosonfirst: { - datapath: '/path/to/data', - missingFilesAreFatal: value - } - } - }; - - t.doesNotThrow(validate.bind(null, config)); - }); - - t.end(); - - }); - - test.test('case-insensitive \'no\' and false should be valid missingFilesAreFatal values', function(t) { - [false, 'nO', 'No'].forEach((value) => { - var config = { - imports: { - whosonfirst: { - datapath: '/path/to/data', - missingFilesAreFatal: value - } - } - }; - - t.doesNotThrow(validate.bind(null, config)); - }); - - t.end(); - - }); test.end(); }); diff --git a/test/test.js b/test/test.js index 82a999b2..c9cc59cb 100644 --- a/test/test.js +++ b/test/test.js @@ -2,13 +2,9 @@ require ('./components/conformsTo.js'); require ('./components/extractFieldsTest.js'); require ('./components/isActiveRecordTest.js'); require ('./components/isNotNullIslandRelated.js'); -require ('./components/loadJSONTest.js'); -require ('./components/metadataStream.js'); -require ('./components/parseMetaFiles.js'); require ('./components/recordHasIdAndPropertiesTest.js'); require ('./components/recordHasNameTest.js'); require ('./components/sqliteStream.js'); -require ('./components/wofIdToPath.js'); require ('./hierarchyFinderTest.js'); require ('./importStreamTest.js'); require ('./peliasDocGeneratorsTest.js'); diff --git a/utils/download_data.js b/utils/download_data.js deleted file mode 100644 index 66d7777f..00000000 --- a/utils/download_data.js +++ /dev/null @@ -1,70 +0,0 @@ - -const config = require( 'pelias-config' ).generate(require('../schema')).imports.whosonfirst; - -function on_done() { - console.log('All done!'); -} - -if( config.importPlace ) { - const download = require('./sqlite_download').download; - const extract = require('./sqlite_extract_data').extract; - const findSubdivisions = require('./sqlite_extract_data').findSubdivisions; - const mainDataDB = 'whosonfirst-data-latest.db'; - - // download main sqlite database file - download({ databases: [ mainDataDB ] }, () => { - - // enumerate additional sqlite databases required - let databases = []; - const subdivisions = findSubdivisions( mainDataDB ); - subdivisions.forEach( subdivision => { - let parts = subdivision.split('-'); - if( 'xx' === parts[0] ){ return; } // ignore 'xx' unspecified subdivisions - if( parts.length > 1 ){ - if( true === config.importVenues ){ - if( 'us' === parts[0] ){ - databases.push(`whosonfirst-data-venue-${subdivision}-latest.db`); - } else { - databases.push(`whosonfirst-data-venue-${parts[0]}-latest.db`); - } - } - if( true === config.importIntersections ){ - if( 'us' === parts[0] ){ - databases.push(`whosonfirst-data-intersection-${subdivision}-latest.db`); - } else { - databases.push(`whosonfirst-data-intersection-${parts[0]}-latest.db`); - } - } - } - else { - if( true === config.importPostalcodes ){ - databases.push(`whosonfirst-data-postalcode-${subdivision}-latest.db`); - } - if( true === config.importConstituencies ){ - databases.push(`whosonfirst-data-constituency-${subdivision}-latest.db`); - } - } - }); - - // dedupe array - databases = databases.filter((item, pos) => databases.indexOf(item) === pos); - - // download additonal database files - download({ databases: databases }, () => { - - // extract all files - console.error('extracting data...'); - extract({ - unlink: true, - databases: [ mainDataDB ].concat( databases ) - }, on_done); - }); - }); -} -else { - if ( config.sqlite ) { - require('./download_sqlite_all').download(on_done); - } else { - require('./download_data_all').download(on_done); - } -} diff --git a/utils/download_data_all.js b/utils/download_data_all.js deleted file mode 100644 index 1409566e..00000000 --- a/utils/download_data_all.js +++ /dev/null @@ -1,72 +0,0 @@ -const child_process = require('child_process'); -const async = require('async'); -const fs = require('fs-extra'); -const os = require('os'); -const path = require('path'); -const commandExistsSync = require('command-exists').sync; - -const bundles = require('../src/bundleList'); -const config = require('pelias-config').generate(require('../schema')); - -const wofDataHost = config.get('imports.whosonfirst.dataHost') || 'https://dist.whosonfirst.org'; - -function download(callback) { - // ensure required directory structure exists - fs.ensureDirSync(path.join(config.imports.whosonfirst.datapath, 'meta')); - - // download one bundle for every other CPU (tar and bzip2 can both max out one core) - // (the maximum is configurable, to keep things from getting too intense, and defaults to 4) - // lower this number to make the downloader more CPU friendly - // raise this number to (possibly) make it faster - const maxSimultaneousDownloads = config.get('imports.whosonfirst.maxDownloads') || 4; - const cpuCount = os.cpus().length; - const simultaneousDownloads = Math.max(maxSimultaneousDownloads, Math.min(1, cpuCount / 2)); - - // generate a shell command that does the following: - // 1.) use curl to download the bundle, piping directly to tar (this avoids the - // need for intermediate storage of the archive file) - // 2.) extract the archive so that the data directory goes in the right place and - // the README file is ignored (it just would get overridden by subsequent bundles) - // 3.) move the meta file to the meta files directory - function generateCommand(bundle, directory) { - let extract; - // check if we have lbzip2 installed - if (commandExistsSync('lbzip2')) { - extract = `tar -x --use-compress-program=lbzip2`; - } else { - extract = `tar -xj`; - } - - const csvFilename = bundle - .replace(/-\d{8}T\d{6}-/, '-latest-') // support timestamped downloads - .replace('.tar.bz2', '.csv'); - - return `curl -s ${wofDataHost}/bundles/${bundle} | ${extract} --strip-components=1 --exclude=README.txt -C ` + - `${directory} && mv ${path.join(directory, csvFilename)} ${path.join(directory, 'meta')}`; - } - - bundles.generateBundleList((err, bundlesToDownload) => { - if (err) { - throw new Error(err.message); - } - - const downloadFunctions = bundlesToDownload.map(function (type) { - return function downloadABundle(callback) { - const cmd = generateCommand(type, config.imports.whosonfirst.datapath); - console.log('Downloading ' + type + ' bundle'); - child_process.exec(cmd, function commandCallback(error, stdout, stderr) { - console.log('done downloading ' + type + ' bundle'); - if (error) { - console.error('error downloading ' + type + ' bundle: ' + error); - console.log(stderr); - } - callback(); - }); - }; - }); - - async.parallelLimit(downloadFunctions, simultaneousDownloads, callback); - }); -} - -module.exports.download = download; diff --git a/utils/download_sqlite_all.js b/utils/download_sqlite_all.js index 27d9ff6a..55827bcd 100644 --- a/utils/download_sqlite_all.js +++ b/utils/download_sqlite_all.js @@ -9,10 +9,20 @@ const commandExistsSync = require('command-exists').sync; const config = require('pelias-config').generate(require('../schema')); const DATA_GEOCODE_EARTH_URL = 'https://data.geocode.earth/wof/dist'; -const DATA_WOF_URL = 'https://dist.whosonfirst.org'; -const wofDataHost = config.get('imports.whosonfirst.dataHost') || DATA_WOF_URL; -const COMBINED_REGEX = /^whosonfirst-data-(admin|postalcode|venue)-latest/; -const CONTRY_REGEX = /^whosonfirst-data-(admin|postalcode|venue)-[a-z]{2}-latest/; +const wofDataHost = config.get('imports.whosonfirst.dataHost') || DATA_GEOCODE_EARTH_URL; +const COMBINED_REGEX = /^whosonfirst-data-(admin|postalcode)-latest/; +const COUNTRY_REGEX = /^whosonfirst-data-(admin|postalcode)-[a-z]{2}-latest/; + +function on_done() { + console.log('All done!'); +} + +function getCountriesToDownload() { + const countries = Array.isArray(config.imports.whosonfirst.countryCode) ? + config.imports.whosonfirst.countryCode : [config.imports.whosonfirst.countryCode]; + + return countries.map((c) => { return c.toLowerCase(); }); +} function download(callback) { //ensure required directory structure exists @@ -26,33 +36,20 @@ function download(callback) { const cpuCount = os.cpus().length; const simultaneousDownloads = Math.max(maxSimultaneousDownloads, Math.min(1, cpuCount / 2)); const countryFilter = () => { - const countries = Array.isArray(config.imports.whosonfirst.countries) ? - config.imports.whosonfirst.countries : [config.imports.whosonfirst.countries]; + const countries = getCountriesToDownload(); return (e) => { if (countries.length === 0) { - // This is specific to geocode earth + // This is specific to geocode earth, it will select global sqlites return COMBINED_REGEX.test(e.name_compressed) || wofDataHost !== DATA_GEOCODE_EARTH_URL; } - return countries.some(c => e.name_compressed.indexOf(`-${c}-latest`) >= 0); - }; - }; - - const importVenues = () => { - return config.imports.whosonfirst.importVenues && process.argv[2] !== '--admin-only'; - }; - - const venueFilter = (venuesOnly) => { - return (e) => { - if (venuesOnly) { - return e.name_compressed.indexOf('venue') >= 0; - } - return e.name_compressed.indexOf('venue') < 0 || importVenues(); + // This will download sqlites with the selected country code + return COUNTRY_REGEX.test(e.name_compressed) && countries.some(c => e.name_compressed.indexOf(`-${c}-latest`) >= 0); }; }; - const generateSQLites = (url, venuesOnly) => { + const generateSQLites = () => { const files = {}; - const content = JSON.parse(downloadFileSync(`${url}/sqlite/inventory.json`)) + JSON.parse(downloadFileSync(`${wofDataHost}/sqlite/inventory.json`)) // Only latest compressed files .filter(e => e.name_compressed.indexOf('latest') >= 0) // Only wanted countries @@ -60,10 +57,9 @@ function download(callback) { // Postalcodes only when importPostalcodes is ture and without --admin-only arg .filter(e => e.name_compressed.indexOf('postalcode') < 0 || (config.imports.whosonfirst.importPostalcodes && process.argv[2] !== '--admin-only')) - // Venues only when importVenues is true and without --admin-only arg - .filter(venueFilter(venuesOnly)) - // We don't need constituency and intersection ? - .filter(e => e.name_compressed.indexOf('constituency') < 0 && e.name_compressed.indexOf('intersection') < 0) + // We don't need constituency and intersection and venue + .filter(e => e.name_compressed.indexOf('constituency') < 0 && + e.name_compressed.indexOf('intersection') < 0 && e.name_compressed.indexOf('venue') < 0) // Remove duplicates based on name, we can have differents name_compressed // (for example whosonfirst-data-latest.db.bz2 and whosonfirst-data-latest.db.tar.bz2) // but with the same name... We will take the newer version. @@ -73,13 +69,6 @@ function download(callback) { } else if (!files[e.name]) { files[e.name] = e; } - // Remove old combined database when per country exists and are newer - if (CONTRY_REGEX.test(files[e.name].name) && - files['whosonfirst-data-latest.db'] && - new Date(files[e.name].last_modified) > new Date(files['whosonfirst-data-latest.db'].last_modified)) { - delete files['whosonfirst-data-latest.db']; - } - e.downloadUrl = `${url}/sqlite/${e.name_compressed}`; }); return Object.values(files); }; @@ -104,15 +93,10 @@ function download(callback) { throw new Error('What is this extension ?!?'); } - return `curl -s ${sqlite.downloadUrl} | ${extract} > ${path.join(directory, 'sqlite', sqlite.name)}`; + return `curl -s ${wofDataHost}/sqlite/${sqlite.name_compressed} | ${extract} > ${path.join(directory, 'sqlite', sqlite.name)}`; }; - // All SQLites to download, if Venues are activated, we add some download from WOF. - const generatedSQLites = generateSQLites(wofDataHost).concat( - wofDataHost === DATA_GEOCODE_EARTH_URL && importVenues() ? generateSQLites(DATA_WOF_URL, true) : [] - ); - - const downloadFunctions = generatedSQLites.map(function (sqlite) { + const downloadFunctions = generateSQLites().map(function (sqlite) { return function downloadABundle(callback) { const cmd = generateCommand(sqlite, config.imports.whosonfirst.datapath); console.log('Downloading ' + sqlite.name_compressed); @@ -130,4 +114,4 @@ function download(callback) { async.parallelLimit(downloadFunctions, simultaneousDownloads, callback); } -module.exports.download = download; +download(on_done); diff --git a/utils/sqlite_clean.js b/utils/sqlite_clean.js index 97943c63..71a63308 100644 --- a/utils/sqlite_clean.js +++ b/utils/sqlite_clean.js @@ -6,11 +6,11 @@ const config = require('pelias-config').generate(require('../schema')).imports.w if (config.sqlite) { const filters = ` id = 1 OR name = '' OR is_deprecated != 0 OR is_superseded != 0 - OR (spr.latitude != 0 AND spr.longitude != 0)` + + OR (spr.latitude != 0 AND spr.longitude != 0) + OR placetype = 'venue' + OR placetype = 'intersection'` + (config.importPostalcodes ? '' : ` OR placetype = 'postalcode' `) + - (config.importVenues ? '' : ` OR placetype = 'venue' `) + - (config.importConstituencies ? '' : ` OR placetype = 'constituency' `) + - (config.importIntersections ? '' : ` OR placetype = 'intersection' `); + (config.importConstituencies ? '' : ` OR placetype = 'constituency' `); generateBundleList((e, dbList) => { if (e) { console.error(e); diff --git a/utils/sqlite_common.js b/utils/sqlite_common.js deleted file mode 100644 index 52bae37a..00000000 --- a/utils/sqlite_common.js +++ /dev/null @@ -1,40 +0,0 @@ - -const fs = require('fs'); -const path = require('path'); - -// handler for all metatdata streams -module.exports.MetaDataFiles = function MetaDataFiles( metaDir ){ - let streams = {}; - this.stats = {}; - this.write = function( row ){ - let keys = Object.keys(row); - - // first time writing to this meta file - if( !streams.hasOwnProperty( row.placetype ) ){ - - // create write stream - streams[row.placetype] = fs.createWriteStream( - path.join( metaDir, `whosonfirst-data-${row.placetype}-latest.csv` ) - ); - - // init stats - this.stats[row.placetype] = 0; - - // write csv header - streams[row.placetype].write( keys.join(',') + '\n' ); - } - - // write csv row - streams[row.placetype].write( keys.map(key => { - // quote fields containing comma or newline, escape internal quotes - // https://gist.github.com/getify/3667624 - if ( /[",\n]/.test( row[key] ) ) { - return '"' + row[key].replace(/""([\s\S])|(")/g, '"$1$2') + '"'; - } - return row[key]; - }).join(',') + '\n' ); - - // increment stats - this.stats[row.placetype]++; - }; -}; diff --git a/utils/sqlite_download.js b/utils/sqlite_download.js deleted file mode 100644 index fb7f52cc..00000000 --- a/utils/sqlite_download.js +++ /dev/null @@ -1,60 +0,0 @@ - -const os = require('os'); -const util = require('util'); -const path = require('path'); -const fs = require('fs-extra'); -const child_process = require('child_process'); -const async = require('async'); -const common = require('./sqlite_common'); - -function download(options, callback){ - - // load configuration variables - const config = require('pelias-config').generate(require('../schema')).imports.whosonfirst; - const sqliteDir = path.join(config.datapath, 'sqlite'); - fs.ensureDirSync(sqliteDir); - - // generate a download function per database listed in config - const downloadFunctions = options.databases.map(filename => { - return (done) => { - - // build shell command - const options = { cwd: __dirname }; - const cmd = './sqlite_download.sh'; - const args = [ filename, path.join( sqliteDir, filename ) ]; - const child = child_process.spawn(cmd, args, options); - - // handle stdio - function stdio( ioname, buffer ){ - child[ioname].on('data', data => { - buffer += data; - let line = data.toString().trim(); - if( line.length ){ console.log( line ); } - }); - } - - var stdout, stderr; - stdio( 'stdout', stdout ); - stdio( 'stderr', stderr ); - - // handle exit code - child.on('exit', code => { - if( '0' !== code.toString() ){ - console.error('error downloading: ' + filename); - } - done(); - }); - }; - }); - - // download one database for every other CPU (tar and bzip2 can both max out one core) - // (but not more than 4, to keep things from getting too intense) - // lower this number to make the downloader more CPU friendly - // raise this number to (possibly) make it faster - const simultaneousDownloads = Math.max(4, Math.min(1, os.cpus().length / 2)); - - // download all files - async.parallelLimit(downloadFunctions, simultaneousDownloads, callback); -} - -module.exports.download = download; diff --git a/utils/sqlite_download.sh b/utils/sqlite_download.sh deleted file mode 100755 index 0177456b..00000000 --- a/utils/sqlite_download.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -set -euo pipefail -IFS=$'\n\t' - -# whosonfirst sqlite database downloader -# this script handles the download & extract of whosonfirst bundles. -# an additonal '.timestamp' file is saved next to the extracted database -# in order to avoid re-downloading the same file on subsequent executions. - -# input params: -# $1) name of database without path or bz2 suffix -# eg. 'whosonfirst-data-postalcode-ad-latest.db' -# $2) absolute path of database without bz2 suffix -# eg. '/tmp/whosonfirst-data-postalcode-ad-latest.db' - -# you can find a list of available bundles at: -# https://dist.whosonfirst.org/sqlite/ - -DB_FILENAME="$1" -LOCAL_DB_PATH="$2" -LOCAL_BZ2_PATH="$2.bz2" -LOCAL_TS_PATH="${LOCAL_DB_PATH}.timestamp" -REMOTE='https://dist.whosonfirst.org/sqlite' -REMOTE_PATH="${REMOTE}/${DB_FILENAME}.bz2" - -info() { echo -e "\e[33m[$1]\t\e[0m $2" >&2; } -err() { echo -e "\e[31m[$1]\t\e[0m \e[91m$2\e[0m" >&2; } - -# Check if we have lbzip2 (https://lbzip2.org/) installed -decompress_utility() { - if hash lbunzip2 2>/dev/null; then - lbunzip2 -d -f "${LOCAL_BZ2_PATH}" > "${LOCAL_DB_PATH}" - else - bunzip2 -f "${LOCAL_BZ2_PATH}" > "${LOCAL_DB_PATH}" - fi -} -extract_file() { - info 'whosonfirst-sqlite-decompress' "${LOCAL_BZ2_PATH}" - decompress_utility -} -generate_timestamp() { - printf "@" > "${LOCAL_TS_PATH}" # date command requires @ prefix - stat -c %Y "${LOCAL_DB_PATH}" >> "${LOCAL_TS_PATH}" -} -download_handler() { - HTTP_STATUS="${1}" - if [[ "${HTTP_STATUS}" == "200" ]]; then - extract_file && generate_timestamp - elif [[ "${HTTP_STATUS}" == "304" ]]; then - info 'not modified' "${DB_FILENAME}" - else - rm -f "${LOCAL_BZ2_PATH}" - err "status ${HTTP_STATUS}" "${REMOTE_PATH}" - fi -} -download_sqlite_db() { - info 'whosonfirst-sqlite-download' "${REMOTE_PATH}" - if [[ ! -f "${LOCAL_TS_PATH}" ]]; then - # first download - download_handler $(curl "${REMOTE_PATH}" \ - -o "${LOCAL_BZ2_PATH}" -s -L -w %{http_code}) - else - # subsequent download - LAST_MODIFIED=$(date --rfc-2822 -f "${LOCAL_TS_PATH}") - download_handler $(curl -s "${REMOTE_PATH}" \ - -z "${LAST_MODIFIED}" \ - -o "${LOCAL_BZ2_PATH}" -s -L -w %{http_code}) - fi -} - -download_sqlite_db; diff --git a/utils/sqlite_extract_data.js b/utils/sqlite_extract_data.js deleted file mode 100644 index 8d6b711f..00000000 --- a/utils/sqlite_extract_data.js +++ /dev/null @@ -1,212 +0,0 @@ - -const fs = require('fs-extra'); -const path = require('path'); -const util = require('util'); -const Sqlite3 = require('better-sqlite3'); -const common = require('./sqlite_common'); -const wofIdToPath = require('../src/wofIdToPath'); - -// sql statements -const sql = { - data: `SELECT spr.id, spr.placetype, geojson.body FROM geojson - JOIN spr ON geojson.id = spr.id - WHERE spr.id @placefilter;`, - meta: `SELECT - json_extract(body, '$.bbox[0]') || ',' || - json_extract(body, '$.bbox[1]') || ',' || - json_extract(body, '$.bbox[2]') || ',' || - json_extract(body, '$.bbox[3]') AS bbox, - json_extract(body, '$.properties.edtf:cessation') AS cessation, - json_extract(body, '$.properties.wof:hierarchy[0].country_id') AS country_id, - json_extract(body, '$.properties.edtf:deprecated') AS deprecated, - '' AS file_hash, - '' AS fullname, - json_extract(body, '$.properties.geom:hash') AS geom_hash, - json_extract(body, '$.properties.geom:latitude') AS geom_latitude, - json_extract(body, '$.properties.geom:longitude') AS geom_longitude, - json_extract(body, '$.properties.wof:id') AS id, - json_extract(body, '$.properties.edtf:inception') AS inception, - json_extract(body, '$.properties.iso:country') AS iso, - json_extract(body, '$.properties.iso:country') AS iso_country, - json_extract(body, '$.properties.wof:lastmodified') AS lastmodified, - json_extract(body, '$.properties.lbl:latitude') AS lbl_latitude, - json_extract(body, '$.properties.lbl:longitude') AS lbl_longitude, - json_extract(body, '$.properties.wof:hierarchy[0].locality_id') AS locality_id, - json_extract(body, '$.properties.wof:name') AS name, - json_extract(body, '$.properties.wof:parent_id') AS parent_id, - REPLACE( - REPLACE( - SUBSTR(json_extract(body, '$.properties.wof:id'),1,3) ||'/'|| - SUBSTR(json_extract(body, '$.properties.wof:id'),4,3) ||'/'|| - SUBSTR(json_extract(body, '$.properties.wof:id'),7,3) ||'/'|| - SUBSTR(json_extract(body, '$.properties.wof:id'),10) ||'/'|| - json_extract(body, '$.properties.wof:id') || '.geojson', - '//', '/'), - '//','/') AS path, - json_extract(body, '$.properties.wof:placetype') AS placetype, - json_extract(body, '$.properties.wof:hierarchy[0].region_id') AS region_id, - json_extract(body, '$.properties.src:geom') AS source, - json_extract(body, '$.properties.wof:superseded_by[0]') AS superseded_by, - json_extract(body, '$.properties.wof:supersedes[0]') AS supersedes, - json_extract(body, '$.properties.wof:country') AS wof_country - FROM geojson - WHERE id @placefilter;`, - subdiv: `SELECT DISTINCT LOWER( IFNULL( - json_extract(body, '$.properties."wof:subdivision"'), - json_extract(body, '$.properties."iso:country"') - )) AS subdivision - FROM geojson - WHERE id @placefilter - AND subdivision != '';`, - placefilter: `IN ( - SELECT DISTINCT id - FROM ancestors - WHERE id IN (@wofids) - UNION - SELECT DISTINCT id - FROM ancestors - WHERE ancestor_id IN (@wofids) - UNION - SELECT DISTINCT ancestor_id - FROM ancestors - WHERE id IN (@wofids) - )` -}; - -function extract(options, callback){ - - // load configuration variables - const config = require('pelias-config').generate(require('../schema')).imports.whosonfirst; - - // location of data and meta dirs - const metaDir = path.join(config.datapath, 'meta'); - const dataDir = path.join(config.datapath, 'data'); - const sqliteDir = path.join(config.datapath, 'sqlite'); - - // unlink (truncate meta and data dirs) - if( options && true === options.unlink ){ - fs.removeSync(metaDir); - fs.removeSync(dataDir); - } - - // ensure required directory structure exists - fs.ensureDirSync(metaDir); - fs.ensureDirSync(dataDir); - fs.ensureDirSync(sqliteDir); - - // open one write stream per metadata file - // note: important for to ensure meta files are written correctly - // with only one header per import run - const metafiles = new common.MetaDataFiles( metaDir ); - - // extract from a single db file - function extractDB( dbpath ){ - let targetWofIds = Array.isArray(config.importPlace) ? config.importPlace: [config.importPlace]; - - // connect to sql db - let db = new Sqlite3( dbpath, { readonly: true } ); - - // convert ids to integers and remove any which fail to convert - let cleanIds = targetWofIds.map(id => parseInt(id, 10)).filter(id => !isNaN(id)); - - // placefilter is used to select only records targeted by the 'importPlace' config option - // note: if no 'importPlace' ids are provided then we process all ids which aren't 0 - let placefilter = (cleanIds.length > 0) ? sql.placefilter : '!= 0'; - - // note: we need to use replace instead of bound params in order to be able - // to query an array of values using IN. - let dataQuery = sql.data.replace(/@placefilter/g, placefilter).replace(/@wofids/g, cleanIds.join(',')); - let metaQuery = sql.meta.replace(/@placefilter/g, placefilter).replace(/@wofids/g, cleanIds.join(',')); - - // extract all data to disk - for( let row of db.prepare(dataQuery).iterate() ){ - if( 'postalcode' === row.placetype && true !== config.importPostalcodes ){ return; } - if( 'venue' === row.placetype && true !== config.importVenues ){ return; } - if( 'constituency' === row.placetype && true !== config.importConstituencies ){ return; } - if( 'intersection' === row.placetype && true !== config.importIntersections ){ return; } - writeJson( row ); - } - - // write meta data to disk - for( let row of db.prepare(metaQuery).iterate() ){ - if( 'postalcode' === row.placetype && true !== config.importPostalcodes ){ return; } - if( 'venue' === row.placetype && true !== config.importVenues ){ return; } - if( 'constituency' === row.placetype && true !== config.importConstituencies ){ return; } - if( 'intersection' === row.placetype && true !== config.importIntersections ){ return; } - if( !row.hasOwnProperty('path') ){ - // ensure path property is present (required by some importers) - row.path = wofIdToPath(row.id).concat(row.id+'.geojson').join(path.sep); - } - metafiles.write( row ); - } - - // close connection - db.close(); - } - - // extract from all database files - options.databases.forEach( filename => { - - let dbpath = path.join( sqliteDir, filename ); - if( !fs.existsSync( dbpath ) ){ - console.error('not found:', dbpath); - return; - } - - extractDB( dbpath ); - }); - - // print stats - if( Object.keys( metafiles.stats ).length ){ - Object.keys( metafiles.stats ).forEach( key => { - console.error( util.format('extracted %d %s(s)', - metafiles.stats[key], key - )); - }); - } else { - console.error('failed to extract any records!'); - } - - callback(); - - // ---------------------------------------------------------------------------- - - // write json to disk - function writeJson( row ){ - let targetDir = path.join(dataDir, wofIdToPath(row.id).join(path.sep)); - try { - fs.ensureDirSync(targetDir); - fs.writeFileSync( path.join(targetDir, `${row.id}.geojson`), row.body, 'utf8' ); - } catch( error ){ - if( error ){ console.error(`error making directory ${targetDir}`); } - } - } -} - -// return all distinct subdivisions of the data -function findSubdivisions( filename ){ - - // load configuration variables - const config = require('pelias-config').generate(require('../schema')).imports.whosonfirst; - const sqliteDir = path.join(config.datapath, 'sqlite'); - let targetWofIds = Array.isArray(config.importPlace) ? config.importPlace: [config.importPlace]; - - // connect to sql db - let db = new Sqlite3( path.join( sqliteDir, filename ), { readonly: true } ); - - // convert ids to integers and remove any which fail to convert - let cleanIds = targetWofIds.map(id => parseInt(id, 10)).filter(id => !isNaN(id)); - - // placefilter is used to select only records targeted by the 'importPlace' config option - // note: if no 'importPlace' ids are provided then we process all ids which aren't 0 - let placefilter = (cleanIds.length > 0) ? sql.placefilter : '!= 0'; - - // query db - // note: we need to use replace instead of using bound params in order to - // be able to query an array of values using IN. - let query = sql.subdiv.replace(/@placefilter/g, placefilter).replace(/@wofids/g, cleanIds.join(',')); - return db.prepare(query).all().map( row => row.subdivision.toLowerCase()); -} - -module.exports.extract = extract; -module.exports.findSubdivisions = findSubdivisions; \ No newline at end of file