Skip to content

Commit

Permalink
Merge pull request #448 from rinigus/docker-import
Browse files Browse the repository at this point in the history
Switch to import using Docker containers
  • Loading branch information
rinigus authored Oct 6, 2024
2 parents 14f92e7 + e6a5792 commit 39bcb96
Show file tree
Hide file tree
Showing 33 changed files with 668 additions and 664 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@
/.clickable/
/clickable.yaml
/click_release/

# import
/scripts/import/data
/scripts/import/.env
41 changes: 41 additions & 0 deletions scripts/import/.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# planet or region
AREA=monaco
PBF=${AREA}.osm.pbf

# USER
USER_ID=1000
GROUP_ID=1000

# storage
STORE_PLANET=./data/planet
STORE_OUTPUT=./data/output

# final import
STORE_IMPORTED=${STORE_OUTPUT}/Imported

# intermediate imports
STORE_MBTILES=${STORE_OUTPUT}/mbtiles-planet
STORE_NOMINATIM=${STORE_OUTPUT}/nominatim
STORE_VALHALLA=${STORE_OUTPUT}/valhalla

STORE_MISC=${STORE_OUTPUT}/misc

# planetiler: memory options
JAVA_TOOL_OPTIONS=-Xmx32g # use -Xmx130g for planet
PLANETILER_STORAGE_TMP=mmap # comment out to use ram which is faster but needs more ram

# RAM
RAM_DEFALT_LIMIT=10g
RAM_NOMINATIM_LIMIT=32g
RAM_PLANETILER_LIMIT=40g
RAM_VALHALLA_LIMIT=32g

# number of parallel imports for geocoder-nlp
GEOCODER_JOBS=8

# password for the database - set it to some random value
NOMINATIM_PASSWORD=randomPassword

### do not change below
# versions
VALHALLA_VERSION=3.4.0
98 changes: 98 additions & 0 deletions scripts/import/Dockerfile.postprocess
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# build
FROM debian:bookworm AS build

RUN apt-get update
RUN apt-get install -y\
git make cmake \
g++ gcc \
autoconf libtool \
nlohmann-json3-dev \
libmarisa-dev libkyotocabinet-dev \
libsqlite3-dev libpqxx-dev \
libboost-program-options-dev

WORKDIR /app

# libpostal
RUN git clone https://github.com/openvenues/libpostal.git
WORKDIR /app/libpostal
RUN git checkout 8f2066b1d30f4290adf59cacc429980f139b8545

RUN ./bootstrap.sh
RUN ./configure \
--prefix=/usr \
--datadir=/planet_pbf/libpostal \
--disable-data-download
RUN make -j${nproc} && make install

WORKDIR /app

# geocoder-nlp
RUN git clone https://github.com/rinigus/geocoder-nlp.git
WORKDIR /app/geocoder-nlp
RUN git checkout 2c9fc2bd2f51ca2c36b1266bf5b58db593b5dd4d
RUN git submodule init && \
git submodule update

RUN cmake -B build -DCMAKE_INSTALL_PREFIX:PATH=/usr
RUN make -C build -j${nproc} && \
make -C build install

WORKDIR /app

#############
# runtime
FROM debian:bookworm

RUN apt-get update
RUN apt-get install -y \
python3 python3-venv \
bzip2 gzip \
curl

# copy from build
COPY --from=build /usr/bin/geocoder-importer /usr/bin/
COPY --from=build /app/geocoder-nlp/importer/data /app/geocoder-nlp
COPY --from=build /usr/bin/libpostal_data /usr/bin/
COPY --from=build /usr/lib/libpostal.so* /usr/lib/

RUN apt-get install -y \
make \
libmarisa0 \
libkyotocabinet16v5 \
libpqxx-6.4 \
libboost-program-options1.74.0

WORKDIR /app

# create and activate venv
RUN python3 -m venv venv
ENV PATH=/app/venv/bin:$PATH

RUN pip install \
numpy \
mercantile \
psutil \
shapely \
tabulate

# copy used scripts
COPY ./run_postprocess.sh ./pack.sh ./check_import.py ./prepare_distribution.py /app
COPY ./build_geocoder.sh /app/build_geocoder.sh

COPY ./mapbox_planetiler_split.py ./hierarchy.py /app

RUN mkdir mapbox_scripts
COPY ./mapbox/make_packs.py /app/mapbox_scripts/

RUN mkdir valhalla_scripts
COPY ./valhalla/make_packs.py /app/valhalla_scripts/

COPY \
./prepare_countries.py \
./mapbox_country_pack.py \
./valhalla_country_pack.py \
/app

# command
CMD /app/run_postprocess.sh
2 changes: 2 additions & 0 deletions scripts/import/Dockerfile.wget
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FROM alpine
RUN apk --update add openssl wget && rm -rf /var/cache/apk/*
113 changes: 57 additions & 56 deletions scripts/import/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

# Importing maps and their distribution

These scripts are used to download PBF files, split them into regions
and import into the formats recognized by OSM Scout Server.

Import is implemented using Docker containers. Containers are used to download
PBF files, split them into regions and import into the formats recognized
by OSM Scout Server.

## Hierarchy

Expand All @@ -30,73 +30,77 @@ POLY file. It allows also to simply the polygon, use it to reduce
processing on import.


## Scripts
## Import

Information below is for developers and is not required to add or
change imported regions.
At the moment of writing, all datasets used by OSM Scout Server are imported as described
in this README. Exceptions are libpostal (old training datasets are used) and Mapnik (not
imported actively anymore, but old data is distributed).

### On every import
### Prepare containers

* Increase URL id for geocoder-nlp, and others in
`prepare_distribution.py`

* Check that bucket_name is correct
Configure the containers by making a file `.env` by using `.env.template` as a
template. Among other settings, directories where to store the data, and whether
to import full planet or a region.

* If coastlines have to be updated, see corresponding instructions
below and run them before the next instruction.
When directories are determined in `.env`, run a script that will create them
for you and will set user IDs in `.env` to the current user:
```
./prepare_docker.sh
```

* Set environment variable `GEOCODER_IMPORTER_POSTGRES` pointing to
Postgres database with Nominatim tables. For example,
"postgresql://nominatim:pwd@host:port/nominatim"
Next, build the containers:
```
docker compose build
```

* Remove `distribution/geocoder-nlp/.directory` to ensure that
Geocoder NLP files will be imported. This is needed as there is no
time-dependent file reflecting data time stamp:
`rm distribution/geocoder-nlp/.directory`
### Import

* Run `./import_master.sh`
Import the maps by running:
```
docker compose up
```

This will start containers defined in the
[docker compose file](docker-compose.yaml) and trigger the import into the
different formats used by the server. To reduce the stress on the server, the
imports are performed one after another. Expect that the process will take
few days.

### On backend upgrade
If import is finished successfully, the last container will shutdown Nominatim
database used at one of the phases of the import.

In case if there is a new backend version:
Note that requiered storage during import is about:

* run ./prepare_backends.sh
- 1.5 TB for Nominatim
- 170 GB for MBTiles
- 80 GB for Planet.pbf
- 35 GB for Valhalla
- For distribution, about 200 GB

* update versions in the end of build_mapnik.sh script
### Distribution

* update version for Valhalla in valhalla_country_pack.py
To distribute maps:

* update version for vector tiles in mapbox/make_packs.py
* Increase URL id for geocoder-nlp, and others in `prepare_distribution.py`

* check for compatibility in mapmanagerfeature.cpp of the main tree
* Check that `countries_json` in `prepare_distribution.py` points to correct location

* Check that bucket_name is correct

* Run preparation scripts:
```
./prepare_distribution.py
./check_import.py
```

### On coastline update
* Test the maps by serving them using Python:
```
(cd public_http && python -m http.server 8231)
```
and downloading them by OSM Scout Server (you would have to adjust server URL used by
application in file `url.json` of your maps directory and setting `development_disable_url_update=1` in the application configuration file).

* Download data covering coastlines from http://openstreetmapdata.com/data/land-polygons :
http://data.openstreetmapdata.com/simplified-land-polygons-complete-3857.zip
and http://data.openstreetmapdata.com/land-polygons-split-3857.zip

* Unzip the data:
```
unzip simplified-land-polygons-complete-3857.zip
unzip land-polygons-split-3857.zip
```

* Index the polygons by using `shapeindex` to create `.index` files:
```
shapeindex simplified-land-polygons-complete-3857/simplified_land_polygons.shp
shapeindex land-polygons-split-3857/land_polygons.shp
```
* Place the directories `simplified-land-polygons-complete-3857` and
`land-polygons-split-3857` under `mapnik/global` of the distribution.
* Package the `mapnik/global` distribution by running `./pack.sh
distribution/mapnik/global 1` . Here, `1` stands for database
version.
* If all is good, upload it using `./uploader.sh`

### Other scripts

Expand All @@ -109,7 +113,7 @@ In case if there is a new backend version:

* poly2json-hierarchy.sh - generates poly.json from poly in hierarchy

* prepare_countries.py, prepare_distribution.py, prepare_splitter.py - used by importer_master.sh
* prepare_countries.py - used by one of the containers

* prepare_postal.sh - rsyncs postal global and postal country datasets
into distribution. Split into countries for geocoder-nlp is done in
Expand All @@ -118,9 +122,6 @@ In case if there is a new backend version:
* prepare_postal_tags.py - used for distribution of initial
postal_country files in hierarchy

* prepare_osmium.sh - installs new version of osmium and libosmium
used to split planet into regions
* update_distribution.py - used on the distribution server to update
maps distribution from S3 cloud / HTTP server. See note on the top
of the script on how to specify mirror path and URL
Expand Down
11 changes: 3 additions & 8 deletions scripts/import/build_geocoder.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,8 @@

set -e

BNDSRC=`pwd`/backends-source
BNDINSTALL=`pwd`/backends-install

PRIORITY=$BNDINSTALL/share/geocoder-nlp/data/priority.list
IMPORTER=$BNDINSTALL/bin/geocoder-importer

export LD_LIBRARY_PATH=$BNDINSTALL/lib
PRIORITY=/app/geocoder-nlp/priority.list
IMPORTER=geocoder-importer

if [ $# -lt 5 ] ; then
echo "Usage: $0 poly.json base_dir continent_country country_2_letter_code external_postcodes"
Expand All @@ -29,4 +24,4 @@ mkdir -p "$SQLDIR"

"$IMPORTER" --poly "$POLYJSON" --priority "$PRIORITY" --postal-country "$COUNTRY_CODE" --log-errors-to-file="$ERRORLOG" "$SQLDIR"

./pack.sh "$SQLDIR" `backends-install/bin/geocoder-importer --version`
./pack.sh "$SQLDIR" `$IMPORTER --version-data`
23 changes: 0 additions & 23 deletions scripts/import/build_mapnik.sh

This file was deleted.

Loading

0 comments on commit 39bcb96

Please sign in to comment.