Fix Qleverfile for OSM Planet (#135)

Add `ULIMT = 10000`. Update memory settings to more modest ones. Replace the error-prone `curl` by the more robust `wget`. Update meta information in comments at the top of the `Qleverfile`. Fixes #129
ad-freiburg · Feb 13, 2025 · d65235b · d65235b
1 parent ba2823d
commit d65235b
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 12 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "qlever"
 description = "Script for using the QLever SPARQL engine."
-version = "0.5.17"
+version = "0.5.18"
 authors = [
     { name = "Hannah Bast", email = "[email protected]" }
 ]

diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-planet b/src/qlever/Qleverfiles/Qleverfile.osm-planet
@@ -1,16 +1,15 @@
-# Qleverfile for OSM Planet, use with the qlever script (pip install qlever)
+# Qleverfile for OSM Planet, use with the QLever CLI (`pip install qlever`)
 #
-# qlever get-data  # downloads .ttl.bz2 file of ~ 400 GB with ~ 100 B triples
-# qlever index     # takes ~12 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X)
-# qlever start     # takes a few seconds
+# qlever get-data  # downloads ~400 GB (ttl.bz2), ~100 B triples
+# qlever index     # ~20 hours, ~60 GB RAM, ~1.5 TB index size on disk
+# qlever start     # a few seconds, adjust MEMORY_FOR_QUERIES as needed
 #
-# For the OSM data of a single country, do `qlever setup-config osm-country`
-# and edit the Qleverfile to specify the country,
+# Measured on an AMD Ryzen 9 7950X with 128 GB RAM and 2 x 8 TB NVMe (04.01.2025)
 
 [data]
 NAME         = osm-planet
 DATA_URL     = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2
-GET_DATA_CMD = curl --location --fail --continue-at - --remote-time --output ${NAME}.ttl.bz2 ${DATA_URL}
+GET_DATA_CMD = unbuffer wget -O ${NAME}.ttl.bz2 ${DATA_URL} | tee ${NAME}.download-log.txt
 VERSION      = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y" || echo "NO_DATE")
 DESCRIPTION  = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
 
@@ -21,14 +20,15 @@ PARALLEL_PARSING   = true
 PARSER_BUFFER_SIZE = 100M
 STXXL_MEMORY       = 40G
 SETTINGS_JSON      = { "num-triples-per-batch": 10000000 }
+ULIMIT             = 10000
 
 [server]
 PORT                        = 7007
 ACCESS_TOKEN                = ${data:NAME}
-MEMORY_FOR_QUERIES          = 60G
-CACHE_MAX_SIZE              = 40G
-CACHE_MAX_SIZE_SINGLE_ENTRY = 30G
-TIMEOUT                     = 300s
+MEMORY_FOR_QUERIES          = 40G
+CACHE_MAX_SIZE              = 20G
+CACHE_MAX_SIZE_SINGLE_ENTRY = 20G
+TIMEOUT                     = 600s
 
 [runtime]
 SYSTEM = docker