Add options --ulimit und --parser-buffer-size configurable for `q…

…lever index` command (#132) So far, the `ulimit` was set to `1048576` when the size of the input files is larger than `10 GB`, which is a simplisitic and rough heuristic. In particular, this does not work when the RDF data is produced from the input file and the latter is relatively small compared to the former. Or when the OS allows increasing the `ulimit` but not that much. Now the `ulimit` can also be set explicitly via the option `--ulimit` of the `qlever index` command, or via the variable `ULIMIT` in the `[index]` section of the Qleverfile. Further, ad-freiburg/qlever#1698 introduced an option `--parser-buffer-size` to `IndexBuilderMain`, see there for the effect. This can now also be set explicitly via the option `--parser-buffer-size` of the `qlever index` command, or via the variable `PARSER_BUFFER_SIZE` in the `[index]` Qleverfile.
ad-freiburg · Feb 12, 2025 · e5a4990 · e5a4990
1 parent f1797ef
commit e5a4990
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 23 deletions.
diff --git a/src/qlever/Qleverfiles/Qleverfile.ohm-planet b/src/qlever/Qleverfiles/Qleverfile.ohm-planet
@@ -17,9 +17,11 @@ VERSION        = $$(date -r ${NAME}.pbf +%d.%m.%Y || echo "NO_DATE")
 DESCRIPTION    = OHM Planet, data from ${GET_DATA_URL} version ${VERSION} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
 
 [index]
-INPUT_FILES      = ${data:NAME}.ttl.bz2
-CAT_INPUT_FILES  = bzcat -f ${INPUT_FILES}
-SETTINGS_JSON    = { "prefixes-external": [""], "ascii-prefixes-only": false, "parallel-parsing": true, "num-triples-per-batch": 5000000 }
+INPUT_FILES        = ${data:NAME}.ttl.bz2
+CAT_INPUT_FILES    = lbzcat -n 2 ${INPUT_FILES}
+PARALLEL_PARSING   = true
+PARSER_BUFFER_SIZE = 100M
+SETTINGS_JSON      = { "num-triples-per-batch": 5000000 }
 
 [server]
 PORT                        = 7037

diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-planet b/src/qlever/Qleverfiles/Qleverfile.osm-planet
@@ -1,6 +1,6 @@
 # Qleverfile for OSM Planet, use with the qlever script (pip install qlever)
 #
-# qlever get-data  # takes ~50 mins to download .ttl.bz2 file of ~ 300 GB
+# qlever get-data  # downloads .ttl.bz2 file of ~ 400 GB with ~ 100 B triples
 # qlever index     # takes ~12 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X)
 # qlever start     # takes a few seconds
 #
@@ -15,15 +15,17 @@ VERSION      = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y" || echo "NO_DATE")
 DESCRIPTION  = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)
 
 [index]
-INPUT_FILES     = ${data:NAME}.ttl.bz2
-CAT_INPUT_FILES = lbzcat -f -n 2 ${INPUT_FILES}
-STXXL_MEMORY    = 20G
-SETTINGS_JSON   = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
+INPUT_FILES        = ${data:NAME}.ttl.bz2
+CAT_INPUT_FILES    = lbzcat -n 2 ${INPUT_FILES}
+PARALLEL_PARSING   = true
+PARSER_BUFFER_SIZE = 100M
+STXXL_MEMORY       = 40G
+SETTINGS_JSON      = { "num-triples-per-batch": 10000000 }
 
 [server]
 PORT                        = 7007
 ACCESS_TOKEN                = ${data:NAME}
-MEMORY_FOR_QUERIES          = 90G
+MEMORY_FOR_QUERIES          = 60G
 CACHE_MAX_SIZE              = 40G
 CACHE_MAX_SIZE_SINGLE_ENTRY = 30G
 TIMEOUT                     = 300s

diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py
@@ -2,13 +2,17 @@
 
 import glob
 import json
-import shlex
 import re
+import shlex
 
 from qlever.command import QleverCommand
 from qlever.containerize import Containerize
 from qlever.log import log
-from qlever.util import get_existing_index_files, get_total_file_size, run_command
+from qlever.util import (
+    get_existing_index_files,
+    get_total_file_size,
+    run_command,
+)
 
 
 class IndexCommand(QleverCommand):
@@ -36,9 +40,11 @@ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
                 "settings_json",
                 "index_binary",
                 "only_pso_and_pos_permutations",
+                "ulimit",
                 "use_patterns",
                 "text_index",
                 "stxxl_memory",
+                "parser_buffer_size",
             ],
             "runtime": ["system", "image", "index_container"],
         }
@@ -48,7 +54,7 @@ def additional_arguments(self, subparser) -> None:
             "--overwrite-existing",
             action="store_true",
             default=False,
-            help="Overwrite an existing index, think twice before using.",
+            help="Overwrite an existing index, think twice before using this",
         )
 
     # Exception for invalid JSON.
@@ -76,7 +82,8 @@ def get_input_options_for_json(self, args) -> str:
         # Check that it is an array of length at least one.
         if not isinstance(input_specs, list):
             raise self.InvalidInputJson(
-                "`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json
+                "`MULTI_INPUT_JSON` must be a JSON array",
+                args.multi_input_json,
             )
         if len(input_specs) == 0:
             raise self.InvalidInputJson(
@@ -90,13 +97,15 @@ def get_input_options_for_json(self, args) -> str:
             # Check that `input_spec` is a dictionary.
             if not isinstance(input_spec, dict):
                 raise self.InvalidInputJson(
-                    f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object",
+                    f"Element {i} in `MULTI_INPUT_JSON` must be a JSON "
+                    "object",
                     input_spec,
                 )
             # For each `input_spec`, we must have a command.
             if "cmd" not in input_spec:
                 raise self.InvalidInputJson(
-                    f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
+                    f"Element {i} in `MULTI_INPUT_JSON` must contain a "
+                    "key `cmd`",
                     input_spec,
                 )
             # If the command contains a `{}` placeholder, we need a `for-each`
@@ -204,20 +213,31 @@ def execute(self, args) -> bool:
             index_cmd += " --only-pso-and-pos-permutations --no-patterns"
         if not args.use_patterns:
             index_cmd += " --no-patterns"
-        if args.text_index in ["from_text_records", "from_text_records_and_literals"]:
+        if args.text_index in [
+            "from_text_records",
+            "from_text_records_and_literals",
+        ]:
             index_cmd += (
-                f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv"
+                f" -w {args.name}.wordsfile.tsv"
+                f" -d {args.name}.docsfile.tsv"
             )
-        if args.text_index in ["from_literals", "from_text_records_and_literals"]:
+        if args.text_index in [
+            "from_literals",
+            "from_text_records_and_literals",
+        ]:
             index_cmd += " --text-words-from-literals"
         if args.stxxl_memory:
             index_cmd += f" --stxxl-memory {args.stxxl_memory}"
+        if args.parser_buffer_size:
+            index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
         index_cmd += f" | tee {args.name}.index-log.txt"
 
         # If the total file size is larger than 10 GB, set ulimit (such that a
         # large number of open files is allowed).
         total_file_size = get_total_file_size(shlex.split(args.input_files))
-        if total_file_size > 1e10:
+        if args.ulimit is not None:
+            index_cmd = f"ulimit -Sn {args.ulimit}; {index_cmd}"
+        elif total_file_size > 1e10:
             index_cmd = f"ulimit -Sn 1048576; {index_cmd}"
 
         # Run the command in a container (if so desired).
@@ -234,7 +254,8 @@ def execute(self, args) -> bool:
 
         # Command for writing the settings JSON to a file.
         settings_json_cmd = (
-            f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json"
+            f"echo {shlex.quote(args.settings_json)} "
+            f"> {args.name}.settings.json"
         )
 
         # Show the command line.
@@ -279,9 +300,15 @@ def execute(self, args) -> bool:
             return False
 
         # Remove already existing container.
-        if args.system in Containerize.supported_systems() and args.overwrite_existing:
+        if (
+            args.system in Containerize.supported_systems()
+            and args.overwrite_existing
+        ):
             if Containerize.is_running(args.system, args.index_container):
-                log.info("Another index process is running, trying to stop " "it ...")
+                log.info(
+                    "Another index process is running, trying to stop "
+                    "it ..."
+                )
                 log.info("")
                 try:
                     run_command(f"{args.system} rm -f {args.index_container}")

diff --git a/src/qlever/qleverfile.py b/src/qlever/qleverfile.py
@@ -105,6 +105,14 @@ def arg(*args, **kwargs):
             default="{}",
             help="The `.settings.json` file for the index",
         )
+        index_args["ulimit"] = arg(
+            "--ulimit",
+            type=int,
+            default=None,
+            help="Explicitly set the limit for the maximal number of open "
+            "files (default: 1048576 when the total size of the input files "
+            "is larger than 10 GB)",
+        )
         index_args["index_binary"] = arg(
             "--index-binary",
             type=str,
@@ -119,6 +127,13 @@ def arg(*args, **kwargs):
             help="The amount of memory to use for the index build "
             "(the name of the option has historical reasons)",
         )
+        index_args["parser_buffer_size"] = arg(
+            "--parser-buffer-size",
+            type=str,
+            default=None,
+            help="The size of the buffer used for parsing (no single RDF "
+            "statement must be larger than this size; default: 10M)",
+        )
         index_args["only_pso_and_pos_permutations"] = arg(
             "--only-pso-and-pos-permutations",
             action="store_true",

diff --git a/test/qlever/commands/test_index_execute.py b/test/qlever/commands/test_index_execute.py
@@ -41,6 +41,8 @@ def test_execute_successful_indexing_without_extras(
         args.index_container = "test_container"
         args.image = "test_image"
         args.multi_input_json = False
+        args.ulimit = None
+        args.parser_buffer_size = None
 
         # Mock glob, get_total_file_size, get_existing_index_files,
         # run_command and containerize
@@ -238,6 +240,8 @@ def test_execute_total_file_size_greater_than_ten_gb(
         args.index_container = "test_container"
         args.image = "test_image"
         args.multi_input_json = False
+        args.ulimit = None
+        args.parser_buffer_size = None
 
         # Mock glob, get_total_file_size, get_existing_index_files,
         # run_command and containerize
@@ -336,6 +340,8 @@ def test_execute_successful_indexing_with_extras_and_show(
         args.system = "native"
         args.settings_json = '{"example": "settings"}'
         args.show = True
+        args.ulimit = None
+        args.parser_buffer_size = None
 
         # Mock get_input_options_for_json
         mock_input_json.return_value = "test_input_stream"

diff --git a/test/qlever/commands/test_index_other_methods.py b/test/qlever/commands/test_index_other_methods.py
@@ -38,9 +38,11 @@ def test_relevant_qleverfile_arguments(self):
                     "settings_json",
                     "index_binary",
                     "only_pso_and_pos_permutations",
+                    "ulimit",
                     "use_patterns",
                     "text_index",
                     "stxxl_memory",
+                    "parser_buffer_size",
                 ],
                 "runtime": ["system", "image", "index_container"],
             },
@@ -63,7 +65,7 @@ def test_additional_arguments(self):
         argument_help = subparser._group_actions[-1].help
         self.assertEqual(
             argument_help,
-            "Overwrite an existing index, " "think twice before using.",
+            "Overwrite an existing index, " "think twice before using this",
         )
 
     def test_get_input_options_for_json_valid_input(self):