Skip to content

Commit

Permalink
Add options --ulimit und --parser-buffer-size configurable for `q…
Browse files Browse the repository at this point in the history
…lever index` command (#132)

So far, the `ulimit` was set to `1048576` when the size of the input files is larger than `10 GB`, which is a simplisitic and rough heuristic. In particular, this does not work when the RDF data is produced from the input file and the latter is relatively small compared to the former. Or when the OS allows increasing the `ulimit` but not that much. Now the `ulimit` can also be set explicitly via the option `--ulimit` of the `qlever index` command, or via the variable `ULIMIT` in the `[index]` section of the Qleverfile.

Further, ad-freiburg/qlever#1698 introduced an option `--parser-buffer-size` to `IndexBuilderMain`, see there for the effect. This can now also be set explicitly via the option `--parser-buffer-size` of the `qlever index` command, or via the variable `PARSER_BUFFER_SIZE` in the `[index]` Qleverfile.
  • Loading branch information
hannahbast authored Feb 12, 2025
1 parent f1797ef commit e5a4990
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 23 deletions.
8 changes: 5 additions & 3 deletions src/qlever/Qleverfiles/Qleverfile.ohm-planet
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ VERSION = $$(date -r ${NAME}.pbf +%d.%m.%Y || echo "NO_DATE")
DESCRIPTION = OHM Planet, data from ${GET_DATA_URL} version ${VERSION} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)

[index]
INPUT_FILES = ${data:NAME}.ttl.bz2
CAT_INPUT_FILES = bzcat -f ${INPUT_FILES}
SETTINGS_JSON = { "prefixes-external": [""], "ascii-prefixes-only": false, "parallel-parsing": true, "num-triples-per-batch": 5000000 }
INPUT_FILES = ${data:NAME}.ttl.bz2
CAT_INPUT_FILES = lbzcat -n 2 ${INPUT_FILES}
PARALLEL_PARSING = true
PARSER_BUFFER_SIZE = 100M
SETTINGS_JSON = { "num-triples-per-batch": 5000000 }

[server]
PORT = 7037
Expand Down
14 changes: 8 additions & 6 deletions src/qlever/Qleverfiles/Qleverfile.osm-planet
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Qleverfile for OSM Planet, use with the qlever script (pip install qlever)
#
# qlever get-data # takes ~50 mins to download .ttl.bz2 file of ~ 300 GB
# qlever get-data # downloads .ttl.bz2 file of ~ 400 GB with ~ 100 B triples
# qlever index # takes ~12 hours and ~20 GB RAM (on an AMD Ryzen 9 5900X)
# qlever start # takes a few seconds
#
Expand All @@ -15,15 +15,17 @@ VERSION = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y" || echo "NO_DATE")
DESCRIPTION = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects)

[index]
INPUT_FILES = ${data:NAME}.ttl.bz2
CAT_INPUT_FILES = lbzcat -f -n 2 ${INPUT_FILES}
STXXL_MEMORY = 20G
SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "ascii-prefixes-only": false, "num-triples-per-batch": 5000000 }
INPUT_FILES = ${data:NAME}.ttl.bz2
CAT_INPUT_FILES = lbzcat -n 2 ${INPUT_FILES}
PARALLEL_PARSING = true
PARSER_BUFFER_SIZE = 100M
STXXL_MEMORY = 40G
SETTINGS_JSON = { "num-triples-per-batch": 10000000 }

[server]
PORT = 7007
ACCESS_TOKEN = ${data:NAME}
MEMORY_FOR_QUERIES = 90G
MEMORY_FOR_QUERIES = 60G
CACHE_MAX_SIZE = 40G
CACHE_MAX_SIZE_SINGLE_ENTRY = 30G
TIMEOUT = 300s
Expand Down
53 changes: 40 additions & 13 deletions src/qlever/commands/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import glob
import json
import shlex
import re
import shlex

from qlever.command import QleverCommand
from qlever.containerize import Containerize
from qlever.log import log
from qlever.util import get_existing_index_files, get_total_file_size, run_command
from qlever.util import (
get_existing_index_files,
get_total_file_size,
run_command,
)


class IndexCommand(QleverCommand):
Expand Down Expand Up @@ -36,9 +40,11 @@ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]:
"settings_json",
"index_binary",
"only_pso_and_pos_permutations",
"ulimit",
"use_patterns",
"text_index",
"stxxl_memory",
"parser_buffer_size",
],
"runtime": ["system", "image", "index_container"],
}
Expand All @@ -48,7 +54,7 @@ def additional_arguments(self, subparser) -> None:
"--overwrite-existing",
action="store_true",
default=False,
help="Overwrite an existing index, think twice before using.",
help="Overwrite an existing index, think twice before using this",
)

# Exception for invalid JSON.
Expand Down Expand Up @@ -76,7 +82,8 @@ def get_input_options_for_json(self, args) -> str:
# Check that it is an array of length at least one.
if not isinstance(input_specs, list):
raise self.InvalidInputJson(
"`MULTI_INPUT_JSON` must be a JSON array", args.multi_input_json
"`MULTI_INPUT_JSON` must be a JSON array",
args.multi_input_json,
)
if len(input_specs) == 0:
raise self.InvalidInputJson(
Expand All @@ -90,13 +97,15 @@ def get_input_options_for_json(self, args) -> str:
# Check that `input_spec` is a dictionary.
if not isinstance(input_spec, dict):
raise self.InvalidInputJson(
f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " "object",
f"Element {i} in `MULTI_INPUT_JSON` must be a JSON "
"object",
input_spec,
)
# For each `input_spec`, we must have a command.
if "cmd" not in input_spec:
raise self.InvalidInputJson(
f"Element {i} in `MULTI_INPUT_JSON` must contain a " "key `cmd`",
f"Element {i} in `MULTI_INPUT_JSON` must contain a "
"key `cmd`",
input_spec,
)
# If the command contains a `{}` placeholder, we need a `for-each`
Expand Down Expand Up @@ -204,20 +213,31 @@ def execute(self, args) -> bool:
index_cmd += " --only-pso-and-pos-permutations --no-patterns"
if not args.use_patterns:
index_cmd += " --no-patterns"
if args.text_index in ["from_text_records", "from_text_records_and_literals"]:
if args.text_index in [
"from_text_records",
"from_text_records_and_literals",
]:
index_cmd += (
f" -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv"
f" -w {args.name}.wordsfile.tsv"
f" -d {args.name}.docsfile.tsv"
)
if args.text_index in ["from_literals", "from_text_records_and_literals"]:
if args.text_index in [
"from_literals",
"from_text_records_and_literals",
]:
index_cmd += " --text-words-from-literals"
if args.stxxl_memory:
index_cmd += f" --stxxl-memory {args.stxxl_memory}"
if args.parser_buffer_size:
index_cmd += f" --parser-buffer-size {args.parser_buffer_size}"
index_cmd += f" | tee {args.name}.index-log.txt"

# If the total file size is larger than 10 GB, set ulimit (such that a
# large number of open files is allowed).
total_file_size = get_total_file_size(shlex.split(args.input_files))
if total_file_size > 1e10:
if args.ulimit is not None:
index_cmd = f"ulimit -Sn {args.ulimit}; {index_cmd}"
elif total_file_size > 1e10:
index_cmd = f"ulimit -Sn 1048576; {index_cmd}"

# Run the command in a container (if so desired).
Expand All @@ -234,7 +254,8 @@ def execute(self, args) -> bool:

# Command for writing the settings JSON to a file.
settings_json_cmd = (
f"echo {shlex.quote(args.settings_json)} " f"> {args.name}.settings.json"
f"echo {shlex.quote(args.settings_json)} "
f"> {args.name}.settings.json"
)

# Show the command line.
Expand Down Expand Up @@ -279,9 +300,15 @@ def execute(self, args) -> bool:
return False

# Remove already existing container.
if args.system in Containerize.supported_systems() and args.overwrite_existing:
if (
args.system in Containerize.supported_systems()
and args.overwrite_existing
):
if Containerize.is_running(args.system, args.index_container):
log.info("Another index process is running, trying to stop " "it ...")
log.info(
"Another index process is running, trying to stop "
"it ..."
)
log.info("")
try:
run_command(f"{args.system} rm -f {args.index_container}")
Expand Down
15 changes: 15 additions & 0 deletions src/qlever/qleverfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ def arg(*args, **kwargs):
default="{}",
help="The `.settings.json` file for the index",
)
index_args["ulimit"] = arg(
"--ulimit",
type=int,
default=None,
help="Explicitly set the limit for the maximal number of open "
"files (default: 1048576 when the total size of the input files "
"is larger than 10 GB)",
)
index_args["index_binary"] = arg(
"--index-binary",
type=str,
Expand All @@ -119,6 +127,13 @@ def arg(*args, **kwargs):
help="The amount of memory to use for the index build "
"(the name of the option has historical reasons)",
)
index_args["parser_buffer_size"] = arg(
"--parser-buffer-size",
type=str,
default=None,
help="The size of the buffer used for parsing (no single RDF "
"statement must be larger than this size; default: 10M)",
)
index_args["only_pso_and_pos_permutations"] = arg(
"--only-pso-and-pos-permutations",
action="store_true",
Expand Down
6 changes: 6 additions & 0 deletions test/qlever/commands/test_index_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def test_execute_successful_indexing_without_extras(
args.index_container = "test_container"
args.image = "test_image"
args.multi_input_json = False
args.ulimit = None
args.parser_buffer_size = None

# Mock glob, get_total_file_size, get_existing_index_files,
# run_command and containerize
Expand Down Expand Up @@ -238,6 +240,8 @@ def test_execute_total_file_size_greater_than_ten_gb(
args.index_container = "test_container"
args.image = "test_image"
args.multi_input_json = False
args.ulimit = None
args.parser_buffer_size = None

# Mock glob, get_total_file_size, get_existing_index_files,
# run_command and containerize
Expand Down Expand Up @@ -336,6 +340,8 @@ def test_execute_successful_indexing_with_extras_and_show(
args.system = "native"
args.settings_json = '{"example": "settings"}'
args.show = True
args.ulimit = None
args.parser_buffer_size = None

# Mock get_input_options_for_json
mock_input_json.return_value = "test_input_stream"
Expand Down
4 changes: 3 additions & 1 deletion test/qlever/commands/test_index_other_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ def test_relevant_qleverfile_arguments(self):
"settings_json",
"index_binary",
"only_pso_and_pos_permutations",
"ulimit",
"use_patterns",
"text_index",
"stxxl_memory",
"parser_buffer_size",
],
"runtime": ["system", "image", "index_container"],
},
Expand All @@ -63,7 +65,7 @@ def test_additional_arguments(self):
argument_help = subparser._group_actions[-1].help
self.assertEqual(
argument_help,
"Overwrite an existing index, " "think twice before using.",
"Overwrite an existing index, " "think twice before using this",
)

def test_get_input_options_for_json_valid_input(self):
Expand Down

0 comments on commit e5a4990

Please sign in to comment.