From 1999b6446c097a8e6adc6c6c37ecc7c7f5213521 Mon Sep 17 00:00:00 2001 From: mrks Date: Sun, 1 Nov 2020 07:19:27 +0100 Subject: [PATCH] Add the JCC-H benchmark (#2249) Adds support for the JCC-H benchmark. This... erm... gives us more options to improve our statistics components ;) Query 9 does not finish for SF 1 right now. As it works with smaller SFs, I do not consider it an issue of this PR, but of the query plan. Something to fix later. --- .gitmodules | 3 + CMakeLists.txt | 2 +- Jenkinsfile | 2 + README.md | 3 +- resources/benchmark/jcch/customer.csv.json | 54 +++ resources/benchmark/jcch/lineitem.csv.json | 94 +++++ resources/benchmark/jcch/nation.csv.json | 34 ++ resources/benchmark/jcch/orders.csv.json | 59 +++ resources/benchmark/jcch/part.csv.json | 59 +++ resources/benchmark/jcch/partsupp.csv.json | 39 ++ resources/benchmark/jcch/region.csv.json | 29 ++ resources/benchmark/jcch/supplier.csv.json | 49 +++ scripts/test/hyriseBenchmarkJCCH_test.py | 49 +++ scripts/test/hyriseBenchmarkTPCH_test.py | 4 +- src/benchmark/tpch_benchmark.cpp | 75 +++- src/benchmarklib/CMakeLists.txt | 9 + .../file_based_table_generator.cpp | 2 +- .../file_based_table_generator.hpp | 4 +- .../jcch/jcch_benchmark_item_runner.cpp | 337 ++++++++++++++++++ .../jcch/jcch_benchmark_item_runner.hpp | 37 ++ .../jcch/jcch_table_generator.cpp | 89 +++++ .../jcch/jcch_table_generator.hpp | 36 ++ .../tpch/tpch_benchmark_item_runner.cpp | 78 ++-- .../tpch/tpch_benchmark_item_runner.hpp | 5 + src/benchmarklib/tpch/tpch_queries.cpp | 6 +- .../tpch/tpch_table_generator.cpp | 2 +- .../tpch/tpch_table_generator.hpp | 3 +- third_party/CMakeLists.txt | 22 ++ third_party/jcch-dbgen | 1 + 29 files changed, 1131 insertions(+), 55 deletions(-) create mode 100644 resources/benchmark/jcch/customer.csv.json create mode 100644 resources/benchmark/jcch/lineitem.csv.json create mode 100644 resources/benchmark/jcch/nation.csv.json create mode 100644 resources/benchmark/jcch/orders.csv.json create mode 100644 resources/benchmark/jcch/part.csv.json create mode 100644 resources/benchmark/jcch/partsupp.csv.json create mode 100644 resources/benchmark/jcch/region.csv.json create mode 100644 resources/benchmark/jcch/supplier.csv.json create mode 100755 scripts/test/hyriseBenchmarkJCCH_test.py create mode 100644 src/benchmarklib/jcch/jcch_benchmark_item_runner.cpp create mode 100644 src/benchmarklib/jcch/jcch_benchmark_item_runner.hpp create mode 100644 src/benchmarklib/jcch/jcch_table_generator.cpp create mode 100644 src/benchmarklib/jcch/jcch_table_generator.hpp create mode 160000 third_party/jcch-dbgen diff --git a/.gitmodules b/.gitmodules index d4521270ec..13fddd61f9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -50,3 +50,6 @@ [submodule "third_party/robin-map"] path = third_party/robin-map url = https://github.com/Tessil/robin-map.git +[submodule "third_party/jcch-dbgen"] + path = third_party/jcch-dbgen + url = https://github.com/mrks/dbgen.JCC-H.git diff --git a/CMakeLists.txt b/CMakeLists.txt index b9a228e7f6..2b62c040c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,7 +113,7 @@ find_package(Tbb REQUIRED) find_package(Readline REQUIRED) find_package(Curses REQUIRED) find_package(Sqlite3 REQUIRED) -find_package(Boost REQUIRED COMPONENTS container system) +find_package(Boost REQUIRED COMPONENTS container system date_time) add_definitions(-DBOOST_THREAD_VERSION=5) diff --git a/Jenkinsfile b/Jenkinsfile index bbb8ce0a81..199bc93bec 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -143,10 +143,12 @@ try { sh "./scripts/test/hyriseBenchmarkJoinOrder_test.py clang-debug" sh "./scripts/test/hyriseBenchmarkFileBased_test.py clang-debug" sh "cd clang-debug && ../scripts/test/hyriseBenchmarkTPCH_test.py ." // Own folder to isolate visualization + sh "cd clang-debug && ../scripts/test/hyriseBenchmarkJCCH_test.py ." // Own folder to isolate visualization sh "./scripts/test/hyriseConsole_test.py gcc-debug" sh "./scripts/test/hyriseBenchmarkJoinOrder_test.py gcc-debug" sh "./scripts/test/hyriseBenchmarkFileBased_test.py gcc-debug" sh "cd gcc-debug && ../scripts/test/hyriseBenchmarkTPCH_test.py ." // Own folder to isolate visualization + sh "cd gcc-debug && ../scripts/test/hyriseBenchmarkJCCH_test.py ." // Own folder to isolate visualization } else { Utils.markStageSkippedForConditional("debugSystemTests") diff --git a/README.md b/README.md index a025aa799e..bf6d966c3c 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,10 @@ We support a number of benchmarks out of the box. This makes it easy to generate | Benchmark | Notes | | ---------- | ------------------------------------------------------------------------------------------------------------------------ | -| TPC-C | In development, no proper optimization done yet | | TPC-DS | [Query Plans](https://hyrise-ci.epic-hpi.de/job/hyrise/job/hyrise/job/master/lastStableBuild/artifact/query_plans/tpcds) | | TPC-H | [Query Plans](https://hyrise-ci.epic-hpi.de/job/hyrise/job/hyrise/job/master/lastStableBuild/artifact/query_plans/tpch) | +| JCC-H | Call the hyriseBenchmarkTPCH binary with the -j flag. | +| TPC-C | In development, no proper optimization done yet | | Join Order | | # Getting started diff --git a/resources/benchmark/jcch/customer.csv.json b/resources/benchmark/jcch/customer.csv.json new file mode 100644 index 0000000000..ae1c61a97c --- /dev/null +++ b/resources/benchmark/jcch/customer.csv.json @@ -0,0 +1,54 @@ +{ + "columns": [ + { + "name": "c_custkey", + "nullable": false, + "type": "int" + }, + { + "name": "c_name", + "nullable": false, + "type": "string" + }, + { + "name": "c_address", + "nullable": false, + "type": "string" + }, + { + "name": "c_nationkey", + "nullable": false, + "type": "int" + }, + { + "name": "c_phone", + "nullable": false, + "type": "string" + }, + { + "name": "c_acctbal", + "nullable": false, + "type": "float" + }, + { + "name": "c_mktsegment", + "nullable": false, + "type": "string" + }, + { + "name": "c_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/resources/benchmark/jcch/lineitem.csv.json b/resources/benchmark/jcch/lineitem.csv.json new file mode 100644 index 0000000000..16326c8338 --- /dev/null +++ b/resources/benchmark/jcch/lineitem.csv.json @@ -0,0 +1,94 @@ +{ + "columns": [ + { + "name": "l_orderkey", + "nullable": false, + "type": "int" + }, + { + "name": "l_partkey", + "nullable": false, + "type": "int" + }, + { + "name": "l_suppkey", + "nullable": false, + "type": "int" + }, + { + "name": "l_linenumber", + "nullable": false, + "type": "int" + }, + { + "name": "l_quantity", + "nullable": false, + "type": "float" + }, + { + "name": "l_extendedprice", + "nullable": false, + "type": "float" + }, + { + "name": "l_discount", + "nullable": false, + "type": "float" + }, + { + "name": "l_tax", + "nullable": false, + "type": "float" + }, + { + "name": "l_returnflag", + "nullable": false, + "type": "string" + }, + { + "name": "l_linestatus", + "nullable": false, + "type": "string" + }, + { + "name": "l_shipdate", + "nullable": false, + "type": "string" + }, + { + "name": "l_commitdate", + "nullable": false, + "type": "string" + }, + { + "name": "l_receiptdate", + "nullable": false, + "type": "string" + }, + { + "name": "l_shipinstruct", + "nullable": false, + "type": "string" + }, + { + "name": "l_shipmode", + "nullable": false, + "type": "string" + }, + { + "name": "l_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/resources/benchmark/jcch/nation.csv.json b/resources/benchmark/jcch/nation.csv.json new file mode 100644 index 0000000000..b15b7aa713 --- /dev/null +++ b/resources/benchmark/jcch/nation.csv.json @@ -0,0 +1,34 @@ +{ + "columns": [ + { + "name": "n_nationkey", + "nullable": false, + "type": "int" + }, + { + "name": "n_name", + "nullable": false, + "type": "string" + }, + { + "name": "n_regionkey", + "nullable": false, + "type": "int" + }, + { + "name": "n_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/resources/benchmark/jcch/orders.csv.json b/resources/benchmark/jcch/orders.csv.json new file mode 100644 index 0000000000..eab2193db6 --- /dev/null +++ b/resources/benchmark/jcch/orders.csv.json @@ -0,0 +1,59 @@ +{ + "columns": [ + { + "name": "o_orderkey", + "nullable": false, + "type": "int" + }, + { + "name": "o_custkey", + "nullable": false, + "type": "int" + }, + { + "name": "o_orderstatus", + "nullable": false, + "type": "string" + }, + { + "name": "o_totalprice", + "nullable": false, + "type": "float" + }, + { + "name": "o_orderdate", + "nullable": false, + "type": "string" + }, + { + "name": "o_orderpriority", + "nullable": false, + "type": "string" + }, + { + "name": "o_clerk", + "nullable": false, + "type": "string" + }, + { + "name": "o_shippriority", + "nullable": false, + "type": "int" + }, + { + "name": "o_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/resources/benchmark/jcch/part.csv.json b/resources/benchmark/jcch/part.csv.json new file mode 100644 index 0000000000..25e5877590 --- /dev/null +++ b/resources/benchmark/jcch/part.csv.json @@ -0,0 +1,59 @@ +{ + "columns": [ + { + "name": "p_partkey", + "nullable": false, + "type": "int" + }, + { + "name": "p_name", + "nullable": false, + "type": "string" + }, + { + "name": "p_mfgr", + "nullable": false, + "type": "string" + }, + { + "name": "p_brand", + "nullable": false, + "type": "string" + }, + { + "name": "p_type", + "nullable": false, + "type": "string" + }, + { + "name": "p_size", + "nullable": false, + "type": "int" + }, + { + "name": "p_container", + "nullable": false, + "type": "string" + }, + { + "name": "p_retailsize", + "nullable": false, + "type": "float" + }, + { + "name": "p_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/resources/benchmark/jcch/partsupp.csv.json b/resources/benchmark/jcch/partsupp.csv.json new file mode 100644 index 0000000000..fcabcd0c62 --- /dev/null +++ b/resources/benchmark/jcch/partsupp.csv.json @@ -0,0 +1,39 @@ +{ + "columns": [ + { + "name": "ps_partkey", + "nullable": false, + "type": "int" + }, + { + "name": "ps_suppkey", + "nullable": false, + "type": "int" + }, + { + "name": "ps_availqty", + "nullable": false, + "type": "int" + }, + { + "name": "ps_supplycost", + "nullable": false, + "type": "float" + }, + { + "name": "ps_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/resources/benchmark/jcch/region.csv.json b/resources/benchmark/jcch/region.csv.json new file mode 100644 index 0000000000..5fb2e3199a --- /dev/null +++ b/resources/benchmark/jcch/region.csv.json @@ -0,0 +1,29 @@ +{ + "columns": [ + { + "name": "r_regionkey", + "nullable": false, + "type": "int" + }, + { + "name": "r_name", + "nullable": false, + "type": "string" + }, + { + "name": "r_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/resources/benchmark/jcch/supplier.csv.json b/resources/benchmark/jcch/supplier.csv.json new file mode 100644 index 0000000000..cb22b45593 --- /dev/null +++ b/resources/benchmark/jcch/supplier.csv.json @@ -0,0 +1,49 @@ +{ + "columns": [ + { + "name": "s_suppkey", + "nullable": false, + "type": "int" + }, + { + "name": "s_name", + "nullable": false, + "type": "string" + }, + { + "name": "s_address", + "nullable": false, + "type": "string" + }, + { + "name": "s_nationkey", + "nullable": false, + "type": "int" + }, + { + "name": "s_phone", + "nullable": false, + "type": "string" + }, + { + "name": "s_acctbal", + "nullable": false, + "type": "float" + }, + { + "name": "s_comment", + "nullable": false, + "type": "string" + } + ], + "config": { + "delimiter": "\n", + "delimiter_escape": "\\", + "escape": "\"", + "null_handling": "reject_null_strings", + "quote": "\"", + "reject_quoted_nonstrings": true, + "rfc_mode": true, + "separator": "|" + } +} diff --git a/scripts/test/hyriseBenchmarkJCCH_test.py b/scripts/test/hyriseBenchmarkJCCH_test.py new file mode 100755 index 0000000000..ad63f0d610 --- /dev/null +++ b/scripts/test/hyriseBenchmarkJCCH_test.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +from hyriseBenchmarkCore import close_benchmark, check_exit_status, initialize, run_benchmark + + +def main(): + build_dir = initialize() + + # Run JCC-H and validate its output using pexpect and check if all queries were successfully verified with sqlite. + arguments = {} + arguments["--scale"] = ".01" + arguments["--chunk_size"] = "10000" + arguments["--queries"] = "'2,4,6'" + arguments["--time"] = "10" + arguments["--runs"] = "100" + arguments["--warmup"] = "10" + arguments["--encoding"] = "'LZ4'" + arguments["--compression"] = "'SIMD-BP128'" + arguments["--indexes"] = "false" + arguments["--scheduler"] = "true" + arguments["--clients"] = "4" + arguments["--jcch"] = "skewed" + arguments["--verify"] = "true" + arguments["--dont_cache_binary_tables"] = "true" + + benchmark = run_benchmark(build_dir, arguments, "hyriseBenchmarkTPCH", True) + + benchmark.expect_exact("Running in multi-threaded mode using all available cores") + benchmark.expect_exact("4 simulated clients are scheduling items in parallel") + benchmark.expect_exact("Running benchmark in 'Ordered' mode") + benchmark.expect_exact("Encoding is 'LZ4'") + benchmark.expect_exact("Chunk size is 10000") + benchmark.expect_exact("Max runs per item is 100") + benchmark.expect_exact("Max duration per item is 10 seconds") + benchmark.expect_exact("Warmup duration per item is 10 seconds") + benchmark.expect_exact("Benchmarking Queries: [ 2, 4, 6 ]") + benchmark.expect_exact("JCC-H scale factor is 0.01") + benchmark.expect_exact("Using prepared statements: no") + benchmark.expect_exact("Using JCC-H dbgen from") + benchmark.expect_exact("JCC-H query parameters are skewed") + benchmark.expect_exact("calling external qgen") + benchmark.expect_exact("Multi-threaded Topology:") + + close_benchmark(benchmark) + check_exit_status(benchmark) + + +if __name__ == "__main__": + main() diff --git a/scripts/test/hyriseBenchmarkTPCH_test.py b/scripts/test/hyriseBenchmarkTPCH_test.py index a1cd18d87d..8aeb730208 100755 --- a/scripts/test/hyriseBenchmarkTPCH_test.py +++ b/scripts/test/hyriseBenchmarkTPCH_test.py @@ -43,7 +43,7 @@ def main(): benchmark.expect_exact("No warmup runs are performed") benchmark.expect_exact("Not caching tables as binary files") benchmark.expect_exact("Benchmarking Queries: [ 1, 13, 19 ]") - benchmark.expect_exact("TPCH scale factor is 0.01") + benchmark.expect_exact("TPC-H scale factor is 0.01") benchmark.expect_exact("Using prepared statements: yes") benchmark.expect_exact("Creating index on customer [ c_custkey ]") benchmark.expect_exact("Preparing queries") @@ -139,7 +139,7 @@ def main(): benchmark.expect_exact("Max duration per item is 10 seconds") benchmark.expect_exact("Warmup duration per item is 10 seconds") benchmark.expect_exact("Benchmarking Queries: [ 2, 4, 6 ]") - benchmark.expect_exact("TPCH scale factor is 0.01") + benchmark.expect_exact("TPC-H scale factor is 0.01") benchmark.expect_exact("Using prepared statements: no") benchmark.expect_exact("Multi-threaded Topology:") diff --git a/src/benchmark/tpch_benchmark.cpp b/src/benchmark/tpch_benchmark.cpp index 9a76c455f8..71d9f57ab4 100644 --- a/src/benchmark/tpch_benchmark.cpp +++ b/src/benchmark/tpch_benchmark.cpp @@ -10,6 +10,8 @@ #include "cli_config_parser.hpp" #include "cxxopts.hpp" #include "hyrise.hpp" +#include "jcch/jcch_benchmark_item_runner.hpp" +#include "jcch/jcch_table_generator.hpp" #include "tpch/tpch_benchmark_item_runner.hpp" #include "tpch/tpch_queries.hpp" #include "tpch/tpch_table_generator.hpp" @@ -29,22 +31,29 @@ using namespace opossum; // NOLINT * * main() is mostly concerned with parsing the CLI options while BenchmarkRunner.run() performs the actual benchmark * logic. + * + * The same binary is used to run the JCC-H benchmark. For this, simply use the -j flag. */ int main(int argc, char* argv[]) { - auto cli_options = BenchmarkRunner::get_basic_cli_options("TPC-H Benchmark"); + auto cli_options = BenchmarkRunner::get_basic_cli_options("TPC-H/JCC-H Benchmark"); // clang-format off cli_options.add_options() ("s,scale", "Database scale factor (1.0 ~ 1GB)", cxxopts::value()->default_value("1")) ("q,queries", "Specify queries to run (comma-separated query ids, e.g. \"--queries 1,3,19\"), default is all", cxxopts::value()) // NOLINT - ("use_prepared_statements", "Use prepared statements instead of random SQL strings", cxxopts::value()->default_value("false")); // NOLINT + ("use_prepared_statements", "Use prepared statements instead of random SQL strings", cxxopts::value()->default_value("false")) // NOLINT + ("j,jcch", "Use JCC-H data and query generators instead of TPC-H. If this parameter is used, table data always " + "contains skew. With --jcch=skewed, queries are generated to be affected by this skew. With " + "--jcch=normal, query parameters access the unskewed part of the tables ", cxxopts::value()->default_value("")); // NOLINT // clang-format on std::shared_ptr config; std::string comma_separated_queries; float scale_factor; bool use_prepared_statements; + bool jcch; + auto jcch_skewed = false; // Parse command line args const auto cli_parse_result = cli_options.parse(argc, argv); @@ -60,6 +69,17 @@ int main(int argc, char* argv[]) { config = std::make_shared(CLIConfigParser::parse_cli_options(cli_parse_result)); use_prepared_statements = cli_parse_result["use_prepared_statements"].as(); + jcch = cli_parse_result.count("jcch"); + if (jcch) { + const auto jcch_mode = cli_parse_result["jcch"].as(); + if (jcch_mode == "skewed") { + jcch_skewed = true; + } else if (jcch_mode == "normal") { // NOLINT + jcch_skewed = false; + } else { + Fail("Invalid jcch mode, use skewed or normal"); + } + } std::vector item_ids; @@ -75,7 +95,7 @@ int main(int argc, char* argv[]) { std::transform(item_ids_str.begin(), item_ids_str.end(), std::back_inserter(item_ids), [](const auto& item_id_str) { const auto item_id = BenchmarkItemID{boost::lexical_cast(item_id_str) - 1}; - DebugAssert(item_id < 22, "There are only 22 TPC-H queries"); + DebugAssert(item_id < 22, "There are only 22 queries"); return item_id; }); } @@ -91,7 +111,7 @@ int main(int argc, char* argv[]) { Assert(!use_prepared_statements || !config->verify, "SQLite validation does not work with prepared statements"); if (config->verify) { - // Hack: We cannot verify TPC-H Q15, thus we remove it from the list of queries + // Hack: We cannot verify Q15, thus we remove it from the list of queries auto it = std::remove(item_ids.begin(), item_ids.end(), 15 - 1); if (it != item_ids.end()) { // The problem is that the last part of the query, "DROP VIEW", does not return a table. Since we also have @@ -102,16 +122,55 @@ int main(int argc, char* argv[]) { } } - std::cout << "- TPCH scale factor is " << scale_factor << std::endl; + std::cout << "- " << (jcch ? "JCC-H" : "TPC-H") << " scale factor is " << scale_factor << std::endl; std::cout << "- Using prepared statements: " << (use_prepared_statements ? "yes" : "no") << std::endl; // Add TPCH-specific information context.emplace("scale_factor", scale_factor); context.emplace("use_prepared_statements", use_prepared_statements); - auto item_runner = std::make_unique(config, use_prepared_statements, scale_factor, item_ids); - auto benchmark_runner = std::make_shared( - *config, std::move(item_runner), std::make_unique(scale_factor, config), context); + auto table_generator = std::unique_ptr{}; + auto item_runner = std::unique_ptr{}; + + if (jcch) { + // Different from the TPC-H benchmark, where the table and query generators are immediately embedded in Hyrise, the + // JCC-H implementation calls those generators externally. This is because we would get linking conflicts if we were + // to include both generators. Unfortunately, this approach is somewhat slower (30s to start SF1 with TPC-H, 1m18s + // with JCC-H). + // + // JCC-H has both a skewed and a "normal" (i.e., unskewed) mode. The unskewed mode is not the same as TPC-H. You can + // find details in the JCC-H paper: https://ir.cwi.nl/pub/27429 + + // Try to find dbgen/qgen binaries + auto jcch_dbgen_path = + std::filesystem::canonical(std::string{argv[0]}).remove_filename() / "third_party/jcch-dbgen"; + Assert(std::filesystem::exists(jcch_dbgen_path / "dbgen"), + std::string{"JCC-H dbgen not found at "} + jcch_dbgen_path.c_str()); + Assert(std::filesystem::exists(jcch_dbgen_path / "qgen"), + std::string{"JCC-H qgen not found at "} + jcch_dbgen_path.c_str()); + + // Create the jcch_data directory (if needed) and generate the jcch_data/sf-... path + auto jcch_data_path_str = std::ostringstream{}; + jcch_data_path_str << "jcch_data/sf-" << std::noshowpoint << scale_factor; + std::filesystem::create_directories(jcch_data_path_str.str()); + // Success of create_directories is guaranteed by the call to fs::canonical, which fails on invalid paths: + auto jcch_data_path = std::filesystem::canonical(jcch_data_path_str.str()); + + std::cout << "- Using JCC-H dbgen from " << jcch_dbgen_path << std::endl; + std::cout << "- Storing JCC-H tables and query parameters in " << jcch_data_path << std::endl; + std::cout << "- JCC-H query parameters are " << (jcch_skewed ? "skewed" : "not skewed") << std::endl; + + // Create the table generator and item runner + table_generator = std::make_unique(jcch_dbgen_path, jcch_data_path, scale_factor, config); + item_runner = std::make_unique(jcch_skewed, jcch_dbgen_path, jcch_data_path, config, + use_prepared_statements, scale_factor, item_ids); + } else { + table_generator = std::make_unique(scale_factor, config); + item_runner = std::make_unique(config, use_prepared_statements, scale_factor, item_ids); + } + + auto benchmark_runner = + std::make_shared(*config, std::move(item_runner), std::move(table_generator), context); Hyrise::get().benchmark_runner = benchmark_runner; if (config->verify) { diff --git a/src/benchmarklib/CMakeLists.txt b/src/benchmarklib/CMakeLists.txt index ea8bb0728e..684ce76470 100644 --- a/src/benchmarklib/CMakeLists.txt +++ b/src/benchmarklib/CMakeLists.txt @@ -37,6 +37,11 @@ set( tpch/tpch_table_generator.cpp tpch/tpch_table_generator.hpp + jcch/jcch_benchmark_item_runner.cpp + jcch/jcch_benchmark_item_runner.hpp + jcch/jcch_table_generator.cpp + jcch/jcch_table_generator.hpp + tpcds/tpcds_table_generator.cpp tpcds/tpcds_table_generator.hpp @@ -75,10 +80,14 @@ set( # Configure the regular opossum library used for tests/server/playground... add_library(hyriseBenchmarkLib STATIC ${SOURCES}) +add_dependencies(hyriseBenchmarkLib jcchDbgen-build) # *-build is auto-generated target_link_libraries( hyriseBenchmarkLib + PUBLIC + hyrise + ${Boost_DATE_TIME_LIBRARY} tpch_dbgen tpcds_dbgen ) diff --git a/src/benchmarklib/file_based_table_generator.cpp b/src/benchmarklib/file_based_table_generator.cpp index b646b984d4..5a0fdc3224 100644 --- a/src/benchmarklib/file_based_table_generator.cpp +++ b/src/benchmarklib/file_based_table_generator.cpp @@ -20,7 +20,7 @@ FileBasedTableGenerator::FileBasedTableGenerator(const std::shared_ptr FileBasedTableGenerator::generate() { - Assert(std::filesystem::is_directory(_path), "Table path must be a directory"); + Assert(std::filesystem::is_directory(_path), std::string{"Table path "} + _path + " must be a directory"); auto table_info_by_name = std::unordered_map{}; const auto table_extensions = std::unordered_set{".csv", ".tbl", ".bin"}; diff --git a/src/benchmarklib/file_based_table_generator.hpp b/src/benchmarklib/file_based_table_generator.hpp index 13037c063b..65aa005de7 100644 --- a/src/benchmarklib/file_based_table_generator.hpp +++ b/src/benchmarklib/file_based_table_generator.hpp @@ -7,13 +7,13 @@ namespace opossum { -class FileBasedTableGenerator : public AbstractTableGenerator { +class FileBasedTableGenerator : virtual public AbstractTableGenerator { public: FileBasedTableGenerator(const std::shared_ptr& benchmark_config, const std::string& path); std::unordered_map generate() override; - private: + protected: const std::string _path; }; diff --git a/src/benchmarklib/jcch/jcch_benchmark_item_runner.cpp b/src/benchmarklib/jcch/jcch_benchmark_item_runner.cpp new file mode 100644 index 0000000000..89f9d47428 --- /dev/null +++ b/src/benchmarklib/jcch/jcch_benchmark_item_runner.cpp @@ -0,0 +1,337 @@ +#include "jcch_benchmark_item_runner.hpp" + +#include +#include + +#include "tpch/tpch_queries.hpp" +#include "utils/string_utils.hpp" +#include "utils/timer.hpp" + +namespace opossum { + +JCCHBenchmarkItemRunner::JCCHBenchmarkItemRunner(const bool skewed, const std::string& dbgen_path, + const std::string& data_path, + const std::shared_ptr& config, + bool use_prepared_statements, float scale_factor) + : TPCHBenchmarkItemRunner(config, use_prepared_statements, scale_factor), + _skewed(skewed), + _dbgen_path(dbgen_path), + _data_path(data_path) { + _load_params(); +} + +JCCHBenchmarkItemRunner::JCCHBenchmarkItemRunner(const bool skewed, const std::string& dbgen_path, + const std::string& data_path, + const std::shared_ptr& config, + bool use_prepared_statements, float scale_factor, + const std::vector& items) + : TPCHBenchmarkItemRunner(config, use_prepared_statements, scale_factor, items), + _skewed(skewed), + _dbgen_path(dbgen_path), + _data_path(data_path) { + _load_params(); +} + +std::string JCCHBenchmarkItemRunner::item_name(const BenchmarkItemID item_id) const { + Assert(item_id < 22u, "item_id out of range"); + return std::string("JCC-H ") + (_skewed ? "(skewed) " : "(normal) ") + (item_id + 1 < 10 ? "0" : "") + + std::to_string(item_id + 1); +} + +void JCCHBenchmarkItemRunner::_load_params() { + const auto local_queries_path = _data_path + "/queries/"; + const auto params_path = local_queries_path + "params-" + (_skewed ? "skewed" : "normal"); + + // Check if the query parameters have already been generated + if (!std::filesystem::exists(params_path)) { + Timer timer; + + std::cout << "- Creating query parameters by calling external qgen" << std::flush; + + // Check for the existence of dbgen's query templates (1.sql etc.) at the expected location + const auto dbgen_queries_path = _dbgen_path + "/queries/"; + Assert(std::filesystem::exists(dbgen_queries_path), + std::string{"Query templates not found at "} + dbgen_queries_path); + + // Create local directory and copy query templates if needed + const auto local_queries_dir_created = std::filesystem::create_directory(local_queries_path); + Assert(std::filesystem::exists(local_queries_path), "Creating JCC-H queries folder failed"); + if (local_queries_dir_created) { + auto cmd = std::stringstream{}; + cmd << "cd " << local_queries_path << " && ln -s " << _dbgen_path << "/queries/*.sql ."; + auto ret = system(cmd.str().c_str()); + Assert(!ret, "Creating symlinks to query templates failed"); + } + + // Call qgen a couple of times with different PRNG seeds and store the resulting query parameters in queries/params. + // dbgen doesn't like `-r 0`, so we start at 1. + for (auto seed = 1; seed <= (_config->max_runs > 0 ? _config->max_runs : 100'000); ++seed) { + auto cmd = std::stringstream{}; + cmd << "cd " << local_queries_path << " && " << _dbgen_path << "/qgen " << (_skewed ? "-k" : "") << " -s " + << _scale_factor << " -b " << _dbgen_path << "/dists.dss -r " << seed << " -l " << params_path + << " >/dev/null"; + auto ret = system(cmd.str().c_str()); + Assert(!ret, "Calling qgen failed"); + } + + std::cout << " (" << timer.lap_formatted() << ")" << std::endl; + } + + // Open the params file, which looks like this: + // query_id|param0|param1 + auto file = std::ifstream(params_path); + Assert(file.is_open(), std::string{"Could not open JCC-H parameters at "} + params_path); + + std::string line; + while (std::getline(file, line)) { + // Load the parameter into the corresponding entry in _all_params + auto string_values = split_string_by_delimiter(line, '\t'); + const auto query_id = std::stoi(string_values[0]); + Assert(query_id >= 1 && query_id <= 22, "Invalid query_id"); + string_values.erase(string_values.begin()); + _all_params[query_id - 1].emplace_back(string_values); + } +} + +bool JCCHBenchmarkItemRunner::_on_execute_item(const BenchmarkItemID item_id, BenchmarkSQLExecutor& sql_executor) { + using namespace std::string_literals; // NOLINT + + const auto& this_item_params = _all_params[item_id]; + + // Choose a random parameterization from _all_params + static thread_local std::minstd_rand random_engine{_random_seed++}; + std::uniform_int_distribution<> params_dist{0, static_cast(this_item_params.size() - 1)}; + const auto raw_params_iter = this_item_params.begin() + params_dist(random_engine); + + std::vector parameters; + auto sql = std::string{}; + + // This mirrors TPCHBenchmarkItemRunner::_on_execute_item. Instead of generating random parameters according to the + // TPC-H specifications, it uses the ones generated by JCC-H's qgen + switch (item_id) { + // Writing `1-1` to make people aware that this is zero-indexed while TPC-H query names are not + case 1 - 1: { + // In some cases, we still need to do the date calculations that Hyrise does not support yet + const auto date = _calculate_date(boost::gregorian::date{1998, 12, 01}, 0, -std::stoi(raw_params_iter->at(0))); + parameters.emplace_back("'"s + date + "'"); + break; + } + + case 2 - 1: { + parameters.emplace_back(raw_params_iter->at(0)); + parameters.emplace_back("'%"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(2) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(2) + "'"); + break; + } + + case 3 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + break; + } + + case 4 - 1: { + const auto begin_date = boost::gregorian::from_string(raw_params_iter->at(0)); + const auto end_date_str = _calculate_date(begin_date, 3); + + // Cannot use begin_date here, as we would have to convert it into a string first. + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + end_date_str + "'"); + break; + } + + case 5 - 1: { + const auto begin_date = boost::gregorian::from_string(raw_params_iter->at(1)); + const auto end_date_str = _calculate_date(begin_date, 12); + + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + end_date_str + "'"); + break; + } + + case 6 - 1: { + const auto begin_date = boost::gregorian::from_string(raw_params_iter->at(0)); + const auto end_date_str = _calculate_date(begin_date, 12); + + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + end_date_str + "'"); + parameters.emplace_back(raw_params_iter->at(1)); + parameters.emplace_back(raw_params_iter->at(1)); + parameters.emplace_back(raw_params_iter->at(2)); + break; + } + + case 7 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(2) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(3) + "'"); + break; + } + + case 8 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(2) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(3) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(4) + "'"); + break; + } + + case 9 - 1: { + static auto warned_performance = false; + if (!warned_performance) { + std::cerr << "\nWarning: JCC-H Query 9 needs optimization. Consider skipping it using -q\n\n"; + warned_performance = true; + } + + parameters.emplace_back("'%"s + raw_params_iter->at(0) + "%'"); + break; + } + + case 10 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + break; + } + + case 11 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back(raw_params_iter->at(1)); + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + break; + } + + case 12 - 1: { + const auto begin_date = boost::gregorian::from_string(raw_params_iter->at(2)); + const auto end_date_str = _calculate_date(begin_date, 12); + + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(2) + "'"); + parameters.emplace_back("'"s + end_date_str + "'"); + break; + } + + case 13 - 1: { + parameters.emplace_back("'%"s + raw_params_iter->at(0) + '%' + raw_params_iter->at(1) + "%'"); + break; + } + + case 14 - 1: { + const auto begin_date = boost::gregorian::from_string(raw_params_iter->at(0)); + const auto end_date_str = _calculate_date(begin_date, 1); + + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + end_date_str + "'"); + break; + } + + case 15 - 1: { + auto query_15 = std::string{tpch_queries.at(15)}; + + const auto begin_date = boost::gregorian::from_string(raw_params_iter->at(0)); + const auto end_date_str = _calculate_date(begin_date, 3); + + // Hack: We cannot use prepared statements in TPC-H 15. Thus, we need to build the SQL string by hand. + // By manually replacing the `?` from tpch_queries.cpp, we can keep all queries in a readable form there. + // This is ugly, but at least we can assert that nobody tampered with the string over there. + static constexpr auto BEGIN_DATE_OFFSET = 156; + static constexpr auto END_DATE_OFFSET = 192; + DebugAssert((std::string_view{&query_15[BEGIN_DATE_OFFSET], 10} == "1996-01-01" && + std::string_view{&query_15[END_DATE_OFFSET], 10} == "1996-04-01"), + "TPC-H 15 string has been modified"); + query_15.replace(BEGIN_DATE_OFFSET, 10, raw_params_iter->at(0)); + query_15.replace(END_DATE_OFFSET, 10, end_date_str); + + const auto view_id = std::atomic_fetch_add(&_q15_view_id, size_t{1}); + boost::replace_all(query_15, std::string("revenue_view"), std::string("revenue") + std::to_string(view_id)); + + // Not using _substitute_placeholders here + sql = query_15; + break; + } + + case 16 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + for (auto i = 0; i < 8; ++i) parameters.emplace_back(raw_params_iter->at(2 + i)); + break; + } + + case 17 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + break; + } + + case 18 - 1: { + static auto warned_compliance = false; + if (!warned_compliance) { + std::cerr << "\nWarning: JCC-H Query 18 as used by Hyrise slightly diverges from the specification.\n"; + std::cerr << " See jcch_benchmark_item_runner.cpp for details.\n\n"; + warned_compliance = true; + } + + // JCC-H has a second parameter to this query: + // https://github.com/ldbc/dbgen.JCC-H/commit/d42a7ebc2617ec31de55b00425c23ab7885beeeb#diff-c448b6246f882ef1a5fd8e7ded77b8134addba8443ce2b43425e563045895fc4 // NOLINT + // We do not use this parameter as it would bring a structural change to the SQL query template, which is also + // used for TPC-H. + parameters.emplace_back(raw_params_iter->at(0)); + break; + } + + case 19 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + parameters.emplace_back(raw_params_iter->at(3)); + parameters.emplace_back(raw_params_iter->at(3)); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back(raw_params_iter->at(4)); + parameters.emplace_back(raw_params_iter->at(4)); + parameters.emplace_back("'"s + raw_params_iter->at(2) + "'"); + parameters.emplace_back(raw_params_iter->at(5)); + parameters.emplace_back(raw_params_iter->at(5)); + + break; + } + + case 20 - 1: { + const auto begin_date = boost::gregorian::from_string(raw_params_iter->at(1)); + const auto end_date_str = _calculate_date(begin_date, 12); + + parameters.emplace_back("'"s + raw_params_iter->at(0) + "%'"); + parameters.emplace_back("'"s + raw_params_iter->at(1) + "'"); + parameters.emplace_back("'"s + end_date_str + "'"); + parameters.emplace_back("'"s + raw_params_iter->at(2) + "'"); + break; + } + + case 21 - 1: { + parameters.emplace_back("'"s + raw_params_iter->at(0) + "'"); + break; + } + + case 22 - 1: { + // We need the same country code twice - have a look at the query + for (auto i = 0; i < 7; ++i) parameters.emplace_back("'"s + raw_params_iter->at(i) + "'"); + for (auto i = 0; i < 7; ++i) parameters.emplace_back("'"s + raw_params_iter->at(i) + "'"); + break; + } + + default: + Fail("There are only 22 JCC-H queries"); + } + + if (sql.empty()) sql = _substitute_placeholders(item_id, parameters); + + const auto [status, table] = sql_executor.execute(sql, nullptr); + Assert(status == SQLPipelineStatus::Success, "JCC-H items should not fail"); + return true; +} + +} // namespace opossum diff --git a/src/benchmarklib/jcch/jcch_benchmark_item_runner.hpp b/src/benchmarklib/jcch/jcch_benchmark_item_runner.hpp new file mode 100644 index 0000000000..34a2dcbe70 --- /dev/null +++ b/src/benchmarklib/jcch/jcch_benchmark_item_runner.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include + +#include "tpch/tpch_benchmark_item_runner.hpp" + +namespace opossum { + +// The generation of JCC-H items is based on top of that of the TPC-H. Instead of creating random values in the C++ +// code, we use those generated by JCC-H's qgen. See _on_execute_item for details. + +class JCCHBenchmarkItemRunner : public TPCHBenchmarkItemRunner { + public: + // Constructor for a JCCHBenchmarkItemRunner containing all TPC-H queries + JCCHBenchmarkItemRunner(const bool skewed, const std::string& dbgen_path, const std::string& data_path, + const std::shared_ptr& config, bool use_prepared_statements, + float scale_factor); + + // Constructor for a JCCHBenchmarkItemRunner containing a subset of TPC-H queries + JCCHBenchmarkItemRunner(const bool skewed, const std::string& dbgen_path, const std::string& data_path, + const std::shared_ptr& config, bool use_prepared_statements, + float scale_factor, const std::vector& items); + + std::string item_name(const BenchmarkItemID item_id) const override; + + protected: + bool _on_execute_item(const BenchmarkItemID item_id, BenchmarkSQLExecutor& sql_executor) override; + + void _load_params(); + + const bool _skewed; + const std::string _dbgen_path; + const std::string _data_path; + std::array>, 22> _all_params; +}; + +} // namespace opossum diff --git a/src/benchmarklib/jcch/jcch_table_generator.cpp b/src/benchmarklib/jcch/jcch_table_generator.cpp new file mode 100644 index 0000000000..73bb1f9d5f --- /dev/null +++ b/src/benchmarklib/jcch/jcch_table_generator.cpp @@ -0,0 +1,89 @@ +#include "jcch_table_generator.hpp" + +#include +#include + +#include "utils/timer.hpp" + +namespace opossum { + +JCCHTableGenerator::JCCHTableGenerator(const std::string& dbgen_path, const std::string& data_path, float scale_factor, + uint32_t chunk_size) + : JCCHTableGenerator(dbgen_path, data_path, scale_factor, create_benchmark_config_with_chunk_size(chunk_size)) {} + +JCCHTableGenerator::JCCHTableGenerator(const std::string& dbgen_path, const std::string& data_path, float scale_factor, + const std::shared_ptr& benchmark_config) + : AbstractTableGenerator(benchmark_config), + TPCHTableGenerator(scale_factor, benchmark_config), + FileBasedTableGenerator(benchmark_config, data_path), + _dbgen_path(dbgen_path) {} + +std::unordered_map JCCHTableGenerator::generate() { + const auto tables_path = _path + "/tables/"; + + // Check if table data has already been generated (and converted to .bin by the FileBasedTableGenerator) + if (!std::filesystem::exists(tables_path + "/customer.bin")) { + Timer timer; + std::cout << "- Creating table data by calling external dbgen" << std::flush; + + std::filesystem::create_directory(tables_path); + Assert(std::filesystem::exists(tables_path), "Creating JCC-H tables folder failed"); + + { + // Call JCC-H's dbgen + auto cmd = std::stringstream{}; + // `2>` in a string seems to break Sublime Text's formatter, so it's split into two strings + cmd << "cd " << tables_path << " && " << _dbgen_path << "/dbgen -f -k -s " << _scale_factor << " -b " + << _dbgen_path << "/dists.dss >/dev/null 2" + << ">/dev/null"; + auto ret = system(cmd.str().c_str()); + Assert(!ret, "Calling dbgen failed"); + } + + for (const auto& [_, table_name] : tpch_table_names) { + // Rename tbl files generated by dbgen to csv so that the correct importer is used + std::filesystem::rename(tables_path + table_name + ".tbl", tables_path + table_name + ".csv"); + + // Remove the trailing separator from each line as the CSVReader does not like them + { + // sed on Mac requires a space between -i and '', on Linux it doesn't like it... +#ifdef __APPLE__ + const auto* const sed_inplace = "-i ''"; +#else + const auto* const sed_inplace = "-i''"; +#endif + + auto cmd = std::stringstream{}; + cmd << "sed -Ee 's/\\|$//' " << sed_inplace << " " << tables_path << table_name << ".csv"; + auto ret = system(cmd.str().c_str()); + Assert(!ret, "Removing trailing separators using sed failed"); + } + + // std::filesystem::copy does not seem to work. We could use symlinks here, but those would make reading the file + // via ifstream more complicated. + { + auto cmd = std::stringstream{}; + cmd << "cp resources/benchmark/jcch/" << table_name << ".csv.json " << tables_path << table_name << ".csv.json"; + auto ret = system(cmd.str().c_str()); + Assert(!ret, "Copying csv.json files failed"); + } + } + + std::cout << " (" << timer.lap_formatted() << ")" << std::endl; + } + + // Having generated the .csv files, call the FileBasedTableGenerator just as if those files were user-provided + auto generated_tables = FileBasedTableGenerator::generate(); + + // FileBasedTableGenerator automatically stores a binary file. Remove the CSV data to save some space. + if (std::filesystem::exists(tables_path + "/customer.csv")) { + auto cmd = std::stringstream{}; + cmd << "rm " << tables_path << "*.csv*"; + auto ret = system(cmd.str().c_str()); + Assert(!ret, "Removing csv/csv.json files failed"); + } + + return generated_tables; +} + +} // namespace opossum diff --git a/src/benchmarklib/jcch/jcch_table_generator.hpp b/src/benchmarklib/jcch/jcch_table_generator.hpp new file mode 100644 index 0000000000..079ca8c732 --- /dev/null +++ b/src/benchmarklib/jcch/jcch_table_generator.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include "file_based_table_generator.hpp" +#include "tpch/tpch_table_generator.hpp" + +namespace opossum { + +// Generates the JCC-H data by calling JCC-H's dbgen binary. See jcch_benchmark.cpp for details. +// This uses multiple inheritance from TPCHTableGenerator (for the sort order, indexes, and constraints) and from +// FileBasedTableGenerator (for the csv loading part). One could argue if composition would be more appropriate +// here. The relationship between FileBasedTableGenerator and JCCHTableGenerator does not really satisfy the Liskov +// substitution principle. However, it makes reusing the TPC-H definitions much easier. + +class JCCHTableGenerator : virtual public AbstractTableGenerator, + private TPCHTableGenerator, + private FileBasedTableGenerator { + public: + // Convenience constructor for creating a JCCHTableGenerator without a benchmarking context + explicit JCCHTableGenerator(const std::string& dbgen_path, const std::string& data_path, float scale_factor, + uint32_t chunk_size = Chunk::DEFAULT_SIZE); + + // Constructor for creating a JCCHTableGenerator in a benchmark + explicit JCCHTableGenerator(const std::string& dbgen_path, const std::string& data_path, float scale_factor, + const std::shared_ptr& benchmark_config); + + std::unordered_map generate() override; + + protected: + using TPCHTableGenerator::_add_constraints; + using TPCHTableGenerator::_indexes_by_table; + using TPCHTableGenerator::_sort_order_by_table; + + std::string _dbgen_path; +}; + +} // namespace opossum diff --git a/src/benchmarklib/tpch/tpch_benchmark_item_runner.cpp b/src/benchmarklib/tpch/tpch_benchmark_item_runner.cpp index 505370a924..91c192eb50 100644 --- a/src/benchmarklib/tpch/tpch_benchmark_item_runner.cpp +++ b/src/benchmarklib/tpch/tpch_benchmark_item_runner.cpp @@ -11,7 +11,6 @@ extern "C" { #include #include -#include #include #include "hyrise.hpp" @@ -19,18 +18,6 @@ extern "C" { #include "tpch_queries.hpp" #include "utils/assert.hpp" -namespace { -// adds (or subtracts) specified number of months and days -std::string calculate_date(boost::gregorian::date date, int months, int days = 0) { - date = date + boost::gregorian::months(months) + boost::gregorian::days(days); - - std::stringstream output; - output << date.year() << "-" << std::setw(2) << std::setfill('0') << date.month().as_number() << "-" << std::setw(2) - << std::setfill('0') << date.day(); - return output.str(); -} -} // namespace - namespace opossum { TPCHBenchmarkItemRunner::TPCHBenchmarkItemRunner(const std::shared_ptr& config, @@ -70,6 +57,16 @@ bool TPCHBenchmarkItemRunner::_on_execute_item(const BenchmarkItemID item_id, Be return true; } +std::string TPCHBenchmarkItemRunner::_calculate_date(boost::gregorian::date date, int months, int days) { + date = date + boost::gregorian::months(months) + boost::gregorian::days(days); + + std::stringstream output; + output << static_cast(date.year()) << "-" << std::setw(2) << std::setfill('0') // NOLINT + << static_cast(date.month()) // NOLINT + << "-" << std::setw(2) << std::setfill('0') << static_cast(date.day()); // NOLINT + return output.str(); +} + void TPCHBenchmarkItemRunner::on_tables_loaded() { // Make sure that sort order, indexes, and constraints have made it all the way up to here const auto orders_table = Hyrise::get().storage_manager.get_table("orders"); @@ -147,7 +144,7 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) // Writing `1-1` to make people aware that this is zero-indexed while TPC-H query names are not case 1 - 1: { std::uniform_int_distribution<> date_diff_dist{60, 120}; - const auto date = calculate_date(boost::gregorian::date{1998, 12, 01}, 0, -date_diff_dist(random_engine)); + const auto date = _calculate_date(boost::gregorian::date{1998, 12, 01}, 0, -date_diff_dist(random_engine)); parameters.emplace_back("'"s + date + "'"); break; @@ -169,7 +166,7 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) case 3 - 1: { const auto* const segment = c_mseg_set.list[segment_dist(random_engine)].text; std::uniform_int_distribution<> date_diff_dist{0, 30}; - const auto date = calculate_date(boost::gregorian::date{1995, 03, 01}, 0, date_diff_dist(random_engine)); + const auto date = _calculate_date(boost::gregorian::date{1995, 03, 01}, 0, date_diff_dist(random_engine)); parameters.emplace_back("'"s + segment + "'"); parameters.emplace_back("'"s + date + "'"); @@ -180,8 +177,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) case 4 - 1: { std::uniform_int_distribution<> date_diff_dist{0, 4 * 12 + 9}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff + 3); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff + 3); parameters.emplace_back("'"s + begin_date + "'"); parameters.emplace_back("'"s + end_date + "'"); @@ -193,8 +190,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) std::uniform_int_distribution<> date_diff_dist{0, 4}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); parameters.emplace_back("'"s + region + "'"); parameters.emplace_back("'"s + begin_date + "'"); @@ -205,8 +202,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) case 6 - 1: { std::uniform_int_distribution<> date_diff_dist{0, 4}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); static std::uniform_int_distribution<> discount_dist{2, 9}; const auto discount = 0.01f * static_cast(discount_dist(random_engine)); @@ -233,6 +230,11 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) parameters.emplace_back("'"s + nation2 + "'"); parameters.emplace_back("'"s + nation2 + "'"); parameters.emplace_back("'"s + nation1 + "'"); + + // Hard-coded in TPC-H, but used in JCC-H + parameters.emplace_back("'1995-01-01'"); + parameters.emplace_back("'1996-12-31'"); + break; } @@ -247,7 +249,13 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) parameters.emplace_back("'"s + nation + "'"); parameters.emplace_back("'"s + region + "'"); + + // Hard-coded in TPC-H, but used in JCC-H + parameters.emplace_back("'1995-01-01'"); + parameters.emplace_back("'1996-12-31'"); + parameters.emplace_back("'"s + type + "'"); + break; } @@ -261,8 +269,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) case 10 - 1: { std::uniform_int_distribution<> date_diff_dist{0, 23}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 3)); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 3)); parameters.emplace_back("'"s + begin_date + "'"); parameters.emplace_back("'"s + end_date + "'"); @@ -288,8 +296,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) std::uniform_int_distribution<> date_diff_dist{0, 4}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); parameters.emplace_back("'"s + shipmode1 + "'"); parameters.emplace_back("'"s + shipmode2 + "'"); @@ -311,8 +319,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) case 14 - 1: { std::uniform_int_distribution<> date_diff_dist{0, 5 * 12}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff + 1); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff + 1); parameters.emplace_back("'"s + begin_date + "'"); parameters.emplace_back("'"s + end_date + "'"); @@ -324,8 +332,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) std::uniform_int_distribution<> date_diff_dist{0, 4 * 12 + 9}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff + 3); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff + 3); // Hack: We cannot use prepared statements in TPC-H 15. Thus, we need to build the SQL string by hand. // By manually replacing the `?` from tpch_queries.cpp, we can keep all queries in a readable form there. @@ -405,8 +413,8 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) const auto* const color = colors.list[color_dist(random_engine)].text; std::uniform_int_distribution<> date_diff_dist{0, 4}; const auto diff = date_diff_dist(random_engine); - const auto begin_date = calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); - const auto end_date = calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); + const auto begin_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, diff * 12); + const auto end_date = _calculate_date(boost::gregorian::date{1993, 01, 01}, (diff + 1) * 12); const auto* const nation = nations.list[nation_dist(random_engine)].text; parameters.emplace_back("'"s + color + "%'"); @@ -438,7 +446,7 @@ std::string TPCHBenchmarkItemRunner::_build_query(const BenchmarkItemID item_id) } return _substitute_placeholders(item_id, parameters); -} +} // NOLINT std::string TPCHBenchmarkItemRunner::_build_deterministic_query(const BenchmarkItemID item_id) { DebugAssert(item_id < 22, "There are only 22 TPC-H queries"); @@ -464,8 +472,8 @@ std::string TPCHBenchmarkItemRunner::_build_deterministic_query(const BenchmarkI {"'1993-07-01'", "'1993-10-01'"}, {"'ASIA'", "'1994-01-01'", "'1995-01-01'"}, {"'1994-01-01'", "'1995-01-01'", ".06", ".06", "24"}, - {"'FRANCE'", "'GERMANY'", "'GERMANY'", "'FRANCE'"}, - {"'BRAZIL'", "'AMERICA'", "'ECONOMY ANODIZED STEEL'"}, + {"'FRANCE'", "'GERMANY'", "'GERMANY'", "'FRANCE'", "'1995-01-01'", "'1996-12-31'"}, + {"'BRAZIL'", "'AMERICA'", "'1995-01-01'", "'1996-12-31'", "'ECONOMY ANODIZED STEEL'"}, {"'%green%'"}, {"'1993-10-01'", "'1994-01-01'"}, {"'GERMANY'", "0.0001", "'GERMANY'"}, @@ -504,6 +512,8 @@ std::string TPCHBenchmarkItemRunner::_substitute_placeholders(const BenchmarkIte boost::replace_first(query_template, "?", parameter_value); } + Assert(query_template.find('?') == std::string::npos, "Unreplaced Placeholder"); + return query_template; } } diff --git a/src/benchmarklib/tpch/tpch_benchmark_item_runner.hpp b/src/benchmarklib/tpch/tpch_benchmark_item_runner.hpp index dce1238e44..223ff20651 100644 --- a/src/benchmarklib/tpch/tpch_benchmark_item_runner.hpp +++ b/src/benchmarklib/tpch/tpch_benchmark_item_runner.hpp @@ -2,6 +2,8 @@ #include +#include + #include "abstract_benchmark_item_runner.hpp" namespace opossum { @@ -30,6 +32,9 @@ class TPCHBenchmarkItemRunner : public AbstractBenchmarkItemRunner { // Runs the PREPARE queries if _use_prepared_statements is set, otherwise does nothing void _prepare_queries() const; + // Adds (or subtracts) specified number of months and days + static std::string _calculate_date(boost::gregorian::date date, int months, int days = 0); + // Returns an SQL query with random parameters for a given (zero-indexed) benchmark item (i.e., 0 -> TPC-H 1) std::string _build_query(const BenchmarkItemID item_id); diff --git a/src/benchmarklib/tpch/tpch_queries.cpp b/src/benchmarklib/tpch/tpch_queries.cpp index 481845685c..6e9f21321c 100644 --- a/src/benchmarklib/tpch/tpch_queries.cpp +++ b/src/benchmarklib/tpch/tpch_queries.cpp @@ -268,7 +268,7 @@ const char* const tpch_query_7 = c_nationkey = n2.n_nationkey AND ((n1.n_name = ? AND n2.n_name = ?) OR (n1.n_name = ? AND n2.n_name = ?)) AND - l_shipdate BETWEEN '1995-01-01' AND '1996-12-31' + l_shipdate BETWEEN ? AND ? ) as shipping GROUP BY supp_nation, cust_nation, l_year @@ -328,8 +328,8 @@ const char* const tpch_query_8 = n2.n_name as nation FROM part, supplier, lineitem, orders, customer, nation n1, nation n2, region WHERE p_partkey = l_partkey AND s_suppkey = l_suppkey AND l_orderkey = o_orderkey AND o_custkey = c_custkey AND c_nationkey = n1.n_nationkey AND n1.n_regionkey = r_regionkey AND - r_name = ? AND s_nationkey = n2.n_nationkey AND o_orderdate between '1995-01-01' - AND '1996-12-31' AND p_type = ?) as all_nations GROUP BY o_year ORDER BY o_year;)"; + r_name = ? AND s_nationkey = n2.n_nationkey AND o_orderdate between ? + AND ? AND p_type = ?) as all_nations GROUP BY o_year ORDER BY o_year;)"; /** * TPC-H 9 diff --git a/src/benchmarklib/tpch/tpch_table_generator.cpp b/src/benchmarklib/tpch/tpch_table_generator.cpp index 23fc7cee61..385a66016c 100644 --- a/src/benchmarklib/tpch/tpch_table_generator.cpp +++ b/src/benchmarklib/tpch/tpch_table_generator.cpp @@ -116,7 +116,7 @@ std::unordered_map tpch_table_names = { {TPCHTable::Nation, "nation"}, {TPCHTable::Region, "region"}}; TPCHTableGenerator::TPCHTableGenerator(float scale_factor, uint32_t chunk_size) - : AbstractTableGenerator(create_benchmark_config_with_chunk_size(chunk_size)), _scale_factor(scale_factor) {} + : TPCHTableGenerator(scale_factor, create_benchmark_config_with_chunk_size(chunk_size)) {} TPCHTableGenerator::TPCHTableGenerator(float scale_factor, const std::shared_ptr& benchmark_config) : AbstractTableGenerator(benchmark_config), _scale_factor(scale_factor) {} diff --git a/src/benchmarklib/tpch/tpch_table_generator.hpp b/src/benchmarklib/tpch/tpch_table_generator.hpp index 01119a2eb8..d3508620d0 100644 --- a/src/benchmarklib/tpch/tpch_table_generator.hpp +++ b/src/benchmarklib/tpch/tpch_table_generator.hpp @@ -27,7 +27,7 @@ extern std::unordered_map tpch_table_names; * * NOT thread safe because the underlying tpch-dbgen is not (since it has global data and malloc races). */ -class TPCHTableGenerator final : public AbstractTableGenerator { +class TPCHTableGenerator : virtual public AbstractTableGenerator { public: // Convenience constructor for creating a TPCHTableGenerator without a benchmarking context explicit TPCHTableGenerator(float scale_factor, uint32_t chunk_size = Chunk::DEFAULT_SIZE); @@ -42,7 +42,6 @@ class TPCHTableGenerator final : public AbstractTableGenerator { SortOrderByTable _sort_order_by_table() const override; void _add_constraints(std::unordered_map& table_info_by_name) const override; - private: float _scale_factor; }; } // namespace opossum diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index f558d33ab1..15d36f3088 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -357,3 +357,25 @@ target_compile_options( -Wno-parentheses-equality ) endif() + +include(ExternalProject) +externalproject_add( + jcchDbgen + + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/jcch-dbgen + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/jcch-dbgen + CONFIGURE_COMMAND bash -c "cp -r ${CMAKE_CURRENT_SOURCE_DIR}/jcch-dbgen/* ." + BUILD_COMMAND bash -c "output=$(make 2>&1) || (printf \"$output\\n\" && false)" + INSTALL_COMMAND false # install should never be called, this is a safe guard that fails if it is + STEP_TARGETS build +) + +file(GLOB_RECURSE JCCH_DBGEN_FILES ${CMAKE_CURRENT_SOURCE_DIR}/jcch-dbgen/*) +externalproject_add_step( + jcchDbgen + check_for_changes + # Make sure that we rebuild jcch-dbgen when a file changes + DEPENDERS configure + DEPENDS "${JCCH_DBGEN_FILES}" + COMMAND bash -c "(cd ${CMAKE_CURRENT_BINARY_DIR}/jcch-dbgen && make distclean >/dev/null 2>/dev/null) || true" +) \ No newline at end of file diff --git a/third_party/jcch-dbgen b/third_party/jcch-dbgen new file mode 160000 index 0000000000..083a9e48fd --- /dev/null +++ b/third_party/jcch-dbgen @@ -0,0 +1 @@ +Subproject commit 083a9e48fd18661d6cdae0f0fa9b15188809aca7