Join Order Benchmark (hyrise#1391)

* moapjmeon * add script * nicer print * nicer print * fullci * things * HOAÖ
hyrise-mp-22-23 · Jan 10, 2019 · a6c9d5f · a6c9d5f
1 parent 1bdbae1
commit a6c9d5f
Show file tree

Hide file tree

Showing 14 changed files with 356 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 obj/*
+imdb_data
 .DS_STORE
 build/*
 build-*/*

diff --git a/.gitmodules b/.gitmodules
@@ -25,3 +25,6 @@
 [submodule "flat_hash_map"]
 	path = third_party/flat_hash_map
 	url = https://github.com/skarupke/flat_hash_map.git
+[submodule "third_party/join-order-benchmark"]
+	path = third_party/join-order-benchmark
+	url = https://github.com/gregrahn/join-order-benchmark.git
diff --git a/scripts/setup_imdb.py b/scripts/setup_imdb.py
@@ -0,0 +1,106 @@
+#!/usr/bin/python3
+
+# This script is meant to be called by hyriseBenchmarkJoinOrder, but nothing stops you from calling it yourself.
+# It downloads the IMDB used by the JoinOrderBenchmark and unzips it. We do this in Python and not in C++ because
+# downloading and unzipping is straight forward in Python.
+
+import hashlib
+import os
+import sys
+import urllib.request
+import zipfile
+
+
+def clean_up(including_table_dir=False):
+    if os.path.exists(FILE_NAME):
+        os.remove(FILE_NAME)
+
+    if including_table_dir and os.path.exists(table_dir):
+        for file in os.listdir(table_dir):
+            os.remove("./%s/%s" % (table_dir, file))
+        os.rmdir(table_dir)
+
+
+def is_setup():
+    for table_name in TABLE_NAMES:
+        if not os.path.exists(os.path.join(table_dir, table_name + ".csv")):
+            return False
+        if not os.path.exists(os.path.join(table_dir, table_name + ".csv.json")):
+            return False
+
+    return True
+
+
+# [cmd, table_dir]
+assert len(sys.argv) == 2
+table_dir = sys.argv[1]
+
+LOCATION = "https://www.dropbox.com/s/ckh4nyqpol70ri3/imdb.zip?dl=1"
+FILE_NAME = "imdb.zip"
+TABLE_NAMES = ["aka_name", "aka_title", "cast_info", "char_name", "company_name", "company_type", "comp_cast_type", "complete_cast", "info_type",
+                    "keyword", "kind_type", "link_type", "movie_companies", "movie_info", "movie_info_idx", "movie_keyword", "movie_link", "name",
+                    "person_info", "role_type", "title"]
+
+print("Retrieving the IMDB dataset.")
+
+if is_setup():
+    print("  IMDB setup already complete, no setup action required")
+    sys.exit(0)
+
+# We are going to calculate the md5 hash later, on-the-fly while downloading
+hash_md5 = hashlib.md5()
+
+url = urllib.request.urlopen(LOCATION)
+meta = url.info()
+file_size = int(meta['Content-Length'])
+
+file = open(FILE_NAME, 'wb')
+
+print("  Downloading: %s (%.2f GB)" % (FILE_NAME, file_size / 1000 / 1000 / 1000))
+
+already_retrieved = 0
+block_size = 8192
+try:
+    while True:
+        buffer = url.read(block_size)
+        if not buffer:
+            break
+
+        hash_md5.update(buffer)
+
+        already_retrieved += len(buffer)
+        file.write(buffer)
+        status = r"  Retrieved %3.2f%% of the data" % (already_retrieved * 100. / file_size)
+        status = status + chr(8) * (len(status) + 1)
+        print(status, end='\r')
+except:
+    print("  Aborting. Something went wrong during the download. Cleaning up.")
+    clean_up()
+    sys.exit(1)
+
+file.close()
+print()
+print("  Validating integrity...")
+
+hash_dl = hash_md5.hexdigest()
+
+if hash_dl != "79e4c71f8ec0dae17d6aa9182fdab835":
+    print("  Aborting. MD5 checksum mismatch. Cleaning up.")
+    clean_up()
+    sys.exit(2)
+
+print("  Downloaded file is valid.")
+print("  Unzipping the file...")
+
+try:
+    zip = zipfile.ZipFile("imdb.zip", "r")
+    zip.extractall(table_dir)
+    zip.close()
+except:
+    print("  Aborting. Something went wrong during unzipping. Cleaning up.")
+    clean_up(including_table_dir=True)
+    sys.exit(3)
+
+print("  Deleting the archive file.")
+clean_up()
+print("  imdb_setup.py ran sucessfully.")
diff --git a/src/benchmark/CMakeLists.txt b/src/benchmark/CMakeLists.txt
@@ -64,4 +64,18 @@ target_link_libraries(
 
     hyrise
     hyriseBenchmarkLib
-)
+)
+
+# Configure hyriseBenchmarkJoinOrder
+add_executable(
+    hyriseBenchmarkJoinOrder
+
+    join_order_benchmark.cpp
+)
+
+target_link_libraries(
+    hyriseBenchmarkJoinOrder
+
+    hyrise
+    hyriseBenchmarkLib
+)
diff --git a/src/benchmark/file_based_benchmark.cpp b/src/benchmark/file_based_benchmark.cpp
@@ -23,19 +23,23 @@ int main(int argc, char* argv[]) {
 
   // clang-format off
   cli_options.add_options()
-      ("tables", "Specify directory from which tables are loaded", cxxopts::value<std::string>()->default_value("")) // NOLINT
-      ("queries", "Specify queries to run, either a single .sql file or a directory with these files", cxxopts::value<std::string>()->default_value("")); // NOLINT
+      ("table_path", "Directory containing the Tables", cxxopts::value<std::string>()->default_value("")) // NOLINT
+      ("query_path", "Directory/file containing the queries", cxxopts::value<std::string>()->default_value("")) // NOLINT
+      ("queries", "Subset of queries to run as a comma separated list", cxxopts::value<std::string>()->default_value("all")); // NOLINT
   // clang-format on
 
   std::shared_ptr<BenchmarkConfig> benchmark_config;
   std::string query_path;
   std::string table_path;
+  // Comma-separated query names or "all"
+  std::string queries_str;
 
   if (CLIConfigParser::cli_has_json_config(argc, argv)) {
     // JSON config file was passed in
     const auto json_config = CLIConfigParser::parse_json_config_file(argv[1]);
-    table_path = json_config.value("tables", "");
-    query_path = json_config.value("queries", "");
+    table_path = json_config.value("table_path", "");
+    query_path = json_config.value("query_path", "");
+    queries_str = json_config.value("queries", "all");
 
     benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_options_json_config(json_config));
 
@@ -49,26 +53,48 @@ int main(int argc, char* argv[]) {
       return 0;
     }
 
-    query_path = cli_parse_result["queries"].as<std::string>();
-    table_path = cli_parse_result["tables"].as<std::string>();
+    query_path = cli_parse_result["query_path"].as<std::string>();
+    table_path = cli_parse_result["table_path"].as<std::string>();
+    queries_str = cli_parse_result["queries"].as<std::string>();
 
     benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_cli_options(cli_parse_result));
   }
 
-  // Check that the options 'queries' and 'tables' were specifiedc
+  // Check that the options "query_path" and "table_path" were specified
   if (query_path.empty() || table_path.empty()) {
-    std::cerr << "Need to specify --queries=path/to/queries and --tables=path/to/tables" << std::endl;
+    std::cerr << "Need to specify --query_path=path/to/queries and --table_path=path/to/tables" << std::endl;
     std::cerr << cli_options.help({}) << std::endl;
     return 1;
   }
 
   benchmark_config->out << "- Benchmarking queries from " << query_path << std::endl;
   benchmark_config->out << "- Running on tables from " << table_path << std::endl;
 
+  std::optional<std::unordered_set<std::string>> query_subset;
+  if (queries_str == "all") {
+    benchmark_config->out << "- Running all queries from specified path" << std::endl;
+  } else {
+    benchmark_config->out << "- Running subset of queries: " << queries_str << std::endl;
+
+    // "a, b, c, d" -> ["a", " b", " c", " d"]
+    auto query_subset_untrimmed = std::vector<std::string>{};
+    boost::algorithm::split(query_subset_untrimmed, queries_str, boost::is_any_of(","));
+
+    // ["a", " b", " c", " d"] -> ["a", "b", "c", "d"]
+    query_subset.emplace();
+    for (auto& query_name : query_subset_untrimmed) {
+      query_subset->emplace(boost::trim_copy(query_name));
+    }
+  }
+
+  // Ignore no .sql files in the directory for now (TODO(anybody): add CLI option if required)
+  const auto query_filename_blacklist = std::unordered_set<std::string>{};
+
   // Run the benchmark
   auto context = BenchmarkRunner::create_context(*benchmark_config);
   auto table_generator = std::make_unique<FileBasedTableGenerator>(benchmark_config, table_path);
-  auto query_generator = std::make_unique<FileBasedQueryGenerator>(*benchmark_config, query_path);
+  auto query_generator =
+      std::make_unique<FileBasedQueryGenerator>(*benchmark_config, query_path, query_filename_blacklist, query_subset);
 
   BenchmarkRunner{*benchmark_config, std::move(query_generator), std::move(table_generator), context}.run();
 }
diff --git a/src/benchmark/join_order_benchmark.cpp b/src/benchmark/join_order_benchmark.cpp
@@ -0,0 +1,120 @@
+#include <stdlib.h>
+
+#include <boost/algorithm/string.hpp>
+#include <cxxopts.hpp>
+
+#include "benchmark_runner.hpp"
+#include "cli_config_parser.hpp"
+#include "file_based_query_generator.hpp"
+#include "file_based_table_generator.hpp"
+#include "import_export/csv_parser.hpp"
+#include "scheduler/current_scheduler.hpp"
+#include "scheduler/node_queue_scheduler.hpp"
+#include "scheduler/topology.hpp"
+#include "storage/storage_manager.hpp"
+#include "storage/table.hpp"
+#include "types.hpp"
+#include "utils/filesystem.hpp"
+#include "utils/load_table.hpp"
+#include "utils/performance_warning.hpp"
+
+/**
+ * The Join Order Benchmark was introduced by Leis et al. "How good are query optimizers, really?".
+ * It runs on an IMDB database from ~2013 that gets downloaded if necessary as part of running this benchmark.
+ * Its 113 queries are obtained from the "third_party/join-order-benchmark" submodule
+ */
+
+using namespace opossum;               // NOLINT
+using namespace std::string_literals;  // NOLINT
+
+int main(int argc, char* argv[]) {
+  auto cli_options = BenchmarkRunner::get_basic_cli_options("Hyrise Join Order Benchmark");
+
+  const auto DEFAULT_TABLE_PATH = "imdb_data";
+  const auto DEFAULT_QUERY_PATH = "third_party/join-order-benchmark";
+
+  // clang-format off
+  cli_options.add_options()
+  ("table_path", "Directory containing the Tables", cxxopts::value<std::string>()->default_value(DEFAULT_TABLE_PATH)) // NOLINT
+  ("query_path", "Directory/file containing the queries", cxxopts::value<std::string>()->default_value(DEFAULT_QUERY_PATH)) // NOLINT
+  ("queries", "Subset of queries to run as a comma separated list", cxxopts::value<std::string>()->default_value("all")); // NOLINT
+  // clang-format on
+
+  std::shared_ptr<BenchmarkConfig> benchmark_config;
+  std::string query_path;
+  std::string table_path;
+  // Comma-separated query names or "all"
+  std::string queries_str;
+
+  if (CLIConfigParser::cli_has_json_config(argc, argv)) {
+    // JSON config file was passed in
+    const auto json_config = CLIConfigParser::parse_json_config_file(argv[1]);
+    table_path = json_config.value("table_path", DEFAULT_TABLE_PATH);
+    query_path = json_config.value("query_path", DEFAULT_QUERY_PATH);
+    queries_str = json_config.value("queries", "all");
+
+    benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_options_json_config(json_config));
+
+  } else {
+    // Parse regular command line args
+    const auto cli_parse_result = cli_options.parse(argc, argv);
+
+    // Display usage and quit
+    if (cli_parse_result.count("help")) {
+      std::cout << CLIConfigParser::detailed_help(cli_options) << std::endl;
+      return 0;
+    }
+
+    query_path = cli_parse_result["query_path"].as<std::string>();
+    table_path = cli_parse_result["table_path"].as<std::string>();
+    queries_str = cli_parse_result["queries"].as<std::string>();
+
+    benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_cli_options(cli_parse_result));
+  }
+
+  // Check that the options "query_path" and "table_path" were specified
+  if (query_path.empty() || table_path.empty()) {
+    std::cerr << "Need to specify --query_path=path/to/queries and --table_path=path/to/tables" << std::endl;
+    std::cerr << cli_options.help({}) << std::endl;
+    return 1;
+  }
+
+  /**
+   * Use a Python script to download and unzip the IMDB. We do this in Python and not in C++ because downloading and
+   * unzipping is straight forward in Python (and we suspect in C++ it might be... cumbersome).
+   */
+  const auto setup_imdb_command = "python3 scripts/setup_imdb.py "s + table_path;
+  const auto setup_imdb_return_code = system(setup_imdb_command.c_str());
+  Assert(setup_imdb_return_code == 0, "setup_imdb.py failed. Did you run the benchmark from the project root dir?");
+
+  // The join-order-benchmark ships with these two .sql scripts, but we do not want to run them as part of the benchmark
+  // as they do not contains actual queries
+  const auto non_query_file_names = std::unordered_set<std::string>{"fkindexes.sql", "schema.sql"};
+
+  benchmark_config->out << "- Benchmarking queries from " << query_path << std::endl;
+  benchmark_config->out << "- Running on tables from " << table_path << std::endl;
+
+  std::optional<std::unordered_set<std::string>> query_subset;
+  if (queries_str == "all") {
+    benchmark_config->out << "- Running all queries from specified path" << std::endl;
+  } else {
+    benchmark_config->out << "- Running subset of queries: " << queries_str << std::endl;
+
+    // "a, b, c, d" -> ["a", " b", " c", " d"]
+    auto query_subset_untrimmed = std::vector<std::string>{};
+    boost::algorithm::split(query_subset_untrimmed, queries_str, boost::is_any_of(","));
+
+    // ["a", " b", " c", " d"] -> ["a", "b", "c", "d"]
+    query_subset.emplace();
+    for (auto& query_name : query_subset_untrimmed) {
+      query_subset->emplace(boost::trim_copy(query_name));
+    }
+  }
+
+  // Run the benchmark
+  auto context = BenchmarkRunner::create_context(*benchmark_config);
+  auto table_generator = std::make_unique<FileBasedTableGenerator>(benchmark_config, table_path);
+  auto query_generator = std::make_unique<FileBasedQueryGenerator>(*benchmark_config, query_path, non_query_file_names);
+
+  BenchmarkRunner{*benchmark_config, std::move(query_generator), std::move(table_generator), context}.run();
+}
diff --git a/src/benchmarklib/abstract_query_generator.cpp b/src/benchmarklib/abstract_query_generator.cpp
@@ -4,10 +4,6 @@ namespace opossum {
 
 std::string AbstractQueryGenerator::get_preparation_queries() const { return ""; }
 
-const std::vector<std::string>& AbstractQueryGenerator::query_names() const { return _query_names; }
-
-size_t AbstractQueryGenerator::available_query_count() const { return _query_names.size(); }
-
 size_t AbstractQueryGenerator::selected_query_count() const { return _selected_queries.size(); }
 
 const std::vector<QueryID>& AbstractQueryGenerator::selected_queries() const { return _selected_queries; }

diff --git a/src/benchmarklib/abstract_query_generator.hpp b/src/benchmarklib/abstract_query_generator.hpp
@@ -24,10 +24,10 @@ class AbstractQueryGenerator {
   virtual std::string build_query(const QueryID query_id) = 0;
 
   // Returns the names of the individual queries (e.g., "TPC-H 1")
-  const std::vector<std::string>& query_names() const;
+  virtual std::string query_name(const QueryID query_id) const = 0;
 
   // Returns the number of queries supported by the benchmark
-  size_t available_query_count() const;
+  virtual size_t available_query_count() const = 0;
 
   // Returns the number of queries selected for execution
   size_t selected_query_count() const;
@@ -40,9 +40,6 @@ class AbstractQueryGenerator {
 
   // PREPARE and other statements that should be executed first
   std::vector<std::string> _preparation_queries;
-
-  // Contains ALL query names, not only the selected ones
-  std::vector<std::string> _query_names;
 };
 
 }  // namespace opossum