Skip to content

Commit

Permalink
Join Order Benchmark (hyrise#1391)
Browse files Browse the repository at this point in the history
* moapjmeon

* add script

* nicer print

* nicer print

* fullci

* things

* HOAÖ
  • Loading branch information
Moritz Eyssen authored and mrks committed Jan 10, 2019
1 parent 1bdbae1 commit a6c9d5f
Show file tree
Hide file tree
Showing 14 changed files with 356 additions and 56 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
obj/*
imdb_data
.DS_STORE
build/*
build-*/*
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@
[submodule "flat_hash_map"]
path = third_party/flat_hash_map
url = https://github.com/skarupke/flat_hash_map.git
[submodule "third_party/join-order-benchmark"]
path = third_party/join-order-benchmark
url = https://github.com/gregrahn/join-order-benchmark.git
106 changes: 106 additions & 0 deletions scripts/setup_imdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/python3

# This script is meant to be called by hyriseBenchmarkJoinOrder, but nothing stops you from calling it yourself.
# It downloads the IMDB used by the JoinOrderBenchmark and unzips it. We do this in Python and not in C++ because
# downloading and unzipping is straight forward in Python.

import hashlib
import os
import sys
import urllib.request
import zipfile


def clean_up(including_table_dir=False):
if os.path.exists(FILE_NAME):
os.remove(FILE_NAME)

if including_table_dir and os.path.exists(table_dir):
for file in os.listdir(table_dir):
os.remove("./%s/%s" % (table_dir, file))
os.rmdir(table_dir)


def is_setup():
for table_name in TABLE_NAMES:
if not os.path.exists(os.path.join(table_dir, table_name + ".csv")):
return False
if not os.path.exists(os.path.join(table_dir, table_name + ".csv.json")):
return False

return True


# [cmd, table_dir]
assert len(sys.argv) == 2
table_dir = sys.argv[1]

LOCATION = "https://www.dropbox.com/s/ckh4nyqpol70ri3/imdb.zip?dl=1"
FILE_NAME = "imdb.zip"
TABLE_NAMES = ["aka_name", "aka_title", "cast_info", "char_name", "company_name", "company_type", "comp_cast_type", "complete_cast", "info_type",
"keyword", "kind_type", "link_type", "movie_companies", "movie_info", "movie_info_idx", "movie_keyword", "movie_link", "name",
"person_info", "role_type", "title"]

print("Retrieving the IMDB dataset.")

if is_setup():
print(" IMDB setup already complete, no setup action required")
sys.exit(0)

# We are going to calculate the md5 hash later, on-the-fly while downloading
hash_md5 = hashlib.md5()

url = urllib.request.urlopen(LOCATION)
meta = url.info()
file_size = int(meta['Content-Length'])

file = open(FILE_NAME, 'wb')

print(" Downloading: %s (%.2f GB)" % (FILE_NAME, file_size / 1000 / 1000 / 1000))

already_retrieved = 0
block_size = 8192
try:
while True:
buffer = url.read(block_size)
if not buffer:
break

hash_md5.update(buffer)

already_retrieved += len(buffer)
file.write(buffer)
status = r" Retrieved %3.2f%% of the data" % (already_retrieved * 100. / file_size)
status = status + chr(8) * (len(status) + 1)
print(status, end='\r')
except:
print(" Aborting. Something went wrong during the download. Cleaning up.")
clean_up()
sys.exit(1)

file.close()
print()
print(" Validating integrity...")

hash_dl = hash_md5.hexdigest()

if hash_dl != "79e4c71f8ec0dae17d6aa9182fdab835":
print(" Aborting. MD5 checksum mismatch. Cleaning up.")
clean_up()
sys.exit(2)

print(" Downloaded file is valid.")
print(" Unzipping the file...")

try:
zip = zipfile.ZipFile("imdb.zip", "r")
zip.extractall(table_dir)
zip.close()
except:
print(" Aborting. Something went wrong during unzipping. Cleaning up.")
clean_up(including_table_dir=True)
sys.exit(3)

print(" Deleting the archive file.")
clean_up()
print(" imdb_setup.py ran sucessfully.")
16 changes: 15 additions & 1 deletion src/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,18 @@ target_link_libraries(

hyrise
hyriseBenchmarkLib
)
)

# Configure hyriseBenchmarkJoinOrder
add_executable(
hyriseBenchmarkJoinOrder

join_order_benchmark.cpp
)

target_link_libraries(
hyriseBenchmarkJoinOrder

hyrise
hyriseBenchmarkLib
)
44 changes: 35 additions & 9 deletions src/benchmark/file_based_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,23 @@ int main(int argc, char* argv[]) {

// clang-format off
cli_options.add_options()
("tables", "Specify directory from which tables are loaded", cxxopts::value<std::string>()->default_value("")) // NOLINT
("queries", "Specify queries to run, either a single .sql file or a directory with these files", cxxopts::value<std::string>()->default_value("")); // NOLINT
("table_path", "Directory containing the Tables", cxxopts::value<std::string>()->default_value("")) // NOLINT
("query_path", "Directory/file containing the queries", cxxopts::value<std::string>()->default_value("")) // NOLINT
("queries", "Subset of queries to run as a comma separated list", cxxopts::value<std::string>()->default_value("all")); // NOLINT
// clang-format on

std::shared_ptr<BenchmarkConfig> benchmark_config;
std::string query_path;
std::string table_path;
// Comma-separated query names or "all"
std::string queries_str;

if (CLIConfigParser::cli_has_json_config(argc, argv)) {
// JSON config file was passed in
const auto json_config = CLIConfigParser::parse_json_config_file(argv[1]);
table_path = json_config.value("tables", "");
query_path = json_config.value("queries", "");
table_path = json_config.value("table_path", "");
query_path = json_config.value("query_path", "");
queries_str = json_config.value("queries", "all");

benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_options_json_config(json_config));

Expand All @@ -49,26 +53,48 @@ int main(int argc, char* argv[]) {
return 0;
}

query_path = cli_parse_result["queries"].as<std::string>();
table_path = cli_parse_result["tables"].as<std::string>();
query_path = cli_parse_result["query_path"].as<std::string>();
table_path = cli_parse_result["table_path"].as<std::string>();
queries_str = cli_parse_result["queries"].as<std::string>();

benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_cli_options(cli_parse_result));
}

// Check that the options 'queries' and 'tables' were specifiedc
// Check that the options "query_path" and "table_path" were specified
if (query_path.empty() || table_path.empty()) {
std::cerr << "Need to specify --queries=path/to/queries and --tables=path/to/tables" << std::endl;
std::cerr << "Need to specify --query_path=path/to/queries and --table_path=path/to/tables" << std::endl;
std::cerr << cli_options.help({}) << std::endl;
return 1;
}

benchmark_config->out << "- Benchmarking queries from " << query_path << std::endl;
benchmark_config->out << "- Running on tables from " << table_path << std::endl;

std::optional<std::unordered_set<std::string>> query_subset;
if (queries_str == "all") {
benchmark_config->out << "- Running all queries from specified path" << std::endl;
} else {
benchmark_config->out << "- Running subset of queries: " << queries_str << std::endl;

// "a, b, c, d" -> ["a", " b", " c", " d"]
auto query_subset_untrimmed = std::vector<std::string>{};
boost::algorithm::split(query_subset_untrimmed, queries_str, boost::is_any_of(","));

// ["a", " b", " c", " d"] -> ["a", "b", "c", "d"]
query_subset.emplace();
for (auto& query_name : query_subset_untrimmed) {
query_subset->emplace(boost::trim_copy(query_name));
}
}

// Ignore no .sql files in the directory for now (TODO(anybody): add CLI option if required)
const auto query_filename_blacklist = std::unordered_set<std::string>{};

// Run the benchmark
auto context = BenchmarkRunner::create_context(*benchmark_config);
auto table_generator = std::make_unique<FileBasedTableGenerator>(benchmark_config, table_path);
auto query_generator = std::make_unique<FileBasedQueryGenerator>(*benchmark_config, query_path);
auto query_generator =
std::make_unique<FileBasedQueryGenerator>(*benchmark_config, query_path, query_filename_blacklist, query_subset);

BenchmarkRunner{*benchmark_config, std::move(query_generator), std::move(table_generator), context}.run();
}
120 changes: 120 additions & 0 deletions src/benchmark/join_order_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#include <stdlib.h>

#include <boost/algorithm/string.hpp>
#include <cxxopts.hpp>

#include "benchmark_runner.hpp"
#include "cli_config_parser.hpp"
#include "file_based_query_generator.hpp"
#include "file_based_table_generator.hpp"
#include "import_export/csv_parser.hpp"
#include "scheduler/current_scheduler.hpp"
#include "scheduler/node_queue_scheduler.hpp"
#include "scheduler/topology.hpp"
#include "storage/storage_manager.hpp"
#include "storage/table.hpp"
#include "types.hpp"
#include "utils/filesystem.hpp"
#include "utils/load_table.hpp"
#include "utils/performance_warning.hpp"

/**
* The Join Order Benchmark was introduced by Leis et al. "How good are query optimizers, really?".
* It runs on an IMDB database from ~2013 that gets downloaded if necessary as part of running this benchmark.
* Its 113 queries are obtained from the "third_party/join-order-benchmark" submodule
*/

using namespace opossum; // NOLINT
using namespace std::string_literals; // NOLINT

int main(int argc, char* argv[]) {
auto cli_options = BenchmarkRunner::get_basic_cli_options("Hyrise Join Order Benchmark");

const auto DEFAULT_TABLE_PATH = "imdb_data";
const auto DEFAULT_QUERY_PATH = "third_party/join-order-benchmark";

// clang-format off
cli_options.add_options()
("table_path", "Directory containing the Tables", cxxopts::value<std::string>()->default_value(DEFAULT_TABLE_PATH)) // NOLINT
("query_path", "Directory/file containing the queries", cxxopts::value<std::string>()->default_value(DEFAULT_QUERY_PATH)) // NOLINT
("queries", "Subset of queries to run as a comma separated list", cxxopts::value<std::string>()->default_value("all")); // NOLINT
// clang-format on

std::shared_ptr<BenchmarkConfig> benchmark_config;
std::string query_path;
std::string table_path;
// Comma-separated query names or "all"
std::string queries_str;

if (CLIConfigParser::cli_has_json_config(argc, argv)) {
// JSON config file was passed in
const auto json_config = CLIConfigParser::parse_json_config_file(argv[1]);
table_path = json_config.value("table_path", DEFAULT_TABLE_PATH);
query_path = json_config.value("query_path", DEFAULT_QUERY_PATH);
queries_str = json_config.value("queries", "all");

benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_options_json_config(json_config));

} else {
// Parse regular command line args
const auto cli_parse_result = cli_options.parse(argc, argv);

// Display usage and quit
if (cli_parse_result.count("help")) {
std::cout << CLIConfigParser::detailed_help(cli_options) << std::endl;
return 0;
}

query_path = cli_parse_result["query_path"].as<std::string>();
table_path = cli_parse_result["table_path"].as<std::string>();
queries_str = cli_parse_result["queries"].as<std::string>();

benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_cli_options(cli_parse_result));
}

// Check that the options "query_path" and "table_path" were specified
if (query_path.empty() || table_path.empty()) {
std::cerr << "Need to specify --query_path=path/to/queries and --table_path=path/to/tables" << std::endl;
std::cerr << cli_options.help({}) << std::endl;
return 1;
}

/**
* Use a Python script to download and unzip the IMDB. We do this in Python and not in C++ because downloading and
* unzipping is straight forward in Python (and we suspect in C++ it might be... cumbersome).
*/
const auto setup_imdb_command = "python3 scripts/setup_imdb.py "s + table_path;
const auto setup_imdb_return_code = system(setup_imdb_command.c_str());
Assert(setup_imdb_return_code == 0, "setup_imdb.py failed. Did you run the benchmark from the project root dir?");

// The join-order-benchmark ships with these two .sql scripts, but we do not want to run them as part of the benchmark
// as they do not contains actual queries
const auto non_query_file_names = std::unordered_set<std::string>{"fkindexes.sql", "schema.sql"};

benchmark_config->out << "- Benchmarking queries from " << query_path << std::endl;
benchmark_config->out << "- Running on tables from " << table_path << std::endl;

std::optional<std::unordered_set<std::string>> query_subset;
if (queries_str == "all") {
benchmark_config->out << "- Running all queries from specified path" << std::endl;
} else {
benchmark_config->out << "- Running subset of queries: " << queries_str << std::endl;

// "a, b, c, d" -> ["a", " b", " c", " d"]
auto query_subset_untrimmed = std::vector<std::string>{};
boost::algorithm::split(query_subset_untrimmed, queries_str, boost::is_any_of(","));

// ["a", " b", " c", " d"] -> ["a", "b", "c", "d"]
query_subset.emplace();
for (auto& query_name : query_subset_untrimmed) {
query_subset->emplace(boost::trim_copy(query_name));
}
}

// Run the benchmark
auto context = BenchmarkRunner::create_context(*benchmark_config);
auto table_generator = std::make_unique<FileBasedTableGenerator>(benchmark_config, table_path);
auto query_generator = std::make_unique<FileBasedQueryGenerator>(*benchmark_config, query_path, non_query_file_names);

BenchmarkRunner{*benchmark_config, std::move(query_generator), std::move(table_generator), context}.run();
}
4 changes: 0 additions & 4 deletions src/benchmarklib/abstract_query_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ namespace opossum {

std::string AbstractQueryGenerator::get_preparation_queries() const { return ""; }

const std::vector<std::string>& AbstractQueryGenerator::query_names() const { return _query_names; }

size_t AbstractQueryGenerator::available_query_count() const { return _query_names.size(); }

size_t AbstractQueryGenerator::selected_query_count() const { return _selected_queries.size(); }

const std::vector<QueryID>& AbstractQueryGenerator::selected_queries() const { return _selected_queries; }
Expand Down
7 changes: 2 additions & 5 deletions src/benchmarklib/abstract_query_generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ class AbstractQueryGenerator {
virtual std::string build_query(const QueryID query_id) = 0;

// Returns the names of the individual queries (e.g., "TPC-H 1")
const std::vector<std::string>& query_names() const;
virtual std::string query_name(const QueryID query_id) const = 0;

// Returns the number of queries supported by the benchmark
size_t available_query_count() const;
virtual size_t available_query_count() const = 0;

// Returns the number of queries selected for execution
size_t selected_query_count() const;
Expand All @@ -40,9 +40,6 @@ class AbstractQueryGenerator {

// PREPARE and other statements that should be executed first
std::vector<std::string> _preparation_queries;

// Contains ALL query names, not only the selected ones
std::vector<std::string> _query_names;
};

} // namespace opossum
Loading

0 comments on commit a6c9d5f

Please sign in to comment.