forked from hyrise/hyrise
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* moapjmeon * add script * nicer print * nicer print * fullci * things * HOAÖ
- Loading branch information
Showing
14 changed files
with
356 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
obj/* | ||
imdb_data | ||
.DS_STORE | ||
build/* | ||
build-*/* | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#!/usr/bin/python3 | ||
|
||
# This script is meant to be called by hyriseBenchmarkJoinOrder, but nothing stops you from calling it yourself. | ||
# It downloads the IMDB used by the JoinOrderBenchmark and unzips it. We do this in Python and not in C++ because | ||
# downloading and unzipping is straight forward in Python. | ||
|
||
import hashlib | ||
import os | ||
import sys | ||
import urllib.request | ||
import zipfile | ||
|
||
|
||
def clean_up(including_table_dir=False): | ||
if os.path.exists(FILE_NAME): | ||
os.remove(FILE_NAME) | ||
|
||
if including_table_dir and os.path.exists(table_dir): | ||
for file in os.listdir(table_dir): | ||
os.remove("./%s/%s" % (table_dir, file)) | ||
os.rmdir(table_dir) | ||
|
||
|
||
def is_setup(): | ||
for table_name in TABLE_NAMES: | ||
if not os.path.exists(os.path.join(table_dir, table_name + ".csv")): | ||
return False | ||
if not os.path.exists(os.path.join(table_dir, table_name + ".csv.json")): | ||
return False | ||
|
||
return True | ||
|
||
|
||
# [cmd, table_dir] | ||
assert len(sys.argv) == 2 | ||
table_dir = sys.argv[1] | ||
|
||
LOCATION = "https://www.dropbox.com/s/ckh4nyqpol70ri3/imdb.zip?dl=1" | ||
FILE_NAME = "imdb.zip" | ||
TABLE_NAMES = ["aka_name", "aka_title", "cast_info", "char_name", "company_name", "company_type", "comp_cast_type", "complete_cast", "info_type", | ||
"keyword", "kind_type", "link_type", "movie_companies", "movie_info", "movie_info_idx", "movie_keyword", "movie_link", "name", | ||
"person_info", "role_type", "title"] | ||
|
||
print("Retrieving the IMDB dataset.") | ||
|
||
if is_setup(): | ||
print(" IMDB setup already complete, no setup action required") | ||
sys.exit(0) | ||
|
||
# We are going to calculate the md5 hash later, on-the-fly while downloading | ||
hash_md5 = hashlib.md5() | ||
|
||
url = urllib.request.urlopen(LOCATION) | ||
meta = url.info() | ||
file_size = int(meta['Content-Length']) | ||
|
||
file = open(FILE_NAME, 'wb') | ||
|
||
print(" Downloading: %s (%.2f GB)" % (FILE_NAME, file_size / 1000 / 1000 / 1000)) | ||
|
||
already_retrieved = 0 | ||
block_size = 8192 | ||
try: | ||
while True: | ||
buffer = url.read(block_size) | ||
if not buffer: | ||
break | ||
|
||
hash_md5.update(buffer) | ||
|
||
already_retrieved += len(buffer) | ||
file.write(buffer) | ||
status = r" Retrieved %3.2f%% of the data" % (already_retrieved * 100. / file_size) | ||
status = status + chr(8) * (len(status) + 1) | ||
print(status, end='\r') | ||
except: | ||
print(" Aborting. Something went wrong during the download. Cleaning up.") | ||
clean_up() | ||
sys.exit(1) | ||
|
||
file.close() | ||
print() | ||
print(" Validating integrity...") | ||
|
||
hash_dl = hash_md5.hexdigest() | ||
|
||
if hash_dl != "79e4c71f8ec0dae17d6aa9182fdab835": | ||
print(" Aborting. MD5 checksum mismatch. Cleaning up.") | ||
clean_up() | ||
sys.exit(2) | ||
|
||
print(" Downloaded file is valid.") | ||
print(" Unzipping the file...") | ||
|
||
try: | ||
zip = zipfile.ZipFile("imdb.zip", "r") | ||
zip.extractall(table_dir) | ||
zip.close() | ||
except: | ||
print(" Aborting. Something went wrong during unzipping. Cleaning up.") | ||
clean_up(including_table_dir=True) | ||
sys.exit(3) | ||
|
||
print(" Deleting the archive file.") | ||
clean_up() | ||
print(" imdb_setup.py ran sucessfully.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#include <stdlib.h> | ||
|
||
#include <boost/algorithm/string.hpp> | ||
#include <cxxopts.hpp> | ||
|
||
#include "benchmark_runner.hpp" | ||
#include "cli_config_parser.hpp" | ||
#include "file_based_query_generator.hpp" | ||
#include "file_based_table_generator.hpp" | ||
#include "import_export/csv_parser.hpp" | ||
#include "scheduler/current_scheduler.hpp" | ||
#include "scheduler/node_queue_scheduler.hpp" | ||
#include "scheduler/topology.hpp" | ||
#include "storage/storage_manager.hpp" | ||
#include "storage/table.hpp" | ||
#include "types.hpp" | ||
#include "utils/filesystem.hpp" | ||
#include "utils/load_table.hpp" | ||
#include "utils/performance_warning.hpp" | ||
|
||
/** | ||
* The Join Order Benchmark was introduced by Leis et al. "How good are query optimizers, really?". | ||
* It runs on an IMDB database from ~2013 that gets downloaded if necessary as part of running this benchmark. | ||
* Its 113 queries are obtained from the "third_party/join-order-benchmark" submodule | ||
*/ | ||
|
||
using namespace opossum; // NOLINT | ||
using namespace std::string_literals; // NOLINT | ||
|
||
int main(int argc, char* argv[]) { | ||
auto cli_options = BenchmarkRunner::get_basic_cli_options("Hyrise Join Order Benchmark"); | ||
|
||
const auto DEFAULT_TABLE_PATH = "imdb_data"; | ||
const auto DEFAULT_QUERY_PATH = "third_party/join-order-benchmark"; | ||
|
||
// clang-format off | ||
cli_options.add_options() | ||
("table_path", "Directory containing the Tables", cxxopts::value<std::string>()->default_value(DEFAULT_TABLE_PATH)) // NOLINT | ||
("query_path", "Directory/file containing the queries", cxxopts::value<std::string>()->default_value(DEFAULT_QUERY_PATH)) // NOLINT | ||
("queries", "Subset of queries to run as a comma separated list", cxxopts::value<std::string>()->default_value("all")); // NOLINT | ||
// clang-format on | ||
|
||
std::shared_ptr<BenchmarkConfig> benchmark_config; | ||
std::string query_path; | ||
std::string table_path; | ||
// Comma-separated query names or "all" | ||
std::string queries_str; | ||
|
||
if (CLIConfigParser::cli_has_json_config(argc, argv)) { | ||
// JSON config file was passed in | ||
const auto json_config = CLIConfigParser::parse_json_config_file(argv[1]); | ||
table_path = json_config.value("table_path", DEFAULT_TABLE_PATH); | ||
query_path = json_config.value("query_path", DEFAULT_QUERY_PATH); | ||
queries_str = json_config.value("queries", "all"); | ||
|
||
benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_options_json_config(json_config)); | ||
|
||
} else { | ||
// Parse regular command line args | ||
const auto cli_parse_result = cli_options.parse(argc, argv); | ||
|
||
// Display usage and quit | ||
if (cli_parse_result.count("help")) { | ||
std::cout << CLIConfigParser::detailed_help(cli_options) << std::endl; | ||
return 0; | ||
} | ||
|
||
query_path = cli_parse_result["query_path"].as<std::string>(); | ||
table_path = cli_parse_result["table_path"].as<std::string>(); | ||
queries_str = cli_parse_result["queries"].as<std::string>(); | ||
|
||
benchmark_config = std::make_shared<BenchmarkConfig>(CLIConfigParser::parse_basic_cli_options(cli_parse_result)); | ||
} | ||
|
||
// Check that the options "query_path" and "table_path" were specified | ||
if (query_path.empty() || table_path.empty()) { | ||
std::cerr << "Need to specify --query_path=path/to/queries and --table_path=path/to/tables" << std::endl; | ||
std::cerr << cli_options.help({}) << std::endl; | ||
return 1; | ||
} | ||
|
||
/** | ||
* Use a Python script to download and unzip the IMDB. We do this in Python and not in C++ because downloading and | ||
* unzipping is straight forward in Python (and we suspect in C++ it might be... cumbersome). | ||
*/ | ||
const auto setup_imdb_command = "python3 scripts/setup_imdb.py "s + table_path; | ||
const auto setup_imdb_return_code = system(setup_imdb_command.c_str()); | ||
Assert(setup_imdb_return_code == 0, "setup_imdb.py failed. Did you run the benchmark from the project root dir?"); | ||
|
||
// The join-order-benchmark ships with these two .sql scripts, but we do not want to run them as part of the benchmark | ||
// as they do not contains actual queries | ||
const auto non_query_file_names = std::unordered_set<std::string>{"fkindexes.sql", "schema.sql"}; | ||
|
||
benchmark_config->out << "- Benchmarking queries from " << query_path << std::endl; | ||
benchmark_config->out << "- Running on tables from " << table_path << std::endl; | ||
|
||
std::optional<std::unordered_set<std::string>> query_subset; | ||
if (queries_str == "all") { | ||
benchmark_config->out << "- Running all queries from specified path" << std::endl; | ||
} else { | ||
benchmark_config->out << "- Running subset of queries: " << queries_str << std::endl; | ||
|
||
// "a, b, c, d" -> ["a", " b", " c", " d"] | ||
auto query_subset_untrimmed = std::vector<std::string>{}; | ||
boost::algorithm::split(query_subset_untrimmed, queries_str, boost::is_any_of(",")); | ||
|
||
// ["a", " b", " c", " d"] -> ["a", "b", "c", "d"] | ||
query_subset.emplace(); | ||
for (auto& query_name : query_subset_untrimmed) { | ||
query_subset->emplace(boost::trim_copy(query_name)); | ||
} | ||
} | ||
|
||
// Run the benchmark | ||
auto context = BenchmarkRunner::create_context(*benchmark_config); | ||
auto table_generator = std::make_unique<FileBasedTableGenerator>(benchmark_config, table_path); | ||
auto query_generator = std::make_unique<FileBasedQueryGenerator>(*benchmark_config, query_path, non_query_file_names); | ||
|
||
BenchmarkRunner{*benchmark_config, std::move(query_generator), std::move(table_generator), context}.run(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.