Skip to content

Commit

Permalink
Complement InRewriteRule's auto approach (hyrise#2408)
Browse files Browse the repository at this point in the history
Now, we heuristically take the number of input rows into account for InRewriting instead of purely relying on the number of items in the IN. At some point a proper cost model would be best.
  • Loading branch information
Bensk1 authored Sep 29, 2021
1 parent 9500af5 commit a750ae9
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 46 deletions.
19 changes: 16 additions & 3 deletions src/lib/optimizer/strategy/in_expression_rewrite_rule.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "in_expression_rewrite_rule.hpp"

#include "cost_estimation/abstract_cost_estimator.hpp"
#include "expression/binary_predicate_expression.hpp"
#include "expression/expression_functional.hpp"
#include "expression/in_expression.hpp"
Expand All @@ -12,6 +13,7 @@
#include "logical_query_plan/predicate_node.hpp"
#include "logical_query_plan/static_table_node.hpp"
#include "logical_query_plan/union_node.hpp"
#include "statistics/cardinality_estimator.hpp"
#include "statistics/table_statistics.hpp"
#include "storage/table.hpp"

Expand Down Expand Up @@ -104,6 +106,13 @@ std::string InExpressionRewriteRule::name() const {
return name;
}

std::shared_ptr<AbstractCardinalityEstimator> InExpressionRewriteRule::_cardinality_estimator() const {
if (!_cardinality_estimator_internal)
_cardinality_estimator_internal = cost_estimator->cardinality_estimator->new_instance();

return _cardinality_estimator_internal;
}

void InExpressionRewriteRule::_apply_to_plan_without_subqueries(
const std::shared_ptr<AbstractLQPNode>& lqp_root) const {
if (strategy == Strategy::ExpressionEvaluator) {
Expand Down Expand Up @@ -162,11 +171,15 @@ void InExpressionRewriteRule::_apply_to_plan_without_subqueries(
Assert(!in_expression->is_negated(), "Disjunctions cannot handle NOT IN");
rewrite_to_disjunction(sub_node, left_expression, right_side_expressions, *common_data_type);
} else if (strategy == Strategy::Auto) {
if (right_side_expressions.size() <= MAX_ELEMENTS_FOR_DISJUNCTION && !in_expression->is_negated()) {
rewrite_to_disjunction(sub_node, left_expression, right_side_expressions, *common_data_type);
} else if (common_data_type && right_side_expressions.size() >= MIN_ELEMENTS_FOR_JOIN) {
if (right_side_expressions.size() >= MIN_ELEMENTS_FOR_JOIN) {
rewrite_to_join(sub_node, left_expression, right_side_expressions, *common_data_type,
in_expression->is_negated());
} else if ((right_side_expressions.size() <= MAX_ELEMENTS_FOR_DISJUNCTION ||
_cardinality_estimator()->estimate_cardinality(sub_node->left_input()) >=
MIN_INPUT_ROWS_FOR_DISJUNCTION) &&
!in_expression->is_negated() &&
!std::dynamic_pointer_cast<FunctionExpression>(in_expression->value())) {
rewrite_to_disjunction(sub_node, left_expression, right_side_expressions, *common_data_type);
} else {
// Stick with the ExpressionEvaluator
}
Expand Down
18 changes: 14 additions & 4 deletions src/lib/optimizer/strategy/in_expression_rewrite_rule.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ namespace opossum {
class AbstractLQPNode;
class PredicateNode;

// Depending on the size and type of an IN expression `a IN (...)`, this rule rewrites the expression to
// - a bunch of disjunctive predicate and union nodes (equivalent to `a = 1 OR a = 2`) if the right side has up to
// MAX_ELEMENTS_FOR_DISJUNCTION elements and the elements are of the same type.
// Depending on the size and type of an IN expression `a IN (...)` and its input's size, this rule rewrites the
// expression to
// - a bunch of disjunctive predicate and union nodes (equivalent to `a = 1 OR a = 2`) if the elements are of the same
// type, if the `IN` is not part of a `FunctionExpression`. Also either the right side cannot have more than
// MAX_ELEMENTS_FOR_DISJUNCTION elements or the input's size must be larger than MIN_INPUT_ROWS_FOR_DISJUNCTION
// - a semi/anti join (with the list of IN values being stored in a temporary table) if the right side has more than
// MIN_ELEMENTS_FOR_JOIN elements and the elements are of the same type. The exact value of MIN_ELEMENTS_FOR_JOIN
// also depends on the size of the input data (see #1817). Once this becomes relevant, we might want to add a cost
Expand All @@ -21,9 +23,13 @@ class InExpressionRewriteRule : public AbstractRule {
std::string name() const override;

// With the auto strategy, IN expressions with up to MAX_ELEMENTS_FOR_DISJUNCTION on the right side are rewritten
// into disjunctive predicates. This value was chosen conservatively, also to keep the LQPs easy to read.
// into disjunctive predicates.
constexpr static auto MAX_ELEMENTS_FOR_DISJUNCTION = 3;

// With the auto strategy, IN expressions whose input has more than MIN_INPUT_ROWS_FOR_DISJUNCTION are rewritten
// into disjunctive predicates.
constexpr static auto MIN_INPUT_ROWS_FOR_DISJUNCTION = 1'000'000.f;

// With the auto strategy, IN expressions with MIN_ELEMENTS_FOR_JOIN or more are rewritten into semi joins.
constexpr static auto MIN_ELEMENTS_FOR_JOIN = 20;

Expand All @@ -34,6 +40,10 @@ class InExpressionRewriteRule : public AbstractRule {

protected:
void _apply_to_plan_without_subqueries(const std::shared_ptr<AbstractLQPNode>& lqp_root) const override;

std::shared_ptr<AbstractCardinalityEstimator> _cardinality_estimator() const;

mutable std::shared_ptr<AbstractCardinalityEstimator> _cardinality_estimator_internal;
};

} // namespace opossum
129 changes: 90 additions & 39 deletions src/test/lib/optimizer/strategy/in_expression_rewrite_rule_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,22 @@ class InExpressionRewriteRuleTest : public StrategyBaseTest {
void SetUp() override {
// col_a has 1000 entries across 200 values linearly distributed between 1 and 200
node = create_mock_node_with_statistics(
MockNode::ColumnDefinitions{{DataType::Int, "col_a"}, {DataType::Float, "col_b"}}, 1000,
MockNode::ColumnDefinitions{{DataType::Int, "col_a"}, {DataType::Float, "col_b"}, {DataType::String, "col_c"}},
1000,
{{GenericHistogram<int32_t>::with_single_bin(1, 200, 1000, 200),
GenericHistogram<float>::with_single_bin(1.0f, 50.0f, 100, 10)}});
GenericHistogram<float>::with_single_bin(1.0f, 50.0f, 100, 10),
GenericHistogram<pmr_string>::with_single_bin("a", "z", 1, 1000)}});
col_a = node->get_column("col_a");
col_b = node->get_column("col_b");
col_c = node->get_column("col_c");

many_row_node =
create_mock_node_with_statistics(MockNode::ColumnDefinitions{{DataType::Int, "col_large"}}, 10'000'000,
{GenericHistogram<int32_t>::with_single_bin(1, 10'000'000, 1, 10'000'000)});
col_large = many_row_node->get_column("col_large");

single_element_in_expression = in_(col_a, list_(1));
two_element_functional_in_expression = in_(substr_(col_c, 1, 5), list_("85669", "86197"));
five_element_in_expression = in_(col_a, list_(1, 2, 3, 4, 5));
five_element_not_in_expression = not_in_(col_a, list_(1, 2, 3, 4, 5));
duplicate_element_in_expression = in_(col_a, list_(1, 2, 1));
Expand All @@ -34,12 +43,54 @@ class InExpressionRewriteRuleTest : public StrategyBaseTest {
for (auto i = 0; i < 100; ++i) hundred_elements.emplace_back(value_(i));
hundred_element_in_expression = std::make_shared<InExpression>(PredicateCondition::In, col_a,
std::make_shared<ListExpression>(hundred_elements));
hundred_element_in_expression_large_input = std::make_shared<InExpression>(
PredicateCondition::In, col_large, std::make_shared<ListExpression>(hundred_elements));
}

public:
std::shared_ptr<MockNode> node;
std::shared_ptr<AbstractExpression> col_a, col_b, single_element_in_expression, five_element_in_expression,
five_element_not_in_expression, hundred_element_in_expression, duplicate_element_in_expression,
// Can't use EXPECT_LQP_EQ for disjunction rewrites for multiple elements, because ExpressionUnorderedSet produces
// a non-deterministic order of predicates
bool check_disjunction(std::shared_ptr<AbstractLQPNode> result_lqp, std::vector<int> expected_values) {
auto values_found_in_predicates = std::vector<int>{};

// Checks that a given node is a predicate of the form `col_a = x` where x is an int and will be added to
// values_found_in_predicates
const auto verify_predicate_node = [&](const auto& node) {
EXPECT_EQ(node->type, LQPNodeType::Predicate);
auto predicate_node = std::dynamic_pointer_cast<PredicateNode>(node);
EXPECT_TRUE(predicate_node);
auto predicate = std::dynamic_pointer_cast<BinaryPredicateExpression>(predicate_node->predicate());
EXPECT_TRUE(predicate);
EXPECT_EQ(predicate->left_operand(), col_a);
EXPECT_EQ(predicate->right_operand()->type, ExpressionType::Value);
values_found_in_predicates.emplace_back(
boost::get<int>(dynamic_cast<ValueExpression&>(*predicate->right_operand()).value));
};

for (auto union_node_idx = size_t{0}; union_node_idx < expected_values.size() - 1; ++union_node_idx) {
EXPECT_EQ(result_lqp->type, LQPNodeType::Union);
auto union_node = std::dynamic_pointer_cast<UnionNode>(result_lqp);
EXPECT_TRUE(union_node);
EXPECT_EQ(union_node->set_operation_mode, SetOperationMode::All);

verify_predicate_node(union_node->right_input());

result_lqp = union_node->left_input();
}
// After checking expected_values.size() - 1 union nodes, the last node has predicates on both sides
verify_predicate_node(result_lqp);

std::sort(values_found_in_predicates.begin(), values_found_in_predicates.end());

if (values_found_in_predicates == expected_values) return true;

return false;
}

std::shared_ptr<MockNode> node, many_row_node;
std::shared_ptr<AbstractExpression> col_a, col_b, col_c, col_large, single_element_in_expression,
two_element_functional_in_expression, five_element_in_expression, five_element_not_in_expression,
hundred_element_in_expression, hundred_element_in_expression_large_input, duplicate_element_in_expression,
different_types_on_left_and_right_side_expression, different_types_on_right_side_expression, null_in_expression;
};

Expand Down Expand Up @@ -88,40 +139,7 @@ TEST_F(InExpressionRewriteRuleTest, DisjunctionStrategy) {
const auto input_lqp = PredicateNode::make(five_element_in_expression, node);
const auto result_lqp = StrategyBaseTest::apply_rule(rule, input_lqp);

// Can't use EXPECT_LQP_EQ here, because ExpressionUnorderedSet produces a non-deterministic order of predicates
auto values_found_in_predicates = std::vector<int>{};

// Checks that a given node is a predicate of the form `col_a = x` where x is an int and will be added to
// values_found_in_predicates
const auto verify_predicate_node = [&](const auto& node) {
ASSERT_EQ(node->type, LQPNodeType::Predicate);
auto predicate_node = std::dynamic_pointer_cast<PredicateNode>(node);
ASSERT_TRUE(predicate_node);
auto predicate = std::dynamic_pointer_cast<BinaryPredicateExpression>(predicate_node->predicate());
ASSERT_TRUE(predicate);
EXPECT_EQ(predicate->left_operand(), col_a);
ASSERT_EQ(predicate->right_operand()->type, ExpressionType::Value);
values_found_in_predicates.emplace_back(
boost::get<int>(dynamic_cast<ValueExpression&>(*predicate->right_operand()).value));
};

auto current_node = result_lqp;
for (auto union_node_idx = 0; union_node_idx < 4; ++union_node_idx) {
ASSERT_EQ(current_node->type, LQPNodeType::Union);
auto union_node = std::dynamic_pointer_cast<UnionNode>(current_node);
ASSERT_TRUE(union_node);
EXPECT_EQ(union_node->set_operation_mode, SetOperationMode::All);

verify_predicate_node(union_node->right_input());

current_node = union_node->left_input();
}
// After checking four union nodes, the last node has predicates on both sides
verify_predicate_node(current_node);

std::sort(values_found_in_predicates.begin(), values_found_in_predicates.end());
const auto expected_values = std::vector<int>{1, 2, 3, 4, 5};
EXPECT_EQ(values_found_in_predicates, expected_values);
EXPECT_TRUE(check_disjunction(result_lqp, {1, 2, 3, 4, 5}));
}

{
Expand Down Expand Up @@ -304,6 +322,23 @@ TEST_F(InExpressionRewriteRuleTest, AutoStrategy) {
EXPECT_NEAR(cardinality_estimator.estimate_cardinality(result_lqp), 1000.f / 200 * 100, 10);
}

{
// Join for 100 elements even if table is large
const auto input_lqp = PredicateNode::make(hundred_element_in_expression_large_input, many_row_node);
const auto result_lqp = StrategyBaseTest::apply_rule(rule, input_lqp);

const auto column_definitions = TableColumnDefinitions{{"right_values", DataType::Int, false}};
auto table = std::make_shared<Table>(column_definitions, TableType::Data);
for (auto i = 0; i < 100; ++i) table->append({i});
const auto static_table_node = StaticTableNode::make(table);
const auto right_col = lqp_column_(static_table_node, ColumnID{0});
const auto expected_lqp =
JoinNode::make(JoinMode::Semi, equals_(col_large, right_col), many_row_node, static_table_node);

EXPECT_LQP_EQ(result_lqp, expected_lqp);
EXPECT_TABLE_EQ_UNORDERED(static_cast<StaticTableNode&>(*result_lqp->right_input()).table, table);
}

{
// Disjunction for two elements, even if one is NULL
const auto input_lqp = PredicateNode::make(null_in_expression, node);
Expand All @@ -316,6 +351,22 @@ TEST_F(InExpressionRewriteRuleTest, AutoStrategy) {
// clang-format on
EXPECT_LQP_EQ(result_lqp, expected_lqp);
}

{
// Disjunction for five elements, if table is large
const auto input_lqp = PredicateNode::make(five_element_in_expression, many_row_node);
const auto result_lqp = StrategyBaseTest::apply_rule(rule, input_lqp);

EXPECT_TRUE(check_disjunction(result_lqp, {1, 2, 3, 4, 5}));
}

{
// ExpressionEvaluator, despite table is large and elements below threshold if FunctionExpression contained
const auto input_lqp = PredicateNode::make(two_element_functional_in_expression, many_row_node);
const auto result_lqp = StrategyBaseTest::apply_rule(rule, input_lqp);

EXPECT_EQ(result_lqp, input_lqp);
}
}

} // namespace opossum

0 comments on commit a750ae9

Please sign in to comment.