Skip to content

Commit 0f0ec74

Browse files
committed
Bug#36734772: Hypergraph optimizer fails to use index for t1 INNER
JOIN t2 RIGHT JOIN t3 The hypergraph optimizer did not use indexes in some queries where an inner join was on the inner side of an outer join or on the outer side of a semijoin. This happened if both tables in the inner join could use index lookups, and one of the index lookups was parameterized on the other table in the inner join and the other index lookup was parameterized on the table on the other side of the outer join or semijoin. The problem was that DisallowParameterizedJoinPath() was too restrictive, and rejected all good plans for the query. It tries to avoid creating parameterized join paths if the parameterization could have been resolved earlier, so that the optimizer doesn't have to spend time unnecessarily on plans that are unlikely to be any better than the alternatives. In these cases, the parameterization couldn't have been resolved earlier, so it is too strict to reject these plans. This code is now changed so that it also allows creating a parameterized join if the left side of the join fully resolves the parameterization of the right side. Change-Id: I20728c24fe20128a76ae96772255a648842e65c3
1 parent 03bebe5 commit 0f0ec74

File tree

5 files changed

+246
-45
lines changed

5 files changed

+246
-45
lines changed

mysql-test/r/explain_tree_hypergraph.result

+4-2
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,8 @@ JSON_SEARCH(@var, 'one', '%FirstMatch%', NULL, '$**.operation') IS NOT NULL
393393
AS operation_contains_firstmatch;
394394
first_match operation_contains_firstmatch
395395
["firstmatch"] 1
396-
EXPLAIN FORMAT=tree SELECT * FROM t WHERE a IN (SELECT t1.a FROM t AS t1 JOIN t AS t2 ON t1.a=t2.b);
396+
EXPLAIN FORMAT=tree SELECT * FROM t WHERE a IN
397+
(SELECT /*+ SEMIJOIN(LOOSESCAN) */ t1.a FROM t AS t1 JOIN t AS t2 ON t1.a=t2.b);
397398
EXPLAIN
398399
-> Nested loop inner join (LooseScan) (rows=1.41)
399400
-> Remove duplicates from input grouped on t2.b (rows=1.41)
@@ -403,7 +404,8 @@ EXPLAIN
403404
-> Covering index lookup on t2 using b (b = t1.a) (rows=1)
404405
-> Covering index lookup on t using a (a = t2.b) (rows=1)
405406

406-
EXPLAIN FORMAT=JSON INTO @var SELECT * FROM t WHERE a IN (SELECT t1.a FROM t AS t1 JOIN t AS t2 ON t1.a=t2.b);
407+
EXPLAIN FORMAT=JSON INTO @var SELECT * FROM t WHERE a IN
408+
(SELECT /*+ SEMIJOIN(LOOSESCAN) */ t1.a FROM t AS t1 JOIN t AS t2 ON t1.a=t2.b);
407409
SELECT
408410
JSON_EXTRACT(@var, '$**.semijoin_strategy') AS loosescan,
409411
JSON_SEARCH(@var, 'one', '%LooseScan%', NULL, '$**.operation') IS NOT NULL

mysql-test/r/opt_hints_subquery_hypergraph.result

+8-10
Original file line numberDiff line numberDiff line change
@@ -173,12 +173,11 @@ SELECT * FROM t3
173173
WHERE t3.a IN (SELECT /*+ QB_NAME(subq1) */ a FROM t1 tx
174174
WHERE tx.b IN (SELECT /*+ QB_NAME(subq2) */ a FROM t1 ty));
175175
EXPLAIN
176-
-> Hash semijoin (FirstMatch) (t3.a = tx.a) (rows=1.5)
176+
-> Nested loop semijoin (FirstMatch) (rows=1.5)
177177
-> Table scan on t3 (rows=3)
178-
-> Hash
179-
-> Nested loop inner join (rows=4)
180-
-> Table scan on tx (rows=4)
181-
-> Single-row covering index lookup on ty using PRIMARY (a = tx.b) (rows=1)
178+
-> Nested loop inner join (rows=1)
179+
-> Single-row index lookup on tx using PRIMARY (a = t3.a) (rows=1)
180+
-> Single-row covering index lookup on ty using PRIMARY (a = tx.b) (rows=1)
182181

183182
No SEMIJOIN transformation for outer subquery
184183
EXPLAIN
@@ -1428,12 +1427,11 @@ SELECT /*+ SEMIJOIN(@subq1) SEMIJOIN(@subq2) */ * FROM t3
14281427
WHERE t3.a IN (SELECT /*+ QB_NAME(subq1) */ a FROM t1 tx
14291428
WHERE tx.b IN (SELECT /*+ QB_NAME(subq2) */ a FROM t1 ty));
14301429
EXPLAIN
1431-
-> Hash semijoin (FirstMatch) (t3.a = tx.a) (rows=1.5)
1430+
-> Nested loop semijoin (FirstMatch) (rows=1.5)
14321431
-> Table scan on t3 (rows=3)
1433-
-> Hash
1434-
-> Nested loop inner join (rows=4)
1435-
-> Table scan on tx (rows=4)
1436-
-> Single-row covering index lookup on ty using PRIMARY (a = tx.b) (rows=1)
1432+
-> Nested loop inner join (rows=1)
1433+
-> Single-row index lookup on tx using PRIMARY (a = t3.a) (rows=1)
1434+
-> Single-row covering index lookup on ty using PRIMARY (a = tx.b) (rows=1)
14371435

14381436
Test strategies when some are disabled by optimizer_switch
14391437
SET optimizer_switch='semijoin=on';

mysql-test/t/explain_tree_hypergraph.test

+2-1
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,8 @@ JSON_EXTRACT(@var, '$**.semijoin_strategy') AS first_match,
256256
JSON_SEARCH(@var, 'one', '%FirstMatch%', NULL, '$**.operation') IS NOT NULL
257257
AS operation_contains_firstmatch;
258258
# LooseScan strategy
259-
let $query = SELECT * FROM t WHERE a IN (SELECT t1.a FROM t AS t1 JOIN t AS t2 ON t1.a=t2.b);
259+
let $query = SELECT * FROM t WHERE a IN
260+
(SELECT /*+ SEMIJOIN(LOOSESCAN) */ t1.a FROM t AS t1 JOIN t AS t2 ON t1.a=t2.b);
260261
--replace_regex $elide_costs
261262
eval EXPLAIN FORMAT=tree $query;
262263
eval EXPLAIN FORMAT=JSON INTO @var $query;

sql/join_optimizer/join_optimizer.cc

+113-32
Original file line numberDiff line numberDiff line change
@@ -4228,12 +4228,49 @@ NodeMap FindReachableTablesFrom(NodeMap tables, const JoinHypergraph &graph) {
42284228
return reachable;
42294229
}
42304230

4231-
// Returns whether the given set of parameter tables is partially, but not
4232-
// fully, resolved by joining towards the other side.
4233-
bool PartiallyResolvedParameterization(NodeMap parameter_tables,
4234-
NodeMap other_side) {
4235-
return (parameter_tables & ~other_side) != 0 &&
4236-
(parameter_tables & ~other_side) != parameter_tables;
4231+
/**
4232+
Is it possible to resolve more parameter tables before performing a nested
4233+
loop join between "outer" and "inner", or will the join have to be performed
4234+
first?
4235+
4236+
In more precise terms:
4237+
4238+
Consider the set of parameters (a set of tables) that are left unresolved
4239+
after joining inner and outer. This function returns true if this set is
4240+
non-empty and at least one of these unresolved parameter tables, denoted by t,
4241+
can be joined directly into either outer or inner such that the result of
4242+
joining either {outer, t} with {inner} or {outer} with {inner, t} would end up
4243+
with more resolved parameters (fewer unresolved parameters) than simply
4244+
joining {outer} and {inner}.
4245+
*/
4246+
bool CanResolveMoreParameterTables(NodeMap outer, NodeMap inner,
4247+
NodeMap outer_parameters,
4248+
NodeMap inner_parameters,
4249+
NodeMap outer_reachable,
4250+
NodeMap inner_reachable) {
4251+
const NodeMap unresolved_parameters =
4252+
(outer_parameters | inner_parameters) & ~(outer | inner);
4253+
4254+
if (unresolved_parameters == 0) {
4255+
// No unresolved parameters after joining outer and inner (so we cannot
4256+
// resolve more parameters by first joining in parameter tables).
4257+
return false;
4258+
}
4259+
4260+
// Unresolved parameterizations on either side of the join can be resolved by
4261+
// joining a parameter table into the outer path first, if it's reachable.
4262+
if (Overlaps(unresolved_parameters, outer_reachable)) {
4263+
return true;
4264+
}
4265+
4266+
// Unresolved parameterizations that are only on the inner path, can also be
4267+
// resolved by joining a parameter table to the inner path first, if it's
4268+
// reachable.
4269+
if (Overlaps(unresolved_parameters & ~outer_parameters, inner_reachable)) {
4270+
return true;
4271+
}
4272+
4273+
return false;
42374274
}
42384275

42394276
/**
@@ -4245,40 +4282,72 @@ bool PartiallyResolvedParameterization(NodeMap parameter_tables,
42454282
plans to be left-deep (since such plans never gain anything from being
42464283
bushy), reducing the search space significantly without compromising
42474284
plan quality.
4285+
4286+
@param left_path An access path which joins together a superset of all the
4287+
tables on the left-hand side of the hyperedge for which we are creating a
4288+
join.
4289+
4290+
@param right_path An access path which joins together a superset of all the
4291+
tables on the right-hand side of the hyperedge for which we are creating a
4292+
join.
4293+
4294+
@param left The set of tables joined together in "left_path".
4295+
4296+
@param right The set of tables joined together in "right_path".
4297+
4298+
@param left_reachable The set of tables that can be joined directly with
4299+
"left_path", with no intermediate join being performed first. If a table is in
4300+
this set, it is possible to construct a nested loop join between an access
4301+
path accessing only that table and the access path pointed to by "left_path".
4302+
4303+
@param right_reachable The set of tables that can be joined directly with
4304+
"right_path", with no intermediate join being performed first. If a table is
4305+
in this set, it is possible to construct a nested loop join between an access
4306+
path accessing only that table and the access path pointed to by "right_path".
4307+
4308+
@param is_reorderable True if the optimizer may try to construct a nested loop
4309+
join between "left_path" and "right_path" in either direction. False if the
4310+
optimizer will consider nested loop joins in only one direction, with
4311+
"left_path" as the outer table and "right_path" as the inner table. When it is
4312+
true, we disallow a parameterized join path only if it is possible to resolve
4313+
more parameter tables first in both join orders. This is slightly more lenient
4314+
than it has to be, as it will allow parameterized join paths with both join
4315+
orders, even though one of the orders can join with a parameter table first.
4316+
Since all of these joins will be parameterized on the same set of tables, this
4317+
extra leniency is not believed to contribute much to the explosion of plans
4318+
with different parameterizations.
42484319
*/
42494320
bool DisallowParameterizedJoinPath(AccessPath *left_path,
42504321
AccessPath *right_path, NodeMap left,
42514322
NodeMap right, NodeMap left_reachable,
4252-
NodeMap right_reachable) {
4323+
NodeMap right_reachable,
4324+
bool is_reorderable) {
42534325
const NodeMap left_parameters = left_path->parameter_tables & ~RAND_TABLE_BIT;
42544326
const NodeMap right_parameters =
42554327
right_path->parameter_tables & ~RAND_TABLE_BIT;
42564328

4257-
if (IsSubset(left_parameters | right_parameters, left | right)) {
4258-
// Not creating a parameterized path, so it's always fine.
4329+
if (!CanResolveMoreParameterTables(left, right, left_parameters,
4330+
right_parameters, left_reachable,
4331+
right_reachable)) {
4332+
// Neither left nor right can resolve parameterization that is left
4333+
// unresolved by this join by first joining in one of the parameter tables.
4334+
// E.g., we're still on the inside of an outer join, and the parameter
4335+
// tables are outside the outer join, and we still need to join together
4336+
// more tables on the inner side of the outer join before we're allowed to
4337+
// do the outer join. We have to allow creation of a parameterized join path
4338+
// if we want to use index lookups here at all.
42594339
return false;
42604340
}
42614341

4262-
if (!Overlaps(right_parameters, right_reachable) &&
4263-
!Overlaps(left_parameters, left_reachable)) {
4264-
// Either left or right cannot resolve any of their parameterizations yet
4265-
// (e.g., we're still on the inside of an outer join that we cannot
4266-
// finish yet), so we cannot avoid keeping them if we want to use index
4267-
// lookups here at all.
4268-
return false;
4269-
}
4270-
4271-
// If the outer table partially, but not fully, resolves the inner table's
4272-
// parameterization, we still allow it (otherwise, we could not have
4273-
// multi-part index lookups where the keyparts come from different tables).
4274-
// This is the so-called “star-schema exception”.
4275-
//
4276-
// We need to check both ways, in case we try to swap them for a hash join.
4277-
// Only one of these will ever be true in any given join anyway (joins where
4278-
// we try to resolve the outer path's parameterizations with the inner one
4279-
// are disallowed), so we do not allow more than is required.
4280-
if (PartiallyResolvedParameterization(left_parameters, right) ||
4281-
PartiallyResolvedParameterization(right_parameters, left)) {
4342+
// If the join can be performed both ways (such as a commutable join
4343+
// operation, or a semijoin that can be rewritten to an inner join), we're a
4344+
// bit more lenient and allow creation of a parameterized join path even
4345+
// though a parameter table can be resolved first, if it is not possible to
4346+
// resolve any parameter tables first in the reordered join. Otherwise, we
4347+
// might not be able to use indexes in the reordered join.
4348+
if (is_reorderable && !CanResolveMoreParameterTables(
4349+
right, left, right_parameters, left_parameters,
4350+
right_reachable, left_reachable)) {
42824351
return false;
42834352
}
42844353

@@ -4565,9 +4634,21 @@ bool CostingReceiver::FoundSubgraphPair(NodeMap left, NodeMap right,
45654634
zero_path->delayed_predicates = right_path->delayed_predicates;
45664635
right_path = zero_path;
45674636
}
4637+
4638+
// Can this join be performed in both left-right and right-left order? It
4639+
// can if the join operation is commutative (or rewritable to one) and
4640+
// right_path's parameterization doesn't force it to be on the right side.
4641+
// If this condition is true, the right-left join will be attempted proposed
4642+
// in addition to the left-right join, but the additional checks in
4643+
// AllowNestedLoopJoin() and AllowHashJoin() decide if they are actually
4644+
// proposed.
4645+
const bool is_reorderable = (is_commutative || can_rewrite_semi_to_inner) &&
4646+
!Overlaps(right_path->parameter_tables, left);
4647+
45684648
for (AccessPath *left_path : left_it->second.paths) {
45694649
if (DisallowParameterizedJoinPath(left_path, right_path, left, right,
4570-
left_reachable, right_reachable)) {
4650+
left_reachable, right_reachable,
4651+
is_reorderable)) {
45714652
continue;
45724653
}
45734654

@@ -4606,7 +4687,7 @@ bool CostingReceiver::FoundSubgraphPair(NodeMap left, NodeMap right,
46064687
new_obsolete_orderings,
46074688
/*rewrite_semi_to_inner=*/false, &wrote_trace);
46084689
}
4609-
if (is_commutative || can_rewrite_semi_to_inner) {
4690+
if (is_reorderable) {
46104691
ProposeHashJoin(right, left, right_path, left_path, edge, new_fd_set,
46114692
new_obsolete_orderings,
46124693
/*rewrite_semi_to_inner=*/can_rewrite_semi_to_inner,
@@ -4617,7 +4698,7 @@ bool CostingReceiver::FoundSubgraphPair(NodeMap left, NodeMap right,
46174698
ProposeNestedLoopJoin(left, right, left_path, right_path, edge,
46184699
/*rewrite_semi_to_inner=*/false, new_fd_set,
46194700
new_obsolete_orderings, &wrote_trace);
4620-
if (is_commutative || can_rewrite_semi_to_inner) {
4701+
if (is_reorderable) {
46214702
ProposeNestedLoopJoin(
46224703
right, left, right_path, left_path, edge,
46234704
/*rewrite_semi_to_inner=*/can_rewrite_semi_to_inner, new_fd_set,

unittest/gunit/hypergraph_optimizer-t.cc

+119
Original file line numberDiff line numberDiff line change
@@ -2726,6 +2726,125 @@ TEST_F(HypergraphOptimizerTest, InnerNestloopShouldBeLeftDeep) {
27262726
// We don't verify the plan in itself.
27272727
}
27282728

2729+
// Verify that we can produce plans on this form for an inner join inside a left
2730+
// outer join:
2731+
//
2732+
// -> Nested loop left join
2733+
// -> Table scan on t1
2734+
// -> Nested loop inner join
2735+
// -> Single-row index lookup on t2 using key0 (x = t1.x)
2736+
// -> Single-row index lookup on t3 using key0 (x = t2.y)
2737+
//
2738+
// We should be able to use index lookups for both tables in the inner join.
2739+
TEST_F(HypergraphOptimizerTest, UseIndexesInInnerJoinInsideOuterJoin) {
2740+
Query_block *query_block = ParseAndResolve(
2741+
"SELECT 1 FROM t1 LEFT JOIN t2 INNER JOIN t3 ON t2.y=t3.x ON t1.x=t2.x",
2742+
/*nullable=*/true);
2743+
2744+
// Make the outer table small, so that it looks attractive with a nested loop
2745+
// with t1 on the left side and index lookups on t2 and t3 on the right side.
2746+
Fake_TABLE *t1 = m_fake_tables["t1"];
2747+
t1->file->stats.records = 10;
2748+
t1->file->stats.data_file_length = 1000;
2749+
2750+
// Make t2 and t3 big, so that using index lookups looks more attractive
2751+
// than scanning the tables, and create unique indexes on t2(x) and t3(x).
2752+
for (string table_name : {"t2", "t3"}) {
2753+
Fake_TABLE *t23 = m_fake_tables[table_name];
2754+
t23->file->stats.records = 1e6;
2755+
t23->file->stats.data_file_length = 1e9;
2756+
t23->create_index(t23->field[0], HA_NOSAME);
2757+
}
2758+
2759+
TraceGuard trace(m_thd);
2760+
AccessPath *root = FindBestQueryPlan(m_thd, query_block);
2761+
SCOPED_TRACE(trace.contents()); // Prints out the trace on failure.
2762+
ASSERT_NE(nullptr, root);
2763+
// Prints out the query plan on failure.
2764+
SCOPED_TRACE(PrintQueryPlan(0, root, query_block->join,
2765+
/*is_root_of_join=*/true));
2766+
2767+
// Expect the plan to be NLJ(t1, NLJ(INDEX_LOOKUP(t2), INDEX_LOOKUP(t3))). It
2768+
// used to do full table scans on t2 and t3 instead of index lookups.
2769+
ASSERT_EQ(AccessPath::NESTED_LOOP_JOIN, root->type);
2770+
const auto &outer_join = root->nested_loop_join();
2771+
2772+
ASSERT_EQ(AccessPath::NESTED_LOOP_JOIN, outer_join.inner->type);
2773+
const auto &inner_join = outer_join.inner->nested_loop_join();
2774+
2775+
ASSERT_EQ(AccessPath::EQ_REF, inner_join.outer->type);
2776+
EXPECT_STREQ("t2", inner_join.outer->eq_ref().table->alias);
2777+
2778+
ASSERT_EQ(AccessPath::EQ_REF, inner_join.inner->type);
2779+
EXPECT_STREQ("t3", inner_join.inner->eq_ref().table->alias);
2780+
}
2781+
2782+
// Verify that we can produce plans on this form for a semijoin with an inner
2783+
// join on the outer side.
2784+
//
2785+
// -> Nested loop inner join (LooseScan)
2786+
// -> Remove duplicates from input grouped on t3.x, t3.y
2787+
// -> Sort: t3.x, t3.y
2788+
// -> Table scan on t3
2789+
// -> Filter: (t1.y = t3.y)
2790+
// -> Nested loop inner join
2791+
// -> Single-row index lookup on t2 using key0 (x = t3.x)
2792+
// -> Single-row index lookup on t1 using key0 (x = t2.y)
2793+
//
2794+
// We should be able to put the inner join on the right hand side of a nested
2795+
// loop join, so that we can use index lookups on both the tables that are outer
2796+
// to the semijoin.
2797+
TEST_F(HypergraphOptimizerTest, UseIndexesInInnerJoinOutsideSemijoin) {
2798+
Query_block *query_block = ParseAndResolve(
2799+
"SELECT 1 FROM t1, t2 WHERE t1.x = t2.y AND "
2800+
"(t2.x, t1.y) IN (SELECT t3.x, t3.y FROM t3)",
2801+
/*nullable=*/true);
2802+
2803+
// Make t1 and t2 big, so that using index lookups looks more attractive
2804+
// than scanning the tables, and create unique indexes on t1(x) and t2(x).
2805+
for (string table_name : {"t1", "t2"}) {
2806+
Fake_TABLE *t12 = m_fake_tables[table_name];
2807+
t12->file->stats.records = 1e6;
2808+
t12->file->stats.data_file_length = 1e9;
2809+
t12->create_index(t12->field[0], HA_NOSAME);
2810+
}
2811+
2812+
// Make t3 small, so that it looks attractive with a nested loop with t3 on
2813+
// the left side and index lookups on t1 and t2 on the right side.
2814+
Fake_TABLE *t3 = m_fake_tables["t3"];
2815+
t3->file->stats.records = 10;
2816+
t3->file->stats.data_file_length = 1000;
2817+
2818+
TraceGuard trace(m_thd);
2819+
AccessPath *root = FindBestQueryPlan(m_thd, query_block);
2820+
SCOPED_TRACE(trace.contents()); // Prints out the trace on failure.
2821+
ASSERT_NE(nullptr, root);
2822+
// Prints out the query plan on failure.
2823+
SCOPED_TRACE(PrintQueryPlan(0, root, query_block->join,
2824+
/*is_root_of_join=*/true));
2825+
2826+
// Expect the plan to be
2827+
// NLJ(REMOVE_DUPS(t3), FILTER(NLJ(INDEX_LOOKUP(t2), INDEX_LOOKUP(t1)))).
2828+
// It used to do a full table scan on t1 instead of an index lookup.
2829+
ASSERT_EQ(AccessPath::NESTED_LOOP_JOIN, root->type);
2830+
const auto &outer_join = root->nested_loop_join();
2831+
EXPECT_EQ(AccessPath::REMOVE_DUPLICATES, outer_join.outer->type);
2832+
2833+
// The exact placement of the t1.y=t3.y filter is not important. It could also
2834+
// have been pushed down directly on top of the index lookup on t1(x). See
2835+
// bug#33477822.
2836+
ASSERT_EQ(AccessPath::FILTER, outer_join.inner->type);
2837+
ASSERT_EQ(AccessPath::NESTED_LOOP_JOIN,
2838+
outer_join.inner->filter().child->type);
2839+
const auto &inner_join = outer_join.inner->filter().child->nested_loop_join();
2840+
2841+
ASSERT_EQ(AccessPath::EQ_REF, inner_join.outer->type);
2842+
EXPECT_STREQ("t2", inner_join.outer->eq_ref().table->alias);
2843+
2844+
ASSERT_EQ(AccessPath::EQ_REF, inner_join.inner->type);
2845+
EXPECT_STREQ("t1", inner_join.inner->eq_ref().table->alias);
2846+
}
2847+
27292848
TEST_F(HypergraphOptimizerTest, CombineFilters) {
27302849
Query_block *query_block = ParseAndResolve(
27312850
"SELECT 1 FROM t1 WHERE t1.x = 1 HAVING RAND() > 0.5", /*nullable=*/true);

0 commit comments

Comments
 (0)