Skip to content

Commit eb86b40

Browse files
author
Dag Wanvik
committed
WL#15257 Enhanced performance for set operations
Implements hashing using Robin Hood hash (same hashing method as for hashed JOINs) to de-duplicate set operands of INTERSECT and EXCEPT. The space used for this is controlled by setop_hash_buffer_size (default 256 * 1024 bytes). which is the same default as for join_buffer_size. If this space is exceeded, we spill to disk, cf. the algorithm described in the WL, using "chunk" files, similar to what is employed for hashed JOIN overflow. Essentially, this is a divide and conquer approach in which we spread the rows according to another hash (tertiary hash function) of the row partitioning the rows, allowing us to process with the in-memory hash table one partition (chunk pair) at a time. But even this can fail if the table statistics are not up to date or row sizes are skewed (secondary overflow). In this case, we abort the hashing approach and fall back on using the result set's materialized temporary table's non-unique index on the primary hash of the row for de-duplication of the rows. Asymptotically with large table sizes and (too) small sizes of setop_hash_buffer_size we need several *sets* of chunk files because we don't want to have more than a certain number of temporary files open at a time (i.e. 128). This will eventually be slower than using the key of the materialized table (InnoDB). The code does *not* detect this and fall back on using the key of the materialized table when this would be faster. To get optimal performance, it behoves the user to ascertain that the buffer is large enough for the volume of (distinct) rows of in the left operand. The work log contains measurements that give an indication of the performance. Testing the operation of spill to disk is straight forward, just have enough rows in the left operand, detected by seeing temporary files being used by a query, cf. using SHOW STATUS LIKE '%tmp%'; before and after, or inspecting the optimizer trace for evidence ("spill to disk initiated"). Testing secondary overflow is possible by error injection at those locations where space overflow may happen, cf. the debug-only variable debug_setop_secondary_overflow_at If this variable is set, it will inject a secondary overflow event during hashing. Three integers must be given to indicate where to inject the overflow: a) set index, cf. explanation in comments for class SpillState b) chunk index c) row number syntax: <set-idx:integer 0-based> <chunk-idx:integer 0-based> <row_no:integer 1-based> Example: SET SESSION debug_setop_secondary_overflow_at = '1 5 7'; If the numbers given are outside range on the high side, they will never trigger any secondary spill. Secondary overflow can also be seen in the optimizer trace ("spill handling overflow, reverting to index"). By default, INTERSECT and EXCEPT use hashing with this patch. This may be disabled in favour of the old approach (i.e. tmp table key) for testing purposed (e.g. to compare results), via the optimizer switch hash_set_operations {true|false} We also added optimizer trace so the user can detect spill actions, cf. tests for this in query_expression.test. Change-Id: I364b29f515cacd2c8f70f73d34b2fd1e0b8de190
1 parent 683c9bc commit eb86b40

File tree

97 files changed

+9929
-2521
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+9929
-2521
lines changed

include/my_sys.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,7 @@ inline bool my_b_inited(const IO_CACHE *info) {
496496
constexpr int my_b_EOF = INT_MIN;
497497

498498
inline int my_b_read(IO_CACHE *info, uchar *buffer, size_t count) {
499+
assert(info->type != WRITE_CACHE);
499500
if (info->read_pos + count <= info->read_end) {
500501
memcpy(buffer, info->read_pos, count);
501502
info->read_pos += count;
@@ -505,6 +506,7 @@ inline int my_b_read(IO_CACHE *info, uchar *buffer, size_t count) {
505506
}
506507

507508
inline int my_b_write(IO_CACHE *info, const uchar *buffer, size_t count) {
509+
assert(info->type != READ_CACHE);
508510
if (info->write_pos + count <= info->write_end) {
509511
memcpy(info->write_pos, buffer, count);
510512
info->write_pos += count;

mysql-test/include/elide_costs.inc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,19 @@ let $elide_costs_and_rows = /[(]cost=[0-9.e+-]+ rows=[0-9.e+-]+[)]/(...)/;
4444
# ellipsis for the elided metrics. This is to make the output the same
4545
# regardless of whether there actually is some metric to elide.
4646
let $elide_metrics = / *[(]cost=.*//;
47+
48+
# Usage: --replace_regex $elide_trace_costs_and_rows
49+
# Removes metrics from optimizer trace (JSON)
50+
#
51+
# Filter out unstable cost and row estimates. replace_regex
52+
# allows only a *single* variable substitution
53+
# (cf. do_get_replace_regex in mysqltest.cc), so build the
54+
# entire expression incrementally.
55+
let $elide_trace_costs_and_rows=/cost\": [0-9.]+/cost\": "elided"/;
56+
let $elide_trace_costs_and_rows=$elide_trace_costs_and_rows /cost_for_plan\": [0-9.]+/cost_for_plan\": "elided"/;
57+
let $elide_trace_costs_and_rows=$elide_trace_costs_and_rows /rows\": [0-9.]+/rows\": "elided"/;
58+
let $elide_trace_costs_and_rows=$elide_trace_costs_and_rows /rows_for_plan\": [0-9.]+/rows_for_plan\": "elided"/;
59+
let $elide_trace_costs_and_rows=$elide_trace_costs_and_rows /rows_to_scan\": [0-9.]+/rows_to_scan\": "elided"/;
60+
let $elide_trace_costs_and_rows=$elide_trace_costs_and_rows /num_rows_estimate\": [0-9.]+/num_rows_estimate\": "elided"/;
61+
62+
let $elide_json_costs=/cost": "[0-9.]*"/cost": "elided"/;
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
--echo #
2+
--echo # WL#15257 Enhanced performance for set operations: INTERSECT,
3+
--echo # EXCEPT i.e. hashed implementation of EXCEPT, INTERSECT with
4+
--echo # spill to disk (chunk files) if hash table can't fit in memory
5+
--echo # and fallback to de-duplication via keyed temporary table as
6+
--echo # last resort. The latter is tested with error injection in
7+
--echo # query_expression_debug.
8+
--echo #
9+
--echo # The size of the VARCHAR column is an argument ($char_type) so
10+
--echo # we can test short varchar fields as well as blobs.
11+
--echo #
12+
eval CREATE TABLE t(i INT, d DATE, c $char_type CHARSET latin1) ENGINE=innodb;
13+
14+
set @@cte_max_recursion_depth = 100000;
15+
INSERT INTO t
16+
WITH RECURSIVE cte AS (
17+
SELECT 0 AS i, '2022-04-30' AS d, 'abracadabra' as c
18+
UNION
19+
SELECT 1 AS i, '2022-04-30' AS d, 'rabarbra' as c
20+
UNION
21+
SELECT i+2, d, c FROM cte
22+
WHERE i+2 < 65536/2
23+
)
24+
SELECT i,d,c FROM cte;
25+
set @@cte_max_recursion_depth = default;
26+
27+
# insert one duplicate of each row
28+
INSERT INTO t select i, d, c FROM t;
29+
ANALYZE TABLE t;
30+
31+
32+
SELECT COUNT(*) FROM t;
33+
SELECT COUNT(*) FROM (SELECT DISTINCT i,d,c FROM t) derived;
34+
35+
# so we can get reliable Created_tmp_files counts below
36+
FLUSH STATUS;
37+
SHOW STATUS LIKE 'Created_tmp_files%';
38+
39+
SET SESSION optimizer_switch = 'hash_set_operations=off';
40+
SELECT * FROM (SELECT * FROM t INTERSECT SELECT * FROM t) AS derived ORDER BY i LIMIT 20;
41+
SHOW STATUS LIKE 'Created_tmp_files%';
42+
43+
SET SESSION optimizer_switch = 'hash_set_operations=default';
44+
SELECT * FROM (SELECT * FROM t INTERSECT SELECT * FROM t) AS derived ORDER BY i LIMIT 20;
45+
SHOW STATUS LIKE 'Created_tmp_files%';
46+
FLUSH STATUS;
47+
SET SESSION setop_hash_buffer_size = 16384;
48+
--echo # The number of Created_tmp_files will be 386, which is
49+
--echo # 128*2 (build, probe chunks) for left operand + 128 (probe) for right operand
50+
--echo # + 2 (REMAININGINPUT for left and right operand) = 386
51+
--echo # The last 128 (probe chunk files for right operand), could have been avoided
52+
--echo # if we had a way to reset IO_CACHE files; now we do a close and open, cf.
53+
--echo # HashJoinChunk::Init.
54+
SELECT * FROM (SELECT * FROM t INTERSECT SELECT * FROM t) AS derived ORDER BY i LIMIT 20;
55+
SHOW STATUS LIKE 'Created_tmp_files%';
56+
57+
SET SESSION setop_hash_buffer_size = default;
58+
59+
--echo # Test spill correctness and secondary overflow, the latter
60+
--echo # using injection. This query with setop_hash_buffer_size ==
61+
--echo # 16384 will give 128 chunk files. With default setting it does
62+
--echo # not spill to disk.
63+
let $query = SELECT * FROM t INTERSECT SELECT * FROM t;
64+
65+
SET SESSION optimizer_switch = 'hash_set_operations=off';
66+
eval CREATE TABLE no_hashing AS $query;
67+
68+
SET SESSION optimizer_switch = 'hash_set_operations=default';
69+
eval CREATE TABLE hashing_no_spill AS $query;
70+
71+
--echo # Compare old approach (no hashing) with hashing
72+
SET SESSION optimizer_switch = 'hash_set_operations=off';
73+
SELECT COUNT(*) FROM (SELECT * FROM no_hashing EXCEPT ALL SELECT * FROM hashing_no_spill) derived;
74+
SELECT COUNT(*) FROM (SELECT * FROM hashing_no_spill EXCEPT ALL SELECT * FROM no_hashing) derived;
75+
SET SESSION optimizer_switch = 'hash_set_operations=default';
76+
77+
SET SESSION setop_hash_buffer_size = 16384;
78+
eval CREATE TABLE hashing_spill AS $query;
79+
80+
--echo # Compare old approach (no hashing) with hashing w/spill
81+
SET SESSION optimizer_switch = 'hash_set_operations=off';
82+
SELECT COUNT(*) FROM (SELECT * FROM no_hashing EXCEPT ALL SELECT * FROM hashing_spill) derived;
83+
SELECT COUNT(*) FROM (SELECT * FROM hashing_spill EXCEPT ALL SELECT * FROM no_hashing) derived;
84+
85+
SET SESSION optimizer_switch = 'hash_set_operations=default';
86+
SET SESSION setop_hash_buffer_size = default;
87+
88+
DROP TABLE no_hashing, hashing_no_spill, hashing_spill;
89+
90+
--echo #
91+
--echo # Test overflow in resulting tmp table
92+
--echo #
93+
SET SESSION optimizer_trace="enabled=on";
94+
95+
let $show_trace=
96+
SELECT JSON_PRETTY(JSON_EXTRACT(trace,"$.steps[*].join_execution"))
97+
FROM information_schema.optimizer_trace;
98+
99+
let $pattern=$elide_trace_costs_and_rows;
100+
# elide some sorting statistics:
101+
let $pattern=$pattern /num_initial_chunks_spilled_to_disk\": [0-9.]+/num_initial_chunks_spilled_to_disk\": "elided"/;
102+
let $pattern=$pattern /peak_memory_used\": [0-9.]+/peak_memory_used\": "elided"/;
103+
104+
--echo # a) When we spill
105+
SET SESSION tmp_table_size=100000;
106+
SELECT * FROM (SELECT * FROM t INTERSECT SELECT * FROM t) AS derived ORDER BY i LIMIT 20;
107+
--replace_regex $pattern
108+
--skip_if_hypergraph
109+
eval $show_trace;
110+
111+
--echo # b) With secondary overflow (part of query_expression_debug
112+
--echo # since it requires error injection)
113+
114+
--echo # c) When we can hash in memory
115+
SET SESSION setop_hash_buffer_size=2621440;
116+
SELECT * FROM (SELECT * FROM t INTERSECT SELECT * FROM t) AS derived ORDER BY i LIMIT 20;
117+
--replace_regex $pattern
118+
--skip_if_hypergraph
119+
eval $show_trace;
120+
121+
SET SESSION tmp_table_size=default;
122+
SET SESSION setop_hash_buffer_size=default;
123+
SET SESSION optimizer_trace="enabled=default";
124+
125+
DROP TABLE t;

mysql-test/include/query_expression_big.inc

Lines changed: 0 additions & 83 deletions
This file was deleted.

0 commit comments

Comments
 (0)