Skip to content

Commit 93386d7

Browse files
author
Alexander Barkov
committed
Bug#57737 Character sets: search fails with like, contraction, index
Problem: LIKE over an indexed column optimized away good results, because my_like_range_utf32/utf16 returned wrong ranges for contractions. Contraction related code was missing in my_like_range_utf32/utf16, but did exist in my_like_range_ucs2/utf8. It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess). Fix: The patch removes individual functions my_like_range_ucs2, my_like_range_utf16, my_like_range_utf32 and introduces a single function my_like_range_generic() instead. The new function handles contractions correctly. It can handle any character set with cs->min_sort_char and cs->max_sort_char represented in Unicode code points. added: @ mysql-test/include/ctype_czech.inc @ mysql-test/include/ctype_like_ignorable.inc @ mysql-test/r/ctype_like_range.result @ mysql-test/t/ctype_like_range.test Adding tests modified: @ include/m_ctype.h - Adding helper functions for contractions. - Prototypes: removing ucs2,utf16,utf32 functions, adding generic function. @ mysql-test/r/ctype_uca.result @ mysql-test/r/ctype_utf16_uca.result @ mysql-test/r/ctype_utf32_uca.result @ mysql-test/t/ctype_uca.test @ mysql-test/t/ctype_utf16_uca.test @ mysql-test/t/ctype_utf32_uca.test - Adding tests. @ strings/ctype-mb.c - Pad function did not put the last character. - Implementing my_like_range_generic() - an universal replacement for three separate functions my_like_range_ucs2(), my_like_range_utf16() and my_like_range_utf32(), with correct contraction handling. @ strings/ctype-ucs2.c - my_fill_mb2 did not put the high byte, as previously it was used to put only characters in ASCII range. Now it puts high byte as well (needed to pupulate cs->max_sort_char correctly). - Adding DBUG_ASSERT() - Removing character set specific functions: my_like_range_ucs2(), my_like_range_utf16() and my_like_range_utf32(). - Using my_like_range_generic() instead of the old functions. @ strings/ctype-uca.c - Using generic function instead of the old character set specific ones. @ sql/item_create.cc @ sql/item_strfunc.cc @ sql/item_strfunc.h - Adding SQL functions LIKE_RANGE_MIN and LIKE_RANGE_MAX, available only in debug build to make sure like_range() works correctly for all character sets and collations.
1 parent 7d27f9c commit 93386d7

17 files changed

+3001
-344
lines changed

include/m_ctype.h

+35-20
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,32 @@ extern CHARSET_INFO my_charset_utf8mb4_unicode_ci;
356356
#define MY_UTF8MB4 "utf8mb4"
357357

358358

359+
/* Helper functions to handle contraction */
360+
static inline my_bool
361+
my_cs_have_contractions(CHARSET_INFO *cs)
362+
{
363+
return cs->contractions != NULL;
364+
}
365+
366+
static inline my_bool
367+
my_cs_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
368+
{
369+
return ((const char *)cs->contractions)[0x40*0x40 + (wc & 0xFF)];
370+
}
371+
372+
static inline my_bool
373+
my_cs_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
374+
{
375+
return ((const char *)cs->contractions)[0x40*0x40 + (wc & 0xFF)];
376+
}
377+
378+
static inline uint16*
379+
my_cs_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
380+
{
381+
return &cs->contractions[(wc1 - 0x40) * 0x40 + wc2 - 0x40];
382+
}
383+
384+
359385
/* declarations for simple charsets */
360386
extern size_t my_strnxfrm_simple(CHARSET_INFO *, uchar *, size_t,
361387
const uchar *, size_t);
@@ -430,40 +456,29 @@ ulonglong my_strntoull10rnd_ucs2(CHARSET_INFO *cs,
430456

431457
void my_fill_8bit(CHARSET_INFO *cs, char* to, size_t l, int fill);
432458

459+
/* For 8-bit character set */
433460
my_bool my_like_range_simple(CHARSET_INFO *cs,
434461
const char *ptr, size_t ptr_length,
435462
pbool escape, pbool w_one, pbool w_many,
436463
size_t res_length,
437464
char *min_str, char *max_str,
438465
size_t *min_length, size_t *max_length);
439466

467+
/* For ASCII-based multi-byte character sets with mbminlen=1 */
440468
my_bool my_like_range_mb(CHARSET_INFO *cs,
441469
const char *ptr, size_t ptr_length,
442470
pbool escape, pbool w_one, pbool w_many,
443471
size_t res_length,
444472
char *min_str, char *max_str,
445473
size_t *min_length, size_t *max_length);
446474

447-
my_bool my_like_range_ucs2(CHARSET_INFO *cs,
448-
const char *ptr, size_t ptr_length,
449-
pbool escape, pbool w_one, pbool w_many,
450-
size_t res_length,
451-
char *min_str, char *max_str,
452-
size_t *min_length, size_t *max_length);
453-
454-
my_bool my_like_range_utf16(CHARSET_INFO *cs,
455-
const char *ptr, size_t ptr_length,
456-
pbool escape, pbool w_one, pbool w_many,
457-
size_t res_length,
458-
char *min_str, char *max_str,
459-
size_t *min_length, size_t *max_length);
460-
461-
my_bool my_like_range_utf32(CHARSET_INFO *cs,
462-
const char *ptr, size_t ptr_length,
463-
pbool escape, pbool w_one, pbool w_many,
464-
size_t res_length,
465-
char *min_str, char *max_str,
466-
size_t *min_length, size_t *max_length);
475+
/* For other character sets, with arbitrary mbminlen and mbmaxlen numbers */
476+
my_bool my_like_range_generic(CHARSET_INFO *cs,
477+
const char *ptr, size_t ptr_length,
478+
pbool escape, pbool w_one, pbool w_many,
479+
size_t res_length,
480+
char *min_str, char *max_str,
481+
size_t *min_length, size_t *max_length);
467482

468483
int my_wildcmp_8bit(CHARSET_INFO *,
469484
const char *str,const char *str_end,

mysql-test/include/ctype_czech.inc

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
SELECT @@collation_connection;
2+
--echo #
3+
--echo # Bug#57737 Character sets: search fails with like, contraction, index
4+
--echo #
5+
CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
6+
INSERT INTO t1 VALUES ('c'),('ce'),(''),('ch');
7+
SELECT * FROM t1 WHERE s1 LIKE 'c%';
8+
ALTER TABLE t1 ADD KEY s1 (s1);
9+
SELECT * FROM t1 WHERE s1 LIKE 'c%';
10+
ALTER TABLE t1 DROP KEY s1, ADD KEY(s1(1));
11+
SELECT * FROM t1 WHERE s1 LIKE 'ch';
12+
DROP TABLE t1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
SELECT @@collation_connection;
2+
--echo #
3+
--echo # Bug#57737 Character sets: search fails with like, contraction, index
4+
--echo # Part#2 - ignorable characters
5+
--echo #
6+
CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS s1 LIMIT 0;
7+
INSERT INTO t1 VALUES ('a\0\0\0\0\0\t'),('a'),('b'),('c'),('d'),('e');
8+
SELECT HEX(s1) FROM t1 WHERE s1 LIKE 'a%';
9+
ALTER TABLE t1 ADD KEY s1 (s1);
10+
SELECT HEX(s1) FROM t1 WHERE s1 LIKE 'a%';
11+
DROP TABLE t1;

0 commit comments

Comments
 (0)