Skip to content

Commit adcf4a2

Browse files
author
Alexander Barkov
committed
WL#5624 Collation customization improvements
Extension for the original patch to handle "bulk" shift. @ mysql-test/r/ctype_ldml.result @ mysql-test/std_data/Index.xml @mysql-test/t/ctype_ldml.test Adding tests @ strings/ctype-uca.c @ strings/ctype.c shift-after-method="expand" collation option.
1 parent d0b1660 commit adcf4a2

File tree

5 files changed

+166
-1
lines changed

5 files changed

+166
-1
lines changed

mysql-test/r/ctype_ldml.result

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,7 @@ utf8_5624_4 utf8 357 8
468468
ucs2_test_ci ucs2 358 8
469469
ucs2_vn_ci ucs2 359 8
470470
ucs2_5624_1 ucs2 360 8
471+
utf8_5624_5 utf8 368 8
471472
utf32_test_ci utf32 391 8
472473
utf8_maxuserid_ci utf8 2047 8
473474
show collation like '%test%';
@@ -1053,5 +1054,82 @@ wa GROUP_CONCAT(HEX(CONVERT(a USING ucs2)) ORDER BY LENGTH(a), BINARY a)
10531054
15D4 09B909CD
10541055
DROP TABLE t1;
10551056
#
1057+
# WL#5624, shift after, using expansion
1058+
#
1059+
SET NAMES utf8 COLLATE utf8_5624_5;
1060+
CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS a LIMIT 0;
1061+
INSERT INTO t1 VALUES ('0'),('1'),('0z'),(_ucs2 0x0030FF9D);
1062+
INSERT INTO t1 VALUES ('a'),('b'),('c'),('d'),('e'),('f'),('g'),('h'),('i');
1063+
INSERT INTO t1 VALUES ('j'),('k'),('l'),('m'),('n'),('o'),('p'),('q'),('r');
1064+
INSERT INTO t1 VALUES ('s'),('t'),('u'),('v'),('w'),('x'),('y'),('z');
1065+
INSERT INTO t1 VALUES ('aa'),('aaa');
1066+
INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
1067+
INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
1068+
INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
1069+
INSERT INTO t1 VALUES ('AA'),('AAA');
1070+
SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
1071+
a HEX(WEIGHT_STRING(a))
1072+
0 0E29
1073+
0z 0E290E292357
1074+
0ン 0E291E81
1075+
a 0E29233E
1076+
b 0E29233F
1077+
c 0E292340
1078+
d 0E292341
1079+
e 0E292342
1080+
f 0E292343
1081+
g 0E292344
1082+
h 0E292345
1083+
i 0E292346
1084+
j 0E292347
1085+
k 0E292348
1086+
l 0E292349
1087+
m 0E29234A
1088+
n 0E29234B
1089+
o 0E29234C
1090+
p 0E29234D
1091+
q 0E29234E
1092+
r 0E29234F
1093+
s 0E292350
1094+
t 0E292351
1095+
u 0E292352
1096+
v 0E292353
1097+
w 0E292354
1098+
x 0E292355
1099+
y 0E292356
1100+
z 0E292357
1101+
aa 0E292358
1102+
aaa 0E292359
1103+
A 0E29333E
1104+
B 0E29333F
1105+
C 0E293340
1106+
D 0E293341
1107+
E 0E293342
1108+
F 0E293343
1109+
G 0E293344
1110+
H 0E293345
1111+
I 0E293346
1112+
J 0E293347
1113+
K 0E293348
1114+
L 0E293349
1115+
M 0E29334A
1116+
N 0E29334B
1117+
O 0E29334C
1118+
P 0E29334D
1119+
Q 0E29334E
1120+
R 0E29334F
1121+
S 0E293350
1122+
T 0E293351
1123+
U 0E293352
1124+
V 0E293353
1125+
W 0E293354
1126+
X 0E293355
1127+
Y 0E293356
1128+
Z 0E293357
1129+
AA 0E293358
1130+
AAA 0E293359
1131+
1 0E2A
1132+
DROP TABLE t1;
1133+
#
10561134
# End of WL#5624
10571135
#

mysql-test/std_data/Index.xml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,22 @@
105105
</rules>
106106
</collation>
107107

108+
<!-- shift after using expansion -->
109+
<collation name="utf8_5624_5" id="368" shift-after-method="expand">
110+
<rules>
111+
<!--
112+
Put small basic Latin letters between 0 and 1.
113+
Simple shift method would not work, because there is no
114+
weight space between 0 and 1 in DUCET.
115+
Also, to test it works with contractions, put some after 'z'.
116+
-->
117+
<reset>0</reset>
118+
<pc>abcdefghijklmnopqrstuvwxyz</pc><p>aa</p><p>aaa</p>
119+
<reset before="primary">1</reset>
120+
<pc>ABCDEFGHIJKLMNOPQRSTUVWXYZ</pc><p>AA</p><p>AAA</p>
121+
</rules>
122+
</collation>
123+
108124
<collation name="utf8_hugeid_ci" id="2047000000">
109125
<rules>
110126
<reset>a</reset>

mysql-test/t/ctype_ldml.test

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,25 @@ GROUP_CONCAT(HEX(CONVERT(a USING ucs2)) ORDER BY LENGTH(a), BINARY a)
346346
FROM t1 GROUP BY a ORDER BY a;
347347
DROP TABLE t1;
348348

349+
--echo #
350+
--echo # WL#5624, shift after, using expansion
351+
--echo #
352+
SET NAMES utf8 COLLATE utf8_5624_5;
353+
CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS a LIMIT 0;
354+
INSERT INTO t1 VALUES ('0'),('1'),('0z'),(_ucs2 0x0030FF9D);
355+
INSERT INTO t1 VALUES ('a'),('b'),('c'),('d'),('e'),('f'),('g'),('h'),('i');
356+
INSERT INTO t1 VALUES ('j'),('k'),('l'),('m'),('n'),('o'),('p'),('q'),('r');
357+
INSERT INTO t1 VALUES ('s'),('t'),('u'),('v'),('w'),('x'),('y'),('z');
358+
INSERT INTO t1 VALUES ('aa'),('aaa');
359+
INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
360+
INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
361+
INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
362+
INSERT INTO t1 VALUES ('AA'),('AAA');
363+
364+
SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
365+
DROP TABLE t1;
366+
367+
349368
--echo #
350369
--echo # End of WL#5624
351370
--echo #

strings/ctype-uca.c

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20929,6 +20929,18 @@ my_coll_rule_reset(MY_COLL_RULE *r)
2092920929
}
2093020930

2093120931

20932+
/*
20933+
Shift methods:
20934+
Simple: "&B < C" : weight('C') = weight('B') + 1
20935+
Expand: weght('C') = { weight('B'), weight(last_non_ignorable) + 1 }
20936+
*/
20937+
typedef enum
20938+
{
20939+
my_shift_method_simple= 0,
20940+
my_shift_method_expand
20941+
} my_coll_shift_method;
20942+
20943+
2093220944
typedef struct my_coll_rules_st
2093320945
{
2093420946
uint version; /* Unicode version, e.g. 400 or 520 */
@@ -20937,6 +20949,7 @@ typedef struct my_coll_rules_st
2093720949
size_t mrules; /* Number of allocated rules */
2093820950
MY_COLL_RULE *rule; /* Rule array */
2093920951
MY_CHARSET_LOADER *loader;
20952+
my_coll_shift_method shift_after_method;
2094020953
} MY_COLL_RULES;
2094120954

2094220955

@@ -21204,6 +21217,14 @@ my_coll_parser_scan_setting(MY_COLL_RULE_PARSER *p)
2120421217
rules->version= 520;
2120521218
rules->uca= &my_uca_v520;
2120621219
}
21220+
else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method expand]")))
21221+
{
21222+
rules->shift_after_method= my_shift_method_expand;
21223+
}
21224+
else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method simple]")))
21225+
{
21226+
rules->shift_after_method= my_shift_method_simple;
21227+
}
2120721228
else
2120821229
{
2120921230
return 0;
@@ -21415,7 +21436,8 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p)
2141521436
return 0;
2141621437
}
2141721438

21418-
if (p->rule.before_level == 1) /* Apply "before primary" option */
21439+
if (p->rules->shift_after_method == my_shift_method_expand ||
21440+
p->rule.before_level == 1) /* Apply "before primary" option */
2141921441
{
2142021442
/*
2142121443
Suppose we have this rule: &B[before primary] < C
@@ -21435,6 +21457,10 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p)
2143521457

2143621458
We'll compose weight for C as: [BBBB-1][MMMM+1]
2143721459
where [MMMM] is weight for "last_non_ignorable".
21460+
21461+
We also do the same trick for "reset after" if the collation
21462+
option says so. E.g. for the rules "&B < C", weight for
21463+
C will be calculated as: [BBBB][MMMM+1]
2143821464

2143921465
At this point we only need to store codepoints
2144021466
'B' and 'last_non_ignorable'. Actual weights for 'C'
@@ -21924,7 +21950,27 @@ create_tailoring(CHARSET_INFO *cs, MY_CHARSET_LOADER *loader)
2192421950
if (r->before_level == 1) /* Apply "&[before primary]" */
2192521951
{
2192621952
if (nweights >= 2)
21953+
{
2192721954
to[nweights - 2]--; /* Reset before */
21955+
if (rules.shift_after_method == my_shift_method_expand)
21956+
{
21957+
/*
21958+
Special case. Don't let characters shifted after X
21959+
and before next(X) intermix to each other.
21960+
21961+
For example:
21962+
"[shift-after-method expand] &0 < a &[before primary]1 < A".
21963+
I.e. we reorder 'a' after '0', and then 'A' before '1'.
21964+
'a' must be sorted before 'A'.
21965+
21966+
Note, there are no real collations in CLDR which shift
21967+
after and before two neighbourgh characters. We need this
21968+
just in case. Reserving 4096 (0x1000) weights for such
21969+
cases is perfectly enough.
21970+
*/
21971+
to[nweights - 1]+= 0x1000;
21972+
}
21973+
}
2192821974
else
2192921975
{
2193021976
my_snprintf(loader->error, sizeof(loader->error),

strings/ctype.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ struct my_cs_file_section_st
8888
#define _CS_UCA_VERSION 100
8989
#define _CS_CL_SUPPRESS_CONTRACTIONS 101
9090
#define _CS_CL_OPTIMIZE 102
91+
#define _CS_CL_SHIFT_AFTER_METHOD 103
9192

9293

9394
/* Collation Settings */
@@ -187,6 +188,7 @@ static struct my_cs_file_section_st sec[] =
187188
{_CS_UCA_VERSION, "charsets/charset/collation/version"},
188189
{_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
189190
{_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize"},
191+
{_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method"},
190192

191193
/* Collation Settings */
192194
{_CS_ST_SETTINGS, "charsets/charset/collation/settings"},
@@ -646,6 +648,10 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
646648
rc= tailoring_append(st, "[optimize %.*s]", len, attr);
647649
break;
648650

651+
case _CS_CL_SHIFT_AFTER_METHOD:
652+
rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
653+
break;
654+
649655
/* Collation Settings */
650656
case _CS_ST_STRENGTH:
651657
/* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */

0 commit comments

Comments
 (0)