WL#5624 Collation customization improvements

Alexander Barkov · Alexander Barkov · commit adcf4a2b661b · 2011-02-22T16:25:05.000+03:00
Extension for the original patch to handle "bulk" shift.

  @ mysql-test/r/ctype_ldml.result
  @ mysql-test/std_data/Index.xml
  @mysql-test/t/ctype_ldml.test
  Adding tests

  @ strings/ctype-uca.c
  @ strings/ctype.c
  shift-after-method="expand" collation option.
diff --git a/mysql-test/r/ctype_ldml.result b/mysql-test/r/ctype_ldml.result
@@ -468,6 +468,7 @@ utf8_5624_4	utf8	357			8
 ucs2_test_ci	ucs2	358			8
 ucs2_vn_ci	ucs2	359			8
 ucs2_5624_1	ucs2	360			8
+utf8_5624_5	utf8	368			8
 utf32_test_ci	utf32	391			8
 utf8_maxuserid_ci	utf8	2047			8
 show collation like '%test%';
@@ -1053,5 +1054,82 @@ wa	GROUP_CONCAT(HEX(CONVERT(a USING ucs2)) ORDER BY LENGTH(a), BINARY a)
 15D4	09B909CD
 DROP TABLE t1;
 #
+# WL#5624, shift after, using expansion
+#
+SET NAMES utf8 COLLATE utf8_5624_5;
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS a LIMIT 0;
+INSERT INTO t1 VALUES ('0'),('1'),('0z'),(_ucs2 0x0030FF9D);
+INSERT INTO t1 VALUES ('a'),('b'),('c'),('d'),('e'),('f'),('g'),('h'),('i');
+INSERT INTO t1 VALUES ('j'),('k'),('l'),('m'),('n'),('o'),('p'),('q'),('r');
+INSERT INTO t1 VALUES ('s'),('t'),('u'),('v'),('w'),('x'),('y'),('z');
+INSERT INTO t1 VALUES ('aa'),('aaa');
+INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
+INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
+INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
+INSERT INTO t1 VALUES ('AA'),('AAA');
+SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
+a	HEX(WEIGHT_STRING(a))
+0	0E29
+0z	0E290E292357
+0ﾝ	0E291E81
+a	0E29233E
+b	0E29233F
+c	0E292340
+d	0E292341
+e	0E292342
+f	0E292343
+g	0E292344
+h	0E292345
+i	0E292346
+j	0E292347
+k	0E292348
+l	0E292349
+m	0E29234A
+n	0E29234B
+o	0E29234C
+p	0E29234D
+q	0E29234E
+r	0E29234F
+s	0E292350
+t	0E292351
+u	0E292352
+v	0E292353
+w	0E292354
+x	0E292355
+y	0E292356
+z	0E292357
+aa	0E292358
+aaa	0E292359
+A	0E29333E
+B	0E29333F
+C	0E293340
+D	0E293341
+E	0E293342
+F	0E293343
+G	0E293344
+H	0E293345
+I	0E293346
+J	0E293347
+K	0E293348
+L	0E293349
+M	0E29334A
+N	0E29334B
+O	0E29334C
+P	0E29334D
+Q	0E29334E
+R	0E29334F
+S	0E293350
+T	0E293351
+U	0E293352
+V	0E293353
+W	0E293354
+X	0E293355
+Y	0E293356
+Z	0E293357
+AA	0E293358
+AAA	0E293359
+1	0E2A
+DROP TABLE t1;
+#
 # End of WL#5624
 #
diff --git a/mysql-test/std_data/Index.xml b/mysql-test/std_data/Index.xml
@@ -105,6 +105,22 @@
       </rules>
     </collation>
 
+    <!-- shift after using expansion -->
+    <collation name="utf8_5624_5" id="368" shift-after-method="expand">
+      <rules>
+        <!--
+           Put small basic Latin letters between 0 and 1.
+           Simple shift method would not work, because there is no
+           weight space between 0 and 1 in DUCET.
+           Also, to test it works with contractions, put some after 'z'.
+        -->
+        <reset>0</reset>
+        <pc>abcdefghijklmnopqrstuvwxyz</pc><p>aa</p><p>aaa</p>
+        <reset before="primary">1</reset>
+        <pc>ABCDEFGHIJKLMNOPQRSTUVWXYZ</pc><p>AA</p><p>AAA</p>
+      </rules>
+    </collation>
+
    <collation name="utf8_hugeid_ci" id="2047000000">
       <rules>
         <reset>a</reset>
diff --git a/mysql-test/t/ctype_ldml.test b/mysql-test/t/ctype_ldml.test
@@ -346,6 +346,25 @@ GROUP_CONCAT(HEX(CONVERT(a USING ucs2)) ORDER BY LENGTH(a), BINARY a)
 FROM t1 GROUP BY a ORDER BY a;
 DROP TABLE t1;
 
+--echo #
+--echo # WL#5624, shift after, using expansion
+--echo #
+SET NAMES utf8 COLLATE utf8_5624_5;
+CREATE TABLE t1 AS SELECT REPEAT(' ', 10) AS a LIMIT 0;
+INSERT INTO t1 VALUES ('0'),('1'),('0z'),(_ucs2 0x0030FF9D);
+INSERT INTO t1 VALUES ('a'),('b'),('c'),('d'),('e'),('f'),('g'),('h'),('i');
+INSERT INTO t1 VALUES ('j'),('k'),('l'),('m'),('n'),('o'),('p'),('q'),('r');
+INSERT INTO t1 VALUES ('s'),('t'),('u'),('v'),('w'),('x'),('y'),('z');
+INSERT INTO t1 VALUES ('aa'),('aaa');
+INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
+INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
+INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
+INSERT INTO t1 VALUES ('AA'),('AAA');
+
+SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
+DROP TABLE t1;
+
+
 --echo #
 --echo # End of WL#5624
 --echo #
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
@@ -20929,6 +20929,18 @@ my_coll_rule_reset(MY_COLL_RULE *r)
 }
 
 
+/*
+  Shift methods:
+  Simple: "&B < C" : weight('C') = weight('B') + 1
+  Expand: weght('C') =  { weight('B'), weight(last_non_ignorable) + 1 }
+*/
+typedef enum
+{
+  my_shift_method_simple= 0,
+  my_shift_method_expand
+} my_coll_shift_method;
+
+
 typedef struct my_coll_rules_st
 {
   uint version;              /* Unicode version, e.g. 400 or 520  */
@@ -20937,6 +20949,7 @@ typedef struct my_coll_rules_st
   size_t mrules;             /* Number of allocated rules         */
   MY_COLL_RULE *rule;        /* Rule array                        */
   MY_CHARSET_LOADER *loader;
+  my_coll_shift_method shift_after_method;
 } MY_COLL_RULES;
 
 
@@ -21204,6 +21217,14 @@ my_coll_parser_scan_setting(MY_COLL_RULE_PARSER *p)
     rules->version= 520;
     rules->uca= &my_uca_v520;
   }
+  else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method expand]")))
+  {
+    rules->shift_after_method= my_shift_method_expand;
+  }
+  else if (!lex_cmp(lexem, C_STRING_WITH_LEN("[shift-after-method simple]")))
+  {
+    rules->shift_after_method= my_shift_method_simple;
+  }
   else
   {
     return 0;
@@ -21415,7 +21436,8 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p)
       return 0;
   }
 
-  if (p->rule.before_level == 1) /* Apply "before primary" option  */
+  if (p->rules->shift_after_method == my_shift_method_expand ||
+      p->rule.before_level == 1) /* Apply "before primary" option  */
   {
     /*
       Suppose we have this rule:  &B[before primary] < C
@@ -21435,6 +21457,10 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p)
 
       We'll compose weight for C as: [BBBB-1][MMMM+1]
       where [MMMM] is weight for "last_non_ignorable".
+      
+      We also do the same trick for "reset after" if the collation
+      option says so. E.g. for the rules "&B < C", weight for
+      C will be calculated as: [BBBB][MMMM+1]
 
       At this point we only need to store codepoints
       'B' and 'last_non_ignorable'. Actual weights for 'C'
@@ -21924,7 +21950,27 @@ create_tailoring(CHARSET_INFO *cs, MY_CHARSET_LOADER *loader)
       if (r->before_level == 1) /* Apply "&[before primary]" */
       {
         if (nweights >= 2)
+        {
           to[nweights - 2]--; /* Reset before */
+          if (rules.shift_after_method == my_shift_method_expand)
+          {
+            /*
+              Special case. Don't let characters shifted after X
+              and before next(X) intermix to each other.
+              
+              For example:
+              "[shift-after-method expand] &0 < a &[before primary]1 < A".
+              I.e. we reorder 'a' after '0', and then 'A' before '1'.
+              'a' must be sorted before 'A'.
+              
+              Note, there are no real collations in CLDR which shift
+              after and before two neighbourgh characters. We need this
+              just in case. Reserving 4096 (0x1000) weights for such
+              cases is perfectly enough.
+            */
+            to[nweights - 1]+= 0x1000;
+          }
+        }
         else
         {
           my_snprintf(loader->error, sizeof(loader->error),
diff --git a/strings/ctype.c b/strings/ctype.c
@@ -88,6 +88,7 @@ struct my_cs_file_section_st
 #define _CS_UCA_VERSION                 100
 #define _CS_CL_SUPPRESS_CONTRACTIONS    101
 #define _CS_CL_OPTIMIZE                 102
+#define _CS_CL_SHIFT_AFTER_METHOD       103
 
 
 /* Collation Settings */
@@ -187,6 +188,7 @@ static struct my_cs_file_section_st sec[] =
   {_CS_UCA_VERSION,              "charsets/charset/collation/version"},
   {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
   {_CS_CL_OPTIMIZE,              "charsets/charset/collation/optimize"},
+  {_CS_CL_SHIFT_AFTER_METHOD,    "charsets/charset/collation/shift-after-method"},
 
   /* Collation Settings */
   {_CS_ST_SETTINGS,              "charsets/charset/collation/settings"},
@@ -646,6 +648,10 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
     rc= tailoring_append(st, "[optimize %.*s]", len, attr);
     break;
 
+  case _CS_CL_SHIFT_AFTER_METHOD:
+    rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
+    break;
+
   /* Collation Settings */
   case _CS_ST_STRENGTH:
     /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */