Skip to content

Commit 80ba6c4

Browse files
author
bin.x.su@oracle.com
committed
WL#4024 gb18030 Chinese character set
1 parent f0d7a37 commit 80ba6c4

File tree

73 files changed

+139429
-640
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+139429
-640
lines changed

client/mysqltest.cc

+19-1
Original file line numberDiff line numberDiff line change
@@ -6317,7 +6317,25 @@ int read_line(char *buf, int size)
63176317
{
63186318
/* Could be a multibyte character */
63196319
/* This code is based on the code in "sql_load.cc" */
6320-
uint charlen= my_mbcharlen(charset_info, (unsigned char) c);
6320+
uint charlen;
6321+
if (my_mbmaxlenlen(charset_info) == 1)
6322+
charlen= my_mbcharlen(charset_info, (unsigned char) c);
6323+
else
6324+
{
6325+
if (!(charlen= my_mbcharlen(charset_info, (unsigned char) c)))
6326+
{
6327+
char c1= my_getc(cur_file->file);
6328+
if (c1 == EOF)
6329+
{
6330+
*p++= c;
6331+
goto found_eof;
6332+
}
6333+
6334+
charlen= my_mbcharlen_2(charset_info, (unsigned char) c,
6335+
(unsigned char) c1);
6336+
my_ungetc(c1);
6337+
}
6338+
}
63216339
if(charlen == 0)
63226340
DBUG_RETURN(1);
63236341
/* We give up if multibyte character is started but not */

cmake/character_sets.cmake

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2009, 2014, Oracle and/or its affiliates. All rights reserved.
22
#
33
# This program is free software; you can redistribute it and/or modify
44
# it under the terms of the GNU General Public License as published by
@@ -24,13 +24,13 @@ ENDIF()
2424

2525
SET(CHARSETS ${DEFAULT_CHARSET} latin1 utf8 utf8mb4)
2626
SET(CHARSETS_COMPLEX
27-
big5 cp1250 cp932 eucjpms euckr gb2312 gbk latin1 latin2
27+
big5 cp1250 cp932 eucjpms euckr gb2312 gbk gb18030 latin1 latin2
2828
sjis tis620 ucs2 ujis utf8 utf8mb4 utf16 utf16le utf32)
2929

30-
SET(CHARSETS_AVAILABLE
30+
SET(CHARSETS_AVAILABLE
3131
binary armscii8 ascii big5 cp1250 cp1251 cp1256 cp1257
32-
cp850 cp852 cp866 cp932 dec8 eucjpms euckr gb2312 gbk geostd8
33-
greek hebrew hp8 keybcs2 koi8r koi8u
32+
cp850 cp852 cp866 cp932 dec8 eucjpms euckr gb2312 gbk gb18030
33+
geostd8 greek hebrew hp8 keybcs2 koi8r koi8u
3434
latin1 latin2 latin5 latin7 macce macroman
3535
sjis swe7 tis620 ucs2 ujis utf8 utf8mb4 utf16 utf16le utf32)
3636

config.h.cmake

+1
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@
400400
#cmakedefine HAVE_CHARSET_euckr 1
401401
#cmakedefine HAVE_CHARSET_gb2312 1
402402
#cmakedefine HAVE_CHARSET_gbk 1
403+
#cmakedefine HAVE_CHARSET_gb18030 1
403404
#cmakedefine HAVE_CHARSET_geostd8 1
404405
#cmakedefine HAVE_CHARSET_greek 1
405406
#cmakedefine HAVE_CHARSET_hebrew 1

include/m_ctype.h

+37-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
1+
/* Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
22
33
This program is free software; you can redistribute it and/or modify
44
it under the terms of the GNU General Public License as published by
@@ -423,6 +423,7 @@ typedef struct charset_info_st
423423
uchar casedn_multiply;
424424
uint mbminlen;
425425
uint mbmaxlen;
426+
uint mbmaxlenlen;
426427
my_wc_t min_sort_char;
427428
my_wc_t max_sort_char; /* For LIKE optimization */
428429
uchar pad_char;
@@ -454,6 +455,8 @@ extern CHARSET_INFO my_charset_gb2312_chinese_ci;
454455
extern CHARSET_INFO my_charset_gb2312_bin;
455456
extern CHARSET_INFO my_charset_gbk_chinese_ci;
456457
extern CHARSET_INFO my_charset_gbk_bin;
458+
extern CHARSET_INFO my_charset_gb18030_chinese_ci;
459+
extern CHARSET_INFO my_charset_gb18030_bin;
457460
extern CHARSET_INFO my_charset_latin1_german2_ci;
458461
extern CHARSET_INFO my_charset_latin1_bin;
459462
extern CHARSET_INFO my_charset_latin2_czech_ci;
@@ -722,6 +725,8 @@ uint32 my_convert(char *to, uint32 to_length, const CHARSET_INFO *to_cs,
722725
const char *from, uint32 from_length,
723726
const CHARSET_INFO *from_cs, uint *errors);
724727

728+
uint my_mbcharlen_ptr(const CHARSET_INFO *cs, const char *s, const char *e);
729+
725730
#define _MY_U 01 /* Upper case */
726731
#define _MY_L 02 /* Lower case */
727732
#define _MY_NMR 04 /* Numeral (digit) */
@@ -769,6 +774,37 @@ uint32 my_convert(char *to, uint32 to_length, const CHARSET_INFO *to_cs,
769774
#define use_mb(s) ((s)->cset->ismbchar != NULL)
770775
#define my_ismbchar(s, a, b) ((s)->cset->ismbchar((s), (a), (b)))
771776
#define my_mbcharlen(s, a) ((s)->cset->mbcharlen((s),(a)))
777+
/**
778+
Get the length of gb18030 code by the given two leading bytes
779+
780+
@param[in] s charset_info
781+
@param[in] a first byte of gb18030 code
782+
@param[in] b second byte of gb18030 code
783+
@return the length of gb18030 code starting with given two bytes,
784+
the length would be 2 or 4
785+
*/
786+
#define my_mbcharlen_2(s, a, b) ((s)->cset->mbcharlen((s),((((a) & 0xFF) << 8) + ((b) & 0xFF))))
787+
/**
788+
Get the maximum length of leading bytes needed to determine the length of a
789+
multi-byte gb18030 code
790+
791+
@param[in] s charset_info
792+
@return number of leading bytes we need, would be 2 for gb18030
793+
and 1 for all other charsets
794+
*/
795+
#define my_mbmaxlenlen(s) ((s)->mbmaxlenlen)
796+
/**
797+
Judge if the given byte is a possible leading byte for a charset.
798+
For gb18030 whose mbmaxlenlen is 2, we can't determine the length of
799+
a multi-byte character by looking at the first byte only
800+
801+
@param[in] s charset_info
802+
@param[in] i possible leading byte
803+
@return true if it is, otherwise false
804+
*/
805+
#define my_ismb1st(s, i) \
806+
(my_mbcharlen((s), (i)) > 1 || \
807+
(my_mbmaxlenlen((s)) == 2 && my_mbcharlen((s), (i)) == 0))
772808

773809
#define my_caseup_str(s, a) ((s)->cset->caseup_str((s), (a)))
774810
#define my_casedn_str(s, a) ((s)->cset->casedn_str((s), (a)))

mysql-test/include/have_gb18030.inc

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-- require r/have_gb18030.require
2+
disable_query_log;
3+
show collation like 'gb18030_chinese_ci';
4+
enable_query_log;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# The weight of 0x81309D30 will be the same with its capital letter 0x81309C39
2+
SELECT collation(CAST(0x81309D30 AS CHAR));
3+
SELECT HEX(WEIGHT_STRING(CAST(0x6141 AS CHAR)));
4+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D30 AS CHAR)));
5+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D30 AS CHAR) AS CHAR(1)));
6+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D30 AS CHAR) AS CHAR(1)));
7+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D30 AS CHAR) AS CHAR(3)));
8+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D30 AS CHAR) AS CHAR(3)));
9+
SELECT HEX(WEIGHT_STRING(CAST(0x4081309D30 AS CHAR) AS CHAR(3)));
10+
SELECT HEX(WEIGHT_STRING(CAST(0x4081309D3081309D30 AS CHAR) AS CHAR(3)));
11+
SELECT HEX(WEIGHT_STRING(CAST(0x4081309D3081309D3081309D30 AS CHAR) AS CHAR(3)));
12+
SELECT HEX(WEIGHT_STRING(CAST(0x404081309D3081309D3081309D30 AS CHAR) AS CHAR(3)));
13+
14+
# Read 2 CHARacters from the source string (the last CHARacter is not used)
15+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 1, 2, 0xC0));
16+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 2, 2, 0xC0));
17+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 3, 2, 0xC0));
18+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 4, 2, 0xC0));
19+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 5, 2, 0xC0));
20+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 25, 2, 0xC0));
21+
22+
# Read 3 CHARacters from the source string (the entire string is used)
23+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 1, 3, 0xC0));
24+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 2, 3, 0xC0));
25+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 3, 3, 0xC0));
26+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 4, 3, 0xC0));
27+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 5, 3, 0xC0));
28+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 25, 3, 0xC0));
29+
30+
# Read 4 CHARacters from the source string (extra space is added)
31+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 1, 4, 0xC0));
32+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 2, 4, 0xC0));
33+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 3, 4, 0xC0));
34+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 4, 4, 0xC0));
35+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 5, 4, 0xC0));
36+
SELECT HEX(WEIGHT_STRING(CAST(0x81309D3081309D3081309D30 AS CHAR), 25, 4, 0xC0));
37+
+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# The weight of 0xA2A9 will be the same with its capital letter 0xA2F9
2+
select collation(cast(0xA2A9 as char));
3+
select hex(weight_string(cast(0x6141 as char)));
4+
select hex(weight_string(cast(0xA2A9 as char)));
5+
select hex(weight_string(cast(0xA2A9 as char) as char(1)));
6+
select hex(weight_string(cast(0xA2A9A2A9 as char) as char(1)));
7+
select hex(weight_string(cast(0xA2A9 as char) as char(3)));
8+
select hex(weight_string(cast(0xA2A9A2A9 as char) as char(3)));
9+
select hex(weight_string(cast(0x40A2A9 as char) as char(3)));
10+
select hex(weight_string(cast(0x40A2A9A2A9 as char) as char(3)));
11+
select hex(weight_string(cast(0x40A2A9A2A9A2A9 as char) as char(3)));
12+
select hex(weight_string(cast(0x4040A2A9A2A9A2A9 as char) as char(3)));
13+
14+
# Read 2 characters from the source string (the last character is not used)
15+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 1, 2, 0xC0));
16+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 2, 2, 0xC0));
17+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 3, 2, 0xC0));
18+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 4, 2, 0xC0));
19+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 5, 2, 0xC0));
20+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char),25, 2, 0xC0));
21+
22+
# Read 3 characters from the source string (the entire string is used)
23+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 1, 3, 0xC0));
24+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 2, 3, 0xC0));
25+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 3, 3, 0xC0));
26+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 4, 3, 0xC0));
27+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 5, 3, 0xC0));
28+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char),25, 3, 0xC0));
29+
30+
# Read 4 characters from the source string (extra space is added)
31+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 1, 4, 0xC0));
32+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 2, 4, 0xC0));
33+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 3, 4, 0xC0));
34+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 4, 4, 0xC0));
35+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char), 5, 4, 0xC0));
36+
select hex(weight_string(cast(0xA2A9A2A9A2A9 as char),25, 4, 0xC0));
37+

0 commit comments

Comments
 (0)