Skip to content

Commit e939aaf

Browse files
author
Martin Hansson
committed
Merge branch 'mysql-8.0' into mysql-trunk
Change-Id: I10e55c7d22afcbda055e5b89515682f08b6f1b48
2 parents 29905dd + 9591649 commit e939aaf

File tree

8 files changed

+117
-57
lines changed

8 files changed

+117
-57
lines changed

mysql-test/r/regular_expressions_utf-8.result

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ SELECT @buf_sz_utf16;
672672
512.000000000
673673
SELECT length(regexp_replace( repeat('a', @buf_sz_utf16), 'a', 'b' ));
674674
length(regexp_replace( repeat('a', @buf_sz_utf16), 'a', 'b' ))
675-
1024
675+
512
676676
SELECT length(regexp_replace( repeat('a', @buf_sz_utf16 + 1), 'a', 'b' ));
677677
ERROR HY000: The result string is larger than the result buffer.
678678
SELECT length(regexp_replace( repeat('a', @buf_sz_utf16), 'a', 'bb' ));
@@ -907,16 +907,15 @@ regexp_replace( 'abc', 'b', 'x' ) AS c,
907907
regexp_substr( 'a', 'a' ) AS d,
908908
regexp_substr( repeat('a', 512), 'a' ) AS e,
909909
regexp_substr( repeat('a', 513), 'a' ) AS f;
910-
# The actual character set that ICU uses depends on the machine.
911910
SHOW CREATE TABLE t1;
912911
Table Create Table
913912
t1 CREATE TABLE `t1` (
914913
`a` bigint(21) NOT NULL DEFAULT '0',
915914
`b` int(1) NOT NULL DEFAULT '0',
916-
`c` longtext CHARACTER SET utf16x,
917-
`d` varchar(1) CHARACTER SET utf16x NOT NULL DEFAULT '',
918-
`e` varchar(512) CHARACTER SET utf16x DEFAULT NULL,
919-
`f` text CHARACTER SET utf16x
915+
`c` longtext CHARACTER SET latin1,
916+
`d` varchar(1) CHARACTER SET latin1 NOT NULL DEFAULT '',
917+
`e` varchar(512) CHARACTER SET latin1 DEFAULT NULL,
918+
`f` text CHARACTER SET latin1
920919
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
921920
SELECT * FROM t1;
922921
a b c d e f
@@ -968,3 +967,38 @@ regexp_replace( '???', '.', 'a', 2 )
968967
SELECT regexp_replace( '🍣🍣🍣', '.', 'a', 2, 2 );
969968
regexp_replace( '???', '.', 'a', 2, 2 )
970969
🍣🍣a
970+
#
971+
# REGEXP_REPLACE DOES NOT CONVERT RESULT CHARACTER SET
972+
#
973+
SELECT hex(regexp_replace( convert( 'abcd' using utf8mb4 ), 'c', ''));
974+
hex(regexp_replace( convert( 'abcd' using utf8mb4 ), 'c', ''))
975+
616264
976+
SELECT hex(regexp_replace( convert( 'abcd' using utf16 ), 'c', ''));
977+
hex(regexp_replace( convert( 'abcd' using utf16 ), 'c', ''))
978+
006100620064
979+
SELECT hex(regexp_substr( convert( 'abcd' using utf8mb4 ), 'abc'));
980+
hex(regexp_substr( convert( 'abcd' using utf8mb4 ), 'abc'))
981+
616263
982+
SELECT hex(regexp_substr( convert( 'abcd' using utf16 ), 'abc'));
983+
hex(regexp_substr( convert( 'abcd' using utf16 ), 'abc'))
984+
006100620063
985+
#
986+
# Test of the code path that elides character set conversion when the
987+
# target column has the same character set as ICU produces. This depends
988+
# on the architecture, and so we try both big and little endian.
989+
#
990+
CREATE TABLE t1 (
991+
a CHAR(10) CHARACTER SET utf16le,
992+
b CHAR(10) CHARACTER SET utf16
993+
);
994+
INSERT INTO t1 VALUES (
995+
regexp_substr( convert('abcd' using utf16le), 'abc' ),
996+
regexp_substr( convert('abcd' using utf16), 'abc' ));
997+
INSERT INTO t1 VALUES (
998+
regexp_substr( 'abcd', 'abc' ),
999+
regexp_substr( 'abcd', 'abc' ));
1000+
SELECT * FROM t1;
1001+
a b
1002+
abc abc
1003+
abc abc
1004+
DROP TABLE t1;

mysql-test/t/regular_expressions_utf-8.test

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -777,8 +777,6 @@ CREATE TABLE t1 AS SELECT
777777
regexp_substr( repeat('a', 512), 'a' ) AS e,
778778
regexp_substr( repeat('a', 513), 'a' ) AS f;
779779

780-
--echo # The actual character set that ICU uses depends on the machine.
781-
--replace_regex /(utf16)(le)?/\1x/
782780
SHOW CREATE TABLE t1;
783781
SELECT * FROM t1;
784782
DROP TABLE t1;
@@ -811,3 +809,34 @@ SELECT regexp_replace( '🍣🍣🍣', '.', 'a', 2 );
811809
SELECT regexp_replace( '🍣🍣🍣', '.', 'a', 2, 2 );
812810

813811
--source ../mysql-test/include/cleanup_icu_utils.inc
812+
813+
--echo #
814+
--echo # REGEXP_REPLACE DOES NOT CONVERT RESULT CHARACTER SET
815+
--echo #
816+
817+
SELECT hex(regexp_replace( convert( 'abcd' using utf8mb4 ), 'c', ''));
818+
SELECT hex(regexp_replace( convert( 'abcd' using utf16 ), 'c', ''));
819+
SELECT hex(regexp_substr( convert( 'abcd' using utf8mb4 ), 'abc'));
820+
SELECT hex(regexp_substr( convert( 'abcd' using utf16 ), 'abc'));
821+
822+
--echo #
823+
--echo # Test of the code path that elides character set conversion when the
824+
--echo # target column has the same character set as ICU produces. This depends
825+
--echo # on the architecture, and so we try both big and little endian.
826+
--echo #
827+
CREATE TABLE t1 (
828+
a CHAR(10) CHARACTER SET utf16le,
829+
b CHAR(10) CHARACTER SET utf16
830+
);
831+
832+
INSERT INTO t1 VALUES (
833+
regexp_substr( convert('abcd' using utf16le), 'abc' ),
834+
regexp_substr( convert('abcd' using utf16), 'abc' ));
835+
836+
INSERT INTO t1 VALUES (
837+
regexp_substr( 'abcd', 'abc' ),
838+
regexp_substr( 'abcd', 'abc' ));
839+
840+
SELECT * FROM t1;
841+
842+
DROP TABLE t1;

sql/item_regexp_func.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static bool ParseRegexpOptions(const std::string &options_string,
9090
}
9191

9292
bool Item_func_regexp::resolve_type(THD *) {
93-
return agg_arg_charsets_for_comparison(m_cmp_collation, args, 2);
93+
return agg_arg_charsets_for_comparison(collation, args, 2);
9494
}
9595

9696
bool Item_func_regexp::fix_fields(THD *thd, Item **arguments) {
@@ -116,8 +116,7 @@ bool Item_func_regexp::set_pattern() {
116116
if (!mp.has_value()) return true;
117117

118118
bool is_case_sensitive =
119-
((m_cmp_collation.collation->state & MY_CS_CSSORT) != 0 ||
120-
(m_cmp_collation.collation->state & MY_CS_BINSORT) != 0);
119+
(((collation.collation->state & (MY_CS_CSSORT | MY_CS_BINSORT)) != 0));
121120

122121
uint32_t icu_flags = 0; // Avoids compiler warning on gcc 4.8.5.
123122
// match_parameter overrides coercion type.
@@ -185,7 +184,7 @@ longlong Item_func_regexp_like::val_int() {
185184

186185
bool Item_func_regexp_replace::resolve_type(THD *thd) {
187186
if (Item_func_regexp::resolve_type(thd)) return true;
188-
set_data_type_string(ulonglong{MAX_BLOB_WIDTH}, regexp::regexp_lib_charset);
187+
set_data_type_string(ulonglong{MAX_BLOB_WIDTH});
189188
return false;
190189
}
191190

@@ -206,6 +205,7 @@ String *Item_func_regexp_replace::val_str(String *buf) {
206205
return nullptr;
207206
}
208207

208+
buf->set_charset(collation.collation);
209209
String *result = m_facade->Replace(subject(), replacement(), pos.value(),
210210
occ.value(), buf);
211211
null_value = (result == nullptr);
@@ -214,8 +214,7 @@ String *Item_func_regexp_replace::val_str(String *buf) {
214214

215215
bool Item_func_regexp_substr::resolve_type(THD *thd) {
216216
if (Item_func_regexp::resolve_type(thd)) return true;
217-
set_data_type_string(subject()->max_char_length(),
218-
regexp::regexp_lib_charset);
217+
set_data_type_string(subject()->max_char_length());
219218
return false;
220219
}
221220

@@ -233,6 +232,7 @@ String *Item_func_regexp_substr::val_str(String *buf) {
233232
null_value = true;
234233
return nullptr;
235234
}
235+
buf->set_charset(collation.collation);
236236
String *result = m_facade->Substr(subject(), pos.value(), occ.value(), buf);
237237
null_value = (result == nullptr);
238238
return result;

sql/item_regexp_func.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -214,14 +214,6 @@ class Item_func_regexp : public Item_func {
214214
bool set_pattern();
215215

216216
unique_ptr_destroy_only<regexp::Regexp_facade> m_facade;
217-
218-
private:
219-
/**
220-
The collation that is supposed to be used if you were to compare the
221-
pattern and the subject strings. We use this only for figuring out whether
222-
regular expression matching should be case-sensitive or not.
223-
*/
224-
DTCollation m_cmp_collation;
225217
};
226218

227219
class Item_func_regexp_instr : public Item_func_regexp {

sql/regexp/regexp_engine.cc

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
#include "sql/regexp/errors.h"
3333
#include "sql/regexp/regexp_facade.h"
3434
#include "sql/sql_class.h"
35-
#include "sql_string.h"
3635
#include "template_utils.h"
3736

3837
namespace regexp {
@@ -105,22 +104,15 @@ const std::u16string &Regexp_engine::Replace(const std::u16string &replacement,
105104
return m_replace_buffer;
106105
}
107106

108-
String *Regexp_engine::MatchedSubstring(String *result) {
107+
std::pair<int, int> Regexp_engine::MatchedSubstring() {
109108
int start = uregex_start(m_re, 0, &m_error_code);
110109
int end = uregex_end(m_re, 0, &m_error_code);
111-
auto text =
112-
pointer_cast<const char *>(uregex_getText(m_re, nullptr, &m_error_code));
113110
int start_in_bytes = start * sizeof(UChar);
114111
int length_in_bytes = (end - start) * sizeof(UChar);
115112

116-
if (U_FAILURE(m_error_code)) return nullptr;
117-
/*
118-
The ownership of the text was with us all along, we can safely pass it to
119-
`result`.
120-
*/
121-
result->set(text + start_in_bytes, length_in_bytes, regexp_lib_charset);
113+
if (U_FAILURE(m_error_code)) return {-1, -1};
122114

123-
return result;
115+
return {start_in_bytes, length_in_bytes};
124116
}
125117

126118
void Regexp_engine::AppendHead(size_t size) {

sql/regexp/regexp_engine.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#ifndef SQL_REGEXP_REGEXP_ENGINE_H_
22
#define SQL_REGEXP_REGEXP_ENGINE_H_
33

4-
/* Copyright (c) 2017, 2018, Oracle and/or its affiliates. All rights reserved.
4+
/* Copyright (c) 2017, 2019, Oracle and/or its affiliates. All rights reserved.
55
66
This program is free software; you can redistribute it and/or modify
77
it under the terms of the GNU General Public License, version 2.0,
@@ -27,6 +27,7 @@
2727

2828
#include <stdint.h>
2929
#include <string>
30+
#include <utility>
3031
#include <vector>
3132

3233
#include "m_ctype.h" // CHARSET_INFO.
@@ -35,7 +36,6 @@
3536
#include "sql/current_thd.h"
3637
#include "sql/regexp/errors.h"
3738
#include "sql/sql_class.h" // THD
38-
#include "sql_string.h"
3939
#include "template_utils.h"
4040

4141
extern CHARSET_INFO my_charset_utf16le_general_ci;
@@ -170,15 +170,12 @@ class Regexp_engine {
170170
int occurrence);
171171

172172
/**
173-
Copies the portion of the subject string between the start of the match
174-
and the end of the match into result.
173+
The start of the match and its length.
175174
176-
@param[out] result A string we can write to. The character set
177-
regexp_lib_charset is used.
178-
179-
@return A pointer to @p result.
175+
@return The index of the first code point of the match, and the length of
176+
the same.
180177
*/
181-
String *MatchedSubstring(String *result);
178+
std::pair<int, int> MatchedSubstring();
182179

183180
bool IsError() const { return U_FAILURE(m_error_code); }
184181
bool CheckError() const { return check_icu_status(m_error_code); }

sql/regexp/regexp_facade.cc

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "sql/regexp/regexp_facade.h"
2424

2525
#include <string>
26+
#include <tuple>
2627

2728
#include "my_pointer_arithmetic.h"
2829
#include "sql/mysqld.h" // make_unique_destroy_only
@@ -184,8 +185,19 @@ String *Regexp_facade::Replace(Item *subject_expr, Item *replacement_expr,
184185

185186
const std::u16string &result_buffer = m_engine->Replace(
186187
replacement, ConvertCodePointToLibPosition(start), occurrence);
187-
result->set(pointer_cast<const char *>(result_buffer.data()),
188-
result_buffer.size() * sizeof(UChar), regexp_lib_charset);
188+
189+
uint conversion_error;
190+
size_t number_unaligned_characters;
191+
if (result->needs_conversion(result->length(), regexp_lib_charset,
192+
result->charset(),
193+
&number_unaligned_characters)) {
194+
if (result->copy(pointer_cast<const char *>(result_buffer.data()),
195+
result_buffer.size() * sizeof(UChar), regexp_lib_charset,
196+
result->charset(), &conversion_error))
197+
return nullptr;
198+
} else
199+
result->set(pointer_cast<const char *>(result_buffer.data()),
200+
result_buffer.size() * sizeof(UChar), regexp_lib_charset);
189201
return result;
190202
}
191203

@@ -196,9 +208,24 @@ String *Regexp_facade::Substr(Item *subject_expr, int start, int occurrence,
196208
m_engine->CheckError();
197209
return nullptr;
198210
}
199-
String *res = m_engine->MatchedSubstring(result);
211+
int substart, sublength;
212+
std::tie(substart, sublength) = m_engine->MatchedSubstring();
200213
if (m_engine->CheckError()) return nullptr;
201-
return res;
214+
215+
uint conversion_error;
216+
217+
auto substartptr =
218+
pointer_cast<const char *>(m_current_subject.c_str()) + substart;
219+
220+
size_t number_unaligned_characters;
221+
if (result->needs_conversion(sublength, regexp_lib_charset, result->charset(),
222+
&number_unaligned_characters)) {
223+
if (result->copy(substartptr, sublength, regexp_lib_charset,
224+
result->charset(), &conversion_error))
225+
return nullptr;
226+
} else
227+
result->set(substartptr, sublength, regexp_lib_charset);
228+
return result;
202229
}
203230

204231
bool Regexp_facade::SetupEngine(Item *pattern_expr, uint flags) {

sql/regexp/regexp_facade.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -121,17 +121,6 @@ class Regexp_facade {
121121

122122
String *Substr(Item *subject_expr, int start, int occurrence, String *result);
123123

124-
/**
125-
Returns the substring that was matched by the previous call to find() or
126-
matches().
127-
128-
@param[out] result A string we can write to.
129-
@return A pointer to result.
130-
*/
131-
String *MatchedSubstring(String *result) {
132-
return m_engine->MatchedSubstring(result);
133-
}
134-
135124
private:
136125
/**
137126
Resets the compiled regular expression with a new string.

0 commit comments

Comments
 (0)