Merge branch 'mysql-8.0' into mysql-trunk

Martin Hansson · Martin Hansson · commit e939aafb5e3c · 2019-04-08T15:50:54.000+02:00
Change-Id: I10e55c7d22afcbda055e5b89515682f08b6f1b48
diff --git a/mysql-test/r/regular_expressions_utf-8.result b/mysql-test/r/regular_expressions_utf-8.result
@@ -672,7 +672,7 @@ SELECT @buf_sz_utf16;
 512.000000000
 SELECT length(regexp_replace( repeat('a', @buf_sz_utf16), 'a', 'b' ));
 length(regexp_replace( repeat('a', @buf_sz_utf16), 'a', 'b' ))
-1024
+512
 SELECT length(regexp_replace( repeat('a', @buf_sz_utf16 + 1), 'a', 'b' ));
 ERROR HY000: The result string is larger than the result buffer.
 SELECT length(regexp_replace( repeat('a', @buf_sz_utf16), 'a', 'bb' ));
@@ -907,16 +907,15 @@ regexp_replace( 'abc', 'b', 'x' ) AS c,
 regexp_substr( 'a', 'a' ) AS d,
 regexp_substr( repeat('a', 512), 'a' ) AS e,
 regexp_substr( repeat('a', 513), 'a' ) AS f;
-# The actual character set that ICU uses depends on the machine.
 SHOW CREATE TABLE t1;
 Table	Create Table
 t1	CREATE TABLE `t1` (
   `a` bigint(21) NOT NULL DEFAULT '0',
   `b` int(1) NOT NULL DEFAULT '0',
-  `c` longtext CHARACTER SET utf16x,
-  `d` varchar(1) CHARACTER SET utf16x NOT NULL DEFAULT '',
-  `e` varchar(512) CHARACTER SET utf16x DEFAULT NULL,
-  `f` text CHARACTER SET utf16x
+  `c` longtext CHARACTER SET latin1,
+  `d` varchar(1) CHARACTER SET latin1 NOT NULL DEFAULT '',
+  `e` varchar(512) CHARACTER SET latin1 DEFAULT NULL,
+  `f` text CHARACTER SET latin1
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
 SELECT * FROM t1;
 a	b	c	d	e	f
@@ -968,3 +967,38 @@ regexp_replace( '???', '.', 'a', 2 )
 SELECT regexp_replace( '🍣🍣🍣', '.', 'a', 2, 2 );
 regexp_replace( '???', '.', 'a', 2, 2 )
 🍣🍣a
+#
+# REGEXP_REPLACE DOES NOT CONVERT RESULT CHARACTER SET
+#
+SELECT hex(regexp_replace( convert( 'abcd' using utf8mb4 ), 'c', ''));
+hex(regexp_replace( convert( 'abcd' using utf8mb4 ), 'c', ''))
+616264
+SELECT hex(regexp_replace( convert( 'abcd' using utf16 ), 'c', ''));
+hex(regexp_replace( convert( 'abcd' using utf16 ), 'c', ''))
+006100620064
+SELECT hex(regexp_substr( convert( 'abcd' using utf8mb4 ), 'abc'));
+hex(regexp_substr( convert( 'abcd' using utf8mb4 ), 'abc'))
+616263
+SELECT hex(regexp_substr( convert( 'abcd' using utf16 ), 'abc'));
+hex(regexp_substr( convert( 'abcd' using utf16 ), 'abc'))
+006100620063
+#
+# Test of the code path that elides character set conversion when the
+# target column has the same character set as ICU produces. This depends
+# on the architecture, and so we try both big and little endian.
+#
+CREATE TABLE t1 (
+a CHAR(10) CHARACTER SET utf16le,
+b CHAR(10) CHARACTER SET utf16
+);
+INSERT INTO t1 VALUES (
+regexp_substr( convert('abcd' using utf16le), 'abc' ),
+regexp_substr( convert('abcd' using utf16), 'abc' ));
+INSERT INTO t1 VALUES (
+regexp_substr( 'abcd', 'abc' ),
+regexp_substr( 'abcd', 'abc' ));
+SELECT * FROM t1;
+a	b
+abc	abc
+abc	abc
+DROP TABLE t1;
diff --git a/mysql-test/t/regular_expressions_utf-8.test b/mysql-test/t/regular_expressions_utf-8.test
@@ -777,8 +777,6 @@ CREATE TABLE t1 AS SELECT
   regexp_substr( repeat('a', 512), 'a' ) AS e,
   regexp_substr( repeat('a', 513), 'a' ) AS f;
 
---echo # The actual character set that ICU uses depends on the machine.
---replace_regex /(utf16)(le)?/\1x/
 SHOW CREATE TABLE t1;
 SELECT * FROM t1;
 DROP TABLE t1;
@@ -811,3 +809,34 @@ SELECT regexp_replace( '🍣🍣🍣', '.', 'a', 2 );
 SELECT regexp_replace( '🍣🍣🍣', '.', 'a', 2, 2 );
 
 --source ../mysql-test/include/cleanup_icu_utils.inc
+
+--echo #
+--echo # REGEXP_REPLACE DOES NOT CONVERT RESULT CHARACTER SET
+--echo #
+
+SELECT hex(regexp_replace( convert( 'abcd' using utf8mb4 ), 'c', ''));
+SELECT hex(regexp_replace( convert( 'abcd' using utf16 ), 'c', ''));
+SELECT hex(regexp_substr( convert( 'abcd' using utf8mb4 ), 'abc'));
+SELECT hex(regexp_substr( convert( 'abcd' using utf16 ), 'abc'));
+
+--echo #
+--echo # Test of the code path that elides character set conversion when the
+--echo # target column has the same character set as ICU produces. This depends
+--echo # on the architecture, and so we try both big and little endian.
+--echo #
+CREATE TABLE t1 (
+  a CHAR(10) CHARACTER SET utf16le,
+  b CHAR(10) CHARACTER SET utf16
+);
+
+INSERT INTO t1 VALUES (
+  regexp_substr( convert('abcd' using utf16le), 'abc' ),
+  regexp_substr( convert('abcd' using utf16), 'abc' ));
+
+INSERT INTO t1 VALUES (
+  regexp_substr( 'abcd', 'abc' ),
+  regexp_substr( 'abcd', 'abc' ));
+
+SELECT * FROM t1;
+
+DROP TABLE t1;
diff --git a/sql/item_regexp_func.cc b/sql/item_regexp_func.cc
@@ -90,7 +90,7 @@ static bool ParseRegexpOptions(const std::string &options_string,
 }
 
 bool Item_func_regexp::resolve_type(THD *) {
-  return agg_arg_charsets_for_comparison(m_cmp_collation, args, 2);
+  return agg_arg_charsets_for_comparison(collation, args, 2);
 }
 
 bool Item_func_regexp::fix_fields(THD *thd, Item **arguments) {
@@ -116,8 +116,7 @@ bool Item_func_regexp::set_pattern() {
   if (!mp.has_value()) return true;
 
   bool is_case_sensitive =
-      ((m_cmp_collation.collation->state & MY_CS_CSSORT) != 0 ||
-       (m_cmp_collation.collation->state & MY_CS_BINSORT) != 0);
+      (((collation.collation->state & (MY_CS_CSSORT | MY_CS_BINSORT)) != 0));
 
   uint32_t icu_flags = 0;  // Avoids compiler warning on gcc 4.8.5.
   // match_parameter overrides coercion type.
@@ -185,7 +184,7 @@ longlong Item_func_regexp_like::val_int() {
 
 bool Item_func_regexp_replace::resolve_type(THD *thd) {
   if (Item_func_regexp::resolve_type(thd)) return true;
-  set_data_type_string(ulonglong{MAX_BLOB_WIDTH}, regexp::regexp_lib_charset);
+  set_data_type_string(ulonglong{MAX_BLOB_WIDTH});
   return false;
 }
 
@@ -206,6 +205,7 @@ String *Item_func_regexp_replace::val_str(String *buf) {
     return nullptr;
   }
 
+  buf->set_charset(collation.collation);
   String *result = m_facade->Replace(subject(), replacement(), pos.value(),
                                      occ.value(), buf);
   null_value = (result == nullptr);
@@ -214,8 +214,7 @@ String *Item_func_regexp_replace::val_str(String *buf) {
 
 bool Item_func_regexp_substr::resolve_type(THD *thd) {
   if (Item_func_regexp::resolve_type(thd)) return true;
-  set_data_type_string(subject()->max_char_length(),
-                       regexp::regexp_lib_charset);
+  set_data_type_string(subject()->max_char_length());
   return false;
 }
 
@@ -233,6 +232,7 @@ String *Item_func_regexp_substr::val_str(String *buf) {
     null_value = true;
     return nullptr;
   }
+  buf->set_charset(collation.collation);
   String *result = m_facade->Substr(subject(), pos.value(), occ.value(), buf);
   null_value = (result == nullptr);
   return result;
diff --git a/sql/item_regexp_func.h b/sql/item_regexp_func.h
@@ -214,14 +214,6 @@ class Item_func_regexp : public Item_func {
   bool set_pattern();
 
   unique_ptr_destroy_only<regexp::Regexp_facade> m_facade;
-
- private:
-  /**
-    The collation that is supposed to be used if you were to compare the
-    pattern and the subject strings. We use this only for figuring out whether
-    regular expression matching should be case-sensitive or not.
-  */
-  DTCollation m_cmp_collation;
 };
 
 class Item_func_regexp_instr : public Item_func_regexp {
diff --git a/sql/regexp/regexp_engine.cc b/sql/regexp/regexp_engine.cc
@@ -32,7 +32,6 @@
 #include "sql/regexp/errors.h"
 #include "sql/regexp/regexp_facade.h"
 #include "sql/sql_class.h"
-#include "sql_string.h"
 #include "template_utils.h"
 
 namespace regexp {
@@ -105,22 +104,15 @@ const std::u16string &Regexp_engine::Replace(const std::u16string &replacement,
   return m_replace_buffer;
 }
 
-String *Regexp_engine::MatchedSubstring(String *result) {
+std::pair<int, int> Regexp_engine::MatchedSubstring() {
   int start = uregex_start(m_re, 0, &m_error_code);
   int end = uregex_end(m_re, 0, &m_error_code);
-  auto text =
-      pointer_cast<const char *>(uregex_getText(m_re, nullptr, &m_error_code));
   int start_in_bytes = start * sizeof(UChar);
   int length_in_bytes = (end - start) * sizeof(UChar);
 
-  if (U_FAILURE(m_error_code)) return nullptr;
-  /*
-    The ownership of the text was with us all along, we can safely pass it to
-    `result`.
-  */
-  result->set(text + start_in_bytes, length_in_bytes, regexp_lib_charset);
+  if (U_FAILURE(m_error_code)) return {-1, -1};
 
-  return result;
+  return {start_in_bytes, length_in_bytes};
 }
 
 void Regexp_engine::AppendHead(size_t size) {
diff --git a/sql/regexp/regexp_engine.h b/sql/regexp/regexp_engine.h
@@ -1,7 +1,7 @@
 #ifndef SQL_REGEXP_REGEXP_ENGINE_H_
 #define SQL_REGEXP_REGEXP_ENGINE_H_
 
-/* Copyright (c) 2017, 2018, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2017, 2019, Oracle and/or its affiliates. All rights reserved.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License, version 2.0,
@@ -27,6 +27,7 @@
 
 #include <stdint.h>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "m_ctype.h"    // CHARSET_INFO.
@@ -35,7 +36,6 @@
 #include "sql/current_thd.h"
 #include "sql/regexp/errors.h"
 #include "sql/sql_class.h"  // THD
-#include "sql_string.h"
 #include "template_utils.h"
 
 extern CHARSET_INFO my_charset_utf16le_general_ci;
@@ -170,15 +170,12 @@ class Regexp_engine {
                                 int occurrence);
 
   /**
-    Copies the portion of the subject string between the start of the match
-    and the end of the match into result.
+    The start of the match and its length.
 
-    @param[out] result A string we can write to. The character set
-    regexp_lib_charset is used.
-
-    @return A pointer to @p result.
+    @return The index of the first code point of the match, and the length of
+    the same.
   */
-  String *MatchedSubstring(String *result);
+  std::pair<int, int> MatchedSubstring();
 
   bool IsError() const { return U_FAILURE(m_error_code); }
   bool CheckError() const { return check_icu_status(m_error_code); }
diff --git a/sql/regexp/regexp_facade.cc b/sql/regexp/regexp_facade.cc
@@ -23,6 +23,7 @@
 #include "sql/regexp/regexp_facade.h"
 
 #include <string>
+#include <tuple>
 
 #include "my_pointer_arithmetic.h"
 #include "sql/mysqld.h"  // make_unique_destroy_only
@@ -184,8 +185,19 @@ String *Regexp_facade::Replace(Item *subject_expr, Item *replacement_expr,
 
   const std::u16string &result_buffer = m_engine->Replace(
       replacement, ConvertCodePointToLibPosition(start), occurrence);
-  result->set(pointer_cast<const char *>(result_buffer.data()),
-              result_buffer.size() * sizeof(UChar), regexp_lib_charset);
+
+  uint conversion_error;
+  size_t number_unaligned_characters;
+  if (result->needs_conversion(result->length(), regexp_lib_charset,
+                               result->charset(),
+                               &number_unaligned_characters)) {
+    if (result->copy(pointer_cast<const char *>(result_buffer.data()),
+                     result_buffer.size() * sizeof(UChar), regexp_lib_charset,
+                     result->charset(), &conversion_error))
+      return nullptr;
+  } else
+    result->set(pointer_cast<const char *>(result_buffer.data()),
+                result_buffer.size() * sizeof(UChar), regexp_lib_charset);
   return result;
 }
 
@@ -196,9 +208,24 @@ String *Regexp_facade::Substr(Item *subject_expr, int start, int occurrence,
     m_engine->CheckError();
     return nullptr;
   }
-  String *res = m_engine->MatchedSubstring(result);
+  int substart, sublength;
+  std::tie(substart, sublength) = m_engine->MatchedSubstring();
   if (m_engine->CheckError()) return nullptr;
-  return res;
+
+  uint conversion_error;
+
+  auto substartptr =
+      pointer_cast<const char *>(m_current_subject.c_str()) + substart;
+
+  size_t number_unaligned_characters;
+  if (result->needs_conversion(sublength, regexp_lib_charset, result->charset(),
+                               &number_unaligned_characters)) {
+    if (result->copy(substartptr, sublength, regexp_lib_charset,
+                     result->charset(), &conversion_error))
+      return nullptr;
+  } else
+    result->set(substartptr, sublength, regexp_lib_charset);
+  return result;
 }
 
 bool Regexp_facade::SetupEngine(Item *pattern_expr, uint flags) {
diff --git a/sql/regexp/regexp_facade.h b/sql/regexp/regexp_facade.h
@@ -121,17 +121,6 @@ class Regexp_facade {
 
   String *Substr(Item *subject_expr, int start, int occurrence, String *result);
 
-  /**
-    Returns the substring that was matched by the previous call to find() or
-    matches().
-
-    @param[out] result A string we can write to.
-    @return A pointer to result.
-  */
-  String *MatchedSubstring(String *result) {
-    return m_engine->MatchedSubstring(result);
-  }
-
  private:
   /**
     Resets the compiled regular expression with a new string.