Skip to content

Commit f354611

Browse files
author
Slawomir Maludzinski
committed
BUG#34397106 GR secondary node crashes during relay log purge
Problem ------- Group Replication secondary node crashed while purging of the relay log. Analysis / Root-cause analysis ------------------------------ Server receives SIGSEGV signal when Global_THD_manager iterates over threads using do_for_all_thd method. Solution -------- Threads which execute operations on log files register their operations on global mysql_bin_log object. Next when log_in_use or adjust_linfo_offsets functions are executed all registred LOG_INFO objects are searched or adjusted. When a thread no longer executes operations on a log it unregisters LOG_INFO from mysql_bin_log object. Change-Id: I0e229433278e8997ea4339eefc506dbfc47314d9
1 parent 36111d0 commit f354611

File tree

8 files changed

+146
-125
lines changed

8 files changed

+146
-125
lines changed

sql/binlog.cc

+64-101
Original file line numberDiff line numberDiff line change
@@ -2922,101 +2922,6 @@ static bool binlog_savepoint_rollback_can_release_mdl(handlerton *, THD *thd) {
29222922
return !trans_cannot_safely_rollback(thd);
29232923
}
29242924

2925-
/**
2926-
Adjust log offset in the binary log file for all running slaves
2927-
This class implements call back function for do_for_all_thd().
2928-
It is called for each thd in thd list to adjust offset.
2929-
*/
2930-
class Adjust_offset : public Do_THD_Impl {
2931-
public:
2932-
Adjust_offset(my_off_t value) : m_purge_offset(value) {}
2933-
void operator()(THD *thd) override {
2934-
LOG_INFO *linfo;
2935-
mysql_mutex_lock(&thd->LOCK_thd_data);
2936-
if ((linfo = thd->current_linfo)) {
2937-
/*
2938-
Index file offset can be less that purge offset only if
2939-
we just started reading the index file. In that case
2940-
we have nothing to adjust.
2941-
*/
2942-
if (linfo->index_file_offset < m_purge_offset)
2943-
linfo->fatal = (linfo->index_file_offset != 0);
2944-
else
2945-
linfo->index_file_offset -= m_purge_offset;
2946-
}
2947-
mysql_mutex_unlock(&thd->LOCK_thd_data);
2948-
}
2949-
2950-
private:
2951-
my_off_t m_purge_offset;
2952-
};
2953-
2954-
/*
2955-
Adjust the position pointer in the binary log file for all running slaves.
2956-
2957-
SYNOPSIS
2958-
adjust_linfo_offsets()
2959-
purge_offset Number of bytes removed from start of log index file
2960-
2961-
NOTES
2962-
- This is called when doing a PURGE when we delete lines from the
2963-
index log file.
2964-
2965-
REQUIREMENTS
2966-
- Before calling this function, we have to ensure that no threads are
2967-
using any binary log file before purge_offset.
2968-
2969-
TODO
2970-
- Inform the slave threads that they should sync the position
2971-
in the binary log file with flush_relay_log_info.
2972-
Now they sync is done for next read.
2973-
*/
2974-
static void adjust_linfo_offsets(my_off_t purge_offset) {
2975-
Adjust_offset adjust_offset(purge_offset);
2976-
Global_THD_manager::get_instance()->do_for_all_thd(&adjust_offset);
2977-
}
2978-
2979-
/**
2980-
This class implements Call back function for do_for_all_thd().
2981-
It is called for each thd in thd list to count
2982-
threads using bin log file
2983-
*/
2984-
2985-
class Log_in_use : public Do_THD_Impl {
2986-
public:
2987-
Log_in_use(const char *value) : m_log_name(value), m_count(0) {
2988-
m_log_name_len = strlen(m_log_name) + 1;
2989-
}
2990-
void operator()(THD *thd) override {
2991-
LOG_INFO *linfo;
2992-
mysql_mutex_lock(&thd->LOCK_thd_data);
2993-
if ((linfo = thd->current_linfo)) {
2994-
if (!strncmp(m_log_name, linfo->log_file_name, m_log_name_len)) {
2995-
LogErr(WARNING_LEVEL, ER_BINLOG_FILE_BEING_READ_NOT_PURGED, m_log_name,
2996-
thd->thread_id());
2997-
m_count++;
2998-
}
2999-
}
3000-
mysql_mutex_unlock(&thd->LOCK_thd_data);
3001-
}
3002-
int get_count() { return m_count; }
3003-
3004-
private:
3005-
const char *m_log_name;
3006-
size_t m_log_name_len;
3007-
int m_count;
3008-
};
3009-
3010-
static int log_in_use(const char *log_name) {
3011-
Log_in_use log_in_use(log_name);
3012-
#ifndef NDEBUG
3013-
if (current_thd)
3014-
DEBUG_SYNC(current_thd, "purge_logs_after_lock_index_before_thread_count");
3015-
#endif
3016-
Global_THD_manager::get_instance()->do_for_all_thd(&log_in_use);
3017-
return log_in_use.get_count();
3018-
}
3019-
30202925
static bool purge_error_message(THD *thd, int res) {
30212926
uint errcode;
30222927

@@ -3361,9 +3266,8 @@ bool show_binlog_events(THD *thd, MYSQL_BIN_LOG *binary_log) {
33613266
goto err;
33623267
}
33633268

3364-
mysql_mutex_lock(&thd->LOCK_thd_data);
3365-
thd->current_linfo = &linfo;
3366-
mysql_mutex_unlock(&thd->LOCK_thd_data);
3269+
linfo.thread_id = thd->thread_id();
3270+
binary_log->register_log_info(&linfo);
33673271

33683272
BINLOG_FILE_READER binlog_file_reader(
33693273
opt_source_verify_checksum,
@@ -3470,9 +3374,8 @@ bool show_binlog_events(THD *thd, MYSQL_BIN_LOG *binary_log) {
34703374
} else
34713375
my_eof(thd);
34723376

3473-
mysql_mutex_lock(&thd->LOCK_thd_data);
3474-
thd->current_linfo = nullptr;
3475-
mysql_mutex_unlock(&thd->LOCK_thd_data);
3377+
binary_log->unregister_log_info(&linfo);
3378+
34763379
return !errmsg.empty();
34773380
}
34783381

@@ -3553,6 +3456,7 @@ void MYSQL_BIN_LOG::cleanup() {
35533456
mysql_mutex_destroy(&LOCK_sync);
35543457
mysql_mutex_destroy(&LOCK_binlog_end_pos);
35553458
mysql_mutex_destroy(&LOCK_xids);
3459+
mysql_mutex_destroy(&LOCK_log_info);
35563460
mysql_cond_destroy(&update_cond);
35573461
mysql_cond_destroy(&m_prep_xids_cond);
35583462
if (!is_relay_log) {
@@ -3577,6 +3481,7 @@ void MYSQL_BIN_LOG::init_pthread_objects() {
35773481
mysql_mutex_init(m_key_LOCK_binlog_end_pos, &LOCK_binlog_end_pos,
35783482
MY_MUTEX_INIT_FAST);
35793483
mysql_mutex_init(m_key_LOCK_xids, &LOCK_xids, MY_MUTEX_INIT_FAST);
3484+
mysql_mutex_init(m_key_LOCK_log_info, &LOCK_log_info, MY_MUTEX_INIT_FAST);
35803485
mysql_cond_init(m_key_update_cond, &update_cond);
35813486
mysql_cond_init(m_key_prep_xids_cond, &m_prep_xids_cond);
35823487
if (!is_relay_log) {
@@ -7316,6 +7221,64 @@ int MYSQL_BIN_LOG::get_gtid_executed(Sid_map *sid_map, Gtid_set *gtid_set) {
73167221
return error;
73177222
}
73187223

7224+
void MYSQL_BIN_LOG::register_log_info(LOG_INFO *log_info) {
7225+
DBUG_TRACE;
7226+
MUTEX_LOCK(lock, &LOCK_log_info);
7227+
log_info_set.insert(log_info);
7228+
}
7229+
7230+
void MYSQL_BIN_LOG::unregister_log_info(LOG_INFO *log_info) {
7231+
DBUG_TRACE;
7232+
MUTEX_LOCK(lock, &LOCK_log_info);
7233+
log_info_set.erase(log_info);
7234+
}
7235+
7236+
int MYSQL_BIN_LOG::log_in_use(const char *log_name) {
7237+
DBUG_TRACE;
7238+
#ifndef NDEBUG
7239+
if (current_thd)
7240+
DEBUG_SYNC(current_thd, "purge_logs_after_lock_index_before_thread_count");
7241+
#endif
7242+
7243+
mysql_mutex_assert_owner(&LOCK_index);
7244+
MUTEX_LOCK(lock_log_info, &LOCK_log_info);
7245+
7246+
int count = 0;
7247+
int log_name_len = strlen(log_name) + 1;
7248+
7249+
std::for_each(
7250+
log_info_set.cbegin(), log_info_set.cend(),
7251+
[log_name, log_name_len, &count](LOG_INFO *log_info) {
7252+
if (!strncmp(log_name, log_info->log_file_name, log_name_len)) {
7253+
LogErr(WARNING_LEVEL, ER_BINLOG_FILE_BEING_READ_NOT_PURGED, log_name,
7254+
log_info->thread_id);
7255+
count++;
7256+
}
7257+
});
7258+
7259+
return count;
7260+
}
7261+
7262+
void MYSQL_BIN_LOG::adjust_linfo_offsets(my_off_t purge_offset) {
7263+
DBUG_TRACE;
7264+
7265+
mysql_mutex_assert_owner(&LOCK_index);
7266+
MUTEX_LOCK(lock_log_info, &LOCK_log_info);
7267+
7268+
std::for_each(log_info_set.cbegin(), log_info_set.cend(),
7269+
[purge_offset](LOG_INFO *log_info) {
7270+
/*
7271+
Index file offset can be less that purge offset only if
7272+
we just started reading the index file. In that case
7273+
we have nothing to adjust.
7274+
*/
7275+
if (log_info->index_file_offset < purge_offset)
7276+
log_info->fatal = (log_info->index_file_offset != 0);
7277+
else
7278+
log_info->index_file_offset -= purge_offset;
7279+
});
7280+
}
7281+
73197282
/**
73207283
Write the contents of the given IO_CACHE to the binary log.
73217284

sql/binlog.h

+52-6
Original file line numberDiff line numberDiff line change
@@ -111,25 +111,22 @@ struct Binlog_user_var_event {
111111
#define LOG_CLOSE_TO_BE_OPENED 2
112112
#define LOG_CLOSE_STOP_EVENT 4
113113

114-
/*
115-
Note that we destroy the lock mutex in the destructor here.
116-
This means that object instances cannot be destroyed/go out of scope
117-
until we have reset thd->current_linfo to NULL;
118-
*/
119114
struct LOG_INFO {
120115
char log_file_name[FN_REFLEN] = {0};
121116
my_off_t index_file_offset, index_file_start_offset;
122117
my_off_t pos;
123118
bool fatal; // if the purge happens to give us a negative offset
124119
int entry_index; // used in purge_logs(), calculatd in find_log_pos().
125120
int encrypted_header_size;
121+
my_thread_id thread_id;
126122
LOG_INFO()
127123
: index_file_offset(0),
128124
index_file_start_offset(0),
129125
pos(0),
130126
fatal(false),
131127
entry_index(0),
132-
encrypted_header_size(0) {
128+
encrypted_header_size(0),
129+
thread_id(0) {
133130
memset(log_file_name, 0, FN_REFLEN);
134131
}
135132
};
@@ -185,6 +182,8 @@ class MYSQL_BIN_LOG : public TC_LOG {
185182
PSI_mutex_key m_key_LOCK_sync;
186183
/** The instrumentation key to use for @ LOCK_xids. */
187184
PSI_mutex_key m_key_LOCK_xids;
185+
/** The instrumentation key to use for @ m_key_LOCK_log_info. */
186+
PSI_mutex_key m_key_LOCK_log_info;
188187
/** The instrumentation key to use for @ update_cond. */
189188
PSI_cond_key m_key_update_cond;
190189
/** The instrumentation key to use for @ prep_xids_cond. */
@@ -368,6 +367,7 @@ class MYSQL_BIN_LOG : public TC_LOG {
368367
PSI_mutex_key key_LOCK_flush_queue, PSI_mutex_key key_LOCK_log,
369368
PSI_mutex_key key_LOCK_binlog_end_pos, PSI_mutex_key key_LOCK_sync,
370369
PSI_mutex_key key_LOCK_sync_queue, PSI_mutex_key key_LOCK_xids,
370+
PSI_mutex_key key_LOCK_log_info,
371371
PSI_mutex_key key_LOCK_wait_for_group_turn, PSI_cond_key key_COND_done,
372372
PSI_cond_key key_COND_flush_queue, PSI_cond_key key_update_cond,
373373
PSI_cond_key key_prep_xids_cond,
@@ -390,6 +390,7 @@ class MYSQL_BIN_LOG : public TC_LOG {
390390
m_key_LOCK_after_commit = key_LOCK_after_commit;
391391
m_key_LOCK_sync = key_LOCK_sync;
392392
m_key_LOCK_xids = key_LOCK_xids;
393+
m_key_LOCK_log_info = key_LOCK_log_info;
393394
m_key_update_cond = key_update_cond;
394395
m_key_prep_xids_cond = key_prep_xids_cond;
395396
m_key_file_log = key_file_log;
@@ -1023,6 +1024,51 @@ class MYSQL_BIN_LOG : public TC_LOG {
10231024
True while rotating binlog, which is caused by logging Incident_log_event.
10241025
*/
10251026
bool is_rotating_caused_by_incident;
1027+
1028+
public:
1029+
/**
1030+
Register LOG_INFO so that log_in_use and adjust_linfo_offsets can
1031+
operate on all logs. Note that register_log_info, unregister_log_info,
1032+
log_in_use, adjust_linfo_offsets are is used on global mysql_bin_log object.
1033+
@param log_info pointer to LOG_INFO which is registred
1034+
*/
1035+
void register_log_info(LOG_INFO *log_info);
1036+
/**
1037+
Unregister LOG_INFO when it is no longer needed.
1038+
@param log_info pointer to LOG_INFO which is registred
1039+
*/
1040+
void unregister_log_info(LOG_INFO *log_info);
1041+
/**
1042+
Check if any threads use log name.
1043+
@note This method expects the LOCK_index to be taken so there are no
1044+
concurrent edits against linfo objects being iterated
1045+
@param log_name name of a log which is checked for usage
1046+
1047+
*/
1048+
int log_in_use(const char *log_name);
1049+
/**
1050+
Adjust the position pointer in the binary log file for all running replicas.
1051+
SYNOPSIS
1052+
adjust_linfo_offsets()
1053+
purge_offset Number of bytes removed from start of log index file
1054+
NOTES
1055+
- This is called when doing a PURGE when we delete lines from the
1056+
index log file. This method expects the LOCK_index to be taken so there
1057+
are no concurrent edits against linfo objects being iterated. REQUIREMENTS
1058+
- Before calling this function, we have to ensure that no threads are
1059+
using any binary log file before purge_offset.
1060+
TODO
1061+
- Inform the replica threads that they should sync the position
1062+
in the binary log file with flush_relay_log_info.
1063+
Now they sync is done for next read.
1064+
*/
1065+
void adjust_linfo_offsets(my_off_t purge_offset);
1066+
1067+
private:
1068+
mysql_mutex_t LOCK_log_info;
1069+
// Set of log info objects that are in usage and might prevent some other
1070+
// operations from executing.
1071+
std::set<LOG_INFO *> log_info_set;
10261072
};
10271073

10281074
struct LOAD_FILE_INFO {

sql/mysqld.cc

+3-2
Original file line numberDiff line numberDiff line change
@@ -1089,6 +1089,7 @@ static PSI_mutex_key key_BINLOG_LOCK_binlog_end_pos;
10891089
static PSI_mutex_key key_BINLOG_LOCK_sync;
10901090
static PSI_mutex_key key_BINLOG_LOCK_sync_queue;
10911091
static PSI_mutex_key key_BINLOG_LOCK_xids;
1092+
static PSI_mutex_key key_BINLOG_LOCK_log_info;
10921093
static PSI_mutex_key key_BINLOG_LOCK_wait_for_group_turn;
10931094
static PSI_rwlock_key key_rwlock_global_sid_lock;
10941095
PSI_rwlock_key key_rwlock_gtid_mode_lock;
@@ -4800,8 +4801,8 @@ int init_common_variables() {
48004801
key_BINLOG_LOCK_flush_queue, key_BINLOG_LOCK_log,
48014802
key_BINLOG_LOCK_binlog_end_pos, key_BINLOG_LOCK_sync,
48024803
key_BINLOG_LOCK_sync_queue, key_BINLOG_LOCK_xids,
4803-
key_BINLOG_LOCK_wait_for_group_turn, key_BINLOG_COND_done,
4804-
key_BINLOG_COND_flush_queue, key_BINLOG_update_cond,
4804+
key_BINLOG_LOCK_log_info, key_BINLOG_LOCK_wait_for_group_turn,
4805+
key_BINLOG_COND_done, key_BINLOG_COND_flush_queue, key_BINLOG_update_cond,
48054806
key_BINLOG_prep_xids_cond, key_BINLOG_COND_wait_for_group_turn,
48064807
key_file_binlog, key_file_binlog_index, key_file_binlog_cache,
48074808
key_file_binlog_index_cache);

sql/rpl_binlog_sender.cc

+4-6
Original file line numberDiff line numberDiff line change
@@ -261,9 +261,8 @@ void Binlog_sender::init() {
261261
init_heartbeat_period();
262262
m_last_event_sent_ts = now_in_nanosecs();
263263

264-
mysql_mutex_lock(&thd->LOCK_thd_data);
265-
thd->current_linfo = &m_linfo;
266-
mysql_mutex_unlock(&thd->LOCK_thd_data);
264+
m_linfo.thread_id = thd->thread_id();
265+
mysql_bin_log.register_log_info(&m_linfo);
267266

268267
/* Initialize the buffer only once. */
269268
m_packet.mem_realloc(PACKET_MIN_SIZE); // size of the buffer
@@ -364,9 +363,7 @@ void Binlog_sender::cleanup() {
364363
if (m_transmit_started)
365364
(void)RUN_HOOK(binlog_transmit, transmit_stop, (thd, m_flag));
366365

367-
mysql_mutex_lock(&thd->LOCK_thd_data);
368-
thd->current_linfo = nullptr;
369-
mysql_mutex_unlock(&thd->LOCK_thd_data);
366+
mysql_bin_log.unregister_log_info(&m_linfo);
370367

371368
thd->variables.max_allowed_packet =
372369
global_system_variables.max_allowed_packet;
@@ -380,6 +377,7 @@ void Binlog_sender::cleanup() {
380377

381378
void Binlog_sender::run() {
382379
DBUG_TRACE;
380+
383381
init();
384382

385383
unsigned int max_event_size =

sql/rpl_rli.cc

+3-3
Original file line numberDiff line numberDiff line change
@@ -208,9 +208,9 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery,
208208
PSI_NOT_INSTRUMENTED, key_RELAYLOG_LOCK_log,
209209
key_RELAYLOG_LOCK_log_end_pos, key_RELAYLOG_LOCK_sync,
210210
PSI_NOT_INSTRUMENTED, key_RELAYLOG_LOCK_xids, PSI_NOT_INSTRUMENTED,
211-
PSI_NOT_INSTRUMENTED, PSI_NOT_INSTRUMENTED, key_RELAYLOG_update_cond,
212-
PSI_NOT_INSTRUMENTED, PSI_NOT_INSTRUMENTED, key_file_relaylog,
213-
key_file_relaylog_index, key_file_relaylog_cache,
211+
PSI_NOT_INSTRUMENTED, PSI_NOT_INSTRUMENTED, PSI_NOT_INSTRUMENTED,
212+
key_RELAYLOG_update_cond, PSI_NOT_INSTRUMENTED, PSI_NOT_INSTRUMENTED,
213+
key_file_relaylog, key_file_relaylog_index, key_file_relaylog_cache,
214214
key_file_relaylog_index_cache);
215215
#endif
216216

sql/sql_class.cc

-1
Original file line numberDiff line numberDiff line change
@@ -739,7 +739,6 @@ THD::THD(bool enable_plugins)
739739
lex->thd = nullptr;
740740
lex->set_current_query_block(nullptr);
741741
m_lock_usec = 0L;
742-
current_linfo = nullptr;
743742
slave_thread = false;
744743
memset(&variables, 0, sizeof(variables));
745744
m_thread_id = Global_THD_manager::reserved_thread_id;

sql/sql_class.h

-6
Original file line numberDiff line numberDiff line change
@@ -2729,12 +2729,6 @@ class THD : public MDL_context_owner,
27292729
/** number of name_const() substitutions, see sp_head.cc:subst_spvars() */
27302730
uint query_name_consts;
27312731

2732-
/*
2733-
If we do a purge of binary logs, log index info of the threads
2734-
that are currently reading it needs to be adjusted. To do that
2735-
each thread that is using LOG_INFO needs to adjust the pointer to it
2736-
*/
2737-
LOG_INFO *current_linfo;
27382732
/* Used by the sys_var class to store temporary values */
27392733
union {
27402734
bool bool_value;

0 commit comments

Comments
 (0)