Skip to content

Commit d4be490

Browse files
author
Mikael Ronstrom
committed
BUG#19795152: Wait for LCP start at node restart, add node recovery status table as well
1 parent 3e82905 commit d4be490

22 files changed

+2952
-280
lines changed

mysql-test/suite/ndb/r/ndbinfo.result

+33-2
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,10 @@ table_id table_name comment
6565
18 disk_write_speed_base Actual speed of disk writes per LDM thread, base data
6666
19 disk_write_speed_aggregate Actual speed of disk writes per LDM thread, aggregate data
6767
20 frag_operations Per fragment operational information
68+
21 restart_info Times of restart phases in seconds and current state
6869
SELECT COUNT(*) FROM ndb$tables;
6970
COUNT(*)
70-
21
71+
22
7172
SELECT * FROM ndb$tables WHERE table_id = 2;
7273
table_id table_name comment
7374
2 test for testing
@@ -88,11 +89,12 @@ table_id table_name comment
8889
18 disk_write_speed_base Actual speed of disk writes per LDM thread, base data
8990
19 disk_write_speed_aggregate Actual speed of disk writes per LDM thread, aggregate data
9091
20 frag_operations Per fragment operational information
92+
21 restart_info Times of restart phases in seconds and current state
9193
SELECT * FROM ndb$tables WHERE table_name = 'LOGDESTINATION';
9294
table_id table_name comment
9395
SELECT COUNT(*) FROM ndb$tables t1, ndb$tables t2 WHERE t1.table_id = t1.table_id;
9496
COUNT(*)
95-
441
97+
484
9698

9799
SELECT table_id, table_name, comment from ndb$tables
98100
WHERE table_id > 2 AND table_id <= 5 ORDER BY table_id;
@@ -120,6 +122,7 @@ table_id table_name
120122
14 operations
121123
3 pools
122124
7 resources
125+
21 restart_info
123126
0 tables
124127
2 test
125128
11 threadblocks
@@ -182,6 +185,7 @@ table_id
182185
18
183186
19
184187
20
188+
21
185189

186190
TRUNCATE ndb$tables;
187191
ERROR HY000: Table 'ndb$tables' is read only
@@ -461,6 +465,33 @@ select count(*) from disk_write_speed_aggregate_node;
461465
count(*)
462466
2
463467

468+
desc restart_info;
469+
Field Type Null Key Default Extra
470+
node_id int(10) unsigned YES NULL
471+
node_restart_status varchar(256) YES NULL
472+
node_restart_status_int int(10) unsigned YES NULL
473+
secs_to_complete_node_failure int(10) unsigned YES NULL
474+
secs_to_allocate_node_id int(10) unsigned YES NULL
475+
secs_to_include_in_heartbeat_protocol int(10) unsigned YES NULL
476+
secs_until_wait_for_ndbcntr_master int(10) unsigned YES NULL
477+
secs_wait_for_ndbcntr_master int(10) unsigned YES NULL
478+
secs_to_get_start_permitted int(10) unsigned YES NULL
479+
secs_to_wait_for_lcp_for_copy_meta_data int(10) unsigned YES NULL
480+
secs_to_copy_meta_data int(10) unsigned YES NULL
481+
secs_to_include_node int(10) unsigned YES NULL
482+
secs_starting_node_to_request_local_recovery int(10) unsigned YES NULL
483+
secs_for_local_recovery int(10) unsigned YES NULL
484+
secs_restore_fragments int(10) unsigned YES NULL
485+
secs_undo_disk_data int(10) unsigned YES NULL
486+
secs_exec_redo_log int(10) unsigned YES NULL
487+
secs_index_rebuild int(10) unsigned YES NULL
488+
secs_to_synchronize_starting_node int(10) unsigned YES NULL
489+
secs_wait_lcp_for_restart int(10) unsigned YES NULL
490+
secs_wait_subscription_handover int(10) unsigned YES NULL
491+
total_restart_secs int(10) unsigned YES NULL
492+
select * from restart_info;
493+
node_id node_restart_status node_restart_status_int secs_to_complete_node_failure secs_to_allocate_node_id secs_to_include_in_heartbeat_protocol secs_until_wait_for_ndbcntr_master secs_wait_for_ndbcntr_master secs_to_get_start_permitted secs_to_wait_for_lcp_for_copy_meta_data secs_to_copy_meta_data secs_to_include_node secs_starting_node_to_request_local_recovery secs_for_local_recovery secs_restore_fragments secs_undo_disk_data secs_exec_redo_log secs_index_rebuild secs_to_synchronize_starting_node secs_wait_lcp_for_restart secs_wait_subscription_handover total_restart_secs
494+
464495
desc cluster_transactions;
465496
Field Type Null Key Default Extra
466497
node_id int(10) unsigned YES NULL

mysql-test/suite/ndb/t/ndbinfo.test

+3
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,9 @@ select node_id from disk_write_speed_base group by node_id, thr_no;
232232

233233
select count(*) from disk_write_speed_aggregate_node;
234234

235+
desc restart_info;
236+
select * from restart_info;
237+
235238
desc cluster_transactions;
236239
desc server_transactions;
237240
desc cluster_operations;

scripts/mysql_system_tables.sql

+22
Original file line numberDiff line numberDiff line change
@@ -1803,6 +1803,11 @@ PREPARE stmt FROM @str;
18031803
EXECUTE stmt;
18041804
DROP PREPARE stmt;
18051805

1806+
SET @str=IF(@have_ndbinfo,'DROP VIEW IF EXISTS `ndbinfo`.`restart_info`','SET @dummy = 0');
1807+
PREPARE stmt FROM @str;
1808+
EXECUTE stmt;
1809+
DROP PREPARE stmt;
1810+
18061811
SET @str=IF(@have_ndbinfo,'DROP VIEW IF EXISTS `ndbinfo`.`cluster_transactions`','SET @dummy = 0');
18071812
PREPARE stmt FROM @str;
18081813
EXECUTE stmt;
@@ -2039,6 +2044,17 @@ PREPARE stmt FROM @str;
20392044
EXECUTE stmt;
20402045
DROP PREPARE stmt;
20412046

2047+
# ndbinfo.ndb$restart_info
2048+
SET @str=IF(@have_ndbinfo,'DROP TABLE IF EXISTS `ndbinfo`.`ndb$restart_info`','SET @dummy = 0');
2049+
PREPARE stmt FROM @str;
2050+
EXECUTE stmt;
2051+
DROP PREPARE stmt;
2052+
2053+
SET @str=IF(@have_ndbinfo,'CREATE TABLE `ndbinfo`.`ndb$restart_info` (`node_id` INT UNSIGNED COMMENT "node id",`node_restart_status` VARCHAR(256) COMMENT "Current state of node recovery",`node_restart_status_int` INT UNSIGNED COMMENT "Current state of node recovery as number",`secs_to_complete_node_failure` INT UNSIGNED COMMENT "Seconds to complete node failure handling",`secs_to_allocate_node_id` INT UNSIGNED COMMENT "Seconds from node failure completion to allocation of node id", `secs_to_include_in_heartbeat_protocol` INT UNSIGNED COMMENT "Seonds from allocation of node id to inclusion in HB protocol",`secs_until_wait_for_ndbcntr_master` INT UNSIGNED COMMENT "Seconds from included in HB protocol until we wait for ndbcntr master",`secs_wait_for_ndbcntr_master` INT UNSIGNED COMMENT "Seconds we waited for being accepted by NDBCNTR master to start",`secs_to_get_start_permitted` INT UNSIGNED COMMENT "Seconds from permit by master until all nodes accepted our start",`secs_to_wait_for_lcp_for_copy_meta_data` INT UNSIGNED COMMENT "Seconds waiting for LCP completion before copying meta data",`secs_to_copy_meta_data` INT UNSIGNED COMMENT "Seconds to copy meta data to starting node from master",`secs_to_include_node` INT UNSIGNED COMMENT "Seconds to wait for GCP and inclusion of all nodes into protocols",`secs_starting_node_to_request_local_recovery` INT UNSIGNED COMMENT "Seconds for starting node to request local recovery",`secs_for_local_recovery` INT UNSIGNED COMMENT "Seconds for local recovery in starting node",`secs_restore_fragments` INT UNSIGNED COMMENT "Seconds to restore fragments from LCP files",`secs_undo_disk_data` INT UNSIGNED COMMENT "Seconds to execute UNDO log on disk data part of records",`secs_exec_redo_log` INT UNSIGNED COMMENT "Seconds to execute REDO log on all restored fragments",`secs_index_rebuild` INT UNSIGNED COMMENT "Seconds to rebuild indexes on restored fragments",`secs_to_synchronize_starting_node` INT UNSIGNED COMMENT "Seconds to synchronize starting node from live nodes",`secs_wait_lcp_for_restart` INT UNSIGNED COMMENT "Seconds to wait for LCP start and completion before restart is completed",`secs_wait_subscription_handover` INT UNSIGNED COMMENT "Seconds waiting for handover of replication subscriptions",`total_restart_secs` INT UNSIGNED COMMENT "Total number of seconds from node failure until node is started again") COMMENT="Times of restart phases in seconds and current state" ENGINE=NDBINFO','SET @dummy = 0');
2054+
PREPARE stmt FROM @str;
2055+
EXECUTE stmt;
2056+
DROP PREPARE stmt;
2057+
20422058
# ndbinfo.ndb$transactions
20432059
SET @str=IF(@have_ndbinfo,'DROP TABLE IF EXISTS `ndbinfo`.`ndb$transactions`','SET @dummy = 0');
20442060
PREPARE stmt FROM @str;
@@ -2313,6 +2329,12 @@ PREPARE stmt FROM @str;
23132329
EXECUTE stmt;
23142330
DROP PREPARE stmt;
23152331

2332+
# ndbinfo.restart_info
2333+
SET @str=IF(@have_ndbinfo,'CREATE OR REPLACE DEFINER=`root`@`localhost` SQL SECURITY INVOKER VIEW `ndbinfo`.`restart_info` AS SELECT * from `ndbinfo`.`ndb$restart_info`','SET @dummy = 0');
2334+
PREPARE stmt FROM @str;
2335+
EXECUTE stmt;
2336+
DROP PREPARE stmt;
2337+
23162338
# ndbinfo.cluster_transactions
23172339
SET @str=IF(@have_ndbinfo,'CREATE OR REPLACE DEFINER=`root`@`localhost` SQL SECURITY INVOKER VIEW `ndbinfo`.`cluster_transactions` AS SELECT t.node_id, t.block_instance, t.transid0 + (t.transid1 << 32) as transid, s.state_friendly_name as state, t.c_ops as count_operations, t.outstanding as outstanding_operations, t.timer as inactive_seconds, (t.apiref & 65535) as client_node_id, (t.apiref >> 16) as client_block_ref FROM `ndbinfo`.`ndb$transactions` t LEFT JOIN `ndbinfo`.`ndb$dbtc_apiconnect_state` s ON s.state_int_value = t.state','SET @dummy = 0');
23182340
PREPARE stmt FROM @str;

storage/ndb/include/kernel/GlobalSignalNumbers.h

+10-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
*
2525
* When adding a new signal, remember to update MAX_GSN and SignalNames.cpp
2626
*/
27-
const GlobalSignalNumber MAX_GSN = 769;
27+
const GlobalSignalNumber MAX_GSN = 778;
2828

2929
struct GsnName {
3030
GlobalSignalNumber gsn;
@@ -1093,6 +1093,13 @@ extern const GlobalSignalNumber NO_OF_SIGNAL_NAMES;
10931093

10941094
#define GSN_NODE_STARTED_REP 769
10951095

1096-
#define GSN_CHECK_NODE_INCLUDED_REQ 770
1097-
#define GSN_CHECK_NODE_INCLUDED_CONF 771
1096+
#define GSN_ALLOC_NODEID_REP 770
1097+
#define GSN_INCL_NODE_HB_PROTOCOL_REP 771
1098+
#define GSN_NDBCNTR_START_WAIT_REP 772
1099+
#define GSN_NDBCNTR_STARTED_REP 773
1100+
#define GSN_SUMA_HANDOVER_COMPLETE_REP 774
1101+
#define GSN_END_TOREP 775
1102+
#define GSN_LOCAL_RECOVERY_COMP_REP 776
1103+
#define GSN_CHECK_NODE_INCLUDED_REQ 777
1104+
#define GSN_CHECK_NODE_INCLUDED_CONF 778
10981105
#endif

storage/ndb/include/kernel/signaldata/AllocNodeId.hpp

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006, 2013, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2006, 2014, Oracle and/or its affiliates. All rights reserved.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -76,6 +76,16 @@ class AllocNodeIdRef {
7676
Uint32 masterRef;
7777
};
7878

79+
class AllocNodeIdRep
80+
{
81+
private:
82+
friend class Dbdih;
83+
friend class Qmgr;
84+
STATIC_CONST( SignalLength = 1);
85+
86+
Uint32 nodeId;
87+
};
88+
7989
#undef JAM_FILE_ID
8090

8191
#endif

storage/ndb/include/kernel/signaldata/DihContinueB.hpp

-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ class DihContinueB {
7373
ZTO_START_FRAGMENTS = 44
7474
,ZWAIT_OLD_SCAN = 46
7575
,ZLCP_TRY_LOCK = 47
76-
,ZDELAY_RELEASE_FRAGMENT_INFO_MUTEX = 48
7776
,ZTO_START_LOGGING = 49
7877
,ZGET_TABINFO = 50
7978
,ZGET_TABINFO_SEND = 51

storage/ndb/include/kernel/signaldata/TakeOver.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,13 @@ struct EndToConf
191191
Uint32 startingNodeId;
192192
};
193193

194+
struct EndToRep
195+
{
196+
public:
197+
STATIC_CONST ( SignalLength = 1 );
198+
199+
Uint32 nodeId;
200+
};
194201

195202
#undef JAM_FILE_ID
196203

storage/ndb/include/ndb_version.h.in

+6
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,12 @@ ndbd_multi_tc_instance_takeover(Uint32 x)
873873
*/
874874
#define NDBD_EXTRA_PARALLEL_FRAG_LCP NDB_MAKE_VERSION(7,4,1)
875875

876+
/**
877+
* Support for Node Recovery Status module, including sending
878+
* END_TOREP.
879+
*/
880+
#define NDBD_NODE_RECOVERY_STATUS_VERSION NDB_MAKE_VERSION(7,4,3)
881+
876882
/**
877883
* Support for removing EMPTY_LCP_REQ protocol in master takeover
878884
*/

storage/ndb/src/common/debugger/signaldata/DihContinueB.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,6 @@ printCONTINUEB_DBDIH(FILE * output, const Uint32 * theData,
195195
fprintf(output, " Lcp trylock: attempt %u\n",
196196
theData[1]);
197197
break;
198-
case DihContinueB::ZDELAY_RELEASE_FRAGMENT_INFO_MUTEX:
199-
fprintf(output, " Delay release fragment-info mutex (handle: 0x%x)",
200-
theData[1]);
201-
break;
202198
default:
203199
fprintf(output, " Default system error lab...\n");
204200
break;

storage/ndb/src/common/debugger/signaldata/SignalNames.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,14 @@ const GsnName SignalNames [] = {
809809
,{ GSN_BUILD_FK_IMPL_REF, "BUILD_FK_IMPL_REF" }
810810
,{ GSN_BUILD_FK_IMPL_CONF, "BUILD_FK_IMPL_CONF" }
811811
,{ GSN_NODE_STARTED_REP, "NODE_STARTED_REP" }
812+
,{ GSN_ALLOC_NODEID_REP, "ALLOC_NODEID_REP" }
813+
,{ GSN_INCL_NODE_HB_PROTOCOL_REP, "INCL_NODE_HB_PROTOCOL_REP" }
814+
,{ GSN_NDBCNTR_START_WAIT_REP, "NDBCNTR_START_WAIT_REP" }
815+
,{ GSN_NDBCNTR_STARTED_REP, "NDBCNTR_STARTED_REP" }
816+
,{ GSN_SUMA_HANDOVER_COMPLETE_REP, "SUMA_HANDOVER_COMPLETE_REP" }
817+
,{ GSN_END_TOREP, "END_TOREP" }
818+
,{ GSN_LOCAL_RECOVERY_COMP_REP, "LOCAL_RECOVERY_COMP_REP" }
819+
,{ GSN_CANCEL_SUBSCRIPTION_REQ, "CANCEL_SUBSCRIPTION_REQ" }
812820
,{ GSN_CHECK_NODE_INCLUDED_REQ, "CHECK_NODE_INCLUDED_REQ" }
813821
,{ GSN_CHECK_NODE_INCLUDED_CONF, "CHECK_NODE_INCLUDED_CONF" }
814822
};

storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp

+18-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <AttributeHeader.hpp>
3434
#include <KeyDescriptor.hpp>
3535
#include <Checksum.hpp>
36+
#include <signaldata/NodeRecoveryStatusRep.hpp>
3637
#include <signaldata/DictSchemaInfo.hpp>
3738
#include <signaldata/DictTabInfo.hpp>
3839
#include <signaldata/DropTabFile.hpp>
@@ -20428,9 +20429,24 @@ Dbdict::execDICT_UNLOCK_ORD(Signal* signal)
2042820429
if (ord->lockType == DictLockReq::SumaStartMe ||
2042920430
ord->lockType == DictLockReq::SumaHandOver)
2043020431
{
20432+
Uint32 nodeId = refToNode(ord->senderRef);
2043120433
jam();
20432-
g_eventLogger->info("clearing SumaStartMe dict lock for %u", refToNode(ord->senderRef));
20433-
c_sub_startstop_lock.clear(refToNode(ord->senderRef));
20434+
g_eventLogger->info("clearing SumaStartMe dict lock for %u", nodeId);
20435+
c_sub_startstop_lock.clear(nodeId);
20436+
20437+
if (ord->lockType == DictLockReq::SumaHandOver)
20438+
{
20439+
/**
20440+
* Inform the master DIH that the SUMA handover is now completed, this
20441+
* is the very last phase of the node recovery. This code is only
20442+
* executed in the master node.
20443+
*/
20444+
SumaHandoverCompleteRep *rep =
20445+
(SumaHandoverCompleteRep*)signal->getDataPtrSend();
20446+
rep->nodeId = nodeId;
20447+
EXECUTE_DIRECT(DBDIH, GSN_SUMA_HANDOVER_COMPLETE_REP, signal,
20448+
SumaHandoverCompleteRep::SignalLength);
20449+
}
2043420450
return;
2043520451
}
2043620452

0 commit comments

Comments
 (0)