Skip to content

Commit fac6a27

Browse files
committed
This is a tentative fix for pgxc_ctl to deal with the following
failure in the current Postgres-XC. 1) pg_basebackup does not work with coordinator. 2) ALTER NODE tries to connect to the pooler, which should not happen. Both was caused by unnecessary call to the pooler. This fix will be provided in the separate patch.
1 parent 2c0d2fd commit fac6a27

File tree

2 files changed

+168
-17
lines changed

2 files changed

+168
-17
lines changed

contrib/pgxc_ctl/coord_cmd.c

Lines changed: 161 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,25 @@ int init_coordinator_slave_all(void)
230230

231231
cmd_t *prepare_initCoordinatorSlave(char *nodeName)
232232
{
233-
cmd_t *cmd, *cmdBuildDir, *cmdStartMaster, *cmdBaseBkup, *cmdRecoveryConf, *cmdPgConf;
233+
cmd_t *cmd,
234+
*cmdBuildDir,
235+
*cmdStartMaster,
236+
#if 0
237+
*cmdBaseBkup,
238+
#else
239+
/*
240+
* As of PostgreSQL 9.3 or later, pg_basebackup does now work with coordinator,
241+
* because each coordinator backend needs dbname to read pgxc_node info into
242+
* cache and pg_basebackup does not specify the database name.
243+
* The following uses more primitive means to use pg_start_backup() and pg_stop_backup().
244+
*/
245+
*cmdStartBkup,
246+
*cmdBuildAndSendTar,
247+
*cmdUntar,
248+
*cmdStopBkup,
249+
#endif
250+
*cmdRecoveryConf,
251+
*cmdPgConf;
234252
int idx;
235253
FILE *f;
236254
char localStdin[MAXPATH+1];
@@ -268,11 +286,67 @@ cmd_t *prepare_initCoordinatorSlave(char *nodeName)
268286
/*
269287
* Obtain base backup of the master
270288
*/
289+
#if 0
271290
appendCmdEl(cmdBuildDir, (cmdBaseBkup = initCmd(aval(VAR_coordSlaveServers)[idx])));
272291
snprintf(newCommand(cmdBaseBkup), MAXLINE,
273292
"pg_basebackup -p %s -h %s -D %s -x",
274293
aval(VAR_coordPorts)[idx], aval(VAR_coordMasterServers)[idx], aval(VAR_coordSlaveDirs)[idx]);
275-
294+
#else
295+
/*
296+
* As of PostgreSQL-9.3 or later, pg_basebackup does not run with coordinators, beacuse each coordinator
297+
* needs database name to read pgxc_node info into the cache and pg_basebackup does not specify this.
298+
* Current workaround is to use more primitive pg_start_backup() and pg_stop_backup().
299+
*/
300+
/* Start backup */
301+
appendCmdEl(cmdBuildDir, (cmdStartBkup = initCmd(aval(VAR_coordMasterServers)[idx])));
302+
/*
303+
* Here, we specify "quick and spike" CHECKPOINT because it is coordinator and we do not expect
304+
* much updating transactions against coordinators.
305+
*/
306+
snprintf(newCommand(cmdStartBkup), MAXLINE,
307+
"psql -h localhost -p %s postgres",
308+
aval(VAR_coordPorts)[idx]);
309+
if ((f = prepareLocalStdin((cmdStartBkup->localStdin = Malloc(MAXPATH+1)), MAXPATH, NULL)) == NULL)
310+
{
311+
cleanCmd(cmd);
312+
return(NULL);
313+
}
314+
fprintf(f,
315+
"select pg_start_backup('%s', true);\n\\q\n",
316+
nodeName);
317+
fclose(f);
318+
/* Build tar and send it */
319+
appendCmdEl(cmdBuildDir, (cmdBuildAndSendTar = initCmd(aval(VAR_coordMasterServers)[idx])));
320+
snprintf(newCommand(cmdBuildAndSendTar), MAXLINE,
321+
"rm -f %s/%s.tgz;" /* We remove this just in case the file does not have write privilege */
322+
"cd %s;"
323+
"tar czf %s/%s.tgz . ;"
324+
"scp %s/%s.tgz %s@%s:%s;"
325+
"rm -f %s/%s.tgz",
326+
sval(VAR_tmpDir), nodeName,
327+
aval(VAR_coordMasterDirs)[idx],
328+
sval(VAR_tmpDir), nodeName,
329+
sval(VAR_tmpDir), nodeName, sval(VAR_pgxcUser), aval(VAR_coordSlaveServers)[idx], sval(VAR_tmpDir),
330+
sval(VAR_tmpDir), nodeName);
331+
/* Stop backup */
332+
appendCmdEl(cmdBuildDir, (cmdStopBkup = initCmd(aval(VAR_coordMasterServers)[idx])));
333+
snprintf(newCommand(cmdStopBkup), MAXLINE,
334+
"psql -h localhost -p %s postgres -c 'select pg_stop_backup()'",
335+
aval(VAR_coordPorts)[idx]);
336+
/* Untar */
337+
appendCmdEl(cmdBuildDir, (cmdUntar = initCmd(aval(VAR_coordSlaveServers)[idx])));
338+
snprintf(newCommand(cmdUntar), MAXLINE,
339+
"rm -rf %s;"
340+
"mkdir -p %s;"
341+
"cd %s;"
342+
"tar xzf %s/%s.tgz;"
343+
"rm -rf %s/%s.tgz",
344+
aval(VAR_coordSlaveDirs)[idx],
345+
aval(VAR_coordSlaveDirs)[idx],
346+
aval(VAR_coordSlaveDirs)[idx],
347+
sval(VAR_tmpDir), nodeName,
348+
sval(VAR_tmpDir), nodeName);
349+
#endif
276350
/* Configure recovery.conf file at the slave */
277351
appendCmdEl(cmdBuildDir, (cmdRecoveryConf = initCmd(aval(VAR_coordSlaveServers)[idx])));
278352
if ((f = prepareLocalStdin(localStdin, MAXPATH, NULL)) == NULL)
@@ -940,6 +1014,7 @@ int add_coordinatorMaster(char *name, char *host, int port, int pooler, char *di
9401014
else
9411015
{
9421016
fprintf(f, "ALTER NODE %s WITH (host='%s', PORT=%d);\n", name, host, port);
1017+
fprintf(f, "select pgxc_pool_reload();\n");
9431018
fprintf(f, "\\q\n");
9441019
fclose(f);
9451020
}
@@ -1077,9 +1152,54 @@ int add_coordinatorSlave(char *name, char *host, char *dir, char *archDir)
10771152
"pg_ctl stop -Z coordinator -D %s -m fast", aval(VAR_coordMasterDirs)[idx]);
10781153
doImmediate(aval(VAR_coordMasterServers)[idx], NULL,
10791154
"pg_ctl start -Z coordinator -D %s", aval(VAR_coordMasterDirs)[idx]);
1155+
#if 0
10801156
/* pg_basebackup */
10811157
doImmediate(host, NULL, "pg_basebackup -p %s -h %s -D %s -x",
10821158
aval(VAR_coordPorts)[idx], aval(VAR_coordMasterServers)[idx], dir);
1159+
#else
1160+
/*
1161+
* As of PostgreSQL-9.3 or later, pg_basebackup does not run with coordinators.
1162+
* Now pg_basebackup runs without specifying database name. In each coordinator,
1163+
* we need (at present) database name to load node information into chache.
1164+
* More primitive means (pg_start_backup and pg_stop_backup) works as a work around.
1165+
*/
1166+
/*
1167+
* Stop backup
1168+
* we specify quick and spike checkpoint here because this is just after the restart
1169+
* and we expect coordinator is static so there should not be much updates
1170+
*/
1171+
doImmediate(aval(VAR_coordMasterServers)[idx], NULL,
1172+
"psql -h localhost -p %s postgres \"select pg_start_backup\\('%s', true\\)\"",
1173+
aval(VAR_coordPorts)[idx], name);
1174+
/* Build and send it */
1175+
doImmediate(aval(VAR_coordMasterServers)[idx], NULL,
1176+
"rm -f %s/%s.tgz;" /* We remove this just in case the file does not have write privilege */
1177+
"cd %s;"
1178+
"tar czf %s/%s.tgz . ;"
1179+
"scp %s/%s.tgz %s@%s:%s;"
1180+
"rm -f %s/%s.tgz",
1181+
sval(VAR_tmpDir), name,
1182+
aval(VAR_coordMasterDirs)[idx],
1183+
sval(VAR_tmpDir), name,
1184+
sval(VAR_tmpDir), name, sval(VAR_pgxcUser), host, sval(VAR_tmpDir),
1185+
sval(VAR_tmpDir), name);
1186+
/* Stop Backup */
1187+
doImmediate(aval(VAR_coordMasterServers)[idx], NULL,
1188+
"psql -h localhost -p %s postgres -c 'select pg_stop_backup()'",
1189+
aval(VAR_coordPorts)[idx]);
1190+
/* Untar */
1191+
doImmediate(aval(VAR_coordSlaveServers)[idx], NULL,
1192+
"rm -rf %s;"
1193+
"mkdir -p %s;"
1194+
"cd %s;"
1195+
"tar xzf %s/%s.tgz;"
1196+
"rm -rf %s/%s.tgz",
1197+
dir,
1198+
dir,
1199+
dir,
1200+
sval(VAR_tmpDir), name,
1201+
sval(VAR_tmpDir), name);
1202+
#endif
10831203
/* Update the slave configuration with hot standby and port */
10841204
if ((f = pgxc_popen_w(host, "cat >> %s/postgresql.conf", dir)) == NULL)
10851205
{
@@ -1800,23 +1920,48 @@ static int failover_oneCoordinator(int coordIdx)
18001920
aval(VAR_coordNames)[jj]);
18011921
continue;
18021922
}
1803-
if ((f = pgxc_popen_wRaw("psql -p %s -h %s %s %s",
1804-
aval(VAR_coordPorts)[jj],
1805-
aval(VAR_coordMasterServers)[jj],
1806-
sval(VAR_defaultDatabase),
1807-
sval(VAR_pgxcOwner)))
1808-
== NULL)
1923+
if (jj != coordIdx)
18091924
{
1810-
elog(ERROR, "ERROR: failed to start psql for coordinator %s, %s\n", aval(VAR_coordNames)[jj], strerror(errno));
1811-
continue;
1925+
if ((f = pgxc_popen_wRaw("psql -p %s -h %s %s %s",
1926+
aval(VAR_coordPorts)[jj],
1927+
aval(VAR_coordMasterServers)[jj],
1928+
sval(VAR_defaultDatabase),
1929+
sval(VAR_pgxcOwner)))
1930+
== NULL)
1931+
{
1932+
elog(ERROR, "ERROR: failed to start psql for coordinator %s, %s\n", aval(VAR_coordNames)[jj], strerror(errno));
1933+
continue;
1934+
}
1935+
fprintf(f,
1936+
#if 0 /* Now alter node dies not work well in this context. */
1937+
"ALTER NODE %s WITH (HOST='%s', PORT=%s);\n"
1938+
#else
1939+
"DROP NODE %s;\n"
1940+
"CREATE NODE %s WITH (type = coordinator, HOST='%s', PORT=%s);\n"
1941+
#endif
1942+
"select pgxc_pool_reload();\n"
1943+
"\\q\n",
1944+
aval(VAR_coordNames)[coordIdx],
1945+
aval(VAR_coordNames)[coordIdx], aval(VAR_coordMasterServers)[coordIdx], aval(VAR_coordPorts)[coordIdx]);
1946+
fclose(f);
18121947
}
1813-
fprintf(f,
1814-
"ALTER NODE %s WITH (HOST='%s', PORT=%s);\n"
1815-
"select pgxc_pool_reload();\n"
1816-
"\\q\n",
1817-
aval(VAR_coordNames)[coordIdx], aval(VAR_coordMasterServers)[coordIdx], aval(VAR_coordPorts)[coordIdx]);
1818-
fclose(f);
18191948
}
1949+
/* Now update myself */
1950+
if ((f = pgxc_popen_wRaw("psql -p %s -h %s %s %s",
1951+
aval(VAR_coordPorts)[coordIdx],
1952+
aval(VAR_coordMasterServers)[coordIdx],
1953+
sval(VAR_defaultDatabase),
1954+
sval(VAR_pgxcOwner)))
1955+
== NULL)
1956+
{
1957+
elog(ERROR, "ERROR: failed to start psql for coordinator %s, %s\n", aval(VAR_coordNames)[coordIdx], strerror(errno));
1958+
}
1959+
fprintf(f,
1960+
"ALTER NODE %s WITH (HOST='%s', PORT=%s);\n"
1961+
"select pgxc_pool_reload();\n"
1962+
"\\q\n",
1963+
aval(VAR_coordNames)[coordIdx], aval(VAR_coordMasterServers)[coordIdx], aval(VAR_coordPorts)[coordIdx]);
1964+
fclose(f);
18201965
return(rc);
18211966

18221967
# undef checkRc

contrib/pgxc_ctl/datanode_cmd.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ cmd_t *prepare_initDatanodeSlave(char *nodeName)
278278
"#==========================================\n"
279279
"# Added to initialize the slave, %s\n"
280280
"standby_mode = on\n"
281-
"primary_conninfo = 'host = %s port = %sd user = %s application_name = %s'\n"
281+
"primary_conninfo = 'host = %s port = %s user = %s application_name = %s'\n"
282282
"restore_command = 'cp %s/%%f %%p'\n"
283283
"archive_cleanup_command = 'pg_archivecleanup %s %%r'\n",
284284
timeStampString(timestamp, MAXTOKEN),
@@ -803,9 +803,15 @@ static int failover_oneDatanode(int datanodeIdx)
803803
continue;
804804
}
805805
fprintf(f,
806+
#if 0 /* Current alter node does't work well in this context */
806807
"ALTER NODE %s WITH (HOST='%s', PORT=%s);\n"
808+
#else
809+
"DROP NODE %s;\n"
810+
"CREATE NODE %s WITH (type = datanode, HOST='%s', PORT=%s);\n"
811+
#endif
807812
"select pgxc_pool_reload();\n"
808813
"\\q\n",
814+
aval(VAR_datanodeNames)[datanodeIdx],
809815
aval(VAR_datanodeNames)[datanodeIdx], aval(VAR_datanodeMasterServers)[datanodeIdx], aval(VAR_datanodePorts)[datanodeIdx]);
810816
fclose(f);
811817
}

0 commit comments

Comments
 (0)