Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 49 additions & 51 deletions src/catalog.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
*
* TODO: lock-timeout as parameter
* TODO: we must think about more fine grain unlock mechanism - separate unlock_backup() function.
* TODO: more accurate naming
* -> exclusive lock -> acquire HW_LATCH and wait until all LW_LATCH`es are clear
* -> shared lock -> acquire HW_LATCH, acquire LW_LATCH, release HW_LATCH
*/
bool
lock_backup(pgBackup *backup, bool strict, bool exclusive)
Expand Down Expand Up @@ -205,7 +208,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
{
/* release exclusive lock */
if (fio_unlink(lock_file, FIO_BACKUP_HOST) < 0)
elog(ERROR, "Could not remove old lock file \"%s\": %s",
elog(ERROR, "Could not remove exclusive lock file \"%s\": %s",
lock_file, strerror(errno));

/* we are done */
Expand Down Expand Up @@ -261,48 +264,16 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
int fd = 0;
char buffer[MAXPGPATH * 2 + 256];
int ntries = LOCK_TIMEOUT;
int log_freq = ntries / 5;
int empty_tries = LOCK_STALE_TIMEOUT;
int len;
int encoded_pid;
pid_t my_p_pid;

join_path_components(lock_file, backup->root_dir, BACKUP_LOCK_FILE);

/*
* TODO: is this stuff with ppid below is relevant for us ?
*
* If the PID in the lockfile is our own PID or our parent's or
* grandparent's PID, then the file must be stale (probably left over from
* a previous system boot cycle). We need to check this because of the
* likelihood that a reboot will assign exactly the same PID as we had in
* the previous reboot, or one that's only one or two counts larger and
* hence the lockfile's PID now refers to an ancestor shell process. We
* allow pg_ctl to pass down its parent shell PID (our grandparent PID)
* via the environment variable PG_GRANDPARENT_PID; this is so that
* launching the postmaster via pg_ctl can be just as reliable as
* launching it directly. There is no provision for detecting
* further-removed ancestor processes, but if the init script is written
* carefully then all but the immediate parent shell will be root-owned
* processes and so the kill test will fail with EPERM. Note that we
* cannot get a false negative this way, because an existing postmaster
* would surely never launch a competing postmaster or pg_ctl process
* directly.
*/
#ifndef WIN32
my_p_pid = getppid();
#else

/*
* Windows hasn't got getppid(), but doesn't need it since it's not using
* real kill() either...
*/
my_p_pid = 0;
#endif

/*
* We need a loop here because of race conditions. But don't loop forever
* (for example, a non-writable $backup_instance_path directory might cause a failure
* that won't go away). 100 tries seems like plenty.
* that won't go away).
*/
do
{
Expand Down Expand Up @@ -351,13 +322,38 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
fclose(fp_out);

/*
* It should be possible only as a result of system crash,
* so its hypothetical owner should be dead by now
* There are several possible reasons for lock file
* to be empty:
* - system crash
* - process crash
* - race between writer and reader
*
* Consider empty file to be stale after LOCK_STALE_TIMEOUT attempts.
*
* TODO: alternatively we can write into temp file (lock_file_%pid),
* rename it and then re-read lock file to make sure,
* that we are successfully acquired the lock.
*/
if (len == 0)
{
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
goto grab_lock;
if (empty_tries == 0)
{
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
goto grab_lock;
}

if ((empty_tries % LOG_FREQ) == 0)
elog(WARNING, "Waiting %u seconds on empty exclusive lock for backup %s",
empty_tries, base36enc(backup->start_time));

sleep(1);
/*
* waiting on empty lock file should not affect
* the timer for concurrent lockers (ntries).
*/
empty_tries--;
ntries++;
continue;
}

encoded_pid = atoi(buffer);
Expand All @@ -371,24 +367,23 @@ lock_backup_exclusive(pgBackup *backup, bool strict)

/*
* Check to see if the other process still exists
*
* Per discussion above, my_pid, my_p_pid can be
* ignored as false matches.
*
* Normally kill() will fail with ESRCH if the given PID doesn't
* exist.
*/
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
if (encoded_pid == my_pid)
return 0;
else
{
if (kill(encoded_pid, 0) == 0)
{
/* complain every fifth interval */
if ((ntries % log_freq) == 0)
if ((ntries % LOG_FREQ) == 0)
{
elog(WARNING, "Process %d is using backup %s, and is still running",
encoded_pid, base36enc(backup->start_time));

elog(WARNING, "Waiting %u seconds on lock for backup %s", ntries, base36enc(backup->start_time));
elog(WARNING, "Waiting %u seconds on exclusive lock for backup %s",
ntries, base36enc(backup->start_time));
}

sleep(1);
Expand Down Expand Up @@ -435,7 +430,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
errno = 0;
if (fio_write(fd, buffer, strlen(buffer)) != strlen(buffer))
{
int save_errno = errno;
int save_errno = errno;

fio_close(fd);
fio_unlink(lock_file, FIO_BACKUP_HOST);
Expand All @@ -453,7 +448,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)

if (fio_flush(fd) != 0)
{
int save_errno = errno;
int save_errno = errno;

fio_close(fd);
fio_unlink(lock_file, FIO_BACKUP_HOST);
Expand All @@ -471,7 +466,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)

if (fio_close(fd) != 0)
{
int save_errno = errno;
int save_errno = errno;

fio_unlink(lock_file, FIO_BACKUP_HOST);

Expand All @@ -482,6 +477,10 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
lock_file, strerror(save_errno));
}

// elog(LOG, "Acquired exclusive lock for backup %s after %ds",
// base36enc(backup->start_time),
// LOCK_TIMEOUT - ntries + LOCK_STALE_TIMEOUT - empty_tries);

return 0;
}

Expand All @@ -493,7 +492,6 @@ wait_read_only_owners(pgBackup *backup)
char buffer[256];
pid_t encoded_pid;
int ntries = LOCK_TIMEOUT;
int log_freq = ntries / 5;
char lock_file[MAXPGPATH];

join_path_components(lock_file, backup->root_dir, BACKUP_RO_LOCK_FILE);
Expand Down Expand Up @@ -523,7 +521,7 @@ wait_read_only_owners(pgBackup *backup)
{
if (kill(encoded_pid, 0) == 0)
{
if ((ntries % log_freq) == 0)
if ((ntries % LOG_FREQ) == 0)
{
elog(WARNING, "Process %d is using backup %s in read only mode, and is still running",
encoded_pid, base36enc(backup->start_time));
Expand Down
4 changes: 3 additions & 1 deletion src/pg_probackup.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ extern const char *PROGRAM_EMAIL;
/* Timeout defaults */
#define ARCHIVE_TIMEOUT_DEFAULT 300
#define REPLICA_TIMEOUT_DEFAULT 300
#define LOCK_TIMEOUT 30
#define LOCK_TIMEOUT 60
#define LOCK_STALE_TIMEOUT 30
#define LOG_FREQ 10

/* Directory/File permission */
#define DIR_PERMISSION (0700)
Expand Down
14 changes: 7 additions & 7 deletions tests/helpers/ptrack_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ def run_pb(self, command, asynchronous=False, gdb=False, old_binary=False, retur
return GDBobj([binary_path] + command, self.verbose)
if asynchronous:
return subprocess.Popen(
self.cmd,
[binary_path] + command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env
Expand Down Expand Up @@ -1133,8 +1133,8 @@ def show_archive(
exit(1)

def validate_pb(
self, backup_dir, instance=None,
backup_id=None, options=[], old_binary=False, gdb=False
self, backup_dir, instance=None, backup_id=None,
options=[], old_binary=False, gdb=False, asynchronous=False
):

cmd_list = [
Expand All @@ -1146,11 +1146,11 @@ def validate_pb(
if backup_id:
cmd_list += ['-i', backup_id]

return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)

def delete_pb(
self, backup_dir, instance,
backup_id=None, options=[], old_binary=False, gdb=False):
self, backup_dir, instance, backup_id=None,
options=[], old_binary=False, gdb=False, asynchronous=False):
cmd_list = [
'delete',
'-B', backup_dir
Expand All @@ -1160,7 +1160,7 @@ def delete_pb(
if backup_id:
cmd_list += ['-i', backup_id]

return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)

def delete_expired(
self, backup_dir, instance, options=[], old_binary=False):
Expand Down
46 changes: 46 additions & 0 deletions tests/locking.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,52 @@ def test_backup_directory_name(self):
# Clean after yourself
self.del_test_dir(module_name, fname)

def test_empty_lock_file(self):
"""
https://github.com/postgrespro/pg_probackup/issues/308
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'])

backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()

# Fill with data
node.pgbench_init(scale=100)

# FULL
backup_id = self.backup_node(backup_dir, 'node', node)

lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
with open(lockfile, "w+") as f:
f.truncate()

out = self.validate_pb(backup_dir, 'node', backup_id)

self.assertIn(
"Waiting 30 seconds on empty exclusive lock for backup", out)

# lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
# with open(lockfile, "w+") as f:
# f.truncate()
#
# p1 = self.validate_pb(backup_dir, 'node', backup_id, asynchronous=True,
# options=['--log-level-file=LOG', '--log-filename=validate.log'])
# sleep(3)
# p2 = self.delete_pb(backup_dir, 'node', backup_id, asynchronous=True,
# options=['--log-level-file=LOG', '--log-filename=delete.log'])
#
# p1.wait()
# p2.wait()

# Clean after yourself
self.del_test_dir(module_name, fname)

# TODO:
# test that concurrent validation and restore are not locking each other
# check that quick exclusive lock, when taking RO-lock, is really quick