Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADBDEV-6936 Avoid reusing timeline for interrupted promotion #1173

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 49 additions & 46 deletions src/backend/access/transam/xlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,7 @@ static MemoryContext walDebugCxt = NULL;

static void readRecoverySignalFile(void);
static void validateRecoveryParameters(void);
static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
static void XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog);
static bool recoveryStopsBefore(XLogReaderState *record);
static bool recoveryStopsAfter(XLogReaderState *record);
static void recoveryPausesHere(void);
Expand Down Expand Up @@ -5617,10 +5617,10 @@ validateRecoveryParameters(void)
}

/*
* Exit archive-recovery state
* Initialize the first WAL segment on new timeline.
*/
static void
exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog)
{
char xlogfname[MAXFNAMELEN];
XLogSegNo endLogSegNo;
Expand All @@ -5629,26 +5629,11 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
/* we always switch to a new timeline after archive recovery */
Assert(endTLI != ThisTimeLineID);

/*
* We are no longer in archive recovery state.
*/
InArchiveRecovery = false;

/*
* Update min recovery point one last time.
*/
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);

/*
* If the ending log segment is still open, close it (to avoid problems on
* Windows with trying to rename or delete an open file).
*/
if (readFile >= 0)
{
close(readFile);
readFile = -1;
}

/*
* Calculate the last segment on the old timeline, and the first segment
* on the new timeline. If the switch happens in the middle of a segment,
Expand Down Expand Up @@ -5700,27 +5685,6 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
*/
XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
XLogArchiveCleanup(xlogfname);

/*
* Remove the signal files out of the way, so that we don't accidentally
* re-enter archive recovery mode in a subsequent crash.
*/
if (standby_signal_file_found)
durable_unlink(STANDBY_SIGNAL_FILE, FATAL);

if (recovery_signal_file_found)
durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);

/*
* Response to FTS probes after this point will not indicate that we are a
* mirror because the am_mirror flag is set based on existence of
* RECOVERY_COMMAND_FILE. New libpq connections to the postmaster should
* no longer return CAC_MIRROR_READY as response because we are no longer a
* mirror.
*/
ResetMirrorReadyFlag();
ereport(LOG,
(errmsg("archive recovery complete")));
}

/*
Expand Down Expand Up @@ -7848,6 +7812,25 @@ StartupXLOG(void)
record = ReadRecord(xlogreader, LastRec, PANIC, false);
EndOfLog = EndRecPtr;

if (ArchiveRecoveryRequested)
{
/*
* We are no longer in archive recovery state.
*/
Assert(InArchiveRecovery);
InArchiveRecovery = false;

/*
* If the ending log segment is still open, close it (to avoid problems on
* Windows with trying to rename or delete an open file).
*/
if (readFile >= 0)
{
close(readFile);
readFile = -1;
}
}

/*
* EndOfLogTLI is the TLI in the filename of the XLOG segment containing
* the end-of-log. It could be different from the timeline that EndOfLog
Expand Down Expand Up @@ -7924,8 +7907,6 @@ StartupXLOG(void)
char reason[200];
char recoveryPath[MAXPGPATH];

Assert(InArchiveRecovery);

ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
ereport(LOG,
(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
Expand Down Expand Up @@ -7960,12 +7941,11 @@ StartupXLOG(void)
snprintf(reason, sizeof(reason), "no recovery target specified");

/*
* We are now done reading the old WAL. Turn off archive fetching if
* it was active, and make a writable copy of the last WAL segment.
* (Note that we also have a copy of the last block of the old WAL in
* readBuf; we will use that below.)
* Make a writable copy of the last WAL segment. (Note that we also
* have a copy of the last block of the old WAL in
* endOfRecovery->lastPage; we will use that below.)
*/
exitArchiveRecovery(EndOfLogTLI, EndOfLog);
XLogInitNewTimeline(EndOfLogTLI, EndOfLog);

/*
* Write the timeline history file, and have it archived. After this
Expand Down Expand Up @@ -8080,6 +8060,8 @@ StartupXLOG(void)
UpdateFullPageWrites();
LocalXLogInsertAllowed = -1;

SIMPLE_FAULT_INJECTOR("before_persisting_new_tli");

if (InRecovery)
{
/*
Expand Down Expand Up @@ -8137,6 +8119,27 @@ StartupXLOG(void)

if (ArchiveRecoveryRequested)
{
/*
* Remove the signal files out of the way, so that we don't accidentally
* re-enter archive recovery mode in a subsequent crash.
*/
if (standby_signal_file_found)
durable_unlink(STANDBY_SIGNAL_FILE, FATAL);

if (recovery_signal_file_found)
durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);

/*
* Response to FTS probes after this point will not indicate that we are a
* mirror because the am_mirror flag is set based on existence of
* RECOVERY_COMMAND_FILE. New libpq connections to the postmaster should
* no longer return CAC_MIRROR_READY as response because we are no longer a
* mirror.
*/
ResetMirrorReadyFlag();
ereport(LOG,
(errmsg("archive recovery complete")));

/*
* And finally, execute the recovery_end_command, if any.
*/
Expand Down
7 changes: 7 additions & 0 deletions src/test/isolation2/sql/fts_doublefault.sql
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ select gp_request_fts_probe_scan();
-1U: create table last_timestamp as select time from gp_configuration_history order by time desc limit 1;

-- stop primary in order to promote mirror for content 0
-- start_ignore
create EXTENSION if not exists gp_inject_fault;
SELECT gp_inject_fault('before_persisting_new_tli', 'suspend', 5);
-- end_ignore
select pg_ctl((select datadir from gp_segment_configuration c
where c.role='p' and c.content=0), 'stop');

Expand All @@ -34,6 +38,9 @@ select gp_request_fts_probe_scan();
-- primary is down, and mirror has now been promoted to primary. Verify
-1U: select wait_until_segments_are_down(1);
-1U: select dbid, description from gp_configuration_history where time > (select time from last_timestamp) order by time;
-- start_ignore
select pg_sleep(5);
-- end_ignore

-- stop acting primary in order to trigger double fault for content 0
select pg_ctl((select datadir from gp_segment_configuration c
Expand Down
Loading