Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

configurable upper bound on locking attempt timeout #275

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions bin/pg_repack.c
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ static bool rebuild_indexes(const repack_table *table);
static char *getstr(PGresult *res, int row, int col);
static Oid getoid(PGresult *res, int row, int col);
static bool advisory_lock(PGconn *conn, const char *relid);
static bool lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool start_xact);
static bool lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool start_xact, const repack_table *apply_log_table);
static bool kill_ddl(PGconn *conn, Oid relid, bool terminate);
static bool lock_access_share(PGconn *conn, Oid relid, const char *target_name);

Expand All @@ -252,6 +252,7 @@ static bool moveidx = false;
static SimpleStringList r_index = {NULL, NULL};
static bool only_indexes = false;
static int wait_timeout = 60; /* in seconds */
static int max_lock_timeout_msec = 1000;
static int jobs = 0; /* number of concurrent worker conns. */
static bool dryrun = false;
static unsigned int temp_obj_num = 0; /* temporary objects counter */
Expand Down Expand Up @@ -285,6 +286,7 @@ static pgut_option options[] =
{ 'l', 'i', "index", &r_index },
{ 'b', 'x', "only-indexes", &only_indexes },
{ 'i', 'T', "wait-timeout", &wait_timeout },
{ 'i', 'm', "max-lock-timeout", &max_lock_timeout_msec },
{ 'B', 'Z', "no-analyze", &analyze },
{ 'i', 'j', "jobs", &jobs },
{ 'b', 'D', "no-kill-backend", &no_kill_backend },
Expand Down Expand Up @@ -320,6 +322,10 @@ main(int argc, char *argv[])
if (dryrun)
elog(INFO, "Dry run enabled, not executing repack");

if (max_lock_timeout_msec < 1 || max_lock_timeout_msec > 1000)
ereport(ERROR, (errcode(EINVAL),
errmsg("--max-lock-timeout must be between 1 and 1000")));

if (r_index.head || only_indexes)
{
if (r_index.head && table_list.head)
Expand Down Expand Up @@ -1292,7 +1298,7 @@ repack_one_table(repack_table *table, const char *orderby)
if (!advisory_lock(connection, buffer))
goto cleanup;

if (!(lock_exclusive(connection, buffer, table->lock_table, true)))
if (!(lock_exclusive(connection, buffer, table->lock_table, true, NULL)))
{
if (no_kill_backend)
elog(INFO, "Skipping repack %s due to timeout", table->target_name);
Expand Down Expand Up @@ -1615,7 +1621,7 @@ repack_one_table(repack_table *table, const char *orderby)
/* Bump our existing AccessShare lock to AccessExclusive */

if (!(lock_exclusive(conn2, utoa(table->target_oid, buffer),
table->lock_table, false)))
table->lock_table, false, table)))
{
elog(WARNING, "lock_exclusive() failed in conn2 for %s",
table->target_name);
Expand All @@ -1629,7 +1635,7 @@ repack_one_table(repack_table *table, const char *orderby)
printfStringInfo(&sql, "LOCK TABLE repack.table_%u IN ACCESS EXCLUSIVE MODE",
table->target_oid);
if (!(lock_exclusive(conn2, utoa(table->temp_oid, buffer),
sql.data, false)))
sql.data, false, NULL)))
{
elog(WARNING, "lock_exclusive() failed in conn2 for table_%u",
table->target_oid);
Expand All @@ -1648,7 +1654,7 @@ repack_one_table(repack_table *table, const char *orderby)

command("BEGIN ISOLATION LEVEL READ COMMITTED", 0, NULL);
if (!(lock_exclusive(connection, utoa(table->target_oid, buffer),
table->lock_table, false)))
table->lock_table, false, NULL)))
{
elog(WARNING, "lock_exclusive() failed in connection for %s",
table->target_name);
Expand Down Expand Up @@ -1902,9 +1908,14 @@ static bool advisory_lock(PGconn *conn, const char *relid)
* start_xact: whether we will issue a BEGIN ourselves. If not, we will
* use a SAVEPOINT and ROLLBACK TO SAVEPOINT if our query
* times out, to avoid leaving the transaction in error state.
* apply_log_table: if non-NULL, call apply_log() on this table between
* locking attempts. This prevents lots of log from
* building up if getting the exclusive lock takes many
* retries over a long period.
*/
static bool
lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool start_xact)
lock_exclusive(PGconn *conn, const char *relid, const char *lock_query,
bool start_xact, const repack_table *apply_log_table)
{
time_t start = time(NULL);
int i;
Expand All @@ -1916,6 +1927,7 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
char sql[1024];
PGresult *res;
int wait_msec;
int timeout_msec;

if (start_xact)
pgut_command(conn, "BEGIN ISOLATION LEVEL READ COMMITTED", 0, NULL);
Expand Down Expand Up @@ -1964,7 +1976,8 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta

/* wait for a while to lock the table. */
wait_msec = Min(1000, i * 100);
snprintf(sql, lengthof(sql), "SET LOCAL lock_timeout = %d", wait_msec);
timeout_msec = Min(wait_msec, max_lock_timeout_msec);
snprintf(sql, lengthof(sql), "SET LOCAL lock_timeout = %d", timeout_msec);
pgut_command(conn, sql, 0, NULL);

res = pgut_execute_elevel(conn, lock_query, 0, NULL, DEBUG2);
Expand All @@ -1981,6 +1994,10 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
pgut_rollback(conn);
else
pgut_command(conn, "ROLLBACK TO SAVEPOINT repack_sp1", 0, NULL);
if (timeout_msec < wait_msec)
usleep(1000 * (wait_msec - timeout_msec));
if (apply_log_table)
apply_log(conn, apply_log_table, 0);
ncleaton marked this conversation as resolved.
Show resolved Hide resolved
continue;
}
else
Expand Down Expand Up @@ -2021,7 +2038,7 @@ repack_cleanup_callback(bool fatal, void *userdata)
reconnect(ERROR);

command("BEGIN ISOLATION LEVEL READ COMMITTED", 0, NULL);
if (!(lock_exclusive(connection, params[0], table->lock_table, false)))
if (!(lock_exclusive(connection, params[0], table->lock_table, false, NULL)))
{
pgut_rollback(connection);
elog(ERROR, "lock_exclusive() failed in connection for %s during cleanup callback",
Expand Down Expand Up @@ -2061,7 +2078,7 @@ repack_cleanup(bool fatal, const repack_table *table)
params[1] = utoa(temp_obj_num, num_buff);

command("BEGIN ISOLATION LEVEL READ COMMITTED", 0, NULL);
if (!(lock_exclusive(connection, params[0], table->lock_table, false)))
if (!(lock_exclusive(connection, params[0], table->lock_table, false, NULL)))
{
pgut_rollback(connection);
elog(ERROR, "lock_exclusive() failed in connection for %s during cleanup",
Expand Down Expand Up @@ -2215,7 +2232,7 @@ repack_table_indexes(PGresult *index_details)
resetStringInfo(&sql);
appendStringInfo(&sql, "LOCK TABLE %s IN ACCESS EXCLUSIVE MODE",
table_name);
if (!(lock_exclusive(connection, params[1], sql.data, true)))
if (!(lock_exclusive(connection, params[1], sql.data, true, NULL)))
{
elog(WARNING, "lock_exclusive() failed in connection for %s",
table_name);
Expand Down Expand Up @@ -2415,6 +2432,7 @@ pgut_help(bool details)
printf(" -i, --index=INDEX move only the specified index\n");
printf(" -x, --only-indexes move only indexes of the specified table\n");
printf(" -T, --wait-timeout=SECS timeout to cancel other backends on conflict\n");
printf(" -m, --max-lock-timeout=MS max millisecond timeout for a strong lock attempt\n");
printf(" -D, --no-kill-backend don't kill other backends when timed out\n");
printf(" -Z, --no-analyze don't analyze at end\n");
printf(" -k, --no-superuser-check skip superuser checks in client\n");
Expand Down
15 changes: 15 additions & 0 deletions doc/pg_repack.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ Options:
-i, --index=INDEX move only the specified index
-x, --only-indexes move only indexes of the specified table
-T, --wait-timeout=SECS timeout to cancel other backends on conflict
-m, --max-lock-timeout=MS max millisecond timeout for a strong lock attempt
-D, --no-kill-backend don't kill other backends when timed out
-Z, --no-analyze don't analyze at end
-k, --no-superuser-check skip superuser checks in client
Expand Down Expand Up @@ -212,6 +213,20 @@ Reorg Options
backends after twice this timeout has passed.
The default is 60 seconds.

``-m MS``, ``--max-lock-timeout=MS``
When attempting to take an exclusive lock, pg_repack sets a short timeout
for the lock command and makes repeated attempts to get the lock. This
avoids blocking all access to the table for up to ``--wait-timeout``
seconds if a long-running query is in the way. The lock timeout is
increased with each attempt, up to maximum of ``--max-lock-timeout``
milliseconds.
The default is 1000, i.e. 1 second.
If you want to avoid undue delay to queries on timescales much less than 1
second, setting a lower ``--max-lock-timeout`` will help. It comes at the
cost of making each lock attempt less likely to succeed, which may cause
the pg_repack run to take much longer.
Currently, values larger than 1000 are not supported.

``-D``, ``--no-kill-backend``
Skip to repack table if the lock cannot be taken for duration specified
``--wait-timeout``, instead of cancelling conflicting queries. The default
Expand Down
Loading