From 9d9ae7cd3210608e326b81570284108449ca5721 Mon Sep 17 00:00:00 2001 From: Sami Loone Date: Mon, 27 Jul 2020 15:35:58 +0300 Subject: [PATCH] resource: Retry helper exits before callback finishes On conditions where callback supplied to resource.Retry() takes a long time to execute, a timeout error is returned to caller while the callback is still running. To remind the reader, WaitForState creates a separate goroutine for calling Refresh() callback from StateChangeConf. The execution of Refresh() goroutine is monitored in the main thread with a timeout. However, while timeout does issue a cancellation for the Refresh() goroutine, it does not interrupt the already started Refresh() calls. In addition, cancellation mechanism itself gave up after grace period expired on timeouts. This potentially left the Refresh() callback running in the background while returning TimeoutError to the caller. When timeout error is returned, plugin(s) incorrectly assume that it is safe to rerun the Refresh() content. In reality, the state after timeout is completely unknown. Annotated example from AWS plugin: var createResp *iam.CreateRoleOutput err := resource.Retry(30*time.Second, func() *resource.RetryError { var err error createResp, err = iamconn.CreateRole(request) <-- Has internally a retry loop, can block more then 30 seconds // IAM users (referenced in Principal field of assume policy) // can take ~30 seconds to propagate in AWS if isAWSErr(err, "MalformedPolicyDocument", "Invalid principal in policy") { return resource.RetryableError(err) } return resource.NonRetryableError(err) }) if isResourceTimeoutError(err) { <-- Goroutine started in Retry (WaitForState) can still be running createResp, err = iamconn.CreateRole(request) <-- Issues another blocking CreateRole } The fix is simply to remove the grace period and the matching select timeout. This makes WaitForState() block until Refresh() goroutine returns something to result channel. --- helper/resource/state.go | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/helper/resource/state.go b/helper/resource/state.go index 6eda1993f1b..089a246fda1 100644 --- a/helper/resource/state.go +++ b/helper/resource/state.go @@ -225,12 +225,9 @@ func (conf *StateChangeConf) WaitForStateContext(ctx context.Context) (interface return nil, ctx.Err() case <-timeout: log.Printf("[WARN] WaitForState timeout after %s", conf.Timeout) - log.Printf("[WARN] WaitForState starting %s refresh grace period", refreshGracePeriod) - // cancel the goroutine and start our grace period timer + // cancel the goroutine close(cancelCh) - timeout := time.After(refreshGracePeriod) - // we need a for loop and a label to break on, because we may have // an extra response value to read, but still want to wait for the // channel to close. @@ -254,9 +251,6 @@ func (conf *StateChangeConf) WaitForStateContext(ctx context.Context) (interface case <-ctx.Done(): log.Println("[ERROR] Context cancelation detected, abandoning grace period") break forSelect - case <-timeout: - log.Println("[ERROR] WaitForState exceeded refresh grace period") - break forSelect } }