test: increase wait on e2e tests

Some of these fail quite often when run in github actions so we increase the timeouts to a minute.
ctrox · Jun 23, 2024 · de1d9ac · de1d9ac
1 parent b58414d
commit de1d9ac
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 25 deletions.
diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go
@@ -167,14 +167,26 @@ func TestE2E(t *testing.T) {
 		defer cleanupPod()
 
 		require.Eventually(t, func() bool {
-			return isCheckpointed(t, client, cfg, pod)
-		}, time.Second*10, time.Second)
+			checkpointed, err := isCheckpointed(t, client, cfg, pod)
+			if err != nil {
+				t.Logf("error checking if checkpointed: %s", err)
+				return false
+			}
+			return checkpointed
+		}, time.Minute, time.Second)
 
 		stdout, stderr, err := podExec(cfg, pod, "date")
 		require.NoError(t, err)
 		t.Log(stdout, stderr)
 
-		assert.GreaterOrEqual(t, restoreCount(t, client, cfg, pod), 1, "pod should have been restored at least once")
+		require.Eventually(t, func() bool {
+			count, err := restoreCount(t, client, cfg, pod)
+			if err != nil {
+				t.Logf("error checking if restored: %s", err)
+				return false
+			}
+			return assert.GreaterOrEqual(t, count, 1, "pod should have been restored at least once")
+		}, time.Minute, time.Second)
 	})
 
 	t.Run("delete in restored state", func(t *testing.T) {
@@ -185,8 +197,13 @@ func TestE2E(t *testing.T) {
 		defer cleanupPod()
 
 		require.Eventually(t, func() bool {
-			return isCheckpointed(t, client, cfg, pod)
-		}, time.Second*10, time.Second)
+			checkpointed, err := isCheckpointed(t, client, cfg, pod)
+			if err != nil {
+				t.Logf("error checking if checkpointed: %s", err)
+				return false
+			}
+			return checkpointed
+		}, time.Minute, time.Second)
 
 		stdout, stderr, err := podExec(cfg, pod, "date")
 		require.NoError(t, err)
@@ -239,7 +256,16 @@ func TestE2E(t *testing.T) {
 		// exec into pod to ensure it has been restored at least once
 		require.Eventually(t, func() bool {
 			_, _, err := podExec(cfg, restoredPod, "date")
-			return err == nil && isCheckpointed(t, client, cfg, restoredPod)
+			if err != nil {
+				t.Logf("error during pod exec: %s", err)
+				return false
+			}
+			checkpointed, err := isCheckpointed(t, client, cfg, restoredPod)
+			if err != nil {
+				t.Logf("error checking if checkpointed: %s", err)
+				return false
+			}
+			return checkpointed
 		}, time.Minute, time.Second)
 
 		mfs := getNodeMetrics(t, client, cfg)

diff --git a/e2e/setup_test.go b/e2e/setup_test.go
@@ -483,7 +483,7 @@ func createServiceAndWait(t testing.TB, ctx context.Context, client client.Clien
 		}
 
 		return len(endpoints.Subsets[0].Addresses) == replicas
-	}, time.Second*30, time.Second, "waiting for service endpoints to be ready") {
+	}, time.Minute, time.Second, "waiting for service endpoints to be ready") {
 		t.Log("service did not get ready")
 	}
 
@@ -541,79 +541,87 @@ func podExec(cfg *rest.Config, pod *corev1.Pod, command string) (string, string,
 	return buf.String(), errBuf.String(), nil
 }
 
-func restoreCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int {
+func restoreCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) (int, error) {
 	val, err := getNodeMetric(t, client, cfg, zeropod.MetricRestoreDuration)
 	if err != nil {
-		t.Fatal(err)
+		return 0, err
 	}
 
 	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
 		zeropod.LabelPodName:      pod.Name,
 		zeropod.LabelPodNamespace: pod.Namespace,
 	})
 	if !ok {
-		t.Fatalf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+		return 0, fmt.Errorf("could not find restore duration metric that matches pod: %s/%s: %w",
+			pod.Name, pod.Namespace, err)
 	}
 
 	if metric.Histogram == nil {
-		t.Fatalf("found metric that is not a histogram")
+		return 0, fmt.Errorf("found metric that is not a histogram")
 	}
 
 	if metric.Histogram.SampleCount == nil {
-		t.Fatalf("histogram sample count is nil")
+		return 0, fmt.Errorf("histogram sample count is nil")
 	}
 
-	return int(*metric.Histogram.SampleCount)
+	return int(*metric.Histogram.SampleCount), nil
 }
 
-func checkpointCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int {
+func checkpointCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) (int, error) {
 	val, err := getNodeMetric(t, client, cfg, zeropod.MetricCheckPointDuration)
 	if err != nil {
-		t.Fatal(err)
+		return 0, err
 	}
 
 	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
 		zeropod.LabelPodName:      pod.Name,
 		zeropod.LabelPodNamespace: pod.Namespace,
 	})
 	if !ok {
-		t.Fatalf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+		return 0, fmt.Errorf("could not find checkpoint duration metric that matches pod: %s/%s: %w",
+			pod.Name, pod.Namespace, err)
 	}
 
 	if metric.Histogram == nil {
-		t.Fatalf("found metric that is not a histogram")
+		return 0, fmt.Errorf("found metric that is not a histogram")
 	}
 
 	if metric.Histogram.SampleCount == nil {
-		t.Fatalf("histogram sample count is nil")
+		return 0, fmt.Errorf("histogram sample count is nil")
 	}
 
-	return int(*metric.Histogram.SampleCount)
+	return int(*metric.Histogram.SampleCount), nil
 }
 
-func isCheckpointed(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) bool {
+func isCheckpointed(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) (bool, error) {
 	val, err := getNodeMetric(t, client, cfg, zeropod.MetricRunning)
 	if err != nil {
-		t.Fatal(err)
+		return false, err
 	}
 
 	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
 		zeropod.LabelPodName:      pod.Name,
 		zeropod.LabelPodNamespace: pod.Namespace,
 	})
 	if !ok {
-		t.Fatalf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+		return false, fmt.Errorf("could not find running metric that matches pod: %s/%s: %w",
+			pod.Name, pod.Namespace, err)
 	}
 
 	if metric.Gauge == nil {
-		t.Fatalf("found metric that is not a gauge")
+		return false, fmt.Errorf("found metric that is not a gauge")
 	}
 
 	if metric.Gauge.Value == nil {
-		t.Fatalf("gauge value is nil")
+		return false, fmt.Errorf("gauge value is nil")
+	}
+
+	count, err := checkpointCount(t, client, cfg, pod)
+	if err != nil {
+		return false, err
 	}
 
-	return *metric.Gauge.Value == 0 && checkpointCount(t, client, cfg, pod) >= 1
+	return *metric.Gauge.Value == 0 && count >= 1, nil
 }
 
 func findMetricByLabelMatch(metrics []*dto.Metric, labels map[string]string) (*dto.Metric, bool) {