Merge pull request #10 from ctrox/release-v0.2.0

Prepare Release v0.2.0
ctrox · May 9, 2024 · f2d618f · f2d618f
2 parents 5004678 + 7cb11f4
commit f2d618f
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 25 deletions.
diff --git a/config/production/kustomization.yaml b/config/production/kustomization.yaml
@@ -3,10 +3,10 @@ resources:
 images:
   - name: manager
     newName: ghcr.io/ctrox/zeropod-manager
-    newTag: v0.1.0
+    newTag: v0.2.0
   - name: installer
     newName: ghcr.io/ctrox/zeropod-installer
-    newTag: v0.1.0
+    newTag: v0.2.0
 patches:
   - patch: |-
       - op: add

diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go
@@ -164,31 +164,28 @@ func TestE2E(t *testing.T) {
 		cleanupPod := createPodAndWait(t, ctx, client, pod)
 		defer cleanupPod()
 
+		require.Eventually(t, func() bool {
+			return isCheckpointed(t, client, cfg, pod)
+		}, time.Second*10, time.Second)
+
 		stdout, stderr, err := podExec(cfg, pod, "date")
 		require.NoError(t, err)
 		t.Log(stdout, stderr)
 
-		// as we can't yet reliably check if the pod is fully checkpointed and
-		// ready for another exec, we simply retry
-		require.Eventually(t, func() bool {
-			stdout, stderr, err = podExec(cfg, pod, "date")
-			t.Log(stdout, stderr)
-			return err == nil
-		}, time.Second*10, time.Second)
-
-		assert.GreaterOrEqual(t, restoreCount(t, client, cfg, pod), 2, "pod should have been restored 2 times")
+		assert.GreaterOrEqual(t, restoreCount(t, client, cfg, pod), 1, "pod should have been restored at least once")
 	})
 
 	t.Run("delete in restored state", func(t *testing.T) {
 		// as we want to delete the pod when it is in a restored state, we
-		// first need to make sure it has checkpointed at least once. We give
-		// it 2 seconds to checkpoint initially and wait 5 seconds to ensure
-		// it has finished checkpointing.
-		pod := testPod(scaleDownAfter(time.Second * 2))
+		// first need to make sure it has checkpointed at least once.
+		pod := testPod(scaleDownAfter(0))
 		cleanupPod := createPodAndWait(t, ctx, client, pod)
 		defer cleanupPod()
 
-		time.Sleep(time.Second * 5)
+		require.Eventually(t, func() bool {
+			return isCheckpointed(t, client, cfg, pod)
+		}, time.Second*10, time.Second)
+
 		stdout, stderr, err := podExec(cfg, pod, "date")
 		require.NoError(t, err)
 		t.Log(stdout, stderr)

diff --git a/e2e/setup_test.go b/e2e/setup_test.go
@@ -533,30 +533,85 @@ func podExec(cfg *rest.Config, pod *corev1.Pod, command string) (string, string,
 func restoreCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int {
 	mfs := getNodeMetrics(t, client, cfg)
 
-	running := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRestoreDuration)
-	val, ok := mfs[running]
+	restoreDuration := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRestoreDuration)
+	val, ok := mfs[restoreDuration]
+	if !ok {
+		t.Errorf("could not find expected metric: %s", restoreDuration)
+	}
+
+	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
+		zeropod.LabelPodName:      pod.Name,
+		zeropod.LabelPodNamespace: pod.Namespace,
+	})
+	if !ok {
+		t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+	}
+
+	if metric.Histogram == nil {
+		t.Errorf("found metric that is not a histogram")
+	}
+
+	if metric.Histogram.SampleCount == nil {
+		t.Errorf("histogram sample count is nil")
+	}
+
+	return int(*metric.Histogram.SampleCount)
+}
+
+func checkpointCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int {
+	mfs := getNodeMetrics(t, client, cfg)
+
+	checkpointDuration := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricCheckPointDuration)
+	val, ok := mfs[checkpointDuration]
 	if !ok {
-		t.Fatalf("could not find expected metric: %s", running)
+		t.Errorf("could not find expected metric: %s", checkpointDuration)
 	}
 
 	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
 		zeropod.LabelPodName:      pod.Name,
 		zeropod.LabelPodNamespace: pod.Namespace,
 	})
 	if !ok {
-		t.Fatalf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+		t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
 	}
 
 	if metric.Histogram == nil {
-		t.Fatalf("found metric that is not a histogram")
+		t.Errorf("found metric that is not a histogram")
 	}
 
 	if metric.Histogram.SampleCount == nil {
-		t.Fatalf("histogram sample count is nil")
+		t.Errorf("histogram sample count is nil")
 	}
 
 	return int(*metric.Histogram.SampleCount)
+}
+
+func isCheckpointed(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) bool {
+	mfs := getNodeMetrics(t, client, cfg)
+
+	running := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRunning)
+	val, ok := mfs[running]
+	if !ok {
+		t.Errorf("could not find expected metric: %s", running)
+	}
+
+	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
+		zeropod.LabelPodName:      pod.Name,
+		zeropod.LabelPodNamespace: pod.Namespace,
+	})
+	if !ok {
+		t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+	}
+
+	if metric.Gauge == nil {
+		t.Errorf("found metric that is not a gauge")
+	}
+
+	if metric.Gauge.Value == nil {
+		t.Errorf("gauge value is nil")
+	}
 
+	return *metric.Gauge.Value == 0 && checkpointCount(t, client, cfg, pod) >= 1
 }
 
 func findMetricByLabelMatch(metrics []*dto.Metric, labels map[string]string) (*dto.Metric, bool) {

diff --git a/runc/task/service_zeropod.go b/runc/task/service_zeropod.go
@@ -156,9 +156,6 @@ func (w *wrapper) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.
 		return nil
 	})
 
-	// TODO: this is not a good idea (the 10s). A better idea is probably to
-	// wait whenever we try to first get the Port from the app (retry until
-	// the app is listening).
 	if err := zeropodContainer.ScheduleScaleDown(); err != nil {
 		return nil, err
 	}