diff --git a/config/production/kustomization.yaml b/config/production/kustomization.yaml
index 6beae6c..f3848d9 100644
--- a/config/production/kustomization.yaml
+++ b/config/production/kustomization.yaml
@@ -3,10 +3,10 @@ resources:
 images:
   - name: manager
     newName: ghcr.io/ctrox/zeropod-manager
-    newTag: v0.1.0
+    newTag: v0.2.0
   - name: installer
     newName: ghcr.io/ctrox/zeropod-installer
-    newTag: v0.1.0
+    newTag: v0.2.0
 patches:
   - patch: |-
       - op: add
diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go
index 22066e7..f4f36df 100644
--- a/e2e/e2e_test.go
+++ b/e2e/e2e_test.go
@@ -164,31 +164,28 @@ func TestE2E(t *testing.T) {
 		cleanupPod := createPodAndWait(t, ctx, client, pod)
 		defer cleanupPod()
 
+		require.Eventually(t, func() bool {
+			return isCheckpointed(t, client, cfg, pod)
+		}, time.Second*10, time.Second)
+
 		stdout, stderr, err := podExec(cfg, pod, "date")
 		require.NoError(t, err)
 		t.Log(stdout, stderr)
 
-		// as we can't yet reliably check if the pod is fully checkpointed and
-		// ready for another exec, we simply retry
-		require.Eventually(t, func() bool {
-			stdout, stderr, err = podExec(cfg, pod, "date")
-			t.Log(stdout, stderr)
-			return err == nil
-		}, time.Second*10, time.Second)
-
-		assert.GreaterOrEqual(t, restoreCount(t, client, cfg, pod), 2, "pod should have been restored 2 times")
+		assert.GreaterOrEqual(t, restoreCount(t, client, cfg, pod), 1, "pod should have been restored at least once")
 	})
 
 	t.Run("delete in restored state", func(t *testing.T) {
 		// as we want to delete the pod when it is in a restored state, we
-		// first need to make sure it has checkpointed at least once. We give
-		// it 2 seconds to checkpoint initially and wait 5 seconds to ensure
-		// it has finished checkpointing.
-		pod := testPod(scaleDownAfter(time.Second * 2))
+		// first need to make sure it has checkpointed at least once.
+		pod := testPod(scaleDownAfter(0))
 		cleanupPod := createPodAndWait(t, ctx, client, pod)
 		defer cleanupPod()
 
-		time.Sleep(time.Second * 5)
+		require.Eventually(t, func() bool {
+			return isCheckpointed(t, client, cfg, pod)
+		}, time.Second*10, time.Second)
+
 		stdout, stderr, err := podExec(cfg, pod, "date")
 		require.NoError(t, err)
 		t.Log(stdout, stderr)
diff --git a/e2e/setup_test.go b/e2e/setup_test.go
index d9b113a..0373aa4 100644
--- a/e2e/setup_test.go
+++ b/e2e/setup_test.go
@@ -533,10 +533,38 @@ func podExec(cfg *rest.Config, pod *corev1.Pod, command string) (string, string,
 func restoreCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int {
 	mfs := getNodeMetrics(t, client, cfg)
 
-	running := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRestoreDuration)
-	val, ok := mfs[running]
+	restoreDuration := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRestoreDuration)
+	val, ok := mfs[restoreDuration]
+	if !ok {
+		t.Errorf("could not find expected metric: %s", restoreDuration)
+	}
+
+	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
+		zeropod.LabelPodName:      pod.Name,
+		zeropod.LabelPodNamespace: pod.Namespace,
+	})
+	if !ok {
+		t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+	}
+
+	if metric.Histogram == nil {
+		t.Errorf("found metric that is not a histogram")
+	}
+
+	if metric.Histogram.SampleCount == nil {
+		t.Errorf("histogram sample count is nil")
+	}
+
+	return int(*metric.Histogram.SampleCount)
+}
+
+func checkpointCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int {
+	mfs := getNodeMetrics(t, client, cfg)
+
+	checkpointDuration := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricCheckPointDuration)
+	val, ok := mfs[checkpointDuration]
 	if !ok {
-		t.Fatalf("could not find expected metric: %s", running)
+		t.Errorf("could not find expected metric: %s", checkpointDuration)
 	}
 
 	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
@@ -544,19 +572,46 @@ func restoreCount(t testing.TB, client client.Client, cfg *rest.Config, pod *cor
 		zeropod.LabelPodNamespace: pod.Namespace,
 	})
 	if !ok {
-		t.Fatalf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+		t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
 	}
 
 	if metric.Histogram == nil {
-		t.Fatalf("found metric that is not a histogram")
+		t.Errorf("found metric that is not a histogram")
 	}
 
 	if metric.Histogram.SampleCount == nil {
-		t.Fatalf("histogram sample count is nil")
+		t.Errorf("histogram sample count is nil")
 	}
 
 	return int(*metric.Histogram.SampleCount)
+}
+
+func isCheckpointed(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) bool {
+	mfs := getNodeMetrics(t, client, cfg)
+
+	running := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRunning)
+	val, ok := mfs[running]
+	if !ok {
+		t.Errorf("could not find expected metric: %s", running)
+	}
+
+	metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{
+		zeropod.LabelPodName:      pod.Name,
+		zeropod.LabelPodNamespace: pod.Namespace,
+	})
+	if !ok {
+		t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace)
+	}
+
+	if metric.Gauge == nil {
+		t.Errorf("found metric that is not a gauge")
+	}
+
+	if metric.Gauge.Value == nil {
+		t.Errorf("gauge value is nil")
+	}
 
+	return *metric.Gauge.Value == 0 && checkpointCount(t, client, cfg, pod) >= 1
 }
 
 func findMetricByLabelMatch(metrics []*dto.Metric, labels map[string]string) (*dto.Metric, bool) {
diff --git a/runc/task/service_zeropod.go b/runc/task/service_zeropod.go
index 4a7718a..83b3075 100644
--- a/runc/task/service_zeropod.go
+++ b/runc/task/service_zeropod.go
@@ -156,9 +156,6 @@ func (w *wrapper) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.
 		return nil
 	})
 
-	// TODO: this is not a good idea (the 10s). A better idea is probably to
-	// wait whenever we try to first get the Port from the app (retry until
-	// the app is listening).
 	if err := zeropodContainer.ScheduleScaleDown(); err != nil {
 		return nil, err
 	}