diff --git a/config/production/kustomization.yaml b/config/production/kustomization.yaml index 6beae6c..f3848d9 100644 --- a/config/production/kustomization.yaml +++ b/config/production/kustomization.yaml @@ -3,10 +3,10 @@ resources: images: - name: manager newName: ghcr.io/ctrox/zeropod-manager - newTag: v0.1.0 + newTag: v0.2.0 - name: installer newName: ghcr.io/ctrox/zeropod-installer - newTag: v0.1.0 + newTag: v0.2.0 patches: - patch: |- - op: add diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 22066e7..f4f36df 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -164,31 +164,28 @@ func TestE2E(t *testing.T) { cleanupPod := createPodAndWait(t, ctx, client, pod) defer cleanupPod() + require.Eventually(t, func() bool { + return isCheckpointed(t, client, cfg, pod) + }, time.Second*10, time.Second) + stdout, stderr, err := podExec(cfg, pod, "date") require.NoError(t, err) t.Log(stdout, stderr) - // as we can't yet reliably check if the pod is fully checkpointed and - // ready for another exec, we simply retry - require.Eventually(t, func() bool { - stdout, stderr, err = podExec(cfg, pod, "date") - t.Log(stdout, stderr) - return err == nil - }, time.Second*10, time.Second) - - assert.GreaterOrEqual(t, restoreCount(t, client, cfg, pod), 2, "pod should have been restored 2 times") + assert.GreaterOrEqual(t, restoreCount(t, client, cfg, pod), 1, "pod should have been restored at least once") }) t.Run("delete in restored state", func(t *testing.T) { // as we want to delete the pod when it is in a restored state, we - // first need to make sure it has checkpointed at least once. We give - // it 2 seconds to checkpoint initially and wait 5 seconds to ensure - // it has finished checkpointing. - pod := testPod(scaleDownAfter(time.Second * 2)) + // first need to make sure it has checkpointed at least once. + pod := testPod(scaleDownAfter(0)) cleanupPod := createPodAndWait(t, ctx, client, pod) defer cleanupPod() - time.Sleep(time.Second * 5) + require.Eventually(t, func() bool { + return isCheckpointed(t, client, cfg, pod) + }, time.Second*10, time.Second) + stdout, stderr, err := podExec(cfg, pod, "date") require.NoError(t, err) t.Log(stdout, stderr) diff --git a/e2e/setup_test.go b/e2e/setup_test.go index d9b113a..0373aa4 100644 --- a/e2e/setup_test.go +++ b/e2e/setup_test.go @@ -533,10 +533,38 @@ func podExec(cfg *rest.Config, pod *corev1.Pod, command string) (string, string, func restoreCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int { mfs := getNodeMetrics(t, client, cfg) - running := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRestoreDuration) - val, ok := mfs[running] + restoreDuration := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRestoreDuration) + val, ok := mfs[restoreDuration] + if !ok { + t.Errorf("could not find expected metric: %s", restoreDuration) + } + + metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{ + zeropod.LabelPodName: pod.Name, + zeropod.LabelPodNamespace: pod.Namespace, + }) + if !ok { + t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace) + } + + if metric.Histogram == nil { + t.Errorf("found metric that is not a histogram") + } + + if metric.Histogram.SampleCount == nil { + t.Errorf("histogram sample count is nil") + } + + return int(*metric.Histogram.SampleCount) +} + +func checkpointCount(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) int { + mfs := getNodeMetrics(t, client, cfg) + + checkpointDuration := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricCheckPointDuration) + val, ok := mfs[checkpointDuration] if !ok { - t.Fatalf("could not find expected metric: %s", running) + t.Errorf("could not find expected metric: %s", checkpointDuration) } metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{ @@ -544,19 +572,46 @@ func restoreCount(t testing.TB, client client.Client, cfg *rest.Config, pod *cor zeropod.LabelPodNamespace: pod.Namespace, }) if !ok { - t.Fatalf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace) + t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace) } if metric.Histogram == nil { - t.Fatalf("found metric that is not a histogram") + t.Errorf("found metric that is not a histogram") } if metric.Histogram.SampleCount == nil { - t.Fatalf("histogram sample count is nil") + t.Errorf("histogram sample count is nil") } return int(*metric.Histogram.SampleCount) +} + +func isCheckpointed(t testing.TB, client client.Client, cfg *rest.Config, pod *corev1.Pod) bool { + mfs := getNodeMetrics(t, client, cfg) + + running := prometheus.BuildFQName(zeropod.MetricsNamespace, "", zeropod.MetricRunning) + val, ok := mfs[running] + if !ok { + t.Errorf("could not find expected metric: %s", running) + } + + metric, ok := findMetricByLabelMatch(val.Metric, map[string]string{ + zeropod.LabelPodName: pod.Name, + zeropod.LabelPodNamespace: pod.Namespace, + }) + if !ok { + t.Errorf("could not find running metric that matches pod: %s/%s", pod.Name, pod.Namespace) + } + + if metric.Gauge == nil { + t.Errorf("found metric that is not a gauge") + } + + if metric.Gauge.Value == nil { + t.Errorf("gauge value is nil") + } + return *metric.Gauge.Value == 0 && checkpointCount(t, client, cfg, pod) >= 1 } func findMetricByLabelMatch(metrics []*dto.Metric, labels map[string]string) (*dto.Metric, bool) { diff --git a/runc/task/service_zeropod.go b/runc/task/service_zeropod.go index 4a7718a..83b3075 100644 --- a/runc/task/service_zeropod.go +++ b/runc/task/service_zeropod.go @@ -156,9 +156,6 @@ func (w *wrapper) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI. return nil }) - // TODO: this is not a good idea (the 10s). A better idea is probably to - // wait whenever we try to first get the Port from the app (retry until - // the app is listening). if err := zeropodContainer.ScheduleScaleDown(); err != nil { return nil, err }