From 1c5e609af71bb94dce7ae96ef2ea86b644b52c09 Mon Sep 17 00:00:00 2001 From: Antoni Zawodny Date: Mon, 10 Jul 2023 11:51:25 +0200 Subject: [PATCH 1/7] Take full duration into account in Prometheus-based measurements --- .../common/generic_query_measurement_test.go | 16 ++++++++-------- .../pkg/measurement/util/prometheus.go | 5 +---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go b/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go index c6188cd5ab..c393a01ef8 100644 --- a/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go +++ b/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go @@ -73,9 +73,9 @@ func TestGather(t *testing.T) { }, }, samples: map[string][]*model.Sample{ - "below-threshold-query[1m]": {{Value: model.SampleValue(7)}}, - "no-threshold-query[1m]": {{Value: model.SampleValue(120)}}, - "placeholder-a[1m] + placeholder-b[1m]": {{Value: model.SampleValue(5)}}, + "below-threshold-query[60s]": {{Value: model.SampleValue(7)}}, + "no-threshold-query[60s]": {{Value: model.SampleValue(120)}}, + "placeholder-a[60s] + placeholder-b[60s]": {{Value: model.SampleValue(5)}}, }, wantDataItems: []measurementutil.DataItem{ { @@ -132,7 +132,7 @@ func TestGather(t *testing.T) { }, }, samples: map[string][]*model.Sample{ - "many-samples-query[1m]": { + "many-samples-query[60s]": { {Value: model.SampleValue(1)}, {Value: model.SampleValue(2)}, }, @@ -163,7 +163,7 @@ func TestGather(t *testing.T) { }, }, samples: map[string][]*model.Sample{ - "above-threshold-query[1m]": {{Value: model.SampleValue(123)}}, + "above-threshold-query[60s]": {{Value: model.SampleValue(123)}}, }, wantErr: "sample above threshold: want: less or equal than 60, got: 123", wantDataItems: []measurementutil.DataItem{ @@ -221,7 +221,7 @@ func TestGather(t *testing.T) { }, }, samples: map[string][]*model.Sample{ - "query-perc99[1m]": { + "query-perc99[60s]": { { Metric: model.Metric{ model.LabelName("d1"): model.LabelValue("d1-val1"), @@ -238,7 +238,7 @@ func TestGather(t *testing.T) { Value: model.SampleValue(2), }, }, - "query-perc90[1m]": { + "query-perc90[60s]": { { Metric: model.Metric{ model.LabelName("d1"): model.LabelValue("d1-val1"), @@ -316,7 +316,7 @@ func TestGather(t *testing.T) { }, }, samples: map[string][]*model.Sample{ - "query-perc99[1m]": { + "query-perc99[60s]": { { Metric: model.Metric{ model.LabelName("d1"): model.LabelValue("d1-val1"), diff --git a/clusterloader2/pkg/measurement/util/prometheus.go b/clusterloader2/pkg/measurement/util/prometheus.go index 3602ac5b3f..b63e510ce7 100644 --- a/clusterloader2/pkg/measurement/util/prometheus.go +++ b/clusterloader2/pkg/measurement/util/prometheus.go @@ -174,8 +174,5 @@ func (qr *promResponseData) UnmarshalJSON(b []byte) error { // ToPrometheusTime returns prometheus string representation of given time. func ToPrometheusTime(t time.Duration) string { - if t < time.Minute { - return fmt.Sprintf("%ds", int64(t)/int64(time.Second)) - } - return fmt.Sprintf("%dm", int64(t)/int64(time.Minute)) + return fmt.Sprintf("%ds", int64(t)/int64(time.Second)) } From 6f7a882a37d66657ea9f8576da0f12acc341cfaa Mon Sep 17 00:00:00 2001 From: Antoni Zawodny Date: Fri, 14 Jul 2023 14:16:01 +0200 Subject: [PATCH 2/7] Enable treating threshold as a lower bound in generic measurement --- .../common/generic_query_measurement.go | 30 +++++++--- .../common/generic_query_measurement_test.go | 55 +++++++++++++++++++ 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/clusterloader2/pkg/measurement/common/generic_query_measurement.go b/clusterloader2/pkg/measurement/common/generic_query_measurement.go index 32454c97dc..1a7b8b3ecf 100644 --- a/clusterloader2/pkg/measurement/common/generic_query_measurement.go +++ b/clusterloader2/pkg/measurement/common/generic_query_measurement.go @@ -83,6 +83,7 @@ type GenericQuery struct { Name string Query string Threshold *float64 + LowerBound bool RequireSamples bool } @@ -132,6 +133,26 @@ func getOrCreate(dataItems map[string]*measurementutil.DataItem, key, unit strin return dataItem } +func (g *genericQueryGatherer) validateSample(q GenericQuery, val float64) error { + thresholdMsg := "none" + if q.Threshold != nil { + thresholdMsg = fmt.Sprintf("%v (upper bound)", *q.Threshold) + if q.LowerBound { + thresholdMsg = fmt.Sprintf("%v (lower bound)", *q.Threshold) + } + } + klog.V(2).Infof("metric: %v: %v, value: %v, threshold: %v", g.MetricName, q.Name, val, thresholdMsg) + if q.Threshold != nil { + if q.LowerBound && val < *q.Threshold { + return errors.NewMetricViolationError(q.Name, fmt.Sprintf("sample below threshold: want: greater or equal than %v, got: %v", *q.Threshold, val)) + } + if !q.LowerBound && val > *q.Threshold { + return errors.NewMetricViolationError(q.Name, fmt.Sprintf("sample above threshold: want: less or equal than %v, got: %v", *q.Threshold, val)) + } + } + return nil +} + func (g *genericQueryGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) { var errs []error dataItems := map[string]*measurementutil.DataItem{} @@ -161,13 +182,8 @@ func (g *genericQueryGatherer) Gather(executor QueryExecutor, startTime, endTime dataItem.Data[q.Name] = val } - thresholdMsg := "none" - if q.Threshold != nil { - thresholdMsg = fmt.Sprintf("%v", *q.Threshold) - } - klog.V(2).Infof("metric: %v: %v, value: %v, threshold: %v", g.MetricName, q.Name, val, thresholdMsg) - if q.Threshold != nil && val > *q.Threshold { - errs = append(errs, errors.NewMetricViolationError(q.Name, fmt.Sprintf("sample above threshold: want: less or equal than %v, got: %v", *q.Threshold, val))) + if err := g.validateSample(q, val); err != nil { + errs = append(errs, err) } } } diff --git a/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go b/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go index c393a01ef8..e89fd96fd6 100644 --- a/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go +++ b/clusterloader2/pkg/measurement/common/generic_query_measurement_test.go @@ -175,6 +175,61 @@ func TestGather(t *testing.T) { }, }, }, + { + desc: "sample above lower bound", + params: map[string]interface{}{ + "metricName": "below-threshold", + "metricVersion": "v1", + "unit": "ms", + "queries": []map[string]interface{}{ + { + "name": "above-threshold", + "query": "above-threshold-query[%v]", + "threshold": 60, + "lowerBound": true, + }, + }, + }, + samples: map[string][]*model.Sample{ + "above-threshold-query[60s]": {{Value: model.SampleValue(74)}}, + }, + wantDataItems: []measurementutil.DataItem{ + { + Unit: "ms", + Data: map[string]float64{ + "above-threshold": 74.0, + }, + }, + }, + }, + { + desc: "sample below lower bound", + params: map[string]interface{}{ + "metricName": "below-threshold", + "metricVersion": "v1", + "unit": "ms", + "queries": []map[string]interface{}{ + { + "name": "below-threshold", + "query": "below-threshold-query[%v]", + "threshold": 60, + "lowerBound": true, + }, + }, + }, + samples: map[string][]*model.Sample{ + "below-threshold-query[60s]": {{Value: model.SampleValue(42)}}, + }, + wantErr: "sample below threshold: want: greater or equal than 60, got: 42", + wantDataItems: []measurementutil.DataItem{ + { + Unit: "ms", + Data: map[string]float64{ + "below-threshold": 42.0, + }, + }, + }, + }, { desc: "missing field metricName", params: map[string]interface{}{ From 821b7b7680df2a84880417a6c80f62cbf3a9d6bd Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Fri, 14 Jul 2023 16:02:21 +0200 Subject: [PATCH 3/7] Improve request benchmark test --- clusterloader2/testing/request-benchmark/config.yaml | 6 ++++-- .../testing/request-benchmark/configmap.yaml | 4 ++-- .../testing/request-benchmark/deployment.yaml | 1 + .../modules/benchmark-deployment.yaml | 2 ++ .../request-benchmark/modules/measurements.yaml | 12 ++++++++++++ util-images/request-benchmark/Makefile | 2 +- util-images/request-benchmark/client.go | 5 +++-- util-images/request-benchmark/main.go | 3 ++- 8 files changed, 27 insertions(+), 8 deletions(-) diff --git a/clusterloader2/testing/request-benchmark/config.yaml b/clusterloader2/testing/request-benchmark/config.yaml index ea8d557bb4..e8cb6eeb5f 100644 --- a/clusterloader2/testing/request-benchmark/config.yaml +++ b/clusterloader2/testing/request-benchmark/config.yaml @@ -5,6 +5,7 @@ {{$benchmarkReplicas := DefaultParam .CL2_BENCHMARK_PODS 1}} {{$inflight := DefaultParam .CL2_BENCHMARK_INFLIGHT 10}} +{{$qps := DefaultParam .CL2_BENCHMARK_QPS -1}} {{$uri := DefaultParam .CL2_BENCHMARK_URI ""}} # URI example: /api/v1/namespaces/%namespace%/pods name: Request benchmark @@ -13,7 +14,7 @@ namespace: tuningSets: - name: Sequence parallelismLimitedLoad: - parallelismLimit: 1 + parallelismLimit: 10 steps: - name: Setup permissions phases: @@ -43,7 +44,7 @@ steps: objectBundle: - basename: {{$configMapGroup}} objectTemplatePath: configmap.yaml - params: + templateFillMap: bytes: {{$configMapBytes}} group: {{$configMapGroup}} - module: @@ -56,6 +57,7 @@ steps: replicas: {{$benchmarkReplicas}} inflight: {{$inflight}} uri: {{$uri}} + qps: {{$qps}} - module: path: modules/measurements.yaml params: diff --git a/clusterloader2/testing/request-benchmark/configmap.yaml b/clusterloader2/testing/request-benchmark/configmap.yaml index 4c4a223ab9..6e0582cb9d 100644 --- a/clusterloader2/testing/request-benchmark/configmap.yaml +++ b/clusterloader2/testing/request-benchmark/configmap.yaml @@ -1,4 +1,4 @@ -{{$bytes := DefaultParam .bytes 0}} +{{$bytes := .bytes}} {{$group := DefaultParam .group .Name}} apiVersion: v1 @@ -8,4 +8,4 @@ metadata: labels: group: {{$group}} data: - key: "{{range $i := Loop $bytes}}{{RandIntRange 0 9}}{{end}}" + key: "{{RandData $bytes}}" diff --git a/clusterloader2/testing/request-benchmark/deployment.yaml b/clusterloader2/testing/request-benchmark/deployment.yaml index a688d9cdca..fb90715002 100644 --- a/clusterloader2/testing/request-benchmark/deployment.yaml +++ b/clusterloader2/testing/request-benchmark/deployment.yaml @@ -27,6 +27,7 @@ spec: - --inflight={{.Inflight}} - --namespace={{.Namespace}} - --uri={{.Uri}} + - --qps={{.QPS}} resources: requests: cpu: {{$cpu}} diff --git a/clusterloader2/testing/request-benchmark/modules/benchmark-deployment.yaml b/clusterloader2/testing/request-benchmark/modules/benchmark-deployment.yaml index afbdccc23d..16bfd209e3 100644 --- a/clusterloader2/testing/request-benchmark/modules/benchmark-deployment.yaml +++ b/clusterloader2/testing/request-benchmark/modules/benchmark-deployment.yaml @@ -1,6 +1,7 @@ {{$replicas := DefaultParam .replicas 0}} {{$inflight := DefaultParam .inflight 0}} {{$uri := DefaultParam .uri "/"}} +{{$qps := DefaultParam .qps -1}} steps: - name: Creating WaitForControlledPodsRunning measurement @@ -28,6 +29,7 @@ steps: Replicas: {{$replicas}} Inflight: {{$inflight}} Uri: {{$uri}} + QPS: {{$qps}} - name: Waiting for WaitForControlledPodsRunning gather measurements: - Identifier: WaitForBenchmarkDeployment diff --git a/clusterloader2/testing/request-benchmark/modules/measurements.yaml b/clusterloader2/testing/request-benchmark/modules/measurements.yaml index 569879f85e..3a0d57518a 100644 --- a/clusterloader2/testing/request-benchmark/modules/measurements.yaml +++ b/clusterloader2/testing/request-benchmark/modules/measurements.yaml @@ -9,6 +9,12 @@ steps: duration: 1m - name: "Starting measurement - {{.name}}" measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + enableViolations: false + useSimpleLatencyQuery: true - Identifier: ContainerCPU-{{.name}} Method: GenericPrometheusQuery Params: @@ -49,3 +55,9 @@ steps: query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) - name: Perc50 query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: false + useSimpleLatencyQuery: true diff --git a/util-images/request-benchmark/Makefile b/util-images/request-benchmark/Makefile index a8a95c61ba..6995d684ac 100644 --- a/util-images/request-benchmark/Makefile +++ b/util-images/request-benchmark/Makefile @@ -1,6 +1,6 @@ PROJECT ?= k8s-testimages IMG = gcr.io/$(PROJECT)/perf-tests-util/request-benchmark -TAG = v0.0.5 +TAG = v0.0.6 all: build push diff --git a/util-images/request-benchmark/client.go b/util-images/request-benchmark/client.go index cbb45765f7..ff012dc77f 100644 --- a/util-images/request-benchmark/client.go +++ b/util-images/request-benchmark/client.go @@ -48,13 +48,14 @@ func (c *Client) TimedRequest(ctx context.Context, request *rest.Request) Respon } } -func GetClient() (*Client, error) { +func GetClient(qps float32) (*Client, error) { config, err := getConfig() if err != nil { return nil, err } - config.QPS = -1 // Disable rate limiter + config.QPS = qps + config.Burst = 10 client, err := kubernetes.NewForConfig(config) return (*Client)(client), err diff --git a/util-images/request-benchmark/main.go b/util-images/request-benchmark/main.go index ed07fa9427..ee297e2c0d 100644 --- a/util-images/request-benchmark/main.go +++ b/util-images/request-benchmark/main.go @@ -34,6 +34,7 @@ var ( namespace = flag.String("namespace", "", "Replace %namespace% in URI with provided namespace") URI = flag.String("uri", "", "Request URI") verb = flag.String("verb", "GET", "A verb to be used in requests.") + qps = flag.Float64("qps", -1, "The qps limit for all requests") ) func init() { @@ -41,7 +42,7 @@ func init() { } func main() { - client, err := GetClient() + client, err := GetClient(float32(*qps)) if err != nil { panic(err) } From 5acb50b530817f7cd7773c0e216a825653d38f52 Mon Sep 17 00:00:00 2001 From: Lukasz Szaszkiewicz Date: Thu, 20 Jul 2023 16:24:31 +0200 Subject: [PATCH 4/7] issue:2287: decrease the number of workes to see impact on latency --- clusterloader2/testing/watch-list/config.yaml | 3 ++- clusterloader2/testing/watch-list/job.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clusterloader2/testing/watch-list/config.yaml b/clusterloader2/testing/watch-list/config.yaml index 74c7dee4ea..e5ee2d5362 100644 --- a/clusterloader2/testing/watch-list/config.yaml +++ b/clusterloader2/testing/watch-list/config.yaml @@ -42,7 +42,8 @@ steps: min: 1 max: 1 basename: watch-list - replicasPerNamespace: 2 + # TODO(p0lyn0mial): bring back 2 replicas + replicasPerNamespace: 1 tuningSet: Uniform10qps objectBundle: - basename: watch-list-secret diff --git a/clusterloader2/testing/watch-list/job.yaml b/clusterloader2/testing/watch-list/job.yaml index b1b947022e..5876216a09 100644 --- a/clusterloader2/testing/watch-list/job.yaml +++ b/clusterloader2/testing/watch-list/job.yaml @@ -21,5 +21,6 @@ spec: memory: "16Gi" cpu: "6" command: [ "watch-list" ] - args: [ "--alsologtostderr=true", "--v=4", "--timeout={{.Duration}}", "--count=16", "--namespace=watch-list-1", "--enableWatchListFeature={{.EnableWatchListFeature}}"] + # TODO(p0lyn0mial): bring back 16 informers + args: [ "--alsologtostderr=true", "--v=4", "--timeout={{.Duration}}", "--count=2", "--namespace=watch-list-1", "--enableWatchListFeature={{.EnableWatchListFeature}}"] restartPolicy: Never From a51454d5ab3af3c41a187c012e5a7dd903288972 Mon Sep 17 00:00:00 2001 From: Maciej Wyrzuc Date: Fri, 21 Jul 2023 07:12:21 +0000 Subject: [PATCH 5/7] Update golang compiler for golang tip job --- golang/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/golang/Makefile b/golang/Makefile index 4b6504b947..b8083aaa07 100644 --- a/golang/Makefile +++ b/golang/Makefile @@ -15,7 +15,7 @@ .PHONY: build-go build push export GCS_BUCKET?=k8s-infra-scale-golang-builds -export GO_COMPILER_PKG?=go1.18.5.linux-amd64.tar.gz +export GO_COMPILER_PKG?=go1.20.6.linux-amd64.tar.gz export GO_COMPILER_URL?=https://dl.google.com/go/$(GO_COMPILER_PKG) export ROOT_DIR?=/home/prow/go/src From 36483110212d729575344acddce16a0d76280e71 Mon Sep 17 00:00:00 2001 From: Lukasz Szaszkiewicz Date: Mon, 24 Jul 2023 10:18:27 +0200 Subject: [PATCH 6/7] issue:2287: slightly increase the number of informers to see impact on latency --- clusterloader2/testing/watch-list/job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clusterloader2/testing/watch-list/job.yaml b/clusterloader2/testing/watch-list/job.yaml index 5876216a09..1f407d6bcf 100644 --- a/clusterloader2/testing/watch-list/job.yaml +++ b/clusterloader2/testing/watch-list/job.yaml @@ -22,5 +22,5 @@ spec: cpu: "6" command: [ "watch-list" ] # TODO(p0lyn0mial): bring back 16 informers - args: [ "--alsologtostderr=true", "--v=4", "--timeout={{.Duration}}", "--count=2", "--namespace=watch-list-1", "--enableWatchListFeature={{.EnableWatchListFeature}}"] + args: [ "--alsologtostderr=true", "--v=4", "--timeout={{.Duration}}", "--count=8", "--namespace=watch-list-1", "--enableWatchListFeature={{.EnableWatchListFeature}}"] restartPolicy: Never From e2fae8ffd2ef9eee14aab92cb28e62039127c691 Mon Sep 17 00:00:00 2001 From: Lukasz Szaszkiewicz Date: Fri, 28 Jul 2023 15:22:23 +0200 Subject: [PATCH 7/7] issue:2287: find maximum number of informers that doesn't impact latency. --- clusterloader2/testing/watch-list/job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clusterloader2/testing/watch-list/job.yaml b/clusterloader2/testing/watch-list/job.yaml index 1f407d6bcf..38500689e5 100644 --- a/clusterloader2/testing/watch-list/job.yaml +++ b/clusterloader2/testing/watch-list/job.yaml @@ -22,5 +22,5 @@ spec: cpu: "6" command: [ "watch-list" ] # TODO(p0lyn0mial): bring back 16 informers - args: [ "--alsologtostderr=true", "--v=4", "--timeout={{.Duration}}", "--count=8", "--namespace=watch-list-1", "--enableWatchListFeature={{.EnableWatchListFeature}}"] + args: [ "--alsologtostderr=true", "--v=4", "--timeout={{.Duration}}", "--count=6", "--namespace=watch-list-1", "--enableWatchListFeature={{.EnableWatchListFeature}}"] restartPolicy: Never