From e9b0a9210b21238566630149190add106ef39f0f Mon Sep 17 00:00:00 2001 From: bjee19 <139261241+bjee19@users.noreply.github.com> Date: Mon, 12 Aug 2024 15:40:25 -0700 Subject: [PATCH] Automate Reconfiguration Performance Test (#2313) Automate the reconfiguration performance test. Problem: We want to automate our reconfiguration performance test so that we don't need to run it manually. Solution: Automated the reconfiguration performance test. Testing: Checked that results are reasonable. --- tests/framework/prometheus.go | 325 +++++++++ tests/framework/resourcemanager.go | 63 +- .../scripts/create-resources-gw-last.sh | 31 - .../scripts/create-resources-routes-last.sh | 29 - tests/reconfig/scripts/delete-multiple.sh | 16 - tests/reconfig/setup.md | 120 ---- .../reconfig}/1.0.0/1.0.0.md | 0 .../reconfig}/1.1.0/1.1.0.md | 0 .../reconfig}/1.2.0/1.2.0.md | 0 .../reconfig}/1.3.0/1.3.0.md | 0 .../manifests/reconfig}/cafe-routes.yaml | 0 .../manifests/reconfig/cafe-secret.yaml} | 6 - .../manifests/reconfig}/cafe.yaml | 0 .../manifests/reconfig}/gateway.yaml | 1 - .../manifests/reconfig}/reference-grant.yaml | 1 - tests/suite/reconfig_test.go | 634 ++++++++++++++++++ tests/suite/scale_test.go | 198 +----- tests/suite/system_suite_test.go | 4 +- 18 files changed, 1030 insertions(+), 398 deletions(-) delete mode 100755 tests/reconfig/scripts/create-resources-gw-last.sh delete mode 100755 tests/reconfig/scripts/create-resources-routes-last.sh delete mode 100755 tests/reconfig/scripts/delete-multiple.sh delete mode 100644 tests/reconfig/setup.md rename tests/{reconfig/results => results/reconfig}/1.0.0/1.0.0.md (100%) rename tests/{reconfig/results => results/reconfig}/1.1.0/1.1.0.md (100%) rename tests/{reconfig/results => results/reconfig}/1.2.0/1.2.0.md (100%) rename tests/{reconfig/results => results/reconfig}/1.3.0/1.3.0.md (100%) rename tests/{reconfig/scripts => suite/manifests/reconfig}/cafe-routes.yaml (100%) rename tests/{reconfig/scripts/certificate-ns-and-cafe-secret.yaml => suite/manifests/reconfig/cafe-secret.yaml} (97%) rename tests/{reconfig/scripts => suite/manifests/reconfig}/cafe.yaml (100%) rename tests/{reconfig/scripts => suite/manifests/reconfig}/gateway.yaml (93%) rename tests/{reconfig/scripts => suite/manifests/reconfig}/reference-grant.yaml (93%) create mode 100644 tests/suite/reconfig_test.go diff --git a/tests/framework/prometheus.go b/tests/framework/prometheus.go index 358ec5d09a..d8794562df 100644 --- a/tests/framework/prometheus.go +++ b/tests/framework/prometheus.go @@ -293,3 +293,328 @@ func WritePrometheusMatrixToCSVFile(fileName string, value model.Value) error { return nil } + +// Bucket represents a data point of a Histogram Bucket. +type Bucket struct { + // Le is the interval Less than or Equal which represents the Bucket's bin. i.e. "500ms". + Le string + // Val is the value for how many instances fall in the Bucket. + Val int +} + +// GetReloadCount gets the total number of nginx reloads. +func GetReloadCount(promInstance PrometheusInstance, ngfPodName string) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`, + ngfPodName, + ), + promInstance, + ) +} + +// GetReloadCountWithStartTime gets the total number of nginx reloads from a start time to the current time. +func GetReloadCountWithStartTime( + promInstance PrometheusInstance, + ngfPodName string, + startTime time.Time, +) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + promInstance, + ) +} + +// GetReloadErrsCountWithStartTime gets the total number of nginx reload errors from a start time to the current time. +func GetReloadErrsCountWithStartTime( + promInstance PrometheusInstance, + ngfPodName string, + startTime time.Time, +) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reload_errors_total{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reload_errors_total{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + promInstance, + ) +} + +// GetReloadAvgTime gets the average time in milliseconds for nginx to reload. +func GetReloadAvgTime(promInstance PrometheusInstance, ngfPodName string) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reloads_milliseconds_sum{pod="%[1]s"}`+ + ` / `+ + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`, + ngfPodName, + ), + promInstance, + ) +} + +// GetReloadAvgTimeWithStartTime gets the average time in milliseconds for nginx to reload using a start time +// to the current time to calculate. +func GetReloadAvgTimeWithStartTime( + promInstance PrometheusInstance, + ngfPodName string, + startTime time.Time, +) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `(nginx_gateway_fabric_nginx_reloads_milliseconds_sum{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_milliseconds_sum{pod="%[1]s"} @ %[2]d)`+ + ` / `+ + `(nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"} @ %[2]d)`, + ngfPodName, + startTime.Unix(), + ), + promInstance, + ) +} + +// GetReloadBuckets gets the Buckets in millisecond intervals for nginx reloads. +func GetReloadBuckets(promInstance PrometheusInstance, ngfPodName string) ([]Bucket, error) { + return getBuckets( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reloads_milliseconds_bucket{pod="%[1]s"}`, + ngfPodName, + ), + promInstance, + ) +} + +// GetReloadBucketsWithStartTime gets the Buckets in millisecond intervals for nginx reloads from a start time +// to the current time. +func GetReloadBucketsWithStartTime( + promInstance PrometheusInstance, + ngfPodName string, + startTime time.Time, +) ([]Bucket, error) { + return getBuckets( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reloads_milliseconds_bucket{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_milliseconds_bucket{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + promInstance, + ) +} + +// GetEventsCount gets the NGF event batch processing count. +func GetEventsCount(promInstance PrometheusInstance, ngfPodName string) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`, + ngfPodName, + ), + promInstance, + ) +} + +// GetEventsCountWithStartTime gets the NGF event batch processing count from a start time to the current time. +func GetEventsCountWithStartTime( + promInstance PrometheusInstance, + ngfPodName string, + startTime time.Time, +) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + promInstance, + ) +} + +// GetEventsAvgTime gets the average time in milliseconds it takes for NGF to process a single event batch. +func GetEventsAvgTime(promInstance PrometheusInstance, ngfPodName string) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_event_batch_processing_milliseconds_sum{pod="%[1]s"}`+ + ` / `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`, + ngfPodName, + ), + promInstance, + ) +} + +// GetEventsAvgTimeWithStartTime gets the average time in milliseconds it takes for NGF to process a single event +// batch using a start time to the current time to calculate. +func GetEventsAvgTimeWithStartTime( + promInstance PrometheusInstance, + ngfPodName string, + startTime time.Time, +) (float64, error) { + return getFirstValueOfVector( + fmt.Sprintf( + `(nginx_gateway_fabric_event_batch_processing_milliseconds_sum{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_sum{pod="%[1]s"} @ %[2]d)`+ + ` / `+ + `(nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"} @ %[2]d)`, + ngfPodName, + startTime.Unix(), + ), + promInstance, + ) +} + +// GetEventsBuckets gets the Buckets in millisecond intervals for NGF event batch processing. +func GetEventsBuckets(promInstance PrometheusInstance, ngfPodName string) ([]Bucket, error) { + return getBuckets( + fmt.Sprintf( + `nginx_gateway_fabric_event_batch_processing_milliseconds_bucket{pod="%[1]s"}`, + ngfPodName, + ), + promInstance, + ) +} + +// GetEventsBucketsWithStartTime gets the Buckets in millisecond intervals for NGF event batch processing from a start +// time to the current time. +func GetEventsBucketsWithStartTime( + promInstance PrometheusInstance, + ngfPodName string, + startTime time.Time, +) ([]Bucket, error) { + return getBuckets( + fmt.Sprintf( + `nginx_gateway_fabric_event_batch_processing_milliseconds_bucket{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_bucket{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + promInstance, + ) +} + +// CreateMetricExistChecker returns a function that will query Prometheus at a specific timestamp +// and adjust that timestamp if there is no result found. +func CreateMetricExistChecker( + promInstance PrometheusInstance, + query string, + getTime func() time.Time, + modifyTime func(), +) func() error { + return func() error { + queryWithTimestamp := fmt.Sprintf("%s @ %d", query, getTime().Unix()) + + result, err := promInstance.Query(queryWithTimestamp) + if err != nil { + return fmt.Errorf("failed to query Prometheus: %w", err) + } + + if result.String() == "" { + modifyTime() + return errors.New("empty result") + } + + return nil + } +} + +// CreateEndTimeFinder returns a function that will range query Prometheus given a specific startTime and endTime +// and adjust the endTime if there is no result found. +func CreateEndTimeFinder( + promInstance PrometheusInstance, + query string, + startTime time.Time, + endTime *time.Time, + queryRangeStep time.Duration, +) func() error { + return func() error { + result, err := promInstance.QueryRange(query, v1.Range{ + Start: startTime, + End: *endTime, + Step: queryRangeStep, + }) + if err != nil { + return fmt.Errorf("failed to query Prometheus: %w", err) + } + + if result.String() == "" { + *endTime = time.Now() + return errors.New("empty result") + } + + return nil + } +} + +// CreateResponseChecker returns a function that checks if there is a successful response from a url. +func CreateResponseChecker(url, address string, requestTimeout time.Duration) func() error { + return func() error { + status, _, err := Get(url, address, requestTimeout) + if err != nil { + return fmt.Errorf("bad response: %w", err) + } + + if status != 200 { + return fmt.Errorf("unexpected status code: %d", status) + } + + return nil + } +} + +func getFirstValueOfVector(query string, promInstance PrometheusInstance) (float64, error) { + result, err := promInstance.Query(query) + if err != nil { + return 0, err + } + + val, err := GetFirstValueOfPrometheusVector(result) + if err != nil { + return 0, err + } + + return val, nil +} + +func getBuckets(query string, promInstance PrometheusInstance) ([]Bucket, error) { + result, err := promInstance.Query(query) + if err != nil { + return nil, err + } + + res, ok := result.(model.Vector) + if !ok { + return nil, errors.New("could not convert result to vector") + } + + buckets := make([]Bucket, 0, len(res)) + + for _, sample := range res { + le := sample.Metric["le"] + val := float64(sample.Value) + bucket := Bucket{ + Le: string(le), + Val: int(val), + } + buckets = append(buckets, bucket) + } + + return buckets, nil +} diff --git a/tests/framework/resourcemanager.go b/tests/framework/resourcemanager.go index e7a585cf79..4ce3fec712 100644 --- a/tests/framework/resourcemanager.go +++ b/tests/framework/resourcemanager.go @@ -115,6 +115,20 @@ func (rm *ResourceManager) Apply(resources []client.Object) error { // ApplyFromFiles creates or updates Kubernetes resources defined within the provided YAML files. func (rm *ResourceManager) ApplyFromFiles(files []string, namespace string) error { + for _, file := range files { + data, err := rm.GetFileContents(file) + if err != nil { + return err + } + + if err = rm.ApplyFromBuffer(data, namespace); err != nil { + return err + } + } + return nil +} + +func (rm *ResourceManager) ApplyFromBuffer(buffer *bytes.Buffer, namespace string) error { ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.CreateTimeout) defer cancel() @@ -150,7 +164,7 @@ func (rm *ResourceManager) ApplyFromFiles(files []string, namespace string) erro return nil } - return rm.readAndHandleObjects(handlerFunc, files) + return rm.readAndHandleObject(handlerFunc, buffer) } // Delete deletes Kubernetes resources defined as Go objects. @@ -213,36 +227,41 @@ func (rm *ResourceManager) DeleteFromFiles(files []string, namespace string) err return nil } - return rm.readAndHandleObjects(handlerFunc, files) -} - -func (rm *ResourceManager) readAndHandleObjects( - handle func(unstructured.Unstructured) error, - files []string, -) error { for _, file := range files { data, err := rm.GetFileContents(file) if err != nil { return err } - decoder := yaml.NewYAMLOrJSONDecoder(data, 4096) - for { - obj := unstructured.Unstructured{} - if err := decoder.Decode(&obj); err != nil { - if errors.Is(err, io.EOF) { - break - } - return fmt.Errorf("error decoding resource: %w", err) - } + if err = rm.readAndHandleObject(handlerFunc, data); err != nil { + return err + } + } - if len(obj.Object) == 0 { - continue - } + return nil +} - if err := handle(obj); err != nil { - return err +func (rm *ResourceManager) readAndHandleObject( + handle func(unstructured.Unstructured) error, + data *bytes.Buffer, +) error { + decoder := yaml.NewYAMLOrJSONDecoder(data, 4096) + + for { + obj := unstructured.Unstructured{} + if err := decoder.Decode(&obj); err != nil { + if errors.Is(err, io.EOF) { + break } + return fmt.Errorf("error decoding resource: %w", err) + } + + if len(obj.Object) == 0 { + continue + } + + if err := handle(obj); err != nil { + return err } } diff --git a/tests/reconfig/scripts/create-resources-gw-last.sh b/tests/reconfig/scripts/create-resources-gw-last.sh deleted file mode 100755 index a1be0fc9e6..0000000000 --- a/tests/reconfig/scripts/create-resources-gw-last.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -num_namespaces=$1 - -# Create namespaces -for ((i = 1; i <= num_namespaces; i++)); do - namespace_name="namespace$i" - kubectl create namespace "$namespace_name" -done - -# Create single instance resources -kubectl create -f certificate-ns-and-cafe-secret.yaml -kubectl create -f reference-grant.yaml - -# Create backend service and apps -for ((i = 1; i <= num_namespaces; i++)); do - namespace_name="namespace$i" - sed -e "s/coffee/coffee${namespace_name}/g" -e "s/tea/tea${namespace_name}/g" cafe.yaml | kubectl apply -n "$namespace_name" -f - -done - -# Create routes -for ((i = 1; i <= num_namespaces; i++)); do - namespace_name="namespace$i" - sed -e "s/coffee/coffee${namespace_name}/g" -e "s/tea/tea${namespace_name}/g" cafe-routes.yaml | kubectl apply -n "$namespace_name" -f - -done - -# Wait for apps to be ready -sleep 60 - -# Create Gateway -kubectl create -f gateway.yaml diff --git a/tests/reconfig/scripts/create-resources-routes-last.sh b/tests/reconfig/scripts/create-resources-routes-last.sh deleted file mode 100755 index be41d9a706..0000000000 --- a/tests/reconfig/scripts/create-resources-routes-last.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash - -num_namespaces=$1 - -# Create namespaces -for ((i = 1; i <= num_namespaces; i++)); do - namespace_name="namespace$i" - kubectl create namespace "$namespace_name" -done - -# Create backend service and apps -for ((i = 1; i <= num_namespaces; i++)); do - namespace_name="namespace$i" - sed -e "s/coffee/coffee${namespace_name}/g" -e "s/tea/tea${namespace_name}/g" cafe.yaml | kubectl apply -n "$namespace_name" -f - -done - -# Wait for apps to be ready -sleep 60 - -# Create single instance resources -kubectl create -f certificate-ns-and-cafe-secret.yaml -kubectl create -f reference-grant.yaml -kubectl create -f gateway.yaml - -# Create routes -for ((i = 1; i <= num_namespaces; i++)); do - namespace_name="namespace$i" - sed -e "s/coffee/coffee${namespace_name}/g" -e "s/tea/tea${namespace_name}/g" cafe-routes.yaml | kubectl apply -n "$namespace_name" -f - -done diff --git a/tests/reconfig/scripts/delete-multiple.sh b/tests/reconfig/scripts/delete-multiple.sh deleted file mode 100755 index 2f9752e8c9..0000000000 --- a/tests/reconfig/scripts/delete-multiple.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -num_namespaces=$1 - -# Delete namespaces -namespaces="" -for ((i = 1; i <= num_namespaces; i++)); do - namespaces+="namespace${i} " -done - -kubectl delete namespace "${namespaces}" - -# Delete single instance resources -kubectl delete -f gateway.yaml -kubectl delete -f reference-grant.yaml -kubectl delete -f certificate-ns-and-cafe-secret.yaml diff --git a/tests/reconfig/setup.md b/tests/reconfig/setup.md deleted file mode 100644 index 8729115544..0000000000 --- a/tests/reconfig/setup.md +++ /dev/null @@ -1,120 +0,0 @@ -# Reconfig tests - - -- [Reconfig tests](#reconfig-tests) - - [Goals](#goals) - - [Test Environment](#test-environment) - - [Setup](#setup) - - [Tests](#tests) - - [Test 1: Resources exist before start-up](#test-1-resources-exist-before-start-up) - - [Test 2: Start NGF, deploy Gateway, create many resources attached to GW](#test-2-start-ngf-deploy-gateway-create-many-resources-attached-to-gw) - - [Test 3: Start NGF, create many resources attached to a Gateway, deploy the Gateway](#test-3-start-ngf-create-many-resources-attached-to-a-gateway-deploy-the-gateway) - - -## Goals - -- Measure how long it takes NGF to reconfigure NGINX and update statuses when a number of Gateway API and - referenced core Kubernetes resources are created at once. -- Two runs of each test should be ran with differing numbers of resources. Each run will deploy: - - a single Gateway, Secret, and ReferenceGrant resources - - `x+1` number of namespaces - - `2x` number of backend apps and services - - `3x` number of HTTPRoutes. -- Where x=30 OR x=150. - -## Test Environment - -The following cluster will be sufficient: - -- A Kubernetes cluster with 4 nodes on GKE - - Node: e2-medium (2 vCPU, 4GB memory) - -## Setup - -1. Create cloud cluster -2. Install Gateway API Resources: - - ```bash - kubectl kustomize config/crd/gateway-api/standard | kubectl apply -f - - ``` - -3. Deploy NGF from edge using Helm install and wait for LoadBalancer Service to be ready - (NOTE: For Test 1, deploy AFTER resources): - - ```console - helm install my-release oci://ghcr.io/nginxinc/charts/nginx-gateway-fabric --version 0.0.0-edge \ - --create-namespace --wait -n nginx-gateway --set nginxGateway.productTelemetry.enable=false - ``` - -4. Run tests: - 1. There are 3 versions of the reconfiguration tests that need to be ran, with a low and high number of resources. - Therefore, a full test suite includes 6 test runs. - 2. There are scripts to generate the required resources and config changes. - 3. Run each test using the provided script (`scripts/create-resources-gw-last.sh` or - `scripts/create-resources-routes-last.sh` depending on the test). - 4. The scripts accept a number parameter to indicate how many resources should be created. Currently, we are running - with 30 or 150. The scripts will create a single Gateway, Secret and ReferenceGrant resources, `x+1` number of - namespaces, `2x` number of backend apps and services, and `3x` number of HTTPRoutes. - - Note: Clean up after each test run for isolated results. There's a script provided for removing all the test - fixtures `scripts/delete-multiple.sh` which takes a number (needs to be the same number as what was used in the - create script.) -5. After each individual test: - - - Describe the Gateway resource and make sure the status is correct. - - Check the logs of both NGF containers for errors. - - Parse the logs for TimeToReady numbers (see steps 6-7 below). - - Grab metrics. - Note: You can expose metrics by running the below snippet and then navigating to `127.0.0.1:9113/metrics`: - - ```console - GW_POD=$(kubectl get pods -n nginx-gateway | sed -n '2s/^\([^[:space:]]*\).*$/\1/p') - kubectl port-forward $GW_POD -n nginx-gateway 9113:9113 & - ``` - -6. Measure NGINX Reloads and Time to Ready Results - 1. TimeToReadyTotal as described in each test - NGF logs. - 2. TimeToReadyAvgSingle which is the average time between updating any resource and the - NGINX configuration being reloaded - NGF logs. - 3. NGINX Reload count - metrics. - 4. Average NGINX reload duration - metrics. - 1. The average reload duration can be computed by taking the `nginx_gateway_fabric_nginx_reloads_milliseconds_sum` - metric value and dividing it by the `nginx_gateway_fabric_nginx_reloads_milliseconds_count` metric value. -7. Measure Event Batch Processing Results - 1. Event Batch Total - `nginx_gateway_fabric_event_batch_processing_milliseconds_count` metric. - 2. Average Event Batch Processing duration - metrics. - 1. The average event batch processing duration can be computed by taking the `nginx_gateway_fabric_event_batch_processing_milliseconds_sum` - metric value and dividing it by the `nginx_gateway_fabric_event_batch_processing_milliseconds_count` metric value. -8. For accuracy, repeat the test suite once or twice, take the averages, and look for any anomalies or outliers. - -## Tests - -### Test 1: Resources exist before start-up - -1. Deploy Gateway resources before start-up: - 1. Use either of the provided scripts with the required number of resources, - e.g. `cd scripts && bash create-resources-gw-last.sh 30`. The script will deploy backend apps and services, wait - 60 seconds for them to be ready, and deploy 1 Gateway, 1 RefGrant, 1 Secret, and HTTPRoutes. - 2. Deploy NGF - 3. Measure TimeToReadyTotal as the time it takes from start-up -> final config written and - NGINX reloaded. Measure the other results as described in steps 6-7 of the [Setup](#setup) section. - -### Test 2: Start NGF, deploy Gateway, create many resources attached to GW - -1. Deploy all Gateway resources, NGF running: - 1. Deploy NGF - 2. Run the provided script with the required number of resources, - e.g. `cd scripts && bash create-resources-routes-last.sh 30`. The script will deploy backend apps and services, - wait 60 seconds for them to be ready, and deploy 1 Gateway, 1 Secret, 1 RefGrant, and HTTPRoutes at the same time. - 3. Measure TimeToReadyTotal as the time it takes from NGF receiving the first HTTPRoute resource update (logs will say "reconciling") -> final - config written and NGINX reloaded. Measure the other results as described in steps 6-7 of the [Setup](#setup) section. - -### Test 3: Start NGF, create many resources attached to a Gateway, deploy the Gateway - -1. Deploy HTTPRoute resources, NGF running, Gateway last: - 1. Deploy NGF - 2. Run the provided script with the required number of resources, - e.g. `cd scripts && bash create-resources-gw-last.sh 30`. - The script will deploy the namespaces, backend apps and services, 1 Secret, 1 ReferenceGrant, and the HTTPRoutes; - wait 60 seconds for the backend apps to be ready, and then deploy 1 Gateway for all HTTPRoutes. - 3. Measure TimeToReadyTotal as the time it takes from NGF receiving gateway resource -> config written and NGINX reloaded. - Measure the other results as described in steps 6-7 of the [Setup](#setup) section. diff --git a/tests/reconfig/results/1.0.0/1.0.0.md b/tests/results/reconfig/1.0.0/1.0.0.md similarity index 100% rename from tests/reconfig/results/1.0.0/1.0.0.md rename to tests/results/reconfig/1.0.0/1.0.0.md diff --git a/tests/reconfig/results/1.1.0/1.1.0.md b/tests/results/reconfig/1.1.0/1.1.0.md similarity index 100% rename from tests/reconfig/results/1.1.0/1.1.0.md rename to tests/results/reconfig/1.1.0/1.1.0.md diff --git a/tests/reconfig/results/1.2.0/1.2.0.md b/tests/results/reconfig/1.2.0/1.2.0.md similarity index 100% rename from tests/reconfig/results/1.2.0/1.2.0.md rename to tests/results/reconfig/1.2.0/1.2.0.md diff --git a/tests/reconfig/results/1.3.0/1.3.0.md b/tests/results/reconfig/1.3.0/1.3.0.md similarity index 100% rename from tests/reconfig/results/1.3.0/1.3.0.md rename to tests/results/reconfig/1.3.0/1.3.0.md diff --git a/tests/reconfig/scripts/cafe-routes.yaml b/tests/suite/manifests/reconfig/cafe-routes.yaml similarity index 100% rename from tests/reconfig/scripts/cafe-routes.yaml rename to tests/suite/manifests/reconfig/cafe-routes.yaml diff --git a/tests/reconfig/scripts/certificate-ns-and-cafe-secret.yaml b/tests/suite/manifests/reconfig/cafe-secret.yaml similarity index 97% rename from tests/reconfig/scripts/certificate-ns-and-cafe-secret.yaml rename to tests/suite/manifests/reconfig/cafe-secret.yaml index d4037e2d67..4510460bba 100644 --- a/tests/reconfig/scripts/certificate-ns-and-cafe-secret.yaml +++ b/tests/suite/manifests/reconfig/cafe-secret.yaml @@ -1,13 +1,7 @@ apiVersion: v1 -kind: Namespace -metadata: - name: certificate ---- -apiVersion: v1 kind: Secret metadata: name: cafe-secret - namespace: certificate type: kubernetes.io/tls data: tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUNzakNDQVpvQ0NRQzdCdVdXdWRtRkNEQU5CZ2txaGtpRzl3MEJBUXNGQURBYk1Sa3dGd1lEVlFRRERCQmoKWVdabExtVjRZVzF3YkdVdVkyOXRNQjRYRFRJeU1EY3hOREl4TlRJek9Wb1hEVEl6TURjeE5ESXhOVEl6T1ZvdwpHekVaTUJjR0ExVUVBd3dRWTJGbVpTNWxlR0Z0Y0d4bExtTnZiVENDQVNJd0RRWUpLb1pJaHZjTkFRRUJCUUFECmdnRVBBRENDQVFvQ2dnRUJBTHFZMnRHNFc5aStFYzJhdnV4Q2prb2tnUUx1ek10U1Rnc1RNaEhuK3ZRUmxIam8KVzFLRnMvQVdlS25UUStyTWVKVWNseis4M3QwRGtyRThwUisxR2NKSE50WlNMb0NEYUlRN0Nhck5nY1daS0o4Qgo1WDNnVS9YeVJHZjI2c1REd2xzU3NkSEQ1U2U3K2Vab3NPcTdHTVF3K25HR2NVZ0VtL1Q1UEMvY05PWE0zZWxGClRPL051MStoMzROVG9BbDNQdTF2QlpMcDNQVERtQ0thaEROV0NWbUJQUWpNNFI4VERsbFhhMHQ5Z1o1MTRSRzUKWHlZWTNtdzZpUzIrR1dYVXllMjFuWVV4UEhZbDV4RHY0c0FXaGRXbElweHlZQlNCRURjczN6QlI2bFF1OWkxZAp0R1k4dGJ3blVmcUVUR3NZdWxzc05qcU95V1VEcFdJelhibHhJZVVDQXdFQUFUQU5CZ2txaGtpRzl3MEJBUXNGCkFBT0NBUUVBcjkrZWJ0U1dzSnhLTGtLZlRkek1ISFhOd2Y5ZXFVbHNtTXZmMGdBdWVKTUpUR215dG1iWjlpbXQKL2RnWlpYVE9hTElHUG9oZ3BpS0l5eVVRZVdGQ2F0NHRxWkNPVWRhbUloOGk0Q1h6QVJYVHNvcUNOenNNLzZMRQphM25XbFZyS2lmZHYrWkxyRi8vblc0VVNvOEoxaCtQeDljY0tpRDZZU0RVUERDRGh1RUtFWXcvbHpoUDJVOXNmCnl6cEJKVGQ4enFyM3paTjNGWWlITmgzYlRhQS82di9jU2lyamNTK1EwQXg4RWpzQzYxRjRVMTc4QzdWNWRCKzQKcmtPTy9QNlA0UFlWNTRZZHMvRjE2WkZJTHFBNENCYnExRExuYWRxamxyN3NPbzl2ZzNnWFNMYXBVVkdtZ2todAp6VlZPWG1mU0Z4OS90MDBHUi95bUdPbERJbWlXMGc9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== diff --git a/tests/reconfig/scripts/cafe.yaml b/tests/suite/manifests/reconfig/cafe.yaml similarity index 100% rename from tests/reconfig/scripts/cafe.yaml rename to tests/suite/manifests/reconfig/cafe.yaml diff --git a/tests/reconfig/scripts/gateway.yaml b/tests/suite/manifests/reconfig/gateway.yaml similarity index 93% rename from tests/reconfig/scripts/gateway.yaml rename to tests/suite/manifests/reconfig/gateway.yaml index fd9d52675b..ed6c91eb6a 100644 --- a/tests/reconfig/scripts/gateway.yaml +++ b/tests/suite/manifests/reconfig/gateway.yaml @@ -22,4 +22,3 @@ spec: certificateRefs: - kind: Secret name: cafe-secret - namespace: certificate diff --git a/tests/reconfig/scripts/reference-grant.yaml b/tests/suite/manifests/reconfig/reference-grant.yaml similarity index 93% rename from tests/reconfig/scripts/reference-grant.yaml rename to tests/suite/manifests/reconfig/reference-grant.yaml index 053bbbdcc2..e01df54009 100644 --- a/tests/reconfig/scripts/reference-grant.yaml +++ b/tests/suite/manifests/reconfig/reference-grant.yaml @@ -2,7 +2,6 @@ apiVersion: gateway.networking.k8s.io/v1beta1 kind: ReferenceGrant metadata: name: access-to-cafe-secret - namespace: certificate spec: to: - group: "" diff --git a/tests/suite/reconfig_test.go b/tests/suite/reconfig_test.go new file mode 100644 index 0000000000..386b69000e --- /dev/null +++ b/tests/suite/reconfig_test.go @@ -0,0 +1,634 @@ +package main + +import ( + "bytes" + "context" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "text/template" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + core "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctlr "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + v1 "sigs.k8s.io/gateway-api/apis/v1" + + "github.com/nginxinc/nginx-gateway-fabric/tests/framework" +) + +// Cluster node size must be greater than or equal to 4 for test to perform correctly. +var _ = Describe("Reconfiguration Performance Testing", Ordered, Label("reconfiguration", "nfr"), func() { + const ( + // used for cleaning up resources + maxResourceCount = 150 + + metricExistTimeout = 2 * time.Minute + metricExistPolling = 1 * time.Second + ) + + var ( + scrapeInterval = 15 * time.Second + queryRangeStep = 5 * time.Second + promInstance framework.PrometheusInstance + promPortForwardStopCh = make(chan struct{}) + + reconfigNamespace core.Namespace + + outFile *os.File + ) + + BeforeAll(func() { + resultsDir, err := framework.CreateResultsDir("reconfig", version) + Expect(err).ToNot(HaveOccurred()) + + filename := filepath.Join(resultsDir, framework.CreateResultsFilename("md", version, *plusEnabled)) + outFile, err = framework.CreateResultsFile(filename) + Expect(err).ToNot(HaveOccurred()) + Expect(framework.WriteSystemInfoToFile(outFile, clusterInfo, *plusEnabled)).To(Succeed()) + + promCfg := framework.PrometheusConfig{ + ScrapeInterval: scrapeInterval, + } + + promInstance, err = framework.InstallPrometheus(resourceManager, promCfg) + Expect(err).ToNot(HaveOccurred()) + + k8sConfig := ctlr.GetConfigOrDie() + + if !clusterInfo.IsGKE { + Expect(promInstance.PortForward(k8sConfig, promPortForwardStopCh)).To(Succeed()) + } + }) + + BeforeEach(func() { + output, err := framework.InstallGatewayAPI(getDefaultSetupCfg().gwAPIVersion) + Expect(err).ToNot(HaveOccurred(), string(output)) + + // need to redeclare this variable to reset its resource version. The framework has some bugs where + // if we set and declare this as a global variable, even after deleting the namespace, when we try to + // recreate it, it will error saying the resource version has already been set. + reconfigNamespace = core.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "reconfig", + }, + } + }) + + createUniqueResources := func(resourceCount int, fileName string) error { + for i := 1; i <= resourceCount; i++ { + namespace := "namespace" + strconv.Itoa(i) + + b, err := resourceManager.GetFileContents(fileName) + if err != nil { + return fmt.Errorf("error getting manifest file: %w", err) + } + + fileString := b.String() + fileString = strings.ReplaceAll(fileString, "coffee", "coffee"+namespace) + fileString = strings.ReplaceAll(fileString, "tea", "tea"+namespace) + + data := bytes.NewBufferString(fileString) + + if err := resourceManager.ApplyFromBuffer(data, namespace); err != nil { + return fmt.Errorf("error processing manifest file: %w", err) + } + } + + return nil + } + + createResourcesGWLast := func(resourceCount int) { + ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.CreateTimeout*5) + defer cancel() + + for i := 1; i <= resourceCount; i++ { + ns := core.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "namespace" + strconv.Itoa(i), + }, + } + Expect(k8sClient.Create(ctx, &ns)).To(Succeed()) + } + + Expect(resourceManager.Apply([]client.Object{&reconfigNamespace})).To(Succeed()) + Expect(resourceManager.ApplyFromFiles( + []string{ + "reconfig/cafe-secret.yaml", + "reconfig/reference-grant.yaml", + }, + reconfigNamespace.Name)).To(Succeed()) + + Expect(createUniqueResources(resourceCount, "manifests/reconfig/cafe.yaml")).To(Succeed()) + + Expect(createUniqueResources(resourceCount, "manifests/reconfig/cafe-routes.yaml")).To(Succeed()) + + for i := 1; i <= resourceCount; i++ { + ns := core.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "namespace" + strconv.Itoa(i), + }, + } + Expect(resourceManager.WaitForPodsToBeReady(ctx, ns.Name)).To(Succeed()) + } + + Expect(resourceManager.ApplyFromFiles([]string{"reconfig/gateway.yaml"}, reconfigNamespace.Name)).To(Succeed()) + } + + createResourcesRoutesLast := func(resourceCount int) { + ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.CreateTimeout*5) + defer cancel() + + for i := 1; i <= resourceCount; i++ { + ns := core.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "namespace" + strconv.Itoa(i), + }, + } + Expect(k8sClient.Create(ctx, &ns)).To(Succeed()) + } + + Expect(createUniqueResources(resourceCount, "manifests/reconfig/cafe.yaml")).To(Succeed()) + + for i := 1; i <= resourceCount; i++ { + ns := core.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "namespace" + strconv.Itoa(i), + }, + } + Expect(resourceManager.WaitForPodsToBeReady(ctx, ns.Name)).To(Succeed()) + } + + Expect(resourceManager.Apply([]client.Object{&reconfigNamespace})).To(Succeed()) + Expect(resourceManager.ApplyFromFiles( + []string{ + "reconfig/cafe-secret.yaml", + "reconfig/reference-grant.yaml", + "reconfig/gateway.yaml", + }, + reconfigNamespace.Name)).To(Succeed()) + + Expect(createUniqueResources(resourceCount, "manifests/reconfig/cafe-routes.yaml")).To(Succeed()) + } + + checkResourceCreation := func(resourceCount int) error { + ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout) + defer cancel() + + var namespaces core.NamespaceList + if err := k8sClient.List(ctx, &namespaces); err != nil { + return fmt.Errorf("error getting namespaces: %w", err) + } + Expect(len(namespaces.Items)).To(BeNumerically(">=", resourceCount)) + + var routes v1.HTTPRouteList + if err := k8sClient.List(ctx, &routes); err != nil { + return fmt.Errorf("error getting HTTPRoutes: %w", err) + } + Expect(len(routes.Items)).To(BeNumerically("==", resourceCount*3)) + + var pods core.PodList + if err := k8sClient.List(ctx, &pods); err != nil { + return fmt.Errorf("error getting Pods: %w", err) + } + Expect(len(pods.Items)).To(BeNumerically(">=", resourceCount*2)) + + return nil + } + + cleanupResources := func() error { + var err error + + // FIXME (bjee19): https://github.com/nginxinc/nginx-gateway-fabric/issues/2376 + // Find a way to bulk delete these namespaces. + for i := 1; i <= maxResourceCount; i++ { + nsName := "namespace" + strconv.Itoa(i) + resultError := resourceManager.DeleteNamespace(nsName) + if resultError != nil { + err = resultError + } + } + + Expect(resourceManager.DeleteNamespace(reconfigNamespace.Name)).To(Succeed()) + + return err + } + + getTimeStampFromLogLine := func(logLine string) string { + var timeStamp string + + timeStamp = strings.Split(logLine, "\"ts\":\"")[1] + // sometimes the log message will contain information on a "logger" followed by the "msg" + // while other times the "logger" will be omitted + timeStamp = strings.Split(timeStamp, "\",\"msg\"")[0] + timeStamp = strings.Split(timeStamp, "\",\"logger\"")[0] + + return timeStamp + } + + calculateTimeDifferenceBetweenLogLines := func(firstLine, secondLine string) (int, error) { + layout := time.RFC3339 + + firstTS := getTimeStampFromLogLine(firstLine) + secondTS := getTimeStampFromLogLine(secondLine) + + parsedTS1, err := time.Parse(layout, firstTS) + if err != nil { + return 0, err + } + + parsedTS2, err := time.Parse(layout, secondTS) + if err != nil { + return 0, err + } + + return int(parsedTS2.Sub(parsedTS1).Seconds()), nil + } + + calculateTimeToReadyAverage := func(ngfLogs string) (string, error) { + var reconcilingLine, nginxReloadLine string + const maxCount = 5 + + var times [maxCount]int + var count int + + // parse the logs until it reaches a reconciling log line for a gateway resource, then it compares that + // timestamp to the next NGINX configuration update. When it reaches the NGINX configuration update line, + // it will reset the reconciling log line and set it to the next reconciling log line. + for _, line := range strings.Split(ngfLogs, "\n") { + if reconcilingLine == "" && + strings.Contains(line, "Reconciling the resource\",\"controller\"") && + strings.Contains(line, "\"controllerGroup\":\"gateway.networking.k8s.io\"") { + reconcilingLine = line + } + + if strings.Contains(line, "NGINX configuration was successfully updated") && reconcilingLine != "" { + nginxReloadLine = line + + timeDifference, err := calculateTimeDifferenceBetweenLogLines(reconcilingLine, nginxReloadLine) + if err != nil { + return "", err + } + reconcilingLine = "" + + times[count] = timeDifference + count++ + if count == maxCount-1 { + break + } + } + } + + var sum float64 + for _, time := range times { + sum += float64(time) + } + + avgTime := sum / float64(count+1) + + if avgTime < 1 { + return "< 1", nil + } + + return strconv.FormatFloat(avgTime, 'f', -1, 64), nil + } + + calculateTimeToReadyTotal := func(ngfLogs, startingLogSubstring string) (string, error) { + var firstLine, lastLine string + for _, line := range strings.Split(ngfLogs, "\n") { + if firstLine == "" && strings.Contains(line, startingLogSubstring) { + firstLine = line + } + + if strings.Contains(line, "NGINX configuration was successfully updated") { + lastLine = line + } + } + + timeToReadyTotal, err := calculateTimeDifferenceBetweenLogLines(firstLine, lastLine) + if err != nil { + return "", err + } + + stringTimeToReadyTotal := strconv.Itoa(timeToReadyTotal) + if stringTimeToReadyTotal == "0" { + stringTimeToReadyTotal = "< 1" + } + + return stringTimeToReadyTotal, nil + } + + deployNGFReturnsNGFPodNameAndStartTime := func() (string, time.Time) { + var startTime time.Time + + getStartTime := func() time.Time { return startTime } + modifyStartTime := func() { startTime = startTime.Add(500 * time.Millisecond) } + + setup(getDefaultSetupCfg()) + podNames, err := framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetTimeout) + Expect(err).ToNot(HaveOccurred()) + Expect(podNames).To(HaveLen(1)) + ngfPodName := podNames[0] + startTime = time.Now() + + queries := []string{ + fmt.Sprintf(`container_memory_usage_bytes{pod="%s",container="nginx-gateway"}`, ngfPodName), + fmt.Sprintf(`container_cpu_usage_seconds_total{pod="%s",container="nginx-gateway"}`, ngfPodName), + // We don't need to check all nginx_gateway_fabric_* metrics, as they are collected at the same time + fmt.Sprintf(`nginx_gateway_fabric_nginx_reloads_total{pod="%s"}`, ngfPodName), + } + + for _, q := range queries { + Eventually( + framework.CreateMetricExistChecker( + promInstance, + q, + getStartTime, + modifyStartTime, + ), + ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) + } + + return ngfPodName, startTime + } + + collectMetrics := func( + testDescription string, + resourceCount int, + timeToReadyStartingLogSubstring string, + ngfPodName string, + startTime time.Time, + ) { + time.Sleep(2 * scrapeInterval) + + endTime := time.Now() + + Eventually( + framework.CreateEndTimeFinder( + promInstance, + fmt.Sprintf(`rate(container_cpu_usage_seconds_total{pod="%s",container="nginx-gateway"}[2m])`, ngfPodName), + startTime, + &endTime, + queryRangeStep, + ), + ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) + + getEndTime := func() time.Time { return endTime } + noOpModifier := func() {} + + queries := []string{ + fmt.Sprintf(`container_memory_usage_bytes{pod="%s",container="nginx-gateway"}`, ngfPodName), + // We don't need to check all nginx_gateway_fabric_* metrics, as they are collected at the same time + fmt.Sprintf(`nginx_gateway_fabric_nginx_reloads_total{pod="%s"}`, ngfPodName), + } + + for _, q := range queries { + Eventually( + framework.CreateMetricExistChecker( + promInstance, + q, + getEndTime, + noOpModifier, + ), + ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) + } + + checkContainerLogsForErrors(ngfPodName, false) + + reloadCount, err := framework.GetReloadCount(promInstance, ngfPodName) + Expect(err).ToNot(HaveOccurred()) + + reloadAvgTime, err := framework.GetReloadAvgTime(promInstance, ngfPodName) + Expect(err).ToNot(HaveOccurred()) + + reloadBuckets, err := framework.GetReloadBuckets(promInstance, ngfPodName) + Expect(err).ToNot(HaveOccurred()) + + eventsCount, err := framework.GetEventsCount(promInstance, ngfPodName) + Expect(err).ToNot(HaveOccurred()) + + eventsAvgTime, err := framework.GetEventsAvgTime(promInstance, ngfPodName) + Expect(err).ToNot(HaveOccurred()) + + eventsBuckets, err := framework.GetEventsBuckets(promInstance, ngfPodName) + Expect(err).ToNot(HaveOccurred()) + + logs, err := resourceManager.GetPodLogs(ngfNamespace, ngfPodName, &core.PodLogOptions{ + Container: "nginx-gateway", + }) + Expect(err).ToNot(HaveOccurred()) + + // FIXME (bjee19): https://github.com/nginxinc/nginx-gateway-fabric/issues/2374 + // Find a way to calculate time to ready metrics without having to rely on specific log lines. + timeToReadyTotal, err := calculateTimeToReadyTotal(logs, timeToReadyStartingLogSubstring) + Expect(err).ToNot(HaveOccurred()) + + timeToReadyAvgSingle, err := calculateTimeToReadyAverage(logs) + Expect(err).ToNot(HaveOccurred()) + + results := reconfigTestResults{ + TestDescription: testDescription, + EventsBuckets: eventsBuckets, + ReloadBuckets: reloadBuckets, + NumResources: resourceCount, + TimeToReadyTotal: timeToReadyTotal, + TimeToReadyAvgSingle: timeToReadyAvgSingle, + NGINXReloads: int(reloadCount), + NGINXReloadAvgTime: int(reloadAvgTime), + EventsCount: int(eventsCount), + EventsAvgTime: int(eventsAvgTime), + } + + err = writeReconfigResults(outFile, results) + Expect(err).ToNot(HaveOccurred()) + } + + When("resources exist before startup", func() { + testDescription := "Test 1: Resources exist before startup" + + It("gathers metrics after creating 30 resources", func() { + resourceCount := 30 + timeToReadyStartingLogSubstring := "Starting NGINX Gateway Fabric" + + createResourcesGWLast(resourceCount) + Expect(checkResourceCreation(resourceCount)).To(Succeed()) + + ngfPodName, startTime := deployNGFReturnsNGFPodNameAndStartTime() + + collectMetrics( + testDescription, + resourceCount, + timeToReadyStartingLogSubstring, + ngfPodName, + startTime, + ) + }) + + It("gathers metrics after creating 150 resources", func() { + resourceCount := 150 + timeToReadyStartingLogSubstring := "Starting NGINX Gateway Fabric" + + createResourcesGWLast(resourceCount) + Expect(checkResourceCreation(resourceCount)).To(Succeed()) + + ngfPodName, startTime := deployNGFReturnsNGFPodNameAndStartTime() + + collectMetrics( + testDescription, + resourceCount, + timeToReadyStartingLogSubstring, + ngfPodName, + startTime, + ) + }) + }) + + When("NGF and Gateway resource are deployed first", func() { + testDescription := "Test 2: Start NGF, deploy Gateway, create many resources attached to GW" + + It("gathers metrics after creating 30 resources", func() { + resourceCount := 30 + timeToReadyStartingLogSubstring := "Reconciling the resource\",\"controller\":\"httproute\"" + + ngfPodName, startTime := deployNGFReturnsNGFPodNameAndStartTime() + + createResourcesRoutesLast(resourceCount) + Expect(checkResourceCreation(resourceCount)).To(Succeed()) + + collectMetrics( + testDescription, + resourceCount, + timeToReadyStartingLogSubstring, + ngfPodName, + startTime, + ) + }) + + It("gathers metrics after creating 150 resources", func() { + resourceCount := 150 + timeToReadyStartingLogSubstring := "Reconciling the resource\",\"controller\":\"httproute\"" + + ngfPodName, startTime := deployNGFReturnsNGFPodNameAndStartTime() + + createResourcesRoutesLast(resourceCount) + Expect(checkResourceCreation(resourceCount)).To(Succeed()) + + collectMetrics( + testDescription, + resourceCount, + timeToReadyStartingLogSubstring, + ngfPodName, + startTime, + ) + }) + }) + + When("NGF and resources are deployed first", func() { + testDescription := "Test 3: Start NGF, create many resources attached to a Gateway, deploy the Gateway" + + It("gathers metrics after creating 30 resources", func() { + resourceCount := 30 + timeToReadyStartingLogSubstring := "Reconciling the resource\",\"controller\":\"gateway\"" + + ngfPodName, startTime := deployNGFReturnsNGFPodNameAndStartTime() + + createResourcesGWLast(resourceCount) + Expect(checkResourceCreation(resourceCount)).To(Succeed()) + + collectMetrics( + testDescription, + resourceCount, + timeToReadyStartingLogSubstring, + ngfPodName, + startTime, + ) + }) + + It("gathers metrics after creating 150 resources", func() { + resourceCount := 150 + timeToReadyStartingLogSubstring := "Reconciling the resource\",\"controller\":\"gateway\"" + + ngfPodName, startTime := deployNGFReturnsNGFPodNameAndStartTime() + + createResourcesGWLast(resourceCount) + Expect(checkResourceCreation(resourceCount)).To(Succeed()) + + collectMetrics( + testDescription, + resourceCount, + timeToReadyStartingLogSubstring, + ngfPodName, + startTime, + ) + }) + }) + + AfterEach(func() { + Expect(cleanupResources()).Should(Succeed()) + teardown(releaseName) + }) + + AfterAll(func() { + close(promPortForwardStopCh) + Expect(framework.UninstallPrometheus(resourceManager)).Should(Succeed()) + Expect(outFile.Close()).To(Succeed()) + + // restoring NGF shared among tests in the suite + cfg := getDefaultSetupCfg() + cfg.nfr = true + setup(cfg) + }) +}) + +type reconfigTestResults struct { + TestDescription string + TimeToReadyTotal string + TimeToReadyAvgSingle string + EventsBuckets []framework.Bucket + ReloadBuckets []framework.Bucket + NumResources int + NGINXReloads int + NGINXReloadAvgTime int + EventsCount int + EventsAvgTime int +} + +const reconfigResultTemplate = ` +## {{ .TestDescription }} - NumResources {{ .NumResources }} + +### Reloads and Time to Ready + +- TimeToReadyTotal: {{ .TimeToReadyTotal }}s +- TimeToReadyAvgSingle: {{ .TimeToReadyAvgSingle }}s +- NGINX Reloads: {{ .NGINXReloads }} +- NGINX Reload Average Time: {{ .NGINXReloadAvgTime }}ms +- Reload distribution: +{{- range .ReloadBuckets }} + - {{ .Le }}ms: {{ .Val }} +{{- end }} + +### Event Batch Processing + +- Event Batch Total: {{ .EventsCount }} +- Event Batch Processing Average Time: {{ .EventsAvgTime }}ms +- Event Batch Processing distribution: +{{- range .EventsBuckets }} + - {{ .Le }}ms: {{ .Val }} +{{- end }} + +` + +func writeReconfigResults(dest io.Writer, results reconfigTestResults) error { + tmpl, err := template.New("results").Parse(reconfigResultTemplate) + if err != nil { + return err + } + + return tmpl.Execute(dest, results) +} diff --git a/tests/suite/scale_test.go b/tests/suite/scale_test.go index 053d575d26..b7a2a6d460 100644 --- a/tests/suite/scale_test.go +++ b/tests/suite/scale_test.go @@ -3,7 +3,6 @@ package main import ( "bytes" "context" - "errors" "fmt" "io" "os" @@ -17,7 +16,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" core "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -109,15 +107,10 @@ var _ = Describe("Scale test", Ordered, Label("nfr", "scale"), func() { ngfPodName = podNames[0] }) - type bucket struct { - Le string - Val int - } - type scaleTestResults struct { Name string - EventsBuckets []bucket - ReloadBuckets []bucket + EventsBuckets []framework.Bucket + ReloadBuckets []framework.Bucket EventsAvgTime int EventsCount int NGFContainerRestarts int @@ -173,91 +166,6 @@ The logs are attached only if there are errors. return tmpl.Execute(dest, results) } - createResponseChecker := func(url, address string) func() error { - return func() error { - status, _, err := framework.Get(url, address, timeoutConfig.RequestTimeout) - if err != nil { - return fmt.Errorf("bad response: %w", err) - } - - if status != 200 { - return fmt.Errorf("unexpected status code: %d", status) - } - - return nil - } - } - - createMetricExistChecker := func(query string, getTime func() time.Time, modifyTime func()) func() error { - return func() error { - queryWithTimestamp := fmt.Sprintf("%s @ %d", query, getTime().Unix()) - - result, err := promInstance.Query(queryWithTimestamp) - if err != nil { - return fmt.Errorf("failed to query Prometheus: %w", err) - } - - if result.String() == "" { - modifyTime() - return errors.New("empty result") - } - - return nil - } - } - - createEndTimeFinder := func(query string, startTime time.Time, t *time.Time) func() error { - return func() error { - result, err := promInstance.QueryRange(query, promv1.Range{ - Start: startTime, - End: *t, - Step: queryRangeStep, - }) - if err != nil { - return fmt.Errorf("failed to query Prometheus: %w", err) - } - - if result.String() == "" { - *t = time.Now() - return errors.New("empty result") - } - - return nil - } - } - - getFirstValueOfVector := func(query string) float64 { - result, err := promInstance.Query(query) - Expect(err).ToNot(HaveOccurred()) - - val, err := framework.GetFirstValueOfPrometheusVector(result) - Expect(err).ToNot(HaveOccurred()) - - return val - } - - getBuckets := func(query string) []bucket { - result, err := promInstance.Query(query) - Expect(err).ToNot(HaveOccurred()) - - res, ok := result.(model.Vector) - Expect(ok).To(BeTrue()) - - buckets := make([]bucket, 0, len(res)) - - for _, sample := range res { - le := sample.Metric["le"] - val := float64(sample.Value) - bucket := bucket{ - Le: string(le), - Val: int(val), - } - buckets = append(buckets, bucket) - } - - return buckets - } - checkLogErrors := func( containerName string, substrings []string, @@ -323,7 +231,8 @@ The logs are attached only if there are errors. for _, q := range queries { Eventually( - createMetricExistChecker( + framework.CreateMetricExistChecker( + promInstance, q, getStartTime, modifyStartTime, @@ -345,10 +254,12 @@ The logs are attached only if there are errors. // the rate query may not return any data. // To ensure it returns data, we increase the startTime. Eventually( - createEndTimeFinder( + framework.CreateEndTimeFinder( + promInstance, fmt.Sprintf(`rate(container_cpu_usage_seconds_total{pod="%s",container="nginx-gateway"}[2m])`, ngfPodName), startTime, &endTime, + queryRangeStep, ), ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) @@ -363,7 +274,8 @@ The logs are attached only if there are errors. for _, q := range queries { Eventually( - createMetricExistChecker( + framework.CreateMetricExistChecker( + promInstance, q, getEndTime, noOpModifier, @@ -414,82 +326,26 @@ The logs are attached only if there are errors. Expect(os.Remove(cpuCSV)).To(Succeed()) - reloadCount := getFirstValueOfVector( - fmt.Sprintf( - `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"} @ %d`, - ngfPodName, - startTime.Unix(), - ), - ) + reloadCount, err := framework.GetReloadCountWithStartTime(promInstance, ngfPodName, startTime) + Expect(err).ToNot(HaveOccurred()) - reloadErrsCount := getFirstValueOfVector( - fmt.Sprintf( - `nginx_gateway_fabric_nginx_reload_errors_total{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_nginx_reload_errors_total{pod="%[1]s"} @ %d`, - ngfPodName, - startTime.Unix(), - ), - ) + reloadErrsCount, err := framework.GetReloadErrsCountWithStartTime(promInstance, ngfPodName, startTime) + Expect(err).ToNot(HaveOccurred()) - reloadAvgTime := getFirstValueOfVector( - fmt.Sprintf( - `(nginx_gateway_fabric_nginx_reloads_milliseconds_sum{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_nginx_reloads_milliseconds_sum{pod="%[1]s"} @ %[2]d)`+ - ` / `+ - `(nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"} @ %[2]d)`, - ngfPodName, - startTime.Unix(), - )) - - reloadBuckets := getBuckets( - fmt.Sprintf( - `nginx_gateway_fabric_nginx_reloads_milliseconds_bucket{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_nginx_reloads_milliseconds_bucket{pod="%[1]s"} @ %d`, - ngfPodName, - startTime.Unix(), - ), - ) + reloadAvgTime, err := framework.GetReloadAvgTimeWithStartTime(promInstance, ngfPodName, startTime) + Expect(err).ToNot(HaveOccurred()) - eventsCount := getFirstValueOfVector( - fmt.Sprintf( - `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"} @ %d`, - ngfPodName, - startTime.Unix(), - ), - ) + reloadBuckets, err := framework.GetReloadBucketsWithStartTime(promInstance, ngfPodName, startTime) + Expect(err).ToNot(HaveOccurred()) - eventsAvgTime := getFirstValueOfVector( - fmt.Sprintf( - `(nginx_gateway_fabric_event_batch_processing_milliseconds_sum{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_event_batch_processing_milliseconds_sum{pod="%[1]s"} @ %[2]d)`+ - ` / `+ - `(nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"} @ %[2]d)`, - ngfPodName, - startTime.Unix(), - ), - ) + eventsCount, err := framework.GetEventsCountWithStartTime(promInstance, ngfPodName, startTime) + Expect(err).ToNot(HaveOccurred()) - eventsBuckets := getBuckets( - fmt.Sprintf( - `nginx_gateway_fabric_event_batch_processing_milliseconds_bucket{pod="%[1]s"}`+ - ` - `+ - `nginx_gateway_fabric_event_batch_processing_milliseconds_bucket{pod="%[1]s"} @ %d`, - ngfPodName, - startTime.Unix(), - ), - ) + eventsAvgTime, err := framework.GetEventsAvgTimeWithStartTime(promInstance, ngfPodName, startTime) + Expect(err).ToNot(HaveOccurred()) + + eventsBuckets, err := framework.GetEventsBucketsWithStartTime(promInstance, ngfPodName, startTime) + Expect(err).ToNot(HaveOccurred()) // Check container logs for errors @@ -573,7 +429,7 @@ The logs are attached only if there are errors. startCheck := time.Now() Eventually( - createResponseChecker(url, address), + framework.CreateResponseChecker(url, address, timeoutConfig.RequestTimeout), ).WithTimeout(30 * time.Second).WithPolling(100 * time.Millisecond).Should(Succeed()) ttr := time.Since(startCheck) @@ -607,7 +463,7 @@ The logs are attached only if there are errors. } Eventually( - createResponseChecker(url, address), + framework.CreateResponseChecker(url, address, timeoutConfig.RequestTimeout), ).WithTimeout(5 * time.Second).WithPolling(100 * time.Millisecond).Should(Succeed()) Expect( @@ -620,7 +476,7 @@ The logs are attached only if there are errors. Expect(resourceManager.WaitForPodsToBeReady(ctx, namespace)).To(Succeed()) Eventually( - createResponseChecker(url, address), + framework.CreateResponseChecker(url, address, timeoutConfig.RequestTimeout), ).WithTimeout(5 * time.Second).WithPolling(100 * time.Millisecond).Should(Succeed()) } diff --git a/tests/suite/system_suite_test.go b/tests/suite/system_suite_test.go index f0f3aa6993..bb5168d551 100644 --- a/tests/suite/system_suite_test.go +++ b/tests/suite/system_suite_test.go @@ -274,6 +274,7 @@ var _ = BeforeSuite(func() { "longevity-teardown", // - running longevity teardown (deployment will already exist) "telemetry", // - running telemetry test (NGF will be deployed as part of the test) "scale", // - running scale test (this test will deploy its own version) + "reconfiguration", // - running reconfiguration test (test will deploy its own instances) } for _, s := range skipSubstrings { if strings.Contains(labelFilter, s) { @@ -317,7 +318,8 @@ func isNFR(labelFilter string) bool { strings.Contains(labelFilter, "longevity") || strings.Contains(labelFilter, "performance") || strings.Contains(labelFilter, "upgrade") || - strings.Contains(labelFilter, "scale") + strings.Contains(labelFilter, "scale") || + strings.Contains(labelFilter, "reconfiguration") } var _ = ReportAfterSuite("Print info on failure", func(report Report) {