diff --git a/docker/Dockerfile b/docker/Dockerfile index 8df79c420e..7b89cb6244 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -78,7 +78,7 @@ RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \ COPY config.properties /home/model-server/config.properties RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store -EXPOSE 8080 8081 +EXPOSE 8080 8081 8082 USER model-server WORKDIR /home/model-server diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 1bb04f2c1c..e009861ede 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -58,7 +58,7 @@ RUN if [ "$MACHINE_TYPE" = "gpu" ]; then export USE_CUDA=1; fi \ && cp docker/config.properties /home/model-server/config.properties \ && mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store -EXPOSE 8080 8081 +EXPOSE 8080 8081 8082 USER model-server WORKDIR /home/model-server ENV TEMP=/home/model-server/tmp diff --git a/docker/config.properties b/docker/config.properties index 669292b829..2fb279fedb 100644 --- a/docker/config.properties +++ b/docker/config.properties @@ -1,5 +1,6 @@ inference_address=http://0.0.0.0:8080 management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 number_of_netty_threads=32 job_queue_size=1000 model_store=/home/model-server/model-store \ No newline at end of file diff --git a/kubernetes/README.md b/kubernetes/README.md index fd3a0bc54f..967e63c92a 100644 --- a/kubernetes/README.md +++ b/kubernetes/README.md @@ -482,6 +482,7 @@ ```yaml inference_address=http://0.0.0.0:8080 management_address=http://0.0.0.0:8081 + metrics_address=http://0.0.0.0:8082 NUM_WORKERS=1 number_of_gpu=1 number_of_netty_threads=32 @@ -539,8 +540,9 @@ | Parameter | Description | Default | | ------------------ | ------------------------ | ------------------------------- | | `image` | Torchserve Serving image | `pytorch/torchserve:latest-gpu` | - | `management-port` | TS Inference port | `8080` | - | `inference-port` | TS Management port | `8081` | + | `inference_port` | TS Inference port | `8080` | + | `management_port` | TS Management port | `8081` | + | `metrics_port` | TS Mertics port | `8082` | | `replicas` | K8S deployment replicas | `1` | | `model-store` | EFS mountpath | `/home/model-server/shared/` | | `persistence.size` | Storage size to request | `1Gi` | @@ -568,6 +570,7 @@ torchserve: management_port: 8081 inference_port: 8080 + metrics_port: 8082 pvd_mount: /home/model-server/shared/ n_gpu: 1 n_cpu: 1 @@ -647,7 +650,7 @@ } - curl http://your_elb.us-west-2.elb.amazonaws.com.us-west-2.elb.amazonaws.com:8081/models/squeezenet1_1 + curl http://your_elb.us-west-2.elb.amazonaws.com:8081/models/squeezenet1_1 # You should see something similar to the following [ @@ -710,7 +713,56 @@ } ] ``` + ## Metrics + + ## Install prometheus + ``` + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm install prometheus prometheus prometheus-community/prometheus + ``` + + ## Install grafana + ``` + helm repo add grafana https://grafana.github.io/helm-charts + helm install grafana grafana/grafana + ``` + Get admin user password by running: + ``` + kubectl get secret --namespace default grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo + ``` + + ## Add prometheus as data source in grafana + ``` + kubectl get pods + + NAME READY STATUS RESTARTS AGE + efs-provisioner-1603257008-b6b54d986-gng9g 1/1 Running 0 5h15m + grafana-cbd8775fd-6f8l5 1/1 Running 0 4h12m + model-store-pod 1/1 Running 0 4h35m + prometheus-alertmanager-776df7bfb5-hpsp4 2/2 Running 0 4h42m + prometheus-kube-state-metrics-6df5d44568-zkcm2 1/1 Running 0 4h42m + prometheus-node-exporter-fvsd6 1/1 Running 0 4h42m + prometheus-node-exporter-tmfh8 1/1 Running 0 4h42m + prometheus-pushgateway-85948997f7-4s4bj 1/1 Running 0 4h42m + prometheus-server-f8677599b-xmjbt 2/2 Running 0 4h42m + torchserve-7d468f9894-fvmpj 1/1 Running 0 4h33m + + kubectl get pod prometheus-server-f8677599b-xmjbt -o jsonpath='{.status.podIPs[0].ip}' + 192.168.52.141 + ``` + ![Add data source](images/grafana_datasource.png) + + + ## Expose grafana with loadbalancer + ``` + kubectl patch service grafana -p '{"spec": {"type": "LoadBalancer"}}' + + kubectl get svc grafana -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' + ``` + + ## Login to grafana + http://your.grafana.elb.us-west-2.elb.amazonaws.com:3000 ## Troubleshooting diff --git a/kubernetes/config.properties b/kubernetes/config.properties index 7dbc411a58..9f7ad861a7 100644 --- a/kubernetes/config.properties +++ b/kubernetes/config.properties @@ -1,5 +1,8 @@ inference_address=http://0.0.0.0:8080 management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 +enable_metrics_api=true +metrics_format=prometheus NUM_WORKERS=1 number_of_gpu=1 number_of_netty_threads=32 diff --git a/kubernetes/images/grafana_datasource.png b/kubernetes/images/grafana_datasource.png new file mode 100644 index 0000000000..271f29a00d Binary files /dev/null and b/kubernetes/images/grafana_datasource.png differ diff --git a/kubernetes/templates/torchserve.yaml b/kubernetes/templates/torchserve.yaml index 7dde346928..f87e98104f 100644 --- a/kubernetes/templates/torchserve.yaml +++ b/kubernetes/templates/torchserve.yaml @@ -5,6 +5,9 @@ metadata: name: torchserve labels: app: torchserve + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '8082' spec: ports: - name: preds @@ -13,6 +16,9 @@ spec: - name: mdl port: {{ .Values.torchserve.management_port }} targetPort: ts-management + - name: metrics + port: {{ .Values.torchserve.metrics_port }} + targetPort: ts-metrics type: LoadBalancer selector: app: torchserve @@ -46,6 +52,8 @@ spec: containerPort: {{ .Values.torchserve.inference_port }} - name: ts-management containerPort: {{ .Values.torchserve.management_port }} + - name: ts-metrics + containerPort: {{ .Values.torchserve.metrics_port }} imagePullPolicy: IfNotPresent volumeMounts: - mountPath: {{ .Values.torchserve.pvd_mount }} diff --git a/kubernetes/values.yaml b/kubernetes/values.yaml index 8e427cc122..8a82b4d085 100644 --- a/kubernetes/values.yaml +++ b/kubernetes/values.yaml @@ -7,6 +7,7 @@ namespace: torchserve torchserve: management_port: 8081 inference_port: 8080 + metrics_port: 8082 pvd_mount: /home/model-server/shared/ n_gpu: 1 n_cpu: 1