From dd6644fa0647191fd90b130b0fa3c13fe346771c Mon Sep 17 00:00:00 2001 From: win5923 Date: Wed, 8 Jan 2025 23:50:22 +0800 Subject: [PATCH] [Grafana] add cluster variable to enable filtering of different RayClusters Signed-off-by: win5923 --- config/grafana/data_grafana_dashboard.json | 9122 ++++++++--------- config/grafana/default_grafana_dashboard.json | 114 +- .../serve_deployment_grafana_dashboard.json | 216 +- config/grafana/serve_grafana_dashboard.json | 1200 ++- config/prometheus/podMonitor.yaml | 39 + config/prometheus/serviceMonitor.yaml | 50 +- 6 files changed, 5800 insertions(+), 4941 deletions(-) diff --git a/config/grafana/data_grafana_dashboard.json b/config/grafana/data_grafana_dashboard.json index 0cedcf315be..d7cf75199dc 100644 --- a/config/grafana/data_grafana_dashboard.json +++ b/config/grafana/data_grafana_dashboard.json @@ -1,4563 +1,4563 @@ { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "iteration": 1667344411089, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "hiddenSeries": false, - "id": 1, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Spilled", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount allocated by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Allocated", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount freed by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "hiddenSeries": false, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Freed", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount of memory store used by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Current Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Object Store Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical CPUs allocated to dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 2 - }, - "hiddenSeries": false, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical GPUs allocated to dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 2 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Bytes output per second by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 3 - }, - "hiddenSeries": false, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Output / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Output / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Total rows output per second by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 3 - }, - "hiddenSeries": false, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Rows Output / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows Output / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "rows/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of input blocks received by operator per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "hiddenSeries": false, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Received / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Blocks Received by Operator / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks received by operator per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "hiddenSeries": false, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Received / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Received by Operator / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of input blocks that operator's tasks have finished processing per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 5 - }, - "hiddenSeries": false, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Processed / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Blocks Processed by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks that operator's tasks have finished processing per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 5 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Processed / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Processed by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks passed to submitted tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 6 - }, - "hiddenSeries": false, - "id": 21, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Submitted / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Submitted to Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of output blocks generated by tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 6 - }, - "hiddenSeries": false, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Generated / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Blocks Generated by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks generated by tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 - }, - "hiddenSeries": false, - "id": 23, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Generated / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Generated by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of rows in generated output blocks from finished tasks per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 7 - }, - "hiddenSeries": false, - "id": 24, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Rows Generated / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows Generated by Tasks / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "rows/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of output blocks taken by downstream operators per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "hiddenSeries": false, - "id": 25, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Taken / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Output Blocks Taken by Downstream Operators / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks/sec", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks taken by downstream operators per second.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "hiddenSeries": false, - "id": 26, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[1m])) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Taken / Second: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Output Bytes Taken by Downstream Operators / Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of submitted tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "hiddenSeries": false, - "id": 29, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Submitted Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of running tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "hiddenSeries": false, - "id": 30, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Running Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Running Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of tasks that already have output.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 31, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Tasks with output blocks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of finished tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Finished Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of failed tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 11 - }, - "hiddenSeries": false, - "id": 33, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Failed Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Time spent generating blocks in tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 11 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Block Generation Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Time spent in task submission backpressure.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 12 - }, - "hiddenSeries": false, - "id": 37, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Task Submission Backpressure Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of blocks in operator's internal input queue", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 12 - }, - "hiddenSeries": false, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Inqueue Size (Blocks)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks in the operator's internal input queue.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "hiddenSeries": false, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Inqueue Size (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of blocks in operator's internal output queue", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "hiddenSeries": false, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Outqueue Size (Blocks)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks in the operator's internal output queue.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 14 - }, - "hiddenSeries": false, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Outqueue Size (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks used by pending tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 14 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Size of Blocks used in Pending Tasks (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of freed memory in object store.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 15 - }, - "hiddenSeries": false, - "id": 35, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Freed Memory in Object Store (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of spilled memory in object store.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 15 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Spilled Memory in Object Store (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds spent in iterator initialization code", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration Initialization Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds user thread is blocked by iter_batches()", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "hiddenSeries": false, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration Blocked Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds spent in user code", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 17 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration User Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [ - "rayVersion:3.0.0.dev0" - ], - "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries of a specific Prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".+", - "current": { - "selected": false - }, - "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", - "description": "Filter queries to specific ray sessions.", - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "SessionName", - "options": [], - "query": { - "query": "label_values(ray_data_allocated_bytes{}, SessionName)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, dataset)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "DatasetID", - "options": [], - "query": { - "query": "label_values(ray_data_allocated_bytes{}, dataset)", - "refId": "Prometheus-Dataset-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "selected": false - }, - "datasource": "${datasource}", - "definition": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", - "description": "Filter queries to specific ray cluster for kubernetes.", - "error": null, - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "ray_io_cluster", - "options": [], - "query": { - "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "rayMeta": [ - "excludesSystemRoutes", - "supportsGlobalFilterOverride" - ], - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Data Dashboard", - "uid": "rayDataDashboard", - "version": 1 + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Spilled", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount allocated by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Allocated", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount freed by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Freed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount of memory store used by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Current Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Object Store Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical CPUs allocated to dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical GPUs allocated to dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 2 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Bytes output per second by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Output / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Output / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total rows output per second by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Rows Output / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows Output / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "rows/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of input blocks received by operator per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Received / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Blocks Received by Operator / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks received by operator per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Received / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Received by Operator / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of input blocks that operator's tasks have finished processing per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Processed / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Blocks Processed by Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks that operator's tasks have finished processing per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Processed / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Processed by Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks passed to submitted tasks per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 21, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Submitted / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Submitted to Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of output blocks generated by tasks per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Generated / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Blocks Generated by Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks generated by tasks per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Generated / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Generated by Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of rows in generated output blocks from finished tasks per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Rows Generated / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows Generated by Tasks / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "rows/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of output blocks taken by downstream operators per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Taken / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Output Blocks Taken by Downstream Operators / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks/sec", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks taken by downstream operators per second.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[1m])) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Taken / Second: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Output Bytes Taken by Downstream Operators / Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of submitted tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 29, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Submitted Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of running tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Running Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Running Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of tasks that already have output.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Tasks with output blocks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of finished tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Finished Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of failed tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Failed Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent generating blocks in tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Block Generation Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent in task submission backpressure.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Submission Backpressure Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of blocks in operator's internal input queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Inqueue Size (Blocks)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks in the operator's internal input queue.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Inqueue Size (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of blocks in operator's internal output queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Outqueue Size (Blocks)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks in the operator's internal output queue.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Outqueue Size (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks used by pending tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Size of Blocks used in Pending Tasks (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of freed memory in object store.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Freed Memory in Object Store (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of spilled memory in object store.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Spilled Memory in Object Store (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds spent in iterator initialization code", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration Initialization Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds user thread is blocked by iter_batches()", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration Blocked Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds spent in user code", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration User Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:3.0.0.dev0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", + "description": "Filter queries to specific ray sessions.", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "SessionName", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, SessionName)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_data_allocated_bytes{}, dataset)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "DatasetID", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, dataset)", + "refId": "Prometheus-Dataset-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "Cluster", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Data Dashboard", + "uid": "rayDataDashboard", + "version": 1 } \ No newline at end of file diff --git a/config/grafana/default_grafana_dashboard.json b/config/grafana/default_grafana_dashboard.json index 3e568779224..df39587a8aa 100644 --- a/config/grafana/default_grafana_dashboard.json +++ b/config/grafana/default_grafana_dashboard.json @@ -94,7 +94,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (State), 0)", + "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State), 0)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -102,7 +102,7 @@ }, { "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (State), 0)", + "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State), 0)", "interval": "", "legendFormat": "{{State}} (retry)", "queryType": "randomWalk", @@ -228,7 +228,7 @@ "targets": [ { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Name), 0)", + "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", @@ -236,7 +236,7 @@ }, { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Name), 0)", + "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", "interval": "", "legendFormat": "{{Name}} (retry)", "queryType": "randomWalk", @@ -362,7 +362,7 @@ "targets": [ { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Name), 0)", + "expr": "clamp_min(sum(ray_tasks{IsRetry=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", @@ -370,7 +370,7 @@ }, { "exemplar": true, - "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Name), 0)", + "expr": "clamp_min(sum(ray_tasks{IsRetry!=\"0\",State=~\"RUNNING*\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name), 0)", "interval": "", "legendFormat": "{{Name}} (retry)", "queryType": "randomWalk", @@ -496,7 +496,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{Source=\"gcs\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (State)", + "expr": "sum(ray_actors{Source=\"gcs\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -622,7 +622,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (State)", + "expr": "sum(ray_actors{Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -748,7 +748,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{State!=\"DEAD\",Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Name)", + "expr": "sum(ray_actors{State!=\"DEAD\",Source=\"executor\",NodeAddress=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Name)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", @@ -874,7 +874,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (instance)", + "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (instance)", "interval": "", "legendFormat": "CPU Usage: {{instance}}", "queryType": "randomWalk", @@ -882,7 +882,7 @@ }, { "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_resources{Name=\"CPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -890,7 +890,7 @@ }, { "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)))", + "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)))", "interval": "", "legendFormat": "MAX + PENDING", "queryType": "randomWalk", @@ -1016,7 +1016,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Location)", + "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Location)", "interval": "", "legendFormat": "{{Location}}", "queryType": "randomWalk", @@ -1024,7 +1024,7 @@ }, { "exemplar": true, - "expr": "sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1150,7 +1150,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_resources{Name=\"GPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_resources{Name=\"GPU\",State=\"USED\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "GPU Usage: {{instance}}", "queryType": "randomWalk", @@ -1158,7 +1158,7 @@ }, { "exemplar": true, - "expr": "sum(ray_resources{Name=\"GPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_resources{Name=\"GPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1166,7 +1166,7 @@ }, { "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0)))", + "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0)))", "interval": "", "legendFormat": "MAX + PENDING", "queryType": "randomWalk", @@ -1292,7 +1292,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (State)", + "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -1418,7 +1418,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_cpu_utilization{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} / 100", + "expr": "ray_node_cpu_utilization{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100", "interval": "", "legendFormat": "CPU Usage: {{instance}}", "queryType": "randomWalk", @@ -1426,7 +1426,7 @@ }, { "exemplar": true, - "expr": "ray_node_cpu_utilization{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} / 100", + "expr": "ray_node_cpu_utilization{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100", "interval": "", "legendFormat": "CPU Usage: {{instance}} (head)", "queryType": "randomWalk", @@ -1434,7 +1434,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1560,7 +1560,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_gpus_utilization{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} / 100", + "expr": "ray_node_gpus_utilization{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100", "interval": "", "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", @@ -1568,7 +1568,7 @@ }, { "exemplar": true, - "expr": "ray_node_gpus_utilization{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} / 100", + "expr": "ray_node_gpus_utilization{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} / 100", "interval": "", "legendFormat": "GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", @@ -1576,7 +1576,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gpus_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_node_gpus_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1702,7 +1702,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_disk_usage{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_disk_usage{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Disk Used: {{instance}}", "queryType": "randomWalk", @@ -1710,7 +1710,7 @@ }, { "exemplar": true, - "expr": "ray_node_disk_usage{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_disk_usage{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Disk Used: {{instance}} (head)", "queryType": "randomWalk", @@ -1718,7 +1718,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_disk_free{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) + sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_node_disk_free{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1844,7 +1844,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Write: {{instance}}", "queryType": "randomWalk", @@ -1852,7 +1852,7 @@ }, { "exemplar": true, - "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Write: {{instance}} (head)", "queryType": "randomWalk", @@ -1860,7 +1860,7 @@ }, { "exemplar": true, - "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Read: {{instance}}", "queryType": "randomWalk", @@ -1868,7 +1868,7 @@ }, { "exemplar": true, - "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Read: {{instance}} (head)", "queryType": "randomWalk", @@ -1994,7 +1994,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Memory Used: {{instance}}", "queryType": "randomWalk", @@ -2002,7 +2002,7 @@ }, { "exemplar": true, - "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Memory Used: {{instance}} (head)", "queryType": "randomWalk", @@ -2010,7 +2010,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2136,7 +2136,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}/ray_node_mem_total{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} * 100", + "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}/ray_node_mem_total{instance=~\"$Instance\", IsHeadNode=\"false\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 100", "interval": "", "legendFormat": "Memory Used: {{instance}}", "queryType": "randomWalk", @@ -2144,7 +2144,7 @@ }, { "exemplar": true, - "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}/ray_node_mem_total{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} * 100", + "expr": "ray_node_mem_used{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}/ray_node_mem_total{instance=~\"$Instance\", IsHeadNode=\"true\", SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 100", "interval": "", "legendFormat": "Memory Used: {{instance}} (head)", "queryType": "randomWalk", @@ -2270,7 +2270,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "OOM Killed: {{Name}}, {{instance}}", "queryType": "randomWalk", @@ -2396,7 +2396,7 @@ "targets": [ { "exemplar": true, - "expr": "(sum(ray_component_rss_mb{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Component))", + "expr": "(sum(ray_component_rss_mb{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Component))", "interval": "", "legendFormat": "{{Component}}", "queryType": "randomWalk", @@ -2404,7 +2404,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_node_mem_shared_bytes{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "shared_memory", "queryType": "randomWalk", @@ -2412,7 +2412,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2538,7 +2538,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_component_cpu_percentage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (Component) / 100", + "expr": "sum(ray_component_cpu_percentage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (Component) / 100", "interval": "", "legendFormat": "{{Component}}", "queryType": "randomWalk", @@ -2546,7 +2546,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "sum(ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2672,7 +2672,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",} * 1024 * 1024", + "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",} * 1024 * 1024", "interval": "", "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", @@ -2680,7 +2680,7 @@ }, { "exemplar": true, - "expr": "(sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})) * 1024 * 1024", + "expr": "(sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 1024 * 1024", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2806,7 +2806,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Recv: {{instance}}", "queryType": "randomWalk", @@ -2814,7 +2814,7 @@ }, { "exemplar": true, - "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}", + "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "Send: {{instance}}", "queryType": "randomWalk", @@ -2940,7 +2940,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (NodeType)", + "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (NodeType)", "interval": "", "legendFormat": "Active Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2948,7 +2948,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (NodeType)", + "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (NodeType)", "interval": "", "legendFormat": "Failed Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2956,7 +2956,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) by (NodeType)", + "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) by (NodeType)", "interval": "", "legendFormat": "Pending Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -3082,7 +3082,7 @@ "targets": [ { "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})", + "expr": "avg(ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "CPU (physical)", "queryType": "randomWalk", @@ -3090,7 +3090,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) / on() (sum(autoscaler_cluster_resources{resource=\"GPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) or vector(0))", + "expr": "sum(ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(autoscaler_cluster_resources{resource=\"GPU\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) or vector(0))", "interval": "", "legendFormat": "GPU (physical)", "queryType": "randomWalk", @@ -3098,7 +3098,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) / on() (sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})) * 100", + "expr": "sum(ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_mem_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 100", "interval": "", "legendFormat": "Memory (RAM)", "queryType": "randomWalk", @@ -3106,7 +3106,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) / on() (sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})) * 100", + "expr": "sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_gram_available{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 100", "interval": "", "legendFormat": "GRAM", "queryType": "randomWalk", @@ -3114,7 +3114,7 @@ }, { "exemplar": true, - "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) / on() sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) * 100", + "expr": "sum(ray_object_store_memory{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() sum(ray_resources{Name=\"object_store_memory\",instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) * 100", "interval": "", "legendFormat": "Object Store Memory", "queryType": "randomWalk", @@ -3122,7 +3122,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) / on() (sum(ray_node_disk_free{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",}) + sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\",})) * 100", + "expr": "sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_disk_free{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$Cluster\",})) * 100", "interval": "", "legendFormat": "Disk", "queryType": "randomWalk", @@ -3267,13 +3267,13 @@ }, "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", - "description": "Filter queries to specific ray cluster for kubernetes.", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", "error": null, "hide": 0, "includeAll": false, "label": null, "multi": false, - "name": "ray_io_cluster", + "name": "Cluster", "options": [], "query": { "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", diff --git a/config/grafana/serve_deployment_grafana_dashboard.json b/config/grafana/serve_deployment_grafana_dashboard.json index ea80d2a6bd6..7252ca8288c 100644 --- a/config/grafana/serve_deployment_grafana_dashboard.json +++ b/config/grafana/serve_deployment_grafana_dashboard.json @@ -94,7 +94,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -220,7 +220,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_request_counter{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", + "expr": "sum(rate(ray_serve_deployment_request_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica)", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -346,7 +346,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_error_counter{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", + "expr": "sum(rate(ray_serve_deployment_error_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica)", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -472,7 +472,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -480,7 +480,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -606,7 +606,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -614,7 +614,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -740,7 +740,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -748,7 +748,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -814,7 +814,7 @@ "gridPos": { "x": 0, "y": 2, - "w": 8, + "w": 12, "h": 8 }, "hiddenSeries": false, @@ -874,7 +874,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -924,132 +924,6 @@ "alignLevel": null } }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Pending requests for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 2, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_replica_pending_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Pending requests per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "requests", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, @@ -1064,13 +938,13 @@ "fill": 0, "fillGradient": 0, "gridPos": { - "x": 16, + "x": 12, "y": 2, - "w": 8, + "w": 12, "h": 8 }, "hiddenSeries": false, - "id": 9, + "id": 8, "legend": { "alignAsTable": true, "avg": false, @@ -1126,7 +1000,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -1196,7 +1070,7 @@ "h": 8 }, "hiddenSeries": false, - "id": 10, + "id": 9, "legend": { "alignAsTable": true, "avg": false, @@ -1252,7 +1126,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -1322,7 +1196,7 @@ "h": 8 }, "hiddenSeries": false, - "id": 11, + "id": 10, "legend": { "alignAsTable": true, "avg": false, @@ -1378,7 +1252,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_multiplexed_models_load_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -1448,7 +1322,7 @@ "h": 8 }, "hiddenSeries": false, - "id": 12, + "id": 11, "legend": { "alignAsTable": true, "avg": false, @@ -1504,7 +1378,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_multiplexed_models_unload_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "expr": "sum(ray_serve_multiplexed_models_unload_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}) by (application, deployment, replica)", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -1574,7 +1448,7 @@ "h": 8 }, "hiddenSeries": false, - "id": 13, + "id": 12, "legend": { "alignAsTable": true, "avg": false, @@ -1630,7 +1504,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -1700,7 +1574,7 @@ "h": 8 }, "hiddenSeries": false, - "id": 14, + "id": 13, "legend": { "alignAsTable": true, "avg": false, @@ -1756,7 +1630,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, replica, le))", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -1826,7 +1700,7 @@ "h": 8 }, "hiddenSeries": false, - "id": 15, + "id": 14, "legend": { "alignAsTable": true, "avg": false, @@ -1882,7 +1756,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}", + "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}", "interval": "", "legendFormat": "{{replica}}:{{model_id}}", "queryType": "randomWalk", @@ -1952,7 +1826,7 @@ "h": 8 }, "hiddenSeries": false, - "id": 16, + "id": 15, "legend": { "alignAsTable": true, "avg": false, @@ -2008,7 +1882,7 @@ "targets": [ { "exemplar": true, - "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])))", + "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",ray_io_cluster=~\"$Cluster\",}[5m])))", "interval": "", "legendFormat": "{{replica}}", "queryType": "randomWalk", @@ -2063,7 +1937,7 @@ "schemaVersion": 27, "style": "dark", "tags": [ - "rayVersion:2.9.0" + "rayVersion:3.0.0.dev0" ], "templating": { "list": [ @@ -2222,6 +2096,34 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "Cluster", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -2238,4 +2140,4 @@ "title": "Serve Deployment Dashboard", "uid": "rayServeDeploymentDashboard", "version": 1 -} +} \ No newline at end of file diff --git a/config/grafana/serve_grafana_dashboard.json b/config/grafana/serve_grafana_dashboard.json index dec53437cf9..59b9bede3d4 100644 --- a/config/grafana/serve_grafana_dashboard.json +++ b/config/grafana/serve_grafana_dashboard.json @@ -34,7 +34,7 @@ "gridPos": { "x": 0, "y": 0, - "w": 8, + "w": 12, "h": 8 }, "hiddenSeries": false, @@ -94,7 +94,7 @@ "targets": [ { "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{})", + "expr": "avg(ray_node_cpu_utilization{ray_io_cluster=~\"$Cluster\",})", "interval": "", "legendFormat": "CPU (physical)", "queryType": "randomWalk", @@ -102,7 +102,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{}) / on() (sum(autoscaler_cluster_resources{resource='GPU',}) or vector(0))", + "expr": "sum(ray_node_gpus_utilization{ray_io_cluster=~\"$Cluster\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',ray_io_cluster=~\"$Cluster\",}) or vector(0))", "interval": "", "legendFormat": "GPU (physical)", "queryType": "randomWalk", @@ -110,7 +110,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_used{}) / on() (sum(ray_node_mem_total{})) * 100", + "expr": "sum(ray_node_mem_used{ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_mem_total{ray_io_cluster=~\"$Cluster\",})) * 100", "interval": "", "legendFormat": "Memory (RAM)", "queryType": "randomWalk", @@ -118,7 +118,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gram_used{}) / on() (sum(ray_node_gram_available{}) + sum(ray_node_gram_used{})) * 100", + "expr": "sum(ray_node_gram_used{ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_gram_available{ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_gram_used{ray_io_cluster=~\"$Cluster\",})) * 100", "interval": "", "legendFormat": "GRAM", "queryType": "randomWalk", @@ -126,7 +126,7 @@ }, { "exemplar": true, - "expr": "sum(ray_object_store_memory{}) / on() sum(ray_resources{Name=\"object_store_memory\",}) * 100", + "expr": "sum(ray_object_store_memory{ray_io_cluster=~\"$Cluster\",}) / on() sum(ray_resources{Name=\"object_store_memory\",ray_io_cluster=~\"$Cluster\",}) * 100", "interval": "", "legendFormat": "Object Store Memory", "queryType": "randomWalk", @@ -134,7 +134,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_disk_usage{}) / on() (sum(ray_node_disk_free{}) + sum(ray_node_disk_usage{})) * 100", + "expr": "sum(ray_node_disk_usage{ray_io_cluster=~\"$Cluster\",}) / on() (sum(ray_node_disk_free{ray_io_cluster=~\"$Cluster\",}) + sum(ray_node_disk_usage{ray_io_cluster=~\"$Cluster\",})) * 100", "interval": "", "legendFormat": "Disk", "queryType": "randomWalk", @@ -198,9 +198,9 @@ "fill": 10, "fillGradient": 0, "gridPos": { - "x": 8, + "x": 12, "y": 0, - "w": 8, + "w": 12, "h": 8 }, "hiddenSeries": false, @@ -260,7 +260,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_requests{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", + "expr": "sum(rate(ray_serve_num_http_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route)", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -268,7 +268,7 @@ }, { "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_requests{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", + "expr": "sum(rate(ray_serve_num_grpc_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method)", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -332,9 +332,9 @@ "fill": 10, "fillGradient": 0, "gridPos": { - "x": 16, - "y": 0, - "w": 8, + "x": 0, + "y": 1, + "w": 12, "h": 8 }, "hiddenSeries": false, @@ -394,7 +394,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_error_requests{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", + "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route)", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -402,7 +402,7 @@ }, { "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_error_requests{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", + "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method)", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -452,6 +452,140 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each selected application.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 12, + "y": 1, + "w": 12, + "h": 8 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, error_code)", + "interval": "", + "legendFormat": "{{application, route, error_code}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, error_code)", + "interval": "", + "legendFormat": "{{application, method, error_code}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per application per error code", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -467,7 +601,7 @@ "fillGradient": 0, "gridPos": { "x": 0, - "y": 1, + "y": 2, "w": 8, "h": 8 }, @@ -528,7 +662,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, le))", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -536,7 +670,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, le))", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -544,7 +678,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -609,7 +743,7 @@ "fillGradient": 0, "gridPos": { "x": 8, - "y": 1, + "y": 2, "w": 8, "h": 8 }, @@ -670,7 +804,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, le))", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -678,7 +812,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, le))", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -686,7 +820,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -751,7 +885,7 @@ "fillGradient": 0, "gridPos": { "x": 16, - "y": 1, + "y": 2, "w": 8, "h": 8 }, @@ -812,7 +946,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, route, le))", "interval": "", "legendFormat": "{{application, route}}", "queryType": "randomWalk", @@ -820,7 +954,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, method, le))", "interval": "", "legendFormat": "{{application, method}}", "queryType": "randomWalk", @@ -828,7 +962,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -893,7 +1027,7 @@ "fillGradient": 0, "gridPos": { "x": 0, - "y": 2, + "y": 3, "w": 8, "h": 8 }, @@ -954,7 +1088,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_replica_healthy{}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_replica_healthy{ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1019,7 +1153,7 @@ "fillGradient": 0, "gridPos": { "x": 8, - "y": 2, + "y": 3, "w": 8, "h": 8 }, @@ -1080,7 +1214,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_request_counter{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", + "expr": "sum(rate(ray_serve_deployment_request_counter_total{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1145,7 +1279,7 @@ "fillGradient": 0, "gridPos": { "x": 16, - "y": 2, + "y": 3, "w": 8, "h": 8 }, @@ -1206,7 +1340,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_error_counter{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", + "expr": "sum(rate(ray_serve_deployment_error_counter_total{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1271,7 +1405,7 @@ "fillGradient": 0, "gridPos": { "x": 0, - "y": 3, + "y": 4, "w": 8, "h": 8 }, @@ -1332,7 +1466,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, le))", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1340,7 +1474,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -1405,7 +1539,7 @@ "fillGradient": 0, "gridPos": { "x": 8, - "y": 3, + "y": 4, "w": 8, "h": 8 }, @@ -1466,7 +1600,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, le))", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1474,7 +1608,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -1539,7 +1673,7 @@ "fillGradient": 0, "gridPos": { "x": 16, - "y": 3, + "y": 4, "w": 8, "h": 8 }, @@ -1600,7 +1734,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (application, deployment, le))", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1608,7 +1742,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",ray_io_cluster=~\"$Cluster\",}[5m])) by (le))", "interval": "", "legendFormat": "Total", "queryType": "randomWalk", @@ -1673,7 +1807,7 @@ "fillGradient": 0, "gridPos": { "x": 0, - "y": 4, + "y": 5, "w": 8, "h": 8 }, @@ -1734,7 +1868,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_serve_deployment_queued_queries{}) by (application, deployment)", + "expr": "sum(ray_serve_deployment_queued_queries{ray_io_cluster=~\"$Cluster\",}) by (application, deployment)", "interval": "", "legendFormat": "{{application, deployment}}", "queryType": "randomWalk", @@ -1799,7 +1933,7 @@ "fillGradient": 0, "gridPos": { "x": 8, - "y": 4, + "y": 5, "w": 8, "h": 8 }, @@ -1860,7 +1994,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(autoscaler_active_nodes{}) by (NodeType)", + "expr": "sum(autoscaler_active_nodes{ray_io_cluster=~\"$Cluster\",}) by (NodeType)", "interval": "", "legendFormat": "Active Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -1868,7 +2002,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{}) by (NodeType)", + "expr": "sum(autoscaler_recently_failed_nodes{ray_io_cluster=~\"$Cluster\",}) by (NodeType)", "interval": "", "legendFormat": "Failed Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -1876,7 +2010,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{}) by (NodeType)", + "expr": "sum(autoscaler_pending_nodes{ray_io_cluster=~\"$Cluster\",}) by (NodeType)", "interval": "", "legendFormat": "Pending Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -1941,7 +2075,7 @@ "fillGradient": 0, "gridPos": { "x": 16, - "y": 4, + "y": 5, "w": 8, "h": 8 }, @@ -2002,7 +2136,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_node_network_receive_speed{}) by (instance)", + "expr": "sum(ray_node_network_receive_speed{ray_io_cluster=~\"$Cluster\",}) by (instance)", "interval": "", "legendFormat": "Recv: {{instance}}", "queryType": "randomWalk", @@ -2010,7 +2144,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_network_send_speed{}) by (instance)", + "expr": "sum(ray_node_network_send_speed{ray_io_cluster=~\"$Cluster\",}) by (instance)", "interval": "", "legendFormat": "Send: {{instance}}", "queryType": "randomWalk", @@ -2059,96 +2193,852 @@ "align": false, "alignLevel": null } - } - ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [ - "rayVersion:2.9.0" - ], - "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries of a specific Prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of ongoing requests in the HTTP Proxy.", + "fieldConfig": { + "defaults": {}, + "overrides": [] }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Application", - "options": [], - "query": { - "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 6, + "w": 8, + "h": 8 }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_num_http_requests{}, route)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": "HTTP Route", - "multi": true, - "name": "HTTP_Route", - "options": [], - "query": { - "query": "label_values(ray_serve_num_http_requests{}, route)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_ongoing_http_requests{ray_io_cluster=~\"$Cluster\",}", + "interval": "", + "legendFormat": "Ongoing HTTP Requests", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ongoing HTTP Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of ongoing requests in the gRPC Proxy.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 6, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 21, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_ongoing_grpc_requests{ray_io_cluster=~\"$Cluster\",}", + "interval": "", + "legendFormat": "Ongoing gRPC Requests", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ongoing gRPC Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of request scheduling tasks in the router.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 6, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_scheduling_tasks{ray_io_cluster=~\"$Cluster\",}", + "interval": "", + "legendFormat": "Scheduling Tasks", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduling Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of request scheduling tasks in the router that are undergoing backoff.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 7, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_scheduling_tasks_in_backoff{ray_io_cluster=~\"$Cluster\",}", + "interval": "", + "legendFormat": "Scheduling Tasks in Backoff", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduling Tasks in Backoff", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The duration of the last control loop.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 7, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_controller_control_loop_duration_s{ray_io_cluster=~\"$Cluster\",}", + "interval": "", + "legendFormat": "Control Loop Duration", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Controller Control Loop Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 7, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_controller_num_control_loops{ray_io_cluster=~\"$Cluster\",}", + "interval": "", + "legendFormat": "Control Loops", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of Control Loops", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "loops", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:3.0.0.dev0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Application", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_num_http_requests_total{}, route)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "HTTP Route", + "multi": true, + "name": "HTTP_Route", + "options": [], + "query": { + "query": "label_values(ray_serve_num_http_requests_total{}, route)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", @@ -2189,6 +3079,34 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "Cluster", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{}, ray_io_cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -2205,4 +3123,4 @@ "title": "Serve Dashboard", "uid": "rayServeDashboard", "version": 1 -} +} \ No newline at end of file diff --git a/config/prometheus/podMonitor.yaml b/config/prometheus/podMonitor.yaml index 29aaf353be1..5af17a3fe12 100644 --- a/config/prometheus/podMonitor.yaml +++ b/config/prometheus/podMonitor.yaml @@ -22,3 +22,42 @@ spec: relabelings: - sourceLabels: [__meta_kubernetes_pod_label_ray_io_cluster] targetLabel: ray_io_cluster +--- +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + labels: + # `release: $HELM_RELEASE`: Prometheus can only detect PodMonitor with this label. + release: prometheus + name: ray-head-monitor + namespace: prometheus-system +spec: + jobLabel: ray-head + # Only select Kubernetes Pods in the "default" namespace. + namespaceSelector: + matchNames: + - default + # Only select Kubernetes Pods with "matchLabels". + selector: + matchLabels: + ray.io/node-type: head + # A list of endpoints allowed as part of this PodMonitor. + podMetricsEndpoints: + - port: metrics + relabelings: + - action: replace + sourceLabels: + - __meta_kubernetes_pod_label_ray_io_cluster + targetLabel: ray_io_cluster + - port: as-metrics # autoscaler metrics + relabelings: + - action: replace + sourceLabels: + - __meta_kubernetes_pod_label_ray_io_cluster + targetLabel: ray_io_cluster + - port: dash-metrics # dashboard metrics + relabelings: + - action: replace + sourceLabels: + - __meta_kubernetes_pod_label_ray_io_cluster + targetLabel: ray_io_cluster diff --git a/config/prometheus/serviceMonitor.yaml b/config/prometheus/serviceMonitor.yaml index cc3eb6905df..bff1e066375 100644 --- a/config/prometheus/serviceMonitor.yaml +++ b/config/prometheus/serviceMonitor.yaml @@ -1,25 +1,25 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: ray-head-monitor - namespace: prometheus-system - labels: - # `release: $HELM_RELEASE`: Prometheus can only detect ServiceMonitor with this label. - release: prometheus -spec: - jobLabel: ray-head - # Only select Kubernetes Services in the "default" namespace. - namespaceSelector: - matchNames: - - default - # Only select Kubernetes Services with "matchLabels". - selector: - matchLabels: - ray.io/node-type: head - # A list of endpoints allowed as part of this ServiceMonitor. - endpoints: - - port: metrics - - port: as-metrics # autoscaler metrics - - port: dash-metrics # dashboard metrics - targetLabels: - - ray.io/cluster +# apiVersion: monitoring.coreos.com/v1 +# kind: ServiceMonitor +# metadata: +# name: ray-head-monitor +# namespace: prometheus-system +# labels: +# # `release: $HELM_RELEASE`: Prometheus can only detect ServiceMonitor with this label. +# release: prometheus +# spec: +# jobLabel: ray-head +# # Only select Kubernetes Services in the "default" namespace. +# namespaceSelector: +# matchNames: +# - default +# # Only select Kubernetes Services with "matchLabels". +# selector: +# matchLabels: +# ray.io/node-type: head +# # A list of endpoints allowed as part of this ServiceMonitor. +# endpoints: +# - port: metrics +# - port: as-metrics # autoscaler metrics +# - port: dash-metrics # dashboard metrics +# targetLabels: +# - ray.io/cluster