diff --git a/config/grafana/data_grafana_dashboard.json b/config/grafana/data_grafana_dashboard.json index 6336dab5753..47d44e5b83c 100644 --- a/config/grafana/data_grafana_dashboard.json +++ b/config/grafana/data_grafana_dashboard.json @@ -1,1511 +1,1512 @@ { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] }, "editable": true, - "gnetId": null, + "fiscalYearStartMonth": 0, "graphTooltip": 0, - "iteration": 1667344411089, + "id": null, "links": [], + "liveNow": false, "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "hiddenSeries": false, - "id": 1, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Spilled", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount allocated by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Allocated", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount freed by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "hiddenSeries": false, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Freed", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount of memory store used by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Current Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Object Store Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical CPUs allocated to dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 2 - }, - "hiddenSeries": false, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Bytes Spilled", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical GPUs allocated to dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 2 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Total bytes outputted by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 3 - }, - "hiddenSeries": false, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Outputted: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Outputted", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Total rows outputted by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 3 - }, - "hiddenSeries": false, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Rows Outputted: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows Outputted", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "rows", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "description": "Amount allocated by dataset operators.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Time spent generating blocks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_block_generation_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Block Generation Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds user thread is blocked by iter_batches()", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "hiddenSeries": false, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration Blocked Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds spent in user code", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 5 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration User Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Bytes Allocated", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Amount freed by dataset operators.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Bytes Freed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Amount of memory store used by dataset operators.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Current Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Object Store Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Logical CPUs allocated to dataset operators.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Logical GPUs allocated to dataset operators.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total bytes outputted by dataset operators.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Outputted: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Bytes Outputted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total rows outputted by dataset operators.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Rows Outputted: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Rows Outputted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "rows", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Time spent generating blocks.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_block_generation_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset, operator)", + "interval": "", + "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Block Generation Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Seconds user thread is blocked by iter_batches()", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Iteration Blocked Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Seconds spent in user code", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Iteration User Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } ], - "refresh": false, - "schemaVersion": 27, + "refresh": "", + "schemaVersion": 38, "style": "dark", "tags": [ - "rayVersion:2.9.0" + "rayVersion:2.9.0" ], "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries of a specific Prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".+", - "current": { - "selected": false - }, - "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", - "description": "Filter queries to specific ray sessions.", - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "SessionName", - "options": [], - "query": { - "query": "label_values(ray_data_allocated_bytes{}, SessionName)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, dataset)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "DatasetID", - "options": [], - "query": { - "query": "label_values(ray_data_allocated_bytes{}, dataset)", - "refId": "Prometheus-Dataset-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", + "description": "Filter queries to specific ray sessions.", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "SessionName", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, SessionName)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(ray_data_allocated_bytes{}, dataset)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "DatasetID", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, dataset)", + "refId": "Prometheus-Dataset-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(ray_node_network_receive_speed,ray_io_cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "ray_io_cluster", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed,ray_io_cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, - "rayMeta": [ - "excludesSystemRoutes", - "supportsGlobalFilterOverride" - ], "time": { - "from": "now-30m", - "to": "now" + "from": "now-30m", + "to": "now" }, "timepicker": {}, "timezone": "", "title": "Data Dashboard", "uid": "rayDataDashboard", - "version": 1 -} + "version": 2, + "weekStart": "" + } \ No newline at end of file diff --git a/config/grafana/default_grafana_dashboard.json b/config/grafana/default_grafana_dashboard.json index 96603322971..a4291a89d7a 100644 --- a/config/grafana/default_grafana_dashboard.json +++ b/config/grafana/default_grafana_dashboard.json @@ -1,2836 +1,2944 @@ { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] }, "editable": true, - "gnetId": null, + "fiscalYearStartMonth": 0, "graphTooltip": 0, - "iteration": 1667344411089, + "id": null, "links": [], + "liveNow": false, "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "hiddenSeries": false, - "id": 26, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", - "interval": "", - "legendFormat": "{{State}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", - "interval": "", - "legendFormat": "{{State}} (retry)", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler Task State", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "hiddenSeries": false, - "id": 35, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", - "interval": "", - "legendFormat": "{{Name}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", - "interval": "", - "legendFormat": "{{Name}} (retry)", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Active Tasks by Name", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "hiddenSeries": false, - "id": 33, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_actors{SessionName=~\"$SessionName\",}) by (State)", - "interval": "", - "legendFormat": "{{State}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler Actor State", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "actors", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of (live) actors with a particular name.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=~\"$SessionName\",}) by (Name)", - "interval": "", - "legendFormat": "{{Name}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Active Actors by Name", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "actors", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 2 - }, - "hiddenSeries": false, - "id": 27, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=~\"$SessionName\",}) by (instance)", - "interval": "", - "legendFormat": "CPU Usage: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)))", - "interval": "", - "legendFormat": "MAX + PENDING", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler CPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (State), 0)", + "interval": "", + "legendFormat": "{{State}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (State), 0)", + "interval": "", + "legendFormat": "{{State}} (retry)", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Scheduler Task State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 2 - }, - "hiddenSeries": false, - "id": 29, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) by (Location)", - "interval": "", - "legendFormat": "{{Location}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Object Store Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 3 - }, - "hiddenSeries": false, - "id": 28, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "GPU Usage: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"GPU\",SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)))", - "interval": "", - "legendFormat": "MAX + PENDING", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler GPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "GPUs", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 3 - }, - "hiddenSeries": false, - "id": 40, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",}) by (State)", - "interval": "", - "legendFormat": "{{State}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler Placement Groups", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "placement groups", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", - "interval": "", - "legendFormat": "CPU Usage: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node CPU (hardware utilization)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", - "interval": "", - "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gpus_available{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node GPU (hardware utilization)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "GPUs", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 5 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Disk Used: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Disk", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (Name)", + "interval": "", + "legendFormat": "{{Name}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (Name)", + "interval": "", + "legendFormat": "{{Name}} (retry)", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Active Tasks by Name", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Disk IO per node.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 5 - }, - "hiddenSeries": false, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Write: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Read: {{instance}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Disk IO Speed", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 6 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Memory Used: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Memory (heap + object store)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 6 - }, - "hiddenSeries": false, - "id": 44, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "OOM Killed: {{Name}}, {{instance}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Out of Memory Failures by Name", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "failures", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "(sum(ray_component_rss_mb{SessionName=~\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=~\"$SessionName\",}) by (Component))", - "interval": "", - "legendFormat": "{{Component}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_shared_bytes{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "shared_memory", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Memory by Component", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 7 - }, - "hiddenSeries": false, - "id": 37, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_component_cpu_percentage{SessionName=~\"$SessionName\",}) by (Component) / 100", - "interval": "", - "legendFormat": "{{Component}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node CPU by Component", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "hiddenSeries": false, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * 1024 * 1024", - "interval": "", - "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "(sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 1024 * 1024", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node GPU Memory (GRAM)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_actors{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (State)", + "interval": "", + "legendFormat": "{{State}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Scheduler Actor State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Network speed per node", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Recv: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Send: {{instance}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Network", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "hiddenSeries": false, - "id": 24, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", - "interval": "", - "legendFormat": "Active Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", - "interval": "", - "legendFormat": "Failed Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", - "interval": "", - "legendFormat": "Pending Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "nodes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "actors", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, + "description": "Current number of (live) actors with a particular name.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "hiddenSeries": false, - "id": 41, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "CPU (physical)", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{SessionName=~\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=~\"$SessionName\",}) or vector(0))", - "interval": "", - "legendFormat": "GPU (physical)", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=~\"$SessionName\",})) * 100", - "interval": "", - "legendFormat": "Memory (RAM)", - "queryType": "randomWalk", - "refId": "C" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gram_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 100", - "interval": "", - "legendFormat": "GRAM", - "queryType": "randomWalk", - "refId": "D" - }, - { - "exemplar": true, - "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",}) * 100", - "interval": "", - "legendFormat": "Object Store Memory", - "queryType": "randomWalk", - "refId": "E" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_usage{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})) * 100", - "interval": "", - "legendFormat": "Disk", - "queryType": "randomWalk", - "refId": "F" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Cluster Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "%", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (Name)", + "interval": "", + "legendFormat": "{{Name}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Active Actors by Name", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "actors", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (instance)", + "interval": "", + "legendFormat": "CPU Usage: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_resources{Name=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)))", + "interval": "", + "legendFormat": "MAX + PENDING", + "queryType": "randomWalk", + "range": true, + "refId": "C" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Scheduler CPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 29, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (Location)", + "interval": "", + "legendFormat": "{{Location}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Object Store Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "GPU Usage: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_resources{Name=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0)))", + "interval": "", + "legendFormat": "MAX + PENDING", + "queryType": "randomWalk", + "range": true, + "refId": "C" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Scheduler GPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "GPUs", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (State)", + "interval": "", + "legendFormat": "{{State}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Scheduler Placement Groups", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "placement groups", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"} / 100", + "interval": "", + "legendFormat": "CPU Usage: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node CPU (hardware utilization)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"} / 100", + "interval": "", + "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_gpus_available{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node GPU (hardware utilization)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "GPUs", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "Disk Used: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_disk_free{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node Disk", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Disk IO per node.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "Write: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "Read: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node Disk IO Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "Memory Used: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node Memory (heap + object store)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "hiddenSeries": false, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "OOM Killed: {{Name}}, {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node Out of Memory Failures by Name", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "failures", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "(sum(ray_component_rss_mb{SessionName=~\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (Component))", + "interval": "", + "legendFormat": "{{Component}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_mem_shared_bytes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "shared_memory", + "queryType": "randomWalk", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "C" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node Memory by Component", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_component_cpu_percentage{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (Component) / 100", + "interval": "", + "legendFormat": "{{Component}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node CPU by Component", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"} * 1024 * 1024", + "interval": "", + "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "(sum(ray_node_gram_available{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})) * 1024 * 1024", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node GPU Memory (GRAM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Network speed per node", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "Recv: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}", + "interval": "", + "legendFormat": "Send: {{instance}}", + "queryType": "randomWalk", + "range": true, + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node Network", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 72 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (NodeType)", + "interval": "", + "legendFormat": "Active Nodes: {{NodeType}}", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (NodeType)", + "interval": "", + "legendFormat": "Failed Nodes: {{NodeType}}", + "queryType": "randomWalk", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) by (NodeType)", + "interval": "", + "legendFormat": "Pending Nodes: {{NodeType}}", + "queryType": "randomWalk", + "range": true, + "refId": "C" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Node Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "nodes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 72 + }, + "hiddenSeries": false, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.0.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "color": "#1F60C4", + "dashes": true, + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "color": "#777777", + "dashes": true, + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(ray_node_cpu_utilization{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})", + "interval": "", + "legendFormat": "CPU (physical)", + "queryType": "randomWalk", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_gpus_utilization{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) or vector(0))", + "interval": "", + "legendFormat": "GPU (physical)", + "queryType": "randomWalk", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_mem_used{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) / on() (sum(ray_node_mem_total{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})) * 100", + "interval": "", + "legendFormat": "Memory (RAM)", + "queryType": "randomWalk", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_gram_used{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) / on() (sum(ray_node_gram_available{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})) * 100", + "interval": "", + "legendFormat": "GRAM", + "queryType": "randomWalk", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) * 100", + "interval": "", + "legendFormat": "Object Store Memory", + "queryType": "randomWalk", + "range": true, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(ray_node_disk_usage{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) / on() (sum(ray_node_disk_free{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",ray_io_cluster=~\"$ray_io_cluster\"})) * 100", + "interval": "", + "legendFormat": "Disk", + "queryType": "randomWalk", + "range": true, + "refId": "F" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Cluster Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } ], - "refresh": false, - "schemaVersion": 27, + "refresh": "", + "schemaVersion": 38, "style": "dark", "tags": [ - "rayVersion:2.9.0" + "rayVersion:2.9.0" ], "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries of a specific Prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".+", - "current": { - "selected": false - }, - "datasource": "${datasource}", - "definition": "label_values(ray_node_network_receive_speed{}, SessionName)", - "description": "Filter queries to specific ray sessions.", - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "SessionName", - "options": [], - "query": { - "query": "label_values(ray_node_network_receive_speed{}, SessionName)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Instance", - "options": [], - "query": { - "query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(ray_node_network_receive_speed{},SessionName)", + "description": "Filter queries to specific ray sessions.", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "SessionName", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{},SessionName)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",},instance)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "Instance", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",},instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(ray_node_network_receive_speed{},ray_io_cluster)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "ray_io_cluster", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{},ray_io_cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, "time": { - "from": "now-30m", - "to": "now" + "from": "now-30m", + "to": "now" }, "timepicker": {}, "timezone": "", "title": "Default Dashboard", "uid": "rayDefaultDashboard", "version": 4, - "rayMeta": [ - "supportsGlobalFilterOverride" - ] -} + "weekStart": "" + } \ No newline at end of file