diff --git a/.editorconfig b/.editorconfig index 4fc952f3..427d5db6 100644 --- a/.editorconfig +++ b/.editorconfig @@ -14,7 +14,7 @@ insert_final_newline = true indent_size = 4 src_paths=nvitop -[*.{yaml,yml}] +[*.{yaml,yml,json,xml}] indent_size = 2 [*.md] diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a921702..a0562e24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Add Grafana dashboard for `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#138](https://github.com/XuehaiPan/nvitop/pull/138). - Handle exceptions for function `getpass.getuser()` by [@XuehaiPan](https://github.com/XuehaiPan) in [#130](https://github.com/XuehaiPan/nvitop/pull/130). Issued by [@landgraf](https://github.com/landgraf). ### Changed diff --git a/README.md b/README.md index b469a9bd..52734a1f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,15 @@ An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for G (TERM: GNOME Terminal / OS: Ubuntu 16.04 LTS (over SSH) / Locale: en_US.UTF-8)

+

+ + Filter + +
+ A Grafana dashboard built on top of nvitop-exporter. +

+ + ### Table of Contents - [Features](#features) diff --git a/nvitop-exporter/README.md b/nvitop-exporter/README.md index 3599f625..92a00e7d 100644 --- a/nvitop-exporter/README.md +++ b/nvitop-exporter/README.md @@ -2,10 +2,35 @@ Prometheus exporter built on top of `nvitop`. -## Installation +## Quickstart -Install from PyPI: +Start the exporter with the following command: ```bash -pip3 install --upgrade nvitop-exporter +pipx run nvitop-exporter --bind-address 0.0.0.0 --port 5050 +# or +uvx nvitop-exporter --bind-address 0.0.0.0 --port 5050 ``` + +Then you can access the metrics at `http://localhost:5050/metrics`. + +You will need to configure Prometheus to scrape the metrics from the exporter. + +```yaml +scrape_configs: + - job_name: 'nvitop-exporter' + static_configs: + - targets: ['localhost:5050'] +``` + +## Grafana Dashboard + +A Grafana dashboard is provided to visualize the metrics collected by the exporter. +The source of the dashboard is [`dashboard.json`](./dashboard.json). +The Grafana dashboard can also be imported as by ID [22589](https://grafana.com/grafana/dashboards/22589-nvitop-dashboard). + +

+ Filter +
+ The Grafana dashboard for the exporter. +

diff --git a/nvitop-exporter/dashboard.json b/nvitop-exporter/dashboard.json new file mode 100644 index 00000000..c78cf58c --- /dev/null +++ b/nvitop-exporter/dashboard.json @@ -0,0 +1,2602 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.3.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Grafana Dashboard built by `nvitop-exporter`.", + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": true, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Process", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "right", + "cellOptions": { + "type": "auto", + "wrapText": false + }, + "filterable": true, + "inspect": false, + "minWidth": 50 + }, + "fieldMinMax": true, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^%.*$/" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "lcd", + "type": "gauge", + "valueDisplayMode": "color" + } + }, + { + "id": "max", + "value": 100 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RSS MEMORY" + }, + "properties": [ + { + "id": "unit", + "value": "mbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge", + "valueDisplayMode": "color" + } + }, + { + "id": "max", + "value": 81920 + }, + { + "id": "custom.minWidth", + "value": 150 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "GPU MEMORY" + }, + "properties": [ + { + "id": "unit", + "value": "mbytes" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge", + "valueDisplayMode": "color" + } + }, + { + "id": "max", + "value": 81920 + }, + { + "id": "custom.minWidth", + "value": 150 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "TIME" + }, + "properties": [ + { + "id": "unit", + "value": "s" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "COMMAND" + }, + "properties": [ + { + "id": "custom.align", + "value": "left" + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": false, + "fields": [ + "Value #gpu memory (lastNotNull)" + ], + "reducer": [ + "sum" + ], + "show": true + }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Hostname" + } + ] + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_cpu_percent_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "cpu percent", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_rss_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "rss memory", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_gpu_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "gpu memory", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_gpu_sm_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "gpu sm utilization", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_gpu_memory_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "gpu memory bandwidth utilization", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_gpu_encoder_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "gpu encoder utilization", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_gpu_decoder_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "gpu decoder utilization", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_running_time_Second{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "running time", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "process_info_info{hostname=~\"$hostname\", username=~\"$username\"}", + "format": "table", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "command", + "useBackend": false + } + ], + "title": "GPU Processes", + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "groupBy", + "options": { + "fields": { + "Row": { + "aggregations": [], + "operation": "aggregate" + }, + "Value #command": { + "aggregations": [] + }, + "Value #cpu percent": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #gpu decoder utilization": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #gpu encoder utilization": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #gpu memory": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #gpu memory bandwidth utilization": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #gpu sm utilization": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #rss memory": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #running time": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "command": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "hostname": { + "aggregations": [], + "operation": "groupby" + }, + "index": { + "aggregations": [], + "operation": "groupby" + }, + "pid": { + "aggregations": [], + "operation": "groupby" + }, + "status": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "username": { + "aggregations": [], + "operation": "groupby" + }, + "uuid": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": true, + "field": "Value #gpu memory (lastNotNull)" + } + ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "uuid": true + }, + "includeByName": {}, + "indexByName": { + "Value #cpu percent (lastNotNull)": 6, + "Value #gpu decoder utilization (lastNotNull)": 12, + "Value #gpu encoder utilization (lastNotNull)": 11, + "Value #gpu memory (lastNotNull)": 8, + "Value #gpu memory bandwidth utilization (lastNotNull)": 10, + "Value #gpu sm utilization (lastNotNull)": 9, + "Value #rss memory (lastNotNull)": 7, + "Value #running time (lastNotNull)": 13, + "command (lastNotNull)": 14, + "hostname": 0, + "index": 3, + "pid": 1, + "status (lastNotNull)": 5, + "username": 2, + "uuid": 4 + }, + "renameByName": { + "Value #cpu percent (lastNotNull)": "%CPU", + "Value #gpu decoder utilization (lastNotNull)": "%DEC", + "Value #gpu encoder utilization (lastNotNull)": "%ENC", + "Value #gpu memory (lastNotNull)": "GPU MEMORY", + "Value #gpu memory bandwidth utilization (lastNotNull)": "%GMBW", + "Value #gpu sm utilization (lastNotNull)": "%SM", + "Value #rss memory (lastNotNull)": "RSS MEMORY", + "Value #running time (lastNotNull)": "TIME", + "command (lastNotNull)": "COMMAND", + "hostname": "HOSTNAME", + "index": "DEVICE", + "pid": "PID", + "status (lastNotNull)": "STATUS", + "username": "USERNAME" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 3, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 3, + "x": 0, + "y": 15 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "host_cpu_percent_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "{{hostname}} CPU (%)", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "CPU Utilization", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 18, + "w": 11, + "x": 3, + "y": 15 + }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "gpu_utilization_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "{{hostname}} GPU {{index}} (%)", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Utilization", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 18, + "w": 10, + "x": 14, + "y": 15 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "gpu_memory_percent_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "{{hostname}} GPU {{index}} (%)", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Memory", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 3, + "x": 0, + "y": 24 + }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "host_virtual_memory_percent_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "{{hostname}} MEM (%)", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Host Virtual Memory", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 8, + "panels": [], + "title": "System", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "host_cpu_percent_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "CPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "host_load_average_1m_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} (1m)", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "host_load_average_5m_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} (5m)", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "host_load_average_15m_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} (15m)", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "CPU Load Average", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1024, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "mbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "host_virtual_memory_used_MiB{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Host Virtual Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1024, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "mbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "host_swap_memory_used_MiB{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Host Swap Memory", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 62 + }, + "id": 13, + "panels": [], + "title": "System I/O", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "MiBs" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 63 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(hostname) (rate(host_net_io_rx_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} RX", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(hostname) (rate(host_net_io_tx_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} TX", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Host Network I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "MiBs" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 63 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(hostname) (rate(host_disk_io_read_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} Read", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(hostname) (rate(host_disk_io_write_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} Write", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Host Disk I/O", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 16, + "panels": [], + "title": "Accelerator", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_utilization_Percentage{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1024, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "mbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_memory_used_MiB{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "MiBs" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 92 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_pcie_rx_throughput_MiBps{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU PCIe RX Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "MiBs" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 92 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_pcie_rx_throughput_MiBps{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU PCIe TX Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "MiBs" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 106 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_nvlink_total_rx_throughput_MiBps{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU NVLink RX Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "MiBs" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 106 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_nvlink_total_tx_throughput_MiBps{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU NVLink TX Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 120 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_power_usage_W{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 120 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "gpu_temperature_C{hostname=~\"$hostname\"}", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{hostname}} GPU {{index}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Temperature", + "type": "timeseries" + } + ], + "preload": true, + "refresh": "5s", + "schemaVersion": 40, + "tags": [ + "nvitop", + "nvitop-exporter", + "prometheus", + "nvidia", + "gpu", + "gpu process", + "gpu monitoring" + ], + "templating": { + "list": [ + { + "current": {}, + "definition": "label_values(hostname)", + "description": "", + "includeAll": true, + "multi": true, + "name": "hostname", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(hostname)" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "definition": "label_values(username)", + "description": "", + "includeAll": true, + "multi": true, + "name": "username", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(username)" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "nvitop-dashboard", + "uid": "bdl3vqwxprhtsa", + "version": 1, + "weekStart": "" +} diff --git a/nvitop-exporter/nvitop_exporter/exporter.py b/nvitop-exporter/nvitop_exporter/exporter.py index b0861bf0..ceaeb6c5 100644 --- a/nvitop-exporter/nvitop_exporter/exporter.py +++ b/nvitop-exporter/nvitop_exporter/exporter.py @@ -405,6 +405,12 @@ def __init__( # pylint: disable=too-many-statements ) # Create gauges for process metrics + self.process_info = Info( + name='process_info', + documentation='Process information.', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) self.process_running_time = Gauge( name='process_running_time', documentation='Process running time (s).', @@ -592,19 +598,40 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc alive_pids.clear() with GpuProcess.failsafe(): + host_snapshots = {} for pid, process in device.processes().items(): with process.oneshot(): username = process.username() - running_time = process.running_time() alive_pids.add((pid, username)) + if (pid, username) not in host_snapshots: # noqa: SIM401,RUF100 + host_snapshot = host_snapshots[(pid, username)] = process.host_snapshot() + else: + host_snapshot = host_snapshots[(pid, username)] + self.process_info.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + pid=pid, + username=username, + ).info( + { + 'status': host_snapshot.status, + 'command': host_snapshot.command, + }, + ) for gauge, value in ( ( self.process_running_time, - running_time.total_seconds() if running_time else math.nan, + ( + host_snapshot.running_time.total_seconds() + if host_snapshot.running_time + else math.nan + ), ), - (self.process_cpu_percent, process.cpu_percent()), - (self.process_rss_memory, process.host_memory() / MiB), - (self.process_memory_percent, float(process.memory_percent())), + (self.process_cpu_percent, host_snapshot.cpu_percent), + (self.process_rss_memory, host_snapshot.host_memory / MiB), + (self.process_memory_percent, float(host_snapshot.memory_percent)), (self.process_gpu_memory, process.gpu_memory() / MiB), ( self.process_gpu_sm_utilization, @@ -633,7 +660,8 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc ).set(value) for pid, username in previous_alive_pids.difference(alive_pids): - for gauge in ( + for collector in ( + self.process_info, self.process_running_time, self.process_cpu_percent, self.process_rss_memory, @@ -645,7 +673,7 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc self.process_gpu_decoder_utilization, ): try: - gauge.remove( + collector.remove( self.hostname, index, name,