From 1f76ccca37aadc44e70ca33f2c92f2d6a97e34b7 Mon Sep 17 00:00:00 2001
From: Xuehai Pan
Date: Sun, 29 Dec 2024 21:38:28 +0800
Subject: [PATCH] feat(exporter): add dashboard example (#138)
---
.editorconfig | 2 +-
CHANGELOG.md | 1 +
README.md | 9 +
nvitop-exporter/README.md | 31 +-
nvitop-exporter/dashboard.json | 2602 +++++++++++++++++++
nvitop-exporter/nvitop_exporter/exporter.py | 42 +-
6 files changed, 2676 insertions(+), 11 deletions(-)
create mode 100644 nvitop-exporter/dashboard.json
diff --git a/.editorconfig b/.editorconfig
index 4fc952f3..427d5db6 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -14,7 +14,7 @@ insert_final_newline = true
indent_size = 4
src_paths=nvitop
-[*.{yaml,yml}]
+[*.{yaml,yml,json,xml}]
indent_size = 2
[*.md]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3a921702..a0562e24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
+- Add Grafana dashboard for `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#138](https://github.com/XuehaiPan/nvitop/pull/138).
- Handle exceptions for function `getpass.getuser()` by [@XuehaiPan](https://github.com/XuehaiPan) in [#130](https://github.com/XuehaiPan/nvitop/pull/130). Issued by [@landgraf](https://github.com/landgraf).
### Changed
diff --git a/README.md b/README.md
index b469a9bd..52734a1f 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,15 @@ An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for G
(TERM: GNOME Terminal / OS: Ubuntu 16.04 LTS (over SSH) / Locale: en_US.UTF-8
)
+
+
+
+
+
+ A Grafana dashboard built on top of nvitop-exporter
.
+
+
+
### Table of Contents
- [Features](#features)
diff --git a/nvitop-exporter/README.md b/nvitop-exporter/README.md
index 3599f625..92a00e7d 100644
--- a/nvitop-exporter/README.md
+++ b/nvitop-exporter/README.md
@@ -2,10 +2,35 @@
Prometheus exporter built on top of `nvitop`.
-## Installation
+## Quickstart
-Install from PyPI:
+Start the exporter with the following command:
```bash
-pip3 install --upgrade nvitop-exporter
+pipx run nvitop-exporter --bind-address 0.0.0.0 --port 5050
+# or
+uvx nvitop-exporter --bind-address 0.0.0.0 --port 5050
```
+
+Then you can access the metrics at `http://localhost:5050/metrics`.
+
+You will need to configure Prometheus to scrape the metrics from the exporter.
+
+```yaml
+scrape_configs:
+ - job_name: 'nvitop-exporter'
+ static_configs:
+ - targets: ['localhost:5050']
+```
+
+## Grafana Dashboard
+
+A Grafana dashboard is provided to visualize the metrics collected by the exporter.
+The source of the dashboard is [`dashboard.json`](./dashboard.json).
+The Grafana dashboard can also be imported as by ID [22589](https://grafana.com/grafana/dashboards/22589-nvitop-dashboard).
+
+
+
+
+ The Grafana dashboard for the exporter.
+
diff --git a/nvitop-exporter/dashboard.json b/nvitop-exporter/dashboard.json
new file mode 100644
index 00000000..c78cf58c
--- /dev/null
+++ b/nvitop-exporter/dashboard.json
@@ -0,0 +1,2602 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__elements": {},
+ "__requires": [
+ {
+ "type": "panel",
+ "id": "gauge",
+ "name": "Gauge",
+ "version": ""
+ },
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "11.3.1"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "table",
+ "name": "Table",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "timeseries",
+ "name": "Time series",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Grafana Dashboard built by `nvitop-exporter`.",
+ "editable": false,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "liveNow": true,
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 1,
+ "panels": [],
+ "title": "Process",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ "align": "right",
+ "cellOptions": {
+ "type": "auto",
+ "wrapText": false
+ },
+ "filterable": true,
+ "inspect": false,
+ "minWidth": 50
+ },
+ "fieldMinMax": true,
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byRegexp",
+ "options": "/^%.*$/"
+ },
+ "properties": [
+ {
+ "id": "custom.cellOptions",
+ "value": {
+ "mode": "lcd",
+ "type": "gauge",
+ "valueDisplayMode": "color"
+ }
+ },
+ {
+ "id": "max",
+ "value": 100
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "RSS MEMORY"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "mbytes"
+ },
+ {
+ "id": "custom.cellOptions",
+ "value": {
+ "mode": "gradient",
+ "type": "gauge",
+ "valueDisplayMode": "color"
+ }
+ },
+ {
+ "id": "max",
+ "value": 81920
+ },
+ {
+ "id": "custom.minWidth",
+ "value": 150
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "GPU MEMORY"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "mbytes"
+ },
+ {
+ "id": "custom.cellOptions",
+ "value": {
+ "mode": "gradient",
+ "type": "gauge",
+ "valueDisplayMode": "color"
+ }
+ },
+ {
+ "id": "max",
+ "value": 81920
+ },
+ {
+ "id": "custom.minWidth",
+ "value": 150
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "TIME"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "s"
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "COMMAND"
+ },
+ "properties": [
+ {
+ "id": "custom.align",
+ "value": "left"
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 24,
+ "x": 0,
+ "y": 1
+ },
+ "id": 2,
+ "options": {
+ "cellHeight": "sm",
+ "footer": {
+ "countRows": false,
+ "enablePagination": false,
+ "fields": [
+ "Value #gpu memory (lastNotNull)"
+ ],
+ "reducer": [
+ "sum"
+ ],
+ "show": true
+ },
+ "frameIndex": 0,
+ "showHeader": true,
+ "sortBy": [
+ {
+ "desc": true,
+ "displayName": "Hostname"
+ }
+ ]
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_cpu_percent_Percentage{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "interval": "",
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "cpu percent",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_rss_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "rss memory",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_gpu_memory_MiB{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "gpu memory",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_gpu_sm_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "gpu sm utilization",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_gpu_memory_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "gpu memory bandwidth utilization",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_gpu_encoder_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "gpu encoder utilization",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_gpu_decoder_utilization_Percentage{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "gpu decoder utilization",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_running_time_Second{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "interval": "",
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "running time",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "process_info_info{hostname=~\"$hostname\", username=~\"$username\"}",
+ "format": "table",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "__auto",
+ "range": false,
+ "refId": "command",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU Processes",
+ "transformations": [
+ {
+ "id": "merge",
+ "options": {}
+ },
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Row": {
+ "aggregations": [],
+ "operation": "aggregate"
+ },
+ "Value #command": {
+ "aggregations": []
+ },
+ "Value #cpu percent": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #gpu decoder utilization": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #gpu encoder utilization": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #gpu memory": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #gpu memory bandwidth utilization": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #gpu sm utilization": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #rss memory": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #running time": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "command": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "hostname": {
+ "aggregations": [],
+ "operation": "groupby"
+ },
+ "index": {
+ "aggregations": [],
+ "operation": "groupby"
+ },
+ "pid": {
+ "aggregations": [],
+ "operation": "groupby"
+ },
+ "status": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "username": {
+ "aggregations": [],
+ "operation": "groupby"
+ },
+ "uuid": {
+ "aggregations": [],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {},
+ "sort": [
+ {
+ "desc": true,
+ "field": "Value #gpu memory (lastNotNull)"
+ }
+ ]
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+ "uuid": true
+ },
+ "includeByName": {},
+ "indexByName": {
+ "Value #cpu percent (lastNotNull)": 6,
+ "Value #gpu decoder utilization (lastNotNull)": 12,
+ "Value #gpu encoder utilization (lastNotNull)": 11,
+ "Value #gpu memory (lastNotNull)": 8,
+ "Value #gpu memory bandwidth utilization (lastNotNull)": 10,
+ "Value #gpu sm utilization (lastNotNull)": 9,
+ "Value #rss memory (lastNotNull)": 7,
+ "Value #running time (lastNotNull)": 13,
+ "command (lastNotNull)": 14,
+ "hostname": 0,
+ "index": 3,
+ "pid": 1,
+ "status (lastNotNull)": 5,
+ "username": 2,
+ "uuid": 4
+ },
+ "renameByName": {
+ "Value #cpu percent (lastNotNull)": "%CPU",
+ "Value #gpu decoder utilization (lastNotNull)": "%DEC",
+ "Value #gpu encoder utilization (lastNotNull)": "%ENC",
+ "Value #gpu memory (lastNotNull)": "GPU MEMORY",
+ "Value #gpu memory bandwidth utilization (lastNotNull)": "%GMBW",
+ "Value #gpu sm utilization (lastNotNull)": "%SM",
+ "Value #rss memory (lastNotNull)": "RSS MEMORY",
+ "Value #running time (lastNotNull)": "TIME",
+ "command (lastNotNull)": "COMMAND",
+ "hostname": "HOSTNAME",
+ "index": "DEVICE",
+ "pid": "PID",
+ "status (lastNotNull)": "STATUS",
+ "username": "USERNAME"
+ }
+ }
+ }
+ ],
+ "type": "table"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 14
+ },
+ "id": 3,
+ "panels": [],
+ "title": "Overview",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 90
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 9,
+ "w": 3,
+ "x": 0,
+ "y": 15
+ },
+ "id": 4,
+ "options": {
+ "minVizHeight": 75,
+ "minVizWidth": 75,
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true,
+ "sizing": "auto"
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "host_cpu_percent_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "{{hostname}} CPU (%)",
+ "range": false,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "CPU Utilization",
+ "type": "gauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 90
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 18,
+ "w": 11,
+ "x": 3,
+ "y": 15
+ },
+ "id": 5,
+ "options": {
+ "minVizHeight": 75,
+ "minVizWidth": 75,
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true,
+ "sizing": "auto"
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "gpu_utilization_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "{{hostname}} GPU {{index}} (%)",
+ "range": false,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU Utilization",
+ "type": "gauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 90
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 18,
+ "w": 10,
+ "x": 14,
+ "y": 15
+ },
+ "id": 6,
+ "options": {
+ "minVizHeight": 75,
+ "minVizWidth": 75,
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true,
+ "sizing": "auto"
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "gpu_memory_percent_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "{{hostname}} GPU {{index}} (%)",
+ "range": false,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU Memory",
+ "type": "gauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 90
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 9,
+ "w": 3,
+ "x": 0,
+ "y": 24
+ },
+ "id": 7,
+ "options": {
+ "minVizHeight": 75,
+ "minVizWidth": 75,
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true,
+ "sizing": "auto"
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "exemplar": false,
+ "expr": "host_virtual_memory_percent_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": true,
+ "legendFormat": "{{hostname}} MEM (%)",
+ "range": false,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "Host Virtual Memory",
+ "type": "gauge"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 33
+ },
+ "id": 8,
+ "panels": [],
+ "title": "System",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 10,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 2,
+ "gradientMode": "opacity",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 34
+ },
+ "id": 9,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "host_cpu_percent_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "CPU Utilization",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 10,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 2,
+ "gradientMode": "opacity",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 34
+ },
+ "id": 10,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "host_load_average_1m_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} (1m)",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "host_load_average_5m_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} (5m)",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "host_load_average_15m_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} (15m)",
+ "range": true,
+ "refId": "C",
+ "useBackend": false
+ }
+ ],
+ "title": "CPU Load Average",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1024,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 2,
+ "gradientMode": "opacity",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "mbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 48
+ },
+ "id": 11,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "host_virtual_memory_used_MiB{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "Host Virtual Memory",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1024,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 2,
+ "gradientMode": "opacity",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "mbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 48
+ },
+ "id": 12,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "host_swap_memory_used_MiB{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "Host Swap Memory",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 62
+ },
+ "id": 13,
+ "panels": [],
+ "title": "System I/O",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 2,
+ "gradientMode": "opacity",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "MiBs"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 63
+ },
+ "id": 14,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "sum by(hostname) (rate(host_net_io_rx_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} RX",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "sum by(hostname) (rate(host_net_io_tx_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} TX",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "Host Network I/O",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 2,
+ "gradientMode": "opacity",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "MiBs"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 63
+ },
+ "id": 15,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "sum by(hostname) (rate(host_disk_io_read_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} Read",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "sum by(hostname) (rate(host_disk_io_write_data_MiB{hostname=~\"$hostname\"}[$__rate_interval]))",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} Write",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "Host Disk I/O",
+ "type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 77
+ },
+ "id": 16,
+ "panels": [],
+ "title": "Accelerator",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 78
+ },
+ "id": 17,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_utilization_Percentage{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU Utilization",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1024,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "mbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 78
+ },
+ "id": 18,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_memory_used_MiB{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU Memory",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "MiBs"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 92
+ },
+ "id": 19,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_pcie_rx_throughput_MiBps{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU PCIe RX Throughput",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "MiBs"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 92
+ },
+ "id": 20,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_pcie_rx_throughput_MiBps{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU PCIe TX Throughput",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "MiBs"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 106
+ },
+ "id": 21,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_nvlink_total_rx_throughput_MiBps{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU NVLink RX Throughput",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "axisSoftMax": 1,
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "MiBs"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 106
+ },
+ "id": 22,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_nvlink_total_tx_throughput_MiBps{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU NVLink TX Throughput",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "watt"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 120
+ },
+ "id": 23,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_power_usage_W{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU Power Usage",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "celsius"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 120
+ },
+ "id": 24,
+ "options": {
+ "legend": {
+ "calcs": [
+ "lastNotNull",
+ "mean",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true,
+ "sortBy": "Name",
+ "sortDesc": false
+ },
+ "tooltip": {
+ "maxHeight": 600,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "gpu_temperature_C{hostname=~\"$hostname\"}",
+ "fullMetaSearch": false,
+ "hide": false,
+ "includeNullMetadata": true,
+ "instant": false,
+ "legendFormat": "{{hostname}} GPU {{index}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "GPU Temperature",
+ "type": "timeseries"
+ }
+ ],
+ "preload": true,
+ "refresh": "5s",
+ "schemaVersion": 40,
+ "tags": [
+ "nvitop",
+ "nvitop-exporter",
+ "prometheus",
+ "nvidia",
+ "gpu",
+ "gpu process",
+ "gpu monitoring"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {},
+ "definition": "label_values(hostname)",
+ "description": "",
+ "includeAll": true,
+ "multi": true,
+ "name": "hostname",
+ "options": [],
+ "query": {
+ "qryType": 1,
+ "query": "label_values(hostname)"
+ },
+ "refresh": 1,
+ "regex": "",
+ "type": "query"
+ },
+ {
+ "current": {},
+ "definition": "label_values(username)",
+ "description": "",
+ "includeAll": true,
+ "multi": true,
+ "name": "username",
+ "options": [],
+ "query": {
+ "qryType": 1,
+ "query": "label_values(username)"
+ },
+ "refresh": 1,
+ "regex": "",
+ "type": "query"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "nvitop-dashboard",
+ "uid": "bdl3vqwxprhtsa",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/nvitop-exporter/nvitop_exporter/exporter.py b/nvitop-exporter/nvitop_exporter/exporter.py
index b0861bf0..ceaeb6c5 100644
--- a/nvitop-exporter/nvitop_exporter/exporter.py
+++ b/nvitop-exporter/nvitop_exporter/exporter.py
@@ -405,6 +405,12 @@ def __init__( # pylint: disable=too-many-statements
)
# Create gauges for process metrics
+ self.process_info = Info(
+ name='process_info',
+ documentation='Process information.',
+ labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
+ registry=self.registry,
+ )
self.process_running_time = Gauge(
name='process_running_time',
documentation='Process running time (s).',
@@ -592,19 +598,40 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc
alive_pids.clear()
with GpuProcess.failsafe():
+ host_snapshots = {}
for pid, process in device.processes().items():
with process.oneshot():
username = process.username()
- running_time = process.running_time()
alive_pids.add((pid, username))
+ if (pid, username) not in host_snapshots: # noqa: SIM401,RUF100
+ host_snapshot = host_snapshots[(pid, username)] = process.host_snapshot()
+ else:
+ host_snapshot = host_snapshots[(pid, username)]
+ self.process_info.labels(
+ hostname=self.hostname,
+ index=index,
+ devicename=name,
+ uuid=uuid,
+ pid=pid,
+ username=username,
+ ).info(
+ {
+ 'status': host_snapshot.status,
+ 'command': host_snapshot.command,
+ },
+ )
for gauge, value in (
(
self.process_running_time,
- running_time.total_seconds() if running_time else math.nan,
+ (
+ host_snapshot.running_time.total_seconds()
+ if host_snapshot.running_time
+ else math.nan
+ ),
),
- (self.process_cpu_percent, process.cpu_percent()),
- (self.process_rss_memory, process.host_memory() / MiB),
- (self.process_memory_percent, float(process.memory_percent())),
+ (self.process_cpu_percent, host_snapshot.cpu_percent),
+ (self.process_rss_memory, host_snapshot.host_memory / MiB),
+ (self.process_memory_percent, float(host_snapshot.memory_percent)),
(self.process_gpu_memory, process.gpu_memory() / MiB),
(
self.process_gpu_sm_utilization,
@@ -633,7 +660,8 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc
).set(value)
for pid, username in previous_alive_pids.difference(alive_pids):
- for gauge in (
+ for collector in (
+ self.process_info,
self.process_running_time,
self.process_cpu_percent,
self.process_rss_memory,
@@ -645,7 +673,7 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc
self.process_gpu_decoder_utilization,
):
try:
- gauge.remove(
+ collector.remove(
self.hostname,
index,
name,