Skip to content

Commit

Permalink
feat(exporter): add process info
Browse files Browse the repository at this point in the history
  • Loading branch information
XuehaiPan committed Dec 28, 2024
1 parent 25b6d61 commit ad8a454
Showing 1 changed file with 35 additions and 7 deletions.
42 changes: 35 additions & 7 deletions nvitop-exporter/nvitop_exporter/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,12 @@ def __init__( # pylint: disable=too-many-statements
)

# Create gauges for process metrics
self.process_info = Info(
name='process_info',
documentation='Process information.',
labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'],
registry=self.registry,
)
self.process_running_time = Gauge(
name='process_running_time',
documentation='Process running time (s).',
Expand Down Expand Up @@ -592,19 +598,40 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc
alive_pids.clear()

with GpuProcess.failsafe():
host_snapshots = {}
for pid, process in device.processes().items():
with process.oneshot():
username = process.username()
running_time = process.running_time()
alive_pids.add((pid, username))
if (pid, username) not in host_snapshots: # noqa: SIM401,RUF100
host_snapshot = host_snapshots[(pid, username)] = process.host_snapshot()
else:
host_snapshot = host_snapshots[(pid, username)]
self.process_info.labels(
hostname=self.hostname,
index=index,
devicename=name,
uuid=uuid,
pid=pid,
username=username,
).info(
{
'status': host_snapshot.status,
'command': host_snapshot.command,
},
)
for gauge, value in (
(
self.process_running_time,
running_time.total_seconds() if running_time else math.nan,
(
host_snapshot.running_time.total_seconds()
if host_snapshot.running_time
else math.nan
),
),
(self.process_cpu_percent, process.cpu_percent()),
(self.process_rss_memory, process.host_memory() / MiB),
(self.process_memory_percent, float(process.memory_percent())),
(self.process_cpu_percent, host_snapshot.cpu_percent),
(self.process_rss_memory, host_snapshot.host_memory / MiB),
(self.process_memory_percent, float(host_snapshot.memory_percent)),
(self.process_gpu_memory, process.gpu_memory() / MiB),
(
self.process_gpu_sm_utilization,
Expand Down Expand Up @@ -633,7 +660,8 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc
).set(value)

for pid, username in previous_alive_pids.difference(alive_pids):
for gauge in (
for collector in (
self.process_info,
self.process_running_time,
self.process_cpu_percent,
self.process_rss_memory,
Expand All @@ -645,7 +673,7 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc
self.process_gpu_decoder_utilization,
):
try:
gauge.remove(
collector.remove(
self.hostname,
index,
name,
Expand Down

0 comments on commit ad8a454

Please sign in to comment.