diff --git a/nvitop-exporter/nvitop_exporter/exporter.py b/nvitop-exporter/nvitop_exporter/exporter.py index b0861bf0..ceaeb6c5 100644 --- a/nvitop-exporter/nvitop_exporter/exporter.py +++ b/nvitop-exporter/nvitop_exporter/exporter.py @@ -405,6 +405,12 @@ def __init__( # pylint: disable=too-many-statements ) # Create gauges for process metrics + self.process_info = Info( + name='process_info', + documentation='Process information.', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) self.process_running_time = Gauge( name='process_running_time', documentation='Process running time (s).', @@ -592,19 +598,40 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc alive_pids.clear() with GpuProcess.failsafe(): + host_snapshots = {} for pid, process in device.processes().items(): with process.oneshot(): username = process.username() - running_time = process.running_time() alive_pids.add((pid, username)) + if (pid, username) not in host_snapshots: # noqa: SIM401,RUF100 + host_snapshot = host_snapshots[(pid, username)] = process.host_snapshot() + else: + host_snapshot = host_snapshots[(pid, username)] + self.process_info.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + pid=pid, + username=username, + ).info( + { + 'status': host_snapshot.status, + 'command': host_snapshot.command, + }, + ) for gauge, value in ( ( self.process_running_time, - running_time.total_seconds() if running_time else math.nan, + ( + host_snapshot.running_time.total_seconds() + if host_snapshot.running_time + else math.nan + ), ), - (self.process_cpu_percent, process.cpu_percent()), - (self.process_rss_memory, process.host_memory() / MiB), - (self.process_memory_percent, float(process.memory_percent())), + (self.process_cpu_percent, host_snapshot.cpu_percent), + (self.process_rss_memory, host_snapshot.host_memory / MiB), + (self.process_memory_percent, float(host_snapshot.memory_percent)), (self.process_gpu_memory, process.gpu_memory() / MiB), ( self.process_gpu_sm_utilization, @@ -633,7 +660,8 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc ).set(value) for pid, username in previous_alive_pids.difference(alive_pids): - for gauge in ( + for collector in ( + self.process_info, self.process_running_time, self.process_cpu_percent, self.process_rss_memory, @@ -645,7 +673,7 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc self.process_gpu_decoder_utilization, ): try: - gauge.remove( + collector.remove( self.hostname, index, name,