First commit

This commit is contained in:
l-nmch
2025-11-10 23:24:42 +01:00
commit 46be4bb4b1
7 changed files with 356 additions and 0 deletions

80
app.py Normal file
View File

@@ -0,0 +1,80 @@
import logging
import time
from typing import Dict
from prometheus_client import Gauge, start_http_server
from gpus import amd
from gpus import nvidia
def setup_metrics() -> Dict[str, Gauge]:
labels = ["vendor", "gpu_id", "name", "pci_bus"]
gauges = {
"power_w": Gauge("gpu_power_watts", "Power consumption in watts", labels),
"gpu_temp_c": Gauge("gpu_temperature_celsius", "GPU temperature in Celsius", labels),
"gpu_clock_mhz": Gauge("gpu_clock_mhz", "GPU core clock speed in MHz", labels),
"mem_clock_mhz": Gauge("gpu_memory_clock_mhz", "GPU memory clock speed in MHz", labels),
"fan_speed_percent": Gauge("gpu_fan_speed_percent", "Fan speed percentage", labels),
"gpu_util_percent": Gauge("gpu_utilization_percent", "GPU utilization percent", labels),
"mem_util_percent": Gauge("gpu_memory_utilization_percent", "GPU memory utilization percent", labels),
"memory_used_mib": Gauge("gpu_memory_used_mib", "Used memory in MiB", labels),
"memory_total_mib": Gauge("gpu_memory_total_mib", "Total memory in MiB", labels),
}
return gauges
# One gauge per process, updated per collection
gpu_process_info = Gauge(
"gpu_process_info",
"GPU process usage (1 = present). Labels contain metadata.",
["vendor", "gpu_id", "gpu_name", "pid", "proc_name", "used_memory_mib", "gpu_util_percent"]
)
def update_metrics(gauges: Dict[str, Gauge]) -> None:
all_gpus = amd.extract_amd_gpu_info() + nvidia.extract_nvidia_gpu_info()
for gpu in all_gpus:
labels = {
"vendor": gpu.get("vendor", "unknown"),
"gpu_id": gpu.get("gpu_id", "unknown"),
"name": gpu.get("name", "unknown"),
"pci_bus": gpu.get("pci_bus", "unknown"),
}
for key, gauge in gauges.items():
value = gpu.get(key)
gauge.labels(**labels).set(value if value is not None else 0)
# Clear all previous process metrics to avoid duplication
gpu_process_info.clear()
# Add fresh process info
for proc in amd.extract_amd_processes() + nvidia.extract_nvidia_processes():
gpu_process_info.labels(
vendor=proc["vendor"],
gpu_id=proc["gpu_id"],
gpu_name=proc["gpu_name"],
pid=str(proc["pid"]),
proc_name=proc["name"],
used_memory_mib=str(proc.get("used_memory_mib", 0)),
gpu_util_percent=str(proc.get("gpu_util_percent", 0)),
).set(1)
def main() -> None:
logging.basicConfig(level=logging.INFO)
port = 8000
start_http_server(port)
logging.info(f"Starting GPU exporter on http://localhost:{port}/metrics")
gauges = setup_metrics()
try:
while True:
update_metrics(gauges)
time.sleep(10)
except KeyboardInterrupt:
logging.info("Exporter stopped by user")
if __name__ == "__main__":
main()