First commit

2025-11-10 23:24:42 +01:00
commit 46be4bb4b1
7 changed files with 356 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.venv/
+__pycache__/
+*.pyc
+.vscode/
--- a/38
+++ b/38
@@ -0,0 +1,38 @@
+FROM python:3.11-slim
+
+ARG USER=app
+ARG UID=1000
+ENV APP_HOME=/app \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PORT=8000
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       build-essential \
+       gcc \
+       libffi-dev \
+       libssl-dev \
+       ca-certificates \
+       curl \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN useradd --create-home --uid ${UID} ${USER}
+
+WORKDIR ${APP_HOME}
+
+COPY --chown=${USER}:${USER} requirements.txt ${APP_HOME}/
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install --no-cache-dir -r requirements.txt
+
+COPY --chown=${USER}:${USER} . ${APP_HOME}/
+
+USER ${USER}
+
+EXPOSE ${PORT}
+
+HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
+  CMD curl -f http://localhost:${PORT}/metrics || exit 1
+
+CMD ["python", "-u", "app.py"]
--- a/app.py
+++ b/app.py
@@ -0,0 +1,80 @@
+import logging
+import time
+from typing import Dict
+
+from prometheus_client import Gauge, start_http_server
+
+from gpus import amd
+from gpus import nvidia
+
+def setup_metrics() -> Dict[str, Gauge]:
+    labels = ["vendor", "gpu_id", "name", "pci_bus"]
+
+    gauges = {
+        "power_w": Gauge("gpu_power_watts", "Power consumption in watts", labels),
+        "gpu_temp_c": Gauge("gpu_temperature_celsius", "GPU temperature in Celsius", labels),
+        "gpu_clock_mhz": Gauge("gpu_clock_mhz", "GPU core clock speed in MHz", labels),
+        "mem_clock_mhz": Gauge("gpu_memory_clock_mhz", "GPU memory clock speed in MHz", labels),
+        "fan_speed_percent": Gauge("gpu_fan_speed_percent", "Fan speed percentage", labels),
+        "gpu_util_percent": Gauge("gpu_utilization_percent", "GPU utilization percent", labels),
+        "mem_util_percent": Gauge("gpu_memory_utilization_percent", "GPU memory utilization percent", labels),
+        "memory_used_mib": Gauge("gpu_memory_used_mib", "Used memory in MiB", labels),
+        "memory_total_mib": Gauge("gpu_memory_total_mib", "Total memory in MiB", labels),
+    }
+
+    return gauges
+
+# One gauge per process, updated per collection
+gpu_process_info = Gauge(
+    "gpu_process_info",
+    "GPU process usage (1 = present). Labels contain metadata.",
+    ["vendor", "gpu_id", "gpu_name", "pid", "proc_name", "used_memory_mib", "gpu_util_percent"]
+)
+
+def update_metrics(gauges: Dict[str, Gauge]) -> None:
+    all_gpus = amd.extract_amd_gpu_info() + nvidia.extract_nvidia_gpu_info()
+
+    for gpu in all_gpus:
+        labels = {
+            "vendor": gpu.get("vendor", "unknown"),
+            "gpu_id": gpu.get("gpu_id", "unknown"),
+            "name": gpu.get("name", "unknown"),
+            "pci_bus": gpu.get("pci_bus", "unknown"),
+        }
+
+        for key, gauge in gauges.items():
+            value = gpu.get(key)
+            gauge.labels(**labels).set(value if value is not None else 0)
+
+    # Clear all previous process metrics to avoid duplication
+    gpu_process_info.clear()
+
+    # Add fresh process info
+    for proc in amd.extract_amd_processes() + nvidia.extract_nvidia_processes():
+        gpu_process_info.labels(
+            vendor=proc["vendor"],
+            gpu_id=proc["gpu_id"],
+            gpu_name=proc["gpu_name"],
+            pid=str(proc["pid"]),
+            proc_name=proc["name"],
+            used_memory_mib=str(proc.get("used_memory_mib", 0)),
+            gpu_util_percent=str(proc.get("gpu_util_percent", 0)),
+        ).set(1)
+
+def main() -> None:
+    logging.basicConfig(level=logging.INFO)
+    port = 8000
+    start_http_server(port)
+    logging.info(f"Starting GPU exporter on http://localhost:{port}/metrics")
+
+    gauges = setup_metrics()
+
+    try:
+        while True:
+            update_metrics(gauges)
+            time.sleep(10)
+    except KeyboardInterrupt:
+        logging.info("Exporter stopped by user")
+
+if __name__ == "__main__":
+    main()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,11 @@
+services:
+  prometheus_exporter_gpu:
+    build: .
+    container_name: prometheus_exporter_gpu
+    restart: unless-stopped
+    ports:
+      - "8000:8000"
+    volumes:
+      - /sys/class/drm/:/sys/class/drm/:ro
+      - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
+      - /usr/bin/rocm-smi:/usr/bin/rocm-smi:ro
--- a/gpus/amd.py
+++ b/gpus/amd.py
@@ -0,0 +1,138 @@
+import json
+import os
+import subprocess
+import logging
+
+def get_rocm_smi_data():
+    try:
+        result = subprocess.run(
+            ["/usr/bin/rocm-smi", "--showallinfo", "--json"],
+            check=True,
+            stdout=subprocess.PIPE,
+            text=True
+        )
+        return json.loads(result.stdout)
+    except Exception as e:
+        logging.error(f"Error running rocm-smi: {e}")
+        return {}
+
+def get_drm_cards_by_pci():
+    pci_map = {}
+    drm_path = "/sys/class/drm"
+    for entry in os.listdir(drm_path):
+        if entry.startswith("card") and "-" not in entry:
+            device_path = os.path.realpath(os.path.join(drm_path, entry, "device"))
+            pci_id = os.path.basename(device_path)
+            pci_map[pci_id] = entry
+    return pci_map
+
+def get_power_usage_watts(drm_card):
+    hwmon_path = f"/sys/class/drm/{drm_card}/device/hwmon"
+    if not os.path.exists(hwmon_path):
+        return None
+    try:
+        hwmon_dirs = os.listdir(hwmon_path)
+        if not hwmon_dirs:
+            return None
+        power_path = os.path.join(hwmon_path, hwmon_dirs[0], "power1_input")
+        if os.path.exists(power_path):
+            with open(power_path) as f:
+                return int(f.read().strip()) / 1_000_000
+    except Exception:
+        return None
+    return None
+
+def get_mem_info_mib(drm_card):
+    base_path = f"/sys/class/drm/{drm_card}/device"
+    result = {}
+    try:
+        for field in ["mem_info_vram_total", "mem_info_vram_used"]:
+            path = os.path.join(base_path, field)
+            if os.path.exists(path):
+                with open(path) as f:
+                    result[field] = int(f.read().strip()) / (1024 * 1024)
+    except Exception as e:
+        logging.warning(f"Could not read AMD memory info: {e}")
+    return {
+        "memory_total_mib": result.get("mem_info_vram_total"),
+        "memory_used_mib": result.get("mem_info_vram_used"),
+        "mem_util_percent": (
+            result["mem_info_vram_used"] / result["mem_info_vram_total"] * 100
+            if "mem_info_vram_used" in result and "mem_info_vram_total" in result
+            else None
+        ),
+    } if result else {}
+
+def parse_clock(clock_str):
+    if not clock_str:
+        return None
+    try:
+        return int(clock_str.replace("(", "").replace("Mhz)", ""))
+    except Exception:
+        return None
+
+def extract_amd_gpu_info():
+    rocm_data = get_rocm_smi_data()
+    pci_map = get_drm_cards_by_pci()
+
+    gpu_list = []
+    for card_id, data in rocm_data.items():
+        if not card_id.startswith("card"):
+            continue
+
+        name = data.get("Card Series")
+        pci_id = data.get("PCI Bus")
+        drm_card = pci_map.get(pci_id)
+
+        clocks = {
+            "gpu_clock_mhz": parse_clock(data.get("sclk clock speed:")),
+            "mem_clock_mhz": parse_clock(data.get("mclk clock speed:")),
+        }
+
+        mem_info = get_mem_info_mib(drm_card) if drm_card else {}
+
+        gpu_info = {
+            "vendor": "amd",
+            "gpu_id": card_id,
+            "name": name,
+            "pci_bus": pci_id,
+            "gpu_temp_c": float(data.get("Temperature (Sensor edge) (C)", 0)),
+            "fan_speed_percent": float(data.get("Fan speed (%)", 0)),
+            "power_w": get_power_usage_watts(drm_card),
+            "gpu_clock_mhz": clocks["gpu_clock_mhz"],
+            "mem_clock_mhz": clocks["mem_clock_mhz"],
+            "gpu_util_percent": float(data.get("GPU use (%)", 0)),
+            "mem_util_percent": mem_info.get("mem_util_percent"),
+            "memory_used_mib": mem_info.get("memory_used_mib"),
+            "memory_total_mib": mem_info.get("memory_total_mib"),
+        }
+
+        gpu_list.append(gpu_info)
+
+    return gpu_list
+
+def extract_amd_processes():
+    rocm_data = get_rocm_smi_data()
+    processes = []
+
+    for key, val in rocm_data.get("system", {}).items():
+        if key.startswith("PID"):
+            pid = int(key[3:])
+            parts = val.split(",")
+            if len(parts) >= 5:
+                name = parts[0].strip()
+                gpu_id = f"card{parts[1].strip()}"
+                mem_bytes = int(parts[2].strip())
+                gpu_util = float(parts[4].strip())
+
+                processes.append({
+                    "vendor": "amd",
+                    "pid": pid,
+                    "name": name,
+                    "gpu_id": gpu_id,
+                    "gpu_name": rocm_data.get(gpu_id, {}).get("Card series", "Unknown"),
+                    "used_memory_mib": mem_bytes / (1024 * 1024),
+                    "gpu_util_percent": gpu_util,
+                })
+
+    return processes
--- a/gpus/nvidia.py
+++ b/gpus/nvidia.py
@@ -0,0 +1,84 @@
+import subprocess
+import csv
+import logging
+
+def safe_float(value):
+    try:
+        return float(value)
+    except Exception:
+        return None
+
+def extract_nvidia_gpu_info():
+    query_fields = [
+        "index", "uuid", "name", "temperature.gpu", "utilization.gpu",
+        "utilization.memory", "pstate", "memory.total", "memory.used",
+        "fan.speed", "power.draw", "clocks.current.graphics", "clocks.current.memory",
+        "pci.bus_id"
+    ]
+    cmd = [
+        "/usr/bin/nvidia-smi",
+        f"--query-gpu={','.join(query_fields)}",
+        "--format=csv,noheader,nounits"
+    ]
+    try:
+        output = subprocess.check_output(cmd, encoding="utf-8")
+    except Exception as e:
+        logging.error(f"Error running nvidia-smi: {e}")
+        return []
+
+    reader = csv.reader(output.strip().split("\n"))
+    gpu_list = []
+
+    for row in reader:
+        values = dict(zip(query_fields, row))
+        mem_total = safe_float(values["memory.total"])
+        mem_used = safe_float(values["memory.used"])
+        mem_util = (mem_used / mem_total * 100) if mem_used and mem_total else None
+
+        gpu_info = {
+            "vendor": "nvidia",
+            "gpu_id": f'gpu{values["index"]}',
+            "uuid": values["uuid"],
+            "name": values["name"],
+            "pci_bus": values["pci.bus_id"],
+            "gpu_temp_c": safe_float(values["temperature.gpu"]),
+            "fan_speed_percent": safe_float(values["fan.speed"]),
+            "power_w": safe_float(values["power.draw"]),
+            "gpu_clock_mhz": safe_float(values["clocks.current.graphics"]),
+            "mem_clock_mhz": safe_float(values["clocks.current.memory"]),
+            "gpu_util_percent": safe_float(values["utilization.gpu"]),
+            "mem_util_percent": mem_util,
+            "memory_used_mib": mem_used,
+            "memory_total_mib": mem_total,
+        }
+        gpu_list.append(gpu_info)
+
+    return gpu_list
+
+def extract_nvidia_processes():
+    try:
+        output = subprocess.check_output([
+            "/usr/bin/nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory,gpu_uuid",
+            "--format=csv,noheader,nounits"
+        ], encoding="utf-8")
+    except Exception as e:
+        logging.error(f"Error fetching NVIDIA processes: {e}")
+        return []
+
+    processes = []
+    for line in output.strip().splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) != 4:
+            continue
+        pid, name, mem, uuid = parts
+        processes.append({
+            "vendor": "nvidia",
+            "pid": int(pid),
+            "name": name,
+            "gpu_id": uuid,
+            "gpu_name": "N/A",  # Optionally map UUID to name
+            "used_memory_mib": float(mem),
+            "gpu_util_percent": 0  # NVIDIA does not expose per-process utilization directly
+        })
+
+    return processes
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
+prometheus_client==0.23.1