commit 46be4bb4b1b926ce53ebb57a504e8c74bdfc590e Author: l-nmch Date: Mon Nov 10 23:24:42 2025 +0100 First commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5c0cb07 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.venv/ +__pycache__/ +*.pyc +.vscode/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ed7ca5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.11-slim + +ARG USER=app +ARG UID=1000 +ENV APP_HOME=/app \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PORT=8000 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + libffi-dev \ + libssl-dev \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +RUN useradd --create-home --uid ${UID} ${USER} + +WORKDIR ${APP_HOME} + +COPY --chown=${USER}:${USER} requirements.txt ${APP_HOME}/ +RUN pip install --upgrade pip setuptools wheel \ + && pip install --no-cache-dir -r requirements.txt + +COPY --chown=${USER}:${USER} . ${APP_HOME}/ + +USER ${USER} + +EXPOSE ${PORT} + +HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \ + CMD curl -f http://localhost:${PORT}/metrics || exit 1 + +CMD ["python", "-u", "app.py"] \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..886f14f --- /dev/null +++ b/app.py @@ -0,0 +1,80 @@ +import logging +import time +from typing import Dict + +from prometheus_client import Gauge, start_http_server + +from gpus import amd +from gpus import nvidia + +def setup_metrics() -> Dict[str, Gauge]: + labels = ["vendor", "gpu_id", "name", "pci_bus"] + + gauges = { + "power_w": Gauge("gpu_power_watts", "Power consumption in watts", labels), + "gpu_temp_c": Gauge("gpu_temperature_celsius", "GPU temperature in Celsius", labels), + "gpu_clock_mhz": Gauge("gpu_clock_mhz", "GPU core clock speed in MHz", labels), + "mem_clock_mhz": Gauge("gpu_memory_clock_mhz", "GPU memory clock speed in MHz", labels), + "fan_speed_percent": Gauge("gpu_fan_speed_percent", "Fan speed percentage", labels), + "gpu_util_percent": Gauge("gpu_utilization_percent", "GPU utilization percent", labels), + "mem_util_percent": Gauge("gpu_memory_utilization_percent", "GPU memory utilization percent", labels), + "memory_used_mib": Gauge("gpu_memory_used_mib", "Used memory in MiB", labels), + "memory_total_mib": Gauge("gpu_memory_total_mib", "Total memory in MiB", labels), + } + + return gauges + +# One gauge per process, updated per collection +gpu_process_info = Gauge( + "gpu_process_info", + "GPU process usage (1 = present). Labels contain metadata.", + ["vendor", "gpu_id", "gpu_name", "pid", "proc_name", "used_memory_mib", "gpu_util_percent"] +) + +def update_metrics(gauges: Dict[str, Gauge]) -> None: + all_gpus = amd.extract_amd_gpu_info() + nvidia.extract_nvidia_gpu_info() + + for gpu in all_gpus: + labels = { + "vendor": gpu.get("vendor", "unknown"), + "gpu_id": gpu.get("gpu_id", "unknown"), + "name": gpu.get("name", "unknown"), + "pci_bus": gpu.get("pci_bus", "unknown"), + } + + for key, gauge in gauges.items(): + value = gpu.get(key) + gauge.labels(**labels).set(value if value is not None else 0) + + # Clear all previous process metrics to avoid duplication + gpu_process_info.clear() + + # Add fresh process info + for proc in amd.extract_amd_processes() + nvidia.extract_nvidia_processes(): + gpu_process_info.labels( + vendor=proc["vendor"], + gpu_id=proc["gpu_id"], + gpu_name=proc["gpu_name"], + pid=str(proc["pid"]), + proc_name=proc["name"], + used_memory_mib=str(proc.get("used_memory_mib", 0)), + gpu_util_percent=str(proc.get("gpu_util_percent", 0)), + ).set(1) + +def main() -> None: + logging.basicConfig(level=logging.INFO) + port = 8000 + start_http_server(port) + logging.info(f"Starting GPU exporter on http://localhost:{port}/metrics") + + gauges = setup_metrics() + + try: + while True: + update_metrics(gauges) + time.sleep(10) + except KeyboardInterrupt: + logging.info("Exporter stopped by user") + +if __name__ == "__main__": + main() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8d617d6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,11 @@ +services: + prometheus_exporter_gpu: + build: . + container_name: prometheus_exporter_gpu + restart: unless-stopped + ports: + - "8000:8000" + volumes: + - /sys/class/drm/:/sys/class/drm/:ro + - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro + - /usr/bin/rocm-smi:/usr/bin/rocm-smi:ro \ No newline at end of file diff --git a/gpus/amd.py b/gpus/amd.py new file mode 100644 index 0000000..cc8630c --- /dev/null +++ b/gpus/amd.py @@ -0,0 +1,138 @@ +import json +import os +import subprocess +import logging + +def get_rocm_smi_data(): + try: + result = subprocess.run( + ["/usr/bin/rocm-smi", "--showallinfo", "--json"], + check=True, + stdout=subprocess.PIPE, + text=True + ) + return json.loads(result.stdout) + except Exception as e: + logging.error(f"Error running rocm-smi: {e}") + return {} + +def get_drm_cards_by_pci(): + pci_map = {} + drm_path = "/sys/class/drm" + for entry in os.listdir(drm_path): + if entry.startswith("card") and "-" not in entry: + device_path = os.path.realpath(os.path.join(drm_path, entry, "device")) + pci_id = os.path.basename(device_path) + pci_map[pci_id] = entry + return pci_map + +def get_power_usage_watts(drm_card): + hwmon_path = f"/sys/class/drm/{drm_card}/device/hwmon" + if not os.path.exists(hwmon_path): + return None + try: + hwmon_dirs = os.listdir(hwmon_path) + if not hwmon_dirs: + return None + power_path = os.path.join(hwmon_path, hwmon_dirs[0], "power1_input") + if os.path.exists(power_path): + with open(power_path) as f: + return int(f.read().strip()) / 1_000_000 + except Exception: + return None + return None + +def get_mem_info_mib(drm_card): + base_path = f"/sys/class/drm/{drm_card}/device" + result = {} + try: + for field in ["mem_info_vram_total", "mem_info_vram_used"]: + path = os.path.join(base_path, field) + if os.path.exists(path): + with open(path) as f: + result[field] = int(f.read().strip()) / (1024 * 1024) + except Exception as e: + logging.warning(f"Could not read AMD memory info: {e}") + return { + "memory_total_mib": result.get("mem_info_vram_total"), + "memory_used_mib": result.get("mem_info_vram_used"), + "mem_util_percent": ( + result["mem_info_vram_used"] / result["mem_info_vram_total"] * 100 + if "mem_info_vram_used" in result and "mem_info_vram_total" in result + else None + ), + } if result else {} + +def parse_clock(clock_str): + if not clock_str: + return None + try: + return int(clock_str.replace("(", "").replace("Mhz)", "")) + except Exception: + return None + +def extract_amd_gpu_info(): + rocm_data = get_rocm_smi_data() + pci_map = get_drm_cards_by_pci() + + gpu_list = [] + for card_id, data in rocm_data.items(): + if not card_id.startswith("card"): + continue + + name = data.get("Card Series") + pci_id = data.get("PCI Bus") + drm_card = pci_map.get(pci_id) + + clocks = { + "gpu_clock_mhz": parse_clock(data.get("sclk clock speed:")), + "mem_clock_mhz": parse_clock(data.get("mclk clock speed:")), + } + + mem_info = get_mem_info_mib(drm_card) if drm_card else {} + + gpu_info = { + "vendor": "amd", + "gpu_id": card_id, + "name": name, + "pci_bus": pci_id, + "gpu_temp_c": float(data.get("Temperature (Sensor edge) (C)", 0)), + "fan_speed_percent": float(data.get("Fan speed (%)", 0)), + "power_w": get_power_usage_watts(drm_card), + "gpu_clock_mhz": clocks["gpu_clock_mhz"], + "mem_clock_mhz": clocks["mem_clock_mhz"], + "gpu_util_percent": float(data.get("GPU use (%)", 0)), + "mem_util_percent": mem_info.get("mem_util_percent"), + "memory_used_mib": mem_info.get("memory_used_mib"), + "memory_total_mib": mem_info.get("memory_total_mib"), + } + + gpu_list.append(gpu_info) + + return gpu_list + +def extract_amd_processes(): + rocm_data = get_rocm_smi_data() + processes = [] + + for key, val in rocm_data.get("system", {}).items(): + if key.startswith("PID"): + pid = int(key[3:]) + parts = val.split(",") + if len(parts) >= 5: + name = parts[0].strip() + gpu_id = f"card{parts[1].strip()}" + mem_bytes = int(parts[2].strip()) + gpu_util = float(parts[4].strip()) + + processes.append({ + "vendor": "amd", + "pid": pid, + "name": name, + "gpu_id": gpu_id, + "gpu_name": rocm_data.get(gpu_id, {}).get("Card series", "Unknown"), + "used_memory_mib": mem_bytes / (1024 * 1024), + "gpu_util_percent": gpu_util, + }) + + return processes diff --git a/gpus/nvidia.py b/gpus/nvidia.py new file mode 100644 index 0000000..69c7c4f --- /dev/null +++ b/gpus/nvidia.py @@ -0,0 +1,84 @@ +import subprocess +import csv +import logging + +def safe_float(value): + try: + return float(value) + except Exception: + return None + +def extract_nvidia_gpu_info(): + query_fields = [ + "index", "uuid", "name", "temperature.gpu", "utilization.gpu", + "utilization.memory", "pstate", "memory.total", "memory.used", + "fan.speed", "power.draw", "clocks.current.graphics", "clocks.current.memory", + "pci.bus_id" + ] + cmd = [ + "/usr/bin/nvidia-smi", + f"--query-gpu={','.join(query_fields)}", + "--format=csv,noheader,nounits" + ] + try: + output = subprocess.check_output(cmd, encoding="utf-8") + except Exception as e: + logging.error(f"Error running nvidia-smi: {e}") + return [] + + reader = csv.reader(output.strip().split("\n")) + gpu_list = [] + + for row in reader: + values = dict(zip(query_fields, row)) + mem_total = safe_float(values["memory.total"]) + mem_used = safe_float(values["memory.used"]) + mem_util = (mem_used / mem_total * 100) if mem_used and mem_total else None + + gpu_info = { + "vendor": "nvidia", + "gpu_id": f'gpu{values["index"]}', + "uuid": values["uuid"], + "name": values["name"], + "pci_bus": values["pci.bus_id"], + "gpu_temp_c": safe_float(values["temperature.gpu"]), + "fan_speed_percent": safe_float(values["fan.speed"]), + "power_w": safe_float(values["power.draw"]), + "gpu_clock_mhz": safe_float(values["clocks.current.graphics"]), + "mem_clock_mhz": safe_float(values["clocks.current.memory"]), + "gpu_util_percent": safe_float(values["utilization.gpu"]), + "mem_util_percent": mem_util, + "memory_used_mib": mem_used, + "memory_total_mib": mem_total, + } + gpu_list.append(gpu_info) + + return gpu_list + +def extract_nvidia_processes(): + try: + output = subprocess.check_output([ + "/usr/bin/nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory,gpu_uuid", + "--format=csv,noheader,nounits" + ], encoding="utf-8") + except Exception as e: + logging.error(f"Error fetching NVIDIA processes: {e}") + return [] + + processes = [] + for line in output.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) != 4: + continue + pid, name, mem, uuid = parts + processes.append({ + "vendor": "nvidia", + "pid": int(pid), + "name": name, + "gpu_id": uuid, + "gpu_name": "N/A", # Optionally map UUID to name + "used_memory_mib": float(mem), + "gpu_util_percent": 0 # NVIDIA does not expose per-process utilization directly + }) + + return processes diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c4df80e --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +prometheus_client==0.23.1