First commit
This commit is contained in:
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.vscode/
|
||||||
38
Dockerfile
Normal file
38
Dockerfile
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
ARG USER=app
|
||||||
|
ARG UID=1000
|
||||||
|
ENV APP_HOME=/app \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
PORT=8000
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
gcc \
|
||||||
|
libffi-dev \
|
||||||
|
libssl-dev \
|
||||||
|
ca-certificates \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN useradd --create-home --uid ${UID} ${USER}
|
||||||
|
|
||||||
|
WORKDIR ${APP_HOME}
|
||||||
|
|
||||||
|
COPY --chown=${USER}:${USER} requirements.txt ${APP_HOME}/
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY --chown=${USER}:${USER} . ${APP_HOME}/
|
||||||
|
|
||||||
|
USER ${USER}
|
||||||
|
|
||||||
|
EXPOSE ${PORT}
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:${PORT}/metrics || exit 1
|
||||||
|
|
||||||
|
CMD ["python", "-u", "app.py"]
|
||||||
80
app.py
Normal file
80
app.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from prometheus_client import Gauge, start_http_server
|
||||||
|
|
||||||
|
from gpus import amd
|
||||||
|
from gpus import nvidia
|
||||||
|
|
||||||
|
def setup_metrics() -> Dict[str, Gauge]:
|
||||||
|
labels = ["vendor", "gpu_id", "name", "pci_bus"]
|
||||||
|
|
||||||
|
gauges = {
|
||||||
|
"power_w": Gauge("gpu_power_watts", "Power consumption in watts", labels),
|
||||||
|
"gpu_temp_c": Gauge("gpu_temperature_celsius", "GPU temperature in Celsius", labels),
|
||||||
|
"gpu_clock_mhz": Gauge("gpu_clock_mhz", "GPU core clock speed in MHz", labels),
|
||||||
|
"mem_clock_mhz": Gauge("gpu_memory_clock_mhz", "GPU memory clock speed in MHz", labels),
|
||||||
|
"fan_speed_percent": Gauge("gpu_fan_speed_percent", "Fan speed percentage", labels),
|
||||||
|
"gpu_util_percent": Gauge("gpu_utilization_percent", "GPU utilization percent", labels),
|
||||||
|
"mem_util_percent": Gauge("gpu_memory_utilization_percent", "GPU memory utilization percent", labels),
|
||||||
|
"memory_used_mib": Gauge("gpu_memory_used_mib", "Used memory in MiB", labels),
|
||||||
|
"memory_total_mib": Gauge("gpu_memory_total_mib", "Total memory in MiB", labels),
|
||||||
|
}
|
||||||
|
|
||||||
|
return gauges
|
||||||
|
|
||||||
|
# One gauge per process, updated per collection
|
||||||
|
gpu_process_info = Gauge(
|
||||||
|
"gpu_process_info",
|
||||||
|
"GPU process usage (1 = present). Labels contain metadata.",
|
||||||
|
["vendor", "gpu_id", "gpu_name", "pid", "proc_name", "used_memory_mib", "gpu_util_percent"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def update_metrics(gauges: Dict[str, Gauge]) -> None:
|
||||||
|
all_gpus = amd.extract_amd_gpu_info() + nvidia.extract_nvidia_gpu_info()
|
||||||
|
|
||||||
|
for gpu in all_gpus:
|
||||||
|
labels = {
|
||||||
|
"vendor": gpu.get("vendor", "unknown"),
|
||||||
|
"gpu_id": gpu.get("gpu_id", "unknown"),
|
||||||
|
"name": gpu.get("name", "unknown"),
|
||||||
|
"pci_bus": gpu.get("pci_bus", "unknown"),
|
||||||
|
}
|
||||||
|
|
||||||
|
for key, gauge in gauges.items():
|
||||||
|
value = gpu.get(key)
|
||||||
|
gauge.labels(**labels).set(value if value is not None else 0)
|
||||||
|
|
||||||
|
# Clear all previous process metrics to avoid duplication
|
||||||
|
gpu_process_info.clear()
|
||||||
|
|
||||||
|
# Add fresh process info
|
||||||
|
for proc in amd.extract_amd_processes() + nvidia.extract_nvidia_processes():
|
||||||
|
gpu_process_info.labels(
|
||||||
|
vendor=proc["vendor"],
|
||||||
|
gpu_id=proc["gpu_id"],
|
||||||
|
gpu_name=proc["gpu_name"],
|
||||||
|
pid=str(proc["pid"]),
|
||||||
|
proc_name=proc["name"],
|
||||||
|
used_memory_mib=str(proc.get("used_memory_mib", 0)),
|
||||||
|
gpu_util_percent=str(proc.get("gpu_util_percent", 0)),
|
||||||
|
).set(1)
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
port = 8000
|
||||||
|
start_http_server(port)
|
||||||
|
logging.info(f"Starting GPU exporter on http://localhost:{port}/metrics")
|
||||||
|
|
||||||
|
gauges = setup_metrics()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
update_metrics(gauges)
|
||||||
|
time.sleep(10)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logging.info("Exporter stopped by user")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
11
docker-compose.yml
Normal file
11
docker-compose.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
services:
|
||||||
|
prometheus_exporter_gpu:
|
||||||
|
build: .
|
||||||
|
container_name: prometheus_exporter_gpu
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
volumes:
|
||||||
|
- /sys/class/drm/:/sys/class/drm/:ro
|
||||||
|
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi:ro
|
||||||
|
- /usr/bin/rocm-smi:/usr/bin/rocm-smi:ro
|
||||||
138
gpus/amd.py
Normal file
138
gpus/amd.py
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import logging
|
||||||
|
|
||||||
|
def get_rocm_smi_data():
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["/usr/bin/rocm-smi", "--showallinfo", "--json"],
|
||||||
|
check=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
return json.loads(result.stdout)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error running rocm-smi: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def get_drm_cards_by_pci():
|
||||||
|
pci_map = {}
|
||||||
|
drm_path = "/sys/class/drm"
|
||||||
|
for entry in os.listdir(drm_path):
|
||||||
|
if entry.startswith("card") and "-" not in entry:
|
||||||
|
device_path = os.path.realpath(os.path.join(drm_path, entry, "device"))
|
||||||
|
pci_id = os.path.basename(device_path)
|
||||||
|
pci_map[pci_id] = entry
|
||||||
|
return pci_map
|
||||||
|
|
||||||
|
def get_power_usage_watts(drm_card):
|
||||||
|
hwmon_path = f"/sys/class/drm/{drm_card}/device/hwmon"
|
||||||
|
if not os.path.exists(hwmon_path):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
hwmon_dirs = os.listdir(hwmon_path)
|
||||||
|
if not hwmon_dirs:
|
||||||
|
return None
|
||||||
|
power_path = os.path.join(hwmon_path, hwmon_dirs[0], "power1_input")
|
||||||
|
if os.path.exists(power_path):
|
||||||
|
with open(power_path) as f:
|
||||||
|
return int(f.read().strip()) / 1_000_000
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_mem_info_mib(drm_card):
|
||||||
|
base_path = f"/sys/class/drm/{drm_card}/device"
|
||||||
|
result = {}
|
||||||
|
try:
|
||||||
|
for field in ["mem_info_vram_total", "mem_info_vram_used"]:
|
||||||
|
path = os.path.join(base_path, field)
|
||||||
|
if os.path.exists(path):
|
||||||
|
with open(path) as f:
|
||||||
|
result[field] = int(f.read().strip()) / (1024 * 1024)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Could not read AMD memory info: {e}")
|
||||||
|
return {
|
||||||
|
"memory_total_mib": result.get("mem_info_vram_total"),
|
||||||
|
"memory_used_mib": result.get("mem_info_vram_used"),
|
||||||
|
"mem_util_percent": (
|
||||||
|
result["mem_info_vram_used"] / result["mem_info_vram_total"] * 100
|
||||||
|
if "mem_info_vram_used" in result and "mem_info_vram_total" in result
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
} if result else {}
|
||||||
|
|
||||||
|
def parse_clock(clock_str):
|
||||||
|
if not clock_str:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(clock_str.replace("(", "").replace("Mhz)", ""))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_amd_gpu_info():
|
||||||
|
rocm_data = get_rocm_smi_data()
|
||||||
|
pci_map = get_drm_cards_by_pci()
|
||||||
|
|
||||||
|
gpu_list = []
|
||||||
|
for card_id, data in rocm_data.items():
|
||||||
|
if not card_id.startswith("card"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = data.get("Card Series")
|
||||||
|
pci_id = data.get("PCI Bus")
|
||||||
|
drm_card = pci_map.get(pci_id)
|
||||||
|
|
||||||
|
clocks = {
|
||||||
|
"gpu_clock_mhz": parse_clock(data.get("sclk clock speed:")),
|
||||||
|
"mem_clock_mhz": parse_clock(data.get("mclk clock speed:")),
|
||||||
|
}
|
||||||
|
|
||||||
|
mem_info = get_mem_info_mib(drm_card) if drm_card else {}
|
||||||
|
|
||||||
|
gpu_info = {
|
||||||
|
"vendor": "amd",
|
||||||
|
"gpu_id": card_id,
|
||||||
|
"name": name,
|
||||||
|
"pci_bus": pci_id,
|
||||||
|
"gpu_temp_c": float(data.get("Temperature (Sensor edge) (C)", 0)),
|
||||||
|
"fan_speed_percent": float(data.get("Fan speed (%)", 0)),
|
||||||
|
"power_w": get_power_usage_watts(drm_card),
|
||||||
|
"gpu_clock_mhz": clocks["gpu_clock_mhz"],
|
||||||
|
"mem_clock_mhz": clocks["mem_clock_mhz"],
|
||||||
|
"gpu_util_percent": float(data.get("GPU use (%)", 0)),
|
||||||
|
"mem_util_percent": mem_info.get("mem_util_percent"),
|
||||||
|
"memory_used_mib": mem_info.get("memory_used_mib"),
|
||||||
|
"memory_total_mib": mem_info.get("memory_total_mib"),
|
||||||
|
}
|
||||||
|
|
||||||
|
gpu_list.append(gpu_info)
|
||||||
|
|
||||||
|
return gpu_list
|
||||||
|
|
||||||
|
def extract_amd_processes():
|
||||||
|
rocm_data = get_rocm_smi_data()
|
||||||
|
processes = []
|
||||||
|
|
||||||
|
for key, val in rocm_data.get("system", {}).items():
|
||||||
|
if key.startswith("PID"):
|
||||||
|
pid = int(key[3:])
|
||||||
|
parts = val.split(",")
|
||||||
|
if len(parts) >= 5:
|
||||||
|
name = parts[0].strip()
|
||||||
|
gpu_id = f"card{parts[1].strip()}"
|
||||||
|
mem_bytes = int(parts[2].strip())
|
||||||
|
gpu_util = float(parts[4].strip())
|
||||||
|
|
||||||
|
processes.append({
|
||||||
|
"vendor": "amd",
|
||||||
|
"pid": pid,
|
||||||
|
"name": name,
|
||||||
|
"gpu_id": gpu_id,
|
||||||
|
"gpu_name": rocm_data.get(gpu_id, {}).get("Card series", "Unknown"),
|
||||||
|
"used_memory_mib": mem_bytes / (1024 * 1024),
|
||||||
|
"gpu_util_percent": gpu_util,
|
||||||
|
})
|
||||||
|
|
||||||
|
return processes
|
||||||
84
gpus/nvidia.py
Normal file
84
gpus/nvidia.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
import subprocess
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
|
|
||||||
|
def safe_float(value):
|
||||||
|
try:
|
||||||
|
return float(value)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_nvidia_gpu_info():
|
||||||
|
query_fields = [
|
||||||
|
"index", "uuid", "name", "temperature.gpu", "utilization.gpu",
|
||||||
|
"utilization.memory", "pstate", "memory.total", "memory.used",
|
||||||
|
"fan.speed", "power.draw", "clocks.current.graphics", "clocks.current.memory",
|
||||||
|
"pci.bus_id"
|
||||||
|
]
|
||||||
|
cmd = [
|
||||||
|
"/usr/bin/nvidia-smi",
|
||||||
|
f"--query-gpu={','.join(query_fields)}",
|
||||||
|
"--format=csv,noheader,nounits"
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
output = subprocess.check_output(cmd, encoding="utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error running nvidia-smi: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
reader = csv.reader(output.strip().split("\n"))
|
||||||
|
gpu_list = []
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
values = dict(zip(query_fields, row))
|
||||||
|
mem_total = safe_float(values["memory.total"])
|
||||||
|
mem_used = safe_float(values["memory.used"])
|
||||||
|
mem_util = (mem_used / mem_total * 100) if mem_used and mem_total else None
|
||||||
|
|
||||||
|
gpu_info = {
|
||||||
|
"vendor": "nvidia",
|
||||||
|
"gpu_id": f'gpu{values["index"]}',
|
||||||
|
"uuid": values["uuid"],
|
||||||
|
"name": values["name"],
|
||||||
|
"pci_bus": values["pci.bus_id"],
|
||||||
|
"gpu_temp_c": safe_float(values["temperature.gpu"]),
|
||||||
|
"fan_speed_percent": safe_float(values["fan.speed"]),
|
||||||
|
"power_w": safe_float(values["power.draw"]),
|
||||||
|
"gpu_clock_mhz": safe_float(values["clocks.current.graphics"]),
|
||||||
|
"mem_clock_mhz": safe_float(values["clocks.current.memory"]),
|
||||||
|
"gpu_util_percent": safe_float(values["utilization.gpu"]),
|
||||||
|
"mem_util_percent": mem_util,
|
||||||
|
"memory_used_mib": mem_used,
|
||||||
|
"memory_total_mib": mem_total,
|
||||||
|
}
|
||||||
|
gpu_list.append(gpu_info)
|
||||||
|
|
||||||
|
return gpu_list
|
||||||
|
|
||||||
|
def extract_nvidia_processes():
|
||||||
|
try:
|
||||||
|
output = subprocess.check_output([
|
||||||
|
"/usr/bin/nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory,gpu_uuid",
|
||||||
|
"--format=csv,noheader,nounits"
|
||||||
|
], encoding="utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error fetching NVIDIA processes: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
processes = []
|
||||||
|
for line in output.strip().splitlines():
|
||||||
|
parts = [p.strip() for p in line.split(",")]
|
||||||
|
if len(parts) != 4:
|
||||||
|
continue
|
||||||
|
pid, name, mem, uuid = parts
|
||||||
|
processes.append({
|
||||||
|
"vendor": "nvidia",
|
||||||
|
"pid": int(pid),
|
||||||
|
"name": name,
|
||||||
|
"gpu_id": uuid,
|
||||||
|
"gpu_name": "N/A", # Optionally map UUID to name
|
||||||
|
"used_memory_mib": float(mem),
|
||||||
|
"gpu_util_percent": 0 # NVIDIA does not expose per-process utilization directly
|
||||||
|
})
|
||||||
|
|
||||||
|
return processes
|
||||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
prometheus_client==0.23.1
|
||||||
Reference in New Issue
Block a user