First commit

This commit is contained in:
l-nmch
2025-11-10 23:24:42 +01:00
commit 46be4bb4b1
7 changed files with 356 additions and 0 deletions

138
gpus/amd.py Normal file
View File

@@ -0,0 +1,138 @@
import json
import os
import subprocess
import logging
def get_rocm_smi_data():
try:
result = subprocess.run(
["/usr/bin/rocm-smi", "--showallinfo", "--json"],
check=True,
stdout=subprocess.PIPE,
text=True
)
return json.loads(result.stdout)
except Exception as e:
logging.error(f"Error running rocm-smi: {e}")
return {}
def get_drm_cards_by_pci():
pci_map = {}
drm_path = "/sys/class/drm"
for entry in os.listdir(drm_path):
if entry.startswith("card") and "-" not in entry:
device_path = os.path.realpath(os.path.join(drm_path, entry, "device"))
pci_id = os.path.basename(device_path)
pci_map[pci_id] = entry
return pci_map
def get_power_usage_watts(drm_card):
hwmon_path = f"/sys/class/drm/{drm_card}/device/hwmon"
if not os.path.exists(hwmon_path):
return None
try:
hwmon_dirs = os.listdir(hwmon_path)
if not hwmon_dirs:
return None
power_path = os.path.join(hwmon_path, hwmon_dirs[0], "power1_input")
if os.path.exists(power_path):
with open(power_path) as f:
return int(f.read().strip()) / 1_000_000
except Exception:
return None
return None
def get_mem_info_mib(drm_card):
base_path = f"/sys/class/drm/{drm_card}/device"
result = {}
try:
for field in ["mem_info_vram_total", "mem_info_vram_used"]:
path = os.path.join(base_path, field)
if os.path.exists(path):
with open(path) as f:
result[field] = int(f.read().strip()) / (1024 * 1024)
except Exception as e:
logging.warning(f"Could not read AMD memory info: {e}")
return {
"memory_total_mib": result.get("mem_info_vram_total"),
"memory_used_mib": result.get("mem_info_vram_used"),
"mem_util_percent": (
result["mem_info_vram_used"] / result["mem_info_vram_total"] * 100
if "mem_info_vram_used" in result and "mem_info_vram_total" in result
else None
),
} if result else {}
def parse_clock(clock_str):
if not clock_str:
return None
try:
return int(clock_str.replace("(", "").replace("Mhz)", ""))
except Exception:
return None
def extract_amd_gpu_info():
rocm_data = get_rocm_smi_data()
pci_map = get_drm_cards_by_pci()
gpu_list = []
for card_id, data in rocm_data.items():
if not card_id.startswith("card"):
continue
name = data.get("Card Series")
pci_id = data.get("PCI Bus")
drm_card = pci_map.get(pci_id)
clocks = {
"gpu_clock_mhz": parse_clock(data.get("sclk clock speed:")),
"mem_clock_mhz": parse_clock(data.get("mclk clock speed:")),
}
mem_info = get_mem_info_mib(drm_card) if drm_card else {}
gpu_info = {
"vendor": "amd",
"gpu_id": card_id,
"name": name,
"pci_bus": pci_id,
"gpu_temp_c": float(data.get("Temperature (Sensor edge) (C)", 0)),
"fan_speed_percent": float(data.get("Fan speed (%)", 0)),
"power_w": get_power_usage_watts(drm_card),
"gpu_clock_mhz": clocks["gpu_clock_mhz"],
"mem_clock_mhz": clocks["mem_clock_mhz"],
"gpu_util_percent": float(data.get("GPU use (%)", 0)),
"mem_util_percent": mem_info.get("mem_util_percent"),
"memory_used_mib": mem_info.get("memory_used_mib"),
"memory_total_mib": mem_info.get("memory_total_mib"),
}
gpu_list.append(gpu_info)
return gpu_list
def extract_amd_processes():
rocm_data = get_rocm_smi_data()
processes = []
for key, val in rocm_data.get("system", {}).items():
if key.startswith("PID"):
pid = int(key[3:])
parts = val.split(",")
if len(parts) >= 5:
name = parts[0].strip()
gpu_id = f"card{parts[1].strip()}"
mem_bytes = int(parts[2].strip())
gpu_util = float(parts[4].strip())
processes.append({
"vendor": "amd",
"pid": pid,
"name": name,
"gpu_id": gpu_id,
"gpu_name": rocm_data.get(gpu_id, {}).get("Card series", "Unknown"),
"used_memory_mib": mem_bytes / (1024 * 1024),
"gpu_util_percent": gpu_util,
})
return processes

84
gpus/nvidia.py Normal file
View File

@@ -0,0 +1,84 @@
import subprocess
import csv
import logging
def safe_float(value):
try:
return float(value)
except Exception:
return None
def extract_nvidia_gpu_info():
query_fields = [
"index", "uuid", "name", "temperature.gpu", "utilization.gpu",
"utilization.memory", "pstate", "memory.total", "memory.used",
"fan.speed", "power.draw", "clocks.current.graphics", "clocks.current.memory",
"pci.bus_id"
]
cmd = [
"/usr/bin/nvidia-smi",
f"--query-gpu={','.join(query_fields)}",
"--format=csv,noheader,nounits"
]
try:
output = subprocess.check_output(cmd, encoding="utf-8")
except Exception as e:
logging.error(f"Error running nvidia-smi: {e}")
return []
reader = csv.reader(output.strip().split("\n"))
gpu_list = []
for row in reader:
values = dict(zip(query_fields, row))
mem_total = safe_float(values["memory.total"])
mem_used = safe_float(values["memory.used"])
mem_util = (mem_used / mem_total * 100) if mem_used and mem_total else None
gpu_info = {
"vendor": "nvidia",
"gpu_id": f'gpu{values["index"]}',
"uuid": values["uuid"],
"name": values["name"],
"pci_bus": values["pci.bus_id"],
"gpu_temp_c": safe_float(values["temperature.gpu"]),
"fan_speed_percent": safe_float(values["fan.speed"]),
"power_w": safe_float(values["power.draw"]),
"gpu_clock_mhz": safe_float(values["clocks.current.graphics"]),
"mem_clock_mhz": safe_float(values["clocks.current.memory"]),
"gpu_util_percent": safe_float(values["utilization.gpu"]),
"mem_util_percent": mem_util,
"memory_used_mib": mem_used,
"memory_total_mib": mem_total,
}
gpu_list.append(gpu_info)
return gpu_list
def extract_nvidia_processes():
try:
output = subprocess.check_output([
"/usr/bin/nvidia-smi", "--query-compute-apps=pid,process_name,used_gpu_memory,gpu_uuid",
"--format=csv,noheader,nounits"
], encoding="utf-8")
except Exception as e:
logging.error(f"Error fetching NVIDIA processes: {e}")
return []
processes = []
for line in output.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) != 4:
continue
pid, name, mem, uuid = parts
processes.append({
"vendor": "nvidia",
"pid": int(pid),
"name": name,
"gpu_id": uuid,
"gpu_name": "N/A", # Optionally map UUID to name
"used_memory_mib": float(mem),
"gpu_util_percent": 0 # NVIDIA does not expose per-process utilization directly
})
return processes