276 lines
9.6 KiB
Python
276 lines
9.6 KiB
Python
"""
|
|
GPU Backend Manager — detects and manages compute backends.
|
|
|
|
Supports:
|
|
- CUDA via CuPy
|
|
- OpenCL via PyOpenCL (future)
|
|
- CPU via NumPy (always available)
|
|
|
|
Usage:
|
|
from app.services.gpu_backend import gpu_manager
|
|
xp = gpu_manager.get_array_module() # cupy or numpy
|
|
status = gpu_manager.get_status()
|
|
"""
|
|
|
|
import logging
|
|
from enum import Enum
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Optional
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GPUBackend(str, Enum):
|
|
CUDA = "cuda"
|
|
OPENCL = "opencl"
|
|
CPU = "cpu"
|
|
|
|
|
|
@dataclass
|
|
class GPUDevice:
|
|
backend: GPUBackend
|
|
index: int
|
|
name: str
|
|
memory_mb: int
|
|
extra: dict = field(default_factory=dict)
|
|
|
|
|
|
class GPUManager:
|
|
"""Singleton GPU manager with device detection and selection."""
|
|
|
|
def __init__(self):
|
|
self._devices: list[GPUDevice] = []
|
|
self._active_backend: GPUBackend = GPUBackend.CPU
|
|
self._active_device: Optional[GPUDevice] = None
|
|
self._cupy = None
|
|
self._detect_devices()
|
|
|
|
def _detect_devices(self):
|
|
"""Probe available GPU backends."""
|
|
# Always add CPU
|
|
cpu_device = GPUDevice(
|
|
backend=GPUBackend.CPU,
|
|
index=0,
|
|
name="CPU (NumPy)",
|
|
memory_mb=0,
|
|
)
|
|
self._devices.append(cpu_device)
|
|
|
|
# Try CuPy / CUDA
|
|
try:
|
|
import cupy as cp
|
|
device_count = cp.cuda.runtime.getDeviceCount()
|
|
for i in range(device_count):
|
|
props = cp.cuda.runtime.getDeviceProperties(i)
|
|
name = props["name"]
|
|
if isinstance(name, bytes):
|
|
name = name.decode()
|
|
mem_mb = props["totalGlobalMem"] // (1024 * 1024)
|
|
cuda_ver = cp.cuda.runtime.runtimeGetVersion()
|
|
device = GPUDevice(
|
|
backend=GPUBackend.CUDA,
|
|
index=i,
|
|
name=str(name),
|
|
memory_mb=mem_mb,
|
|
extra={"cuda_version": cuda_ver},
|
|
)
|
|
self._devices.append(device)
|
|
logger.info(f"[GPU] CUDA device {i}: {name} ({mem_mb} MB)")
|
|
if device_count > 0:
|
|
self._cupy = cp
|
|
except ImportError:
|
|
logger.info("[GPU] CuPy not installed — CUDA unavailable")
|
|
except Exception as e:
|
|
logger.warning(f"[GPU] CuPy probe error: {e}")
|
|
|
|
# Try PyOpenCL (future — stub for detection only)
|
|
try:
|
|
import pyopencl as cl
|
|
platforms = cl.get_platforms()
|
|
for plat in platforms:
|
|
for dev in plat.get_devices():
|
|
mem_mb = dev.global_mem_size // (1024 * 1024)
|
|
device = GPUDevice(
|
|
backend=GPUBackend.OPENCL,
|
|
index=len([d for d in self._devices if d.backend == GPUBackend.OPENCL]),
|
|
name=dev.name.strip(),
|
|
memory_mb=mem_mb,
|
|
extra={"platform": plat.name.strip()},
|
|
)
|
|
self._devices.append(device)
|
|
logger.info(f"[GPU] OpenCL device: {device.name} ({mem_mb} MB)")
|
|
except ImportError:
|
|
pass
|
|
except Exception as e:
|
|
logger.debug(f"[GPU] OpenCL probe error: {e}")
|
|
|
|
# Auto-select best: prefer CUDA > OpenCL > CPU
|
|
cuda_devices = [d for d in self._devices if d.backend == GPUBackend.CUDA]
|
|
if cuda_devices:
|
|
self._active_backend = GPUBackend.CUDA
|
|
self._active_device = cuda_devices[0]
|
|
logger.info(f"[GPU] Active backend: CUDA — {self._active_device.name}")
|
|
else:
|
|
self._active_backend = GPUBackend.CPU
|
|
self._active_device = cpu_device
|
|
logger.info("[GPU] Active backend: CPU (NumPy)")
|
|
|
|
@property
|
|
def gpu_available(self) -> bool:
|
|
return self._active_backend != GPUBackend.CPU
|
|
|
|
def get_array_module(self) -> Any:
|
|
"""Return cupy (if CUDA active) or numpy."""
|
|
if self._active_backend == GPUBackend.CUDA and self._cupy is not None:
|
|
return self._cupy
|
|
return np
|
|
|
|
def to_cpu(self, arr: Any) -> np.ndarray:
|
|
"""Transfer array to CPU numpy."""
|
|
if hasattr(arr, 'get'):
|
|
return arr.get()
|
|
return np.asarray(arr)
|
|
|
|
def get_status(self) -> dict:
|
|
"""Full status dict for API."""
|
|
return {
|
|
"active_backend": self._active_backend.value,
|
|
"active_device": {
|
|
"backend": self._active_device.backend.value,
|
|
"index": self._active_device.index,
|
|
"name": self._active_device.name,
|
|
"memory_mb": self._active_device.memory_mb,
|
|
} if self._active_device else None,
|
|
"gpu_available": self.gpu_available,
|
|
"available_devices": [
|
|
{
|
|
"backend": d.backend.value,
|
|
"index": d.index,
|
|
"name": d.name,
|
|
"memory_mb": d.memory_mb,
|
|
}
|
|
for d in self._devices
|
|
],
|
|
}
|
|
|
|
def get_devices(self) -> list[dict]:
|
|
"""Device list for API."""
|
|
return [
|
|
{
|
|
"backend": d.backend.value,
|
|
"index": d.index,
|
|
"name": d.name,
|
|
"memory_mb": d.memory_mb,
|
|
}
|
|
for d in self._devices
|
|
]
|
|
|
|
def get_diagnostics(self) -> dict:
|
|
"""Full diagnostic info for troubleshooting GPU detection."""
|
|
import sys
|
|
import platform
|
|
import subprocess
|
|
|
|
is_wsl = "microsoft" in platform.release().lower()
|
|
|
|
diag = {
|
|
"python_version": sys.version,
|
|
"python_executable": sys.executable,
|
|
"platform": platform.platform(),
|
|
"is_wsl": is_wsl,
|
|
"numpy": {"version": np.__version__},
|
|
"cuda": {},
|
|
"opencl": {},
|
|
"nvidia_smi": None,
|
|
"detected_devices": len(self._devices),
|
|
"active_backend": self._active_backend.value,
|
|
}
|
|
|
|
# Check nvidia-smi (works even without CuPy)
|
|
try:
|
|
result = subprocess.run(
|
|
["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
diag["nvidia_smi"] = result.stdout.strip()
|
|
except Exception:
|
|
diag["nvidia_smi"] = "not found or error"
|
|
|
|
# Check CuPy/CUDA
|
|
try:
|
|
import cupy as cp
|
|
diag["cuda"]["cupy_version"] = cp.__version__
|
|
diag["cuda"]["cuda_runtime_version"] = cp.cuda.runtime.runtimeGetVersion()
|
|
diag["cuda"]["device_count"] = cp.cuda.runtime.getDeviceCount()
|
|
for i in range(diag["cuda"]["device_count"]):
|
|
props = cp.cuda.runtime.getDeviceProperties(i)
|
|
name = props["name"]
|
|
if isinstance(name, bytes):
|
|
name = name.decode()
|
|
diag["cuda"][f"device_{i}"] = {
|
|
"name": str(name),
|
|
"memory_mb": props["totalGlobalMem"] // (1024 * 1024),
|
|
"compute_capability": f"{props['major']}.{props['minor']}",
|
|
}
|
|
except ImportError:
|
|
diag["cuda"]["error"] = "CuPy not installed"
|
|
if is_wsl:
|
|
diag["cuda"]["install_hint"] = "pip3 install cupy-cuda12x --break-system-packages"
|
|
else:
|
|
diag["cuda"]["install_hint"] = "pip install cupy-cuda12x"
|
|
except Exception as e:
|
|
diag["cuda"]["error"] = str(e)
|
|
|
|
# Check PyOpenCL
|
|
try:
|
|
import pyopencl as cl
|
|
diag["opencl"]["pyopencl_version"] = cl.VERSION_TEXT
|
|
diag["opencl"]["platforms"] = []
|
|
for p in cl.get_platforms():
|
|
platform_info = {"name": p.name.strip(), "devices": []}
|
|
for d in p.get_devices():
|
|
platform_info["devices"].append({
|
|
"name": d.name.strip(),
|
|
"type": cl.device_type.to_string(d.type),
|
|
"memory_mb": d.global_mem_size // (1024 * 1024),
|
|
"compute_units": d.max_compute_units,
|
|
})
|
|
diag["opencl"]["platforms"].append(platform_info)
|
|
except ImportError:
|
|
diag["opencl"]["error"] = "PyOpenCL not installed"
|
|
if is_wsl:
|
|
diag["opencl"]["install_hint"] = "pip3 install pyopencl --break-system-packages"
|
|
else:
|
|
diag["opencl"]["install_hint"] = "pip install pyopencl"
|
|
except Exception as e:
|
|
diag["opencl"]["error"] = str(e)
|
|
|
|
return diag
|
|
|
|
def set_device(self, backend: str, index: int = 0) -> dict:
|
|
"""Switch active compute device."""
|
|
target_backend = GPUBackend(backend)
|
|
candidates = [d for d in self._devices
|
|
if d.backend == target_backend and d.index == index]
|
|
if not candidates:
|
|
raise ValueError(f"No device found: backend={backend}, index={index}")
|
|
|
|
self._active_device = candidates[0]
|
|
self._active_backend = target_backend
|
|
|
|
if target_backend == GPUBackend.CUDA and self._cupy is not None:
|
|
self._cupy.cuda.Device(index).use()
|
|
|
|
logger.info(f"[GPU] Switched to: {self._active_device.name} ({target_backend.value})")
|
|
return {
|
|
"backend": self._active_backend.value,
|
|
"device": self._active_device.name,
|
|
}
|
|
|
|
|
|
# Singleton
|
|
gpu_manager = GPUManager()
|