Files
rfcp/backend/app/services/gpu_backend.py

253 lines
8.7 KiB
Python

"""
GPU Backend Manager — detects and manages compute backends.
Supports:
- CUDA via CuPy
- OpenCL via PyOpenCL (future)
- CPU via NumPy (always available)
Usage:
from app.services.gpu_backend import gpu_manager
xp = gpu_manager.get_array_module() # cupy or numpy
status = gpu_manager.get_status()
"""
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Any, Optional
import numpy as np
logger = logging.getLogger(__name__)
class GPUBackend(str, Enum):
CUDA = "cuda"
OPENCL = "opencl"
CPU = "cpu"
@dataclass
class GPUDevice:
backend: GPUBackend
index: int
name: str
memory_mb: int
extra: dict = field(default_factory=dict)
class GPUManager:
"""Singleton GPU manager with device detection and selection."""
def __init__(self):
self._devices: list[GPUDevice] = []
self._active_backend: GPUBackend = GPUBackend.CPU
self._active_device: Optional[GPUDevice] = None
self._cupy = None
self._detect_devices()
def _detect_devices(self):
"""Probe available GPU backends."""
# Always add CPU
cpu_device = GPUDevice(
backend=GPUBackend.CPU,
index=0,
name="CPU (NumPy)",
memory_mb=0,
)
self._devices.append(cpu_device)
# Try CuPy / CUDA
try:
import cupy as cp
device_count = cp.cuda.runtime.getDeviceCount()
for i in range(device_count):
props = cp.cuda.runtime.getDeviceProperties(i)
name = props["name"]
if isinstance(name, bytes):
name = name.decode()
mem_mb = props["totalGlobalMem"] // (1024 * 1024)
cuda_ver = cp.cuda.runtime.runtimeGetVersion()
device = GPUDevice(
backend=GPUBackend.CUDA,
index=i,
name=str(name),
memory_mb=mem_mb,
extra={"cuda_version": cuda_ver},
)
self._devices.append(device)
logger.info(f"[GPU] CUDA device {i}: {name} ({mem_mb} MB)")
if device_count > 0:
self._cupy = cp
except ImportError:
logger.info("[GPU] CuPy not installed — CUDA unavailable")
except Exception as e:
logger.warning(f"[GPU] CuPy probe error: {e}")
# Try PyOpenCL (future — stub for detection only)
try:
import pyopencl as cl
platforms = cl.get_platforms()
for plat in platforms:
for dev in plat.get_devices():
mem_mb = dev.global_mem_size // (1024 * 1024)
device = GPUDevice(
backend=GPUBackend.OPENCL,
index=len([d for d in self._devices if d.backend == GPUBackend.OPENCL]),
name=dev.name.strip(),
memory_mb=mem_mb,
extra={"platform": plat.name.strip()},
)
self._devices.append(device)
logger.info(f"[GPU] OpenCL device: {device.name} ({mem_mb} MB)")
except ImportError:
pass
except Exception as e:
logger.debug(f"[GPU] OpenCL probe error: {e}")
# Auto-select best: prefer CUDA > OpenCL > CPU
cuda_devices = [d for d in self._devices if d.backend == GPUBackend.CUDA]
if cuda_devices:
self._active_backend = GPUBackend.CUDA
self._active_device = cuda_devices[0]
logger.info(f"[GPU] Active backend: CUDA — {self._active_device.name}")
else:
self._active_backend = GPUBackend.CPU
self._active_device = cpu_device
logger.info("[GPU] Active backend: CPU (NumPy)")
@property
def gpu_available(self) -> bool:
return self._active_backend != GPUBackend.CPU
def get_array_module(self) -> Any:
"""Return cupy (if CUDA active) or numpy."""
if self._active_backend == GPUBackend.CUDA and self._cupy is not None:
return self._cupy
return np
def to_cpu(self, arr: Any) -> np.ndarray:
"""Transfer array to CPU numpy."""
if hasattr(arr, 'get'):
return arr.get()
return np.asarray(arr)
def get_status(self) -> dict:
"""Full status dict for API."""
return {
"active_backend": self._active_backend.value,
"active_device": {
"backend": self._active_device.backend.value,
"index": self._active_device.index,
"name": self._active_device.name,
"memory_mb": self._active_device.memory_mb,
} if self._active_device else None,
"gpu_available": self.gpu_available,
"available_devices": [
{
"backend": d.backend.value,
"index": d.index,
"name": d.name,
"memory_mb": d.memory_mb,
}
for d in self._devices
],
}
def get_devices(self) -> list[dict]:
"""Device list for API."""
return [
{
"backend": d.backend.value,
"index": d.index,
"name": d.name,
"memory_mb": d.memory_mb,
}
for d in self._devices
]
def get_diagnostics(self) -> dict:
"""Full diagnostic info for troubleshooting GPU detection."""
import sys
import platform
diag = {
"python_version": sys.version,
"platform": platform.platform(),
"numpy": {"version": np.__version__},
"cuda": {},
"opencl": {},
"detected_devices": len(self._devices),
"active_backend": self._active_backend.value,
}
# Check CuPy/CUDA
try:
import cupy as cp
diag["cuda"]["cupy_version"] = cp.__version__
diag["cuda"]["cuda_runtime_version"] = cp.cuda.runtime.runtimeGetVersion()
diag["cuda"]["device_count"] = cp.cuda.runtime.getDeviceCount()
for i in range(diag["cuda"]["device_count"]):
props = cp.cuda.runtime.getDeviceProperties(i)
name = props["name"]
if isinstance(name, bytes):
name = name.decode()
diag["cuda"][f"device_{i}"] = {
"name": str(name),
"memory_mb": props["totalGlobalMem"] // (1024 * 1024),
"compute_capability": f"{props['major']}.{props['minor']}",
}
except ImportError:
diag["cuda"]["error"] = "CuPy not installed"
diag["cuda"]["install_hint"] = "pip install cupy-cuda12x"
except Exception as e:
diag["cuda"]["error"] = str(e)
# Check PyOpenCL
try:
import pyopencl as cl
diag["opencl"]["pyopencl_version"] = cl.VERSION_TEXT
diag["opencl"]["platforms"] = []
for p in cl.get_platforms():
platform_info = {"name": p.name.strip(), "devices": []}
for d in p.get_devices():
platform_info["devices"].append({
"name": d.name.strip(),
"type": cl.device_type.to_string(d.type),
"memory_mb": d.global_mem_size // (1024 * 1024),
"compute_units": d.max_compute_units,
})
diag["opencl"]["platforms"].append(platform_info)
except ImportError:
diag["opencl"]["error"] = "PyOpenCL not installed"
diag["opencl"]["install_hint"] = "pip install pyopencl"
except Exception as e:
diag["opencl"]["error"] = str(e)
return diag
def set_device(self, backend: str, index: int = 0) -> dict:
"""Switch active compute device."""
target_backend = GPUBackend(backend)
candidates = [d for d in self._devices
if d.backend == target_backend and d.index == index]
if not candidates:
raise ValueError(f"No device found: backend={backend}, index={index}")
self._active_device = candidates[0]
self._active_backend = target_backend
if target_backend == GPUBackend.CUDA and self._cupy is not None:
self._cupy.cuda.Device(index).use()
logger.info(f"[GPU] Switched to: {self._active_device.name} ({target_backend.value})")
return {
"backend": self._active_backend.value,
"device": self._active_device.name,
}
# Singleton
gpu_manager = GPUManager()