Files
rfcp/backend/app/services/gpu_backend.py

193 lines
6.3 KiB
Python

"""
GPU Backend Manager — detects and manages compute backends.
Supports:
- CUDA via CuPy
- OpenCL via PyOpenCL (future)
- CPU via NumPy (always available)
Usage:
from app.services.gpu_backend import gpu_manager
xp = gpu_manager.get_array_module() # cupy or numpy
status = gpu_manager.get_status()
"""
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import Any, Optional
import numpy as np
logger = logging.getLogger(__name__)
class GPUBackend(str, Enum):
CUDA = "cuda"
OPENCL = "opencl"
CPU = "cpu"
@dataclass
class GPUDevice:
backend: GPUBackend
index: int
name: str
memory_mb: int
extra: dict = field(default_factory=dict)
class GPUManager:
"""Singleton GPU manager with device detection and selection."""
def __init__(self):
self._devices: list[GPUDevice] = []
self._active_backend: GPUBackend = GPUBackend.CPU
self._active_device: Optional[GPUDevice] = None
self._cupy = None
self._detect_devices()
def _detect_devices(self):
"""Probe available GPU backends."""
# Always add CPU
cpu_device = GPUDevice(
backend=GPUBackend.CPU,
index=0,
name="CPU (NumPy)",
memory_mb=0,
)
self._devices.append(cpu_device)
# Try CuPy / CUDA
try:
import cupy as cp
device_count = cp.cuda.runtime.getDeviceCount()
for i in range(device_count):
props = cp.cuda.runtime.getDeviceProperties(i)
name = props["name"]
if isinstance(name, bytes):
name = name.decode()
mem_mb = props["totalGlobalMem"] // (1024 * 1024)
cuda_ver = cp.cuda.runtime.runtimeGetVersion()
device = GPUDevice(
backend=GPUBackend.CUDA,
index=i,
name=str(name),
memory_mb=mem_mb,
extra={"cuda_version": cuda_ver},
)
self._devices.append(device)
logger.info(f"[GPU] CUDA device {i}: {name} ({mem_mb} MB)")
if device_count > 0:
self._cupy = cp
except ImportError:
logger.info("[GPU] CuPy not installed — CUDA unavailable")
except Exception as e:
logger.warning(f"[GPU] CuPy probe error: {e}")
# Try PyOpenCL (future — stub for detection only)
try:
import pyopencl as cl
platforms = cl.get_platforms()
for plat in platforms:
for dev in plat.get_devices():
mem_mb = dev.global_mem_size // (1024 * 1024)
device = GPUDevice(
backend=GPUBackend.OPENCL,
index=len([d for d in self._devices if d.backend == GPUBackend.OPENCL]),
name=dev.name.strip(),
memory_mb=mem_mb,
extra={"platform": plat.name.strip()},
)
self._devices.append(device)
logger.info(f"[GPU] OpenCL device: {device.name} ({mem_mb} MB)")
except ImportError:
pass
except Exception as e:
logger.debug(f"[GPU] OpenCL probe error: {e}")
# Auto-select best: prefer CUDA > OpenCL > CPU
cuda_devices = [d for d in self._devices if d.backend == GPUBackend.CUDA]
if cuda_devices:
self._active_backend = GPUBackend.CUDA
self._active_device = cuda_devices[0]
logger.info(f"[GPU] Active backend: CUDA — {self._active_device.name}")
else:
self._active_backend = GPUBackend.CPU
self._active_device = cpu_device
logger.info("[GPU] Active backend: CPU (NumPy)")
@property
def gpu_available(self) -> bool:
return self._active_backend != GPUBackend.CPU
def get_array_module(self) -> Any:
"""Return cupy (if CUDA active) or numpy."""
if self._active_backend == GPUBackend.CUDA and self._cupy is not None:
return self._cupy
return np
def to_cpu(self, arr: Any) -> np.ndarray:
"""Transfer array to CPU numpy."""
if hasattr(arr, 'get'):
return arr.get()
return np.asarray(arr)
def get_status(self) -> dict:
"""Full status dict for API."""
return {
"active_backend": self._active_backend.value,
"active_device": {
"backend": self._active_device.backend.value,
"index": self._active_device.index,
"name": self._active_device.name,
"memory_mb": self._active_device.memory_mb,
} if self._active_device else None,
"gpu_available": self.gpu_available,
"available_devices": [
{
"backend": d.backend.value,
"index": d.index,
"name": d.name,
"memory_mb": d.memory_mb,
}
for d in self._devices
],
}
def get_devices(self) -> list[dict]:
"""Device list for API."""
return [
{
"backend": d.backend.value,
"index": d.index,
"name": d.name,
"memory_mb": d.memory_mb,
}
for d in self._devices
]
def set_device(self, backend: str, index: int = 0) -> dict:
"""Switch active compute device."""
target_backend = GPUBackend(backend)
candidates = [d for d in self._devices
if d.backend == target_backend and d.index == index]
if not candidates:
raise ValueError(f"No device found: backend={backend}, index={index}")
self._active_device = candidates[0]
self._active_backend = target_backend
if target_backend == GPUBackend.CUDA and self._cupy is not None:
self._cupy.cuda.Device(index).use()
logger.info(f"[GPU] Switched to: {self._active_device.name} ({target_backend.value})")
return {
"backend": self._active_backend.value,
"device": self._active_device.name,
}
# Singleton
gpu_manager = GPUManager()