@mytec: iter3.5.0 ready for testing
This commit is contained in:
192
backend/app/services/gpu_backend.py
Normal file
192
backend/app/services/gpu_backend.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
GPU Backend Manager — detects and manages compute backends.
|
||||
|
||||
Supports:
|
||||
- CUDA via CuPy
|
||||
- OpenCL via PyOpenCL (future)
|
||||
- CPU via NumPy (always available)
|
||||
|
||||
Usage:
|
||||
from app.services.gpu_backend import gpu_manager
|
||||
xp = gpu_manager.get_array_module() # cupy or numpy
|
||||
status = gpu_manager.get_status()
|
||||
"""
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GPUBackend(str, Enum):
|
||||
CUDA = "cuda"
|
||||
OPENCL = "opencl"
|
||||
CPU = "cpu"
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPUDevice:
|
||||
backend: GPUBackend
|
||||
index: int
|
||||
name: str
|
||||
memory_mb: int
|
||||
extra: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class GPUManager:
|
||||
"""Singleton GPU manager with device detection and selection."""
|
||||
|
||||
def __init__(self):
|
||||
self._devices: list[GPUDevice] = []
|
||||
self._active_backend: GPUBackend = GPUBackend.CPU
|
||||
self._active_device: Optional[GPUDevice] = None
|
||||
self._cupy = None
|
||||
self._detect_devices()
|
||||
|
||||
def _detect_devices(self):
|
||||
"""Probe available GPU backends."""
|
||||
# Always add CPU
|
||||
cpu_device = GPUDevice(
|
||||
backend=GPUBackend.CPU,
|
||||
index=0,
|
||||
name="CPU (NumPy)",
|
||||
memory_mb=0,
|
||||
)
|
||||
self._devices.append(cpu_device)
|
||||
|
||||
# Try CuPy / CUDA
|
||||
try:
|
||||
import cupy as cp
|
||||
device_count = cp.cuda.runtime.getDeviceCount()
|
||||
for i in range(device_count):
|
||||
props = cp.cuda.runtime.getDeviceProperties(i)
|
||||
name = props["name"]
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode()
|
||||
mem_mb = props["totalGlobalMem"] // (1024 * 1024)
|
||||
cuda_ver = cp.cuda.runtime.runtimeGetVersion()
|
||||
device = GPUDevice(
|
||||
backend=GPUBackend.CUDA,
|
||||
index=i,
|
||||
name=str(name),
|
||||
memory_mb=mem_mb,
|
||||
extra={"cuda_version": cuda_ver},
|
||||
)
|
||||
self._devices.append(device)
|
||||
logger.info(f"[GPU] CUDA device {i}: {name} ({mem_mb} MB)")
|
||||
if device_count > 0:
|
||||
self._cupy = cp
|
||||
except ImportError:
|
||||
logger.info("[GPU] CuPy not installed — CUDA unavailable")
|
||||
except Exception as e:
|
||||
logger.warning(f"[GPU] CuPy probe error: {e}")
|
||||
|
||||
# Try PyOpenCL (future — stub for detection only)
|
||||
try:
|
||||
import pyopencl as cl
|
||||
platforms = cl.get_platforms()
|
||||
for plat in platforms:
|
||||
for dev in plat.get_devices():
|
||||
mem_mb = dev.global_mem_size // (1024 * 1024)
|
||||
device = GPUDevice(
|
||||
backend=GPUBackend.OPENCL,
|
||||
index=len([d for d in self._devices if d.backend == GPUBackend.OPENCL]),
|
||||
name=dev.name.strip(),
|
||||
memory_mb=mem_mb,
|
||||
extra={"platform": plat.name.strip()},
|
||||
)
|
||||
self._devices.append(device)
|
||||
logger.info(f"[GPU] OpenCL device: {device.name} ({mem_mb} MB)")
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.debug(f"[GPU] OpenCL probe error: {e}")
|
||||
|
||||
# Auto-select best: prefer CUDA > OpenCL > CPU
|
||||
cuda_devices = [d for d in self._devices if d.backend == GPUBackend.CUDA]
|
||||
if cuda_devices:
|
||||
self._active_backend = GPUBackend.CUDA
|
||||
self._active_device = cuda_devices[0]
|
||||
logger.info(f"[GPU] Active backend: CUDA — {self._active_device.name}")
|
||||
else:
|
||||
self._active_backend = GPUBackend.CPU
|
||||
self._active_device = cpu_device
|
||||
logger.info("[GPU] Active backend: CPU (NumPy)")
|
||||
|
||||
@property
|
||||
def gpu_available(self) -> bool:
|
||||
return self._active_backend != GPUBackend.CPU
|
||||
|
||||
def get_array_module(self) -> Any:
|
||||
"""Return cupy (if CUDA active) or numpy."""
|
||||
if self._active_backend == GPUBackend.CUDA and self._cupy is not None:
|
||||
return self._cupy
|
||||
return np
|
||||
|
||||
def to_cpu(self, arr: Any) -> np.ndarray:
|
||||
"""Transfer array to CPU numpy."""
|
||||
if hasattr(arr, 'get'):
|
||||
return arr.get()
|
||||
return np.asarray(arr)
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Full status dict for API."""
|
||||
return {
|
||||
"active_backend": self._active_backend.value,
|
||||
"active_device": {
|
||||
"backend": self._active_device.backend.value,
|
||||
"index": self._active_device.index,
|
||||
"name": self._active_device.name,
|
||||
"memory_mb": self._active_device.memory_mb,
|
||||
} if self._active_device else None,
|
||||
"gpu_available": self.gpu_available,
|
||||
"available_devices": [
|
||||
{
|
||||
"backend": d.backend.value,
|
||||
"index": d.index,
|
||||
"name": d.name,
|
||||
"memory_mb": d.memory_mb,
|
||||
}
|
||||
for d in self._devices
|
||||
],
|
||||
}
|
||||
|
||||
def get_devices(self) -> list[dict]:
|
||||
"""Device list for API."""
|
||||
return [
|
||||
{
|
||||
"backend": d.backend.value,
|
||||
"index": d.index,
|
||||
"name": d.name,
|
||||
"memory_mb": d.memory_mb,
|
||||
}
|
||||
for d in self._devices
|
||||
]
|
||||
|
||||
def set_device(self, backend: str, index: int = 0) -> dict:
|
||||
"""Switch active compute device."""
|
||||
target_backend = GPUBackend(backend)
|
||||
candidates = [d for d in self._devices
|
||||
if d.backend == target_backend and d.index == index]
|
||||
if not candidates:
|
||||
raise ValueError(f"No device found: backend={backend}, index={index}")
|
||||
|
||||
self._active_device = candidates[0]
|
||||
self._active_backend = target_backend
|
||||
|
||||
if target_backend == GPUBackend.CUDA and self._cupy is not None:
|
||||
self._cupy.cuda.Device(index).use()
|
||||
|
||||
logger.info(f"[GPU] Switched to: {self._active_device.name} ({target_backend.value})")
|
||||
return {
|
||||
"backend": self._active_backend.value,
|
||||
"device": self._active_device.name,
|
||||
}
|
||||
|
||||
|
||||
# Singleton
|
||||
gpu_manager = GPUManager()
|
||||
Reference in New Issue
Block a user