@mytec: refactor to ray ready for testing

This commit is contained in:
2026-01-31 21:09:10 +02:00
parent 3b010fed83
commit 221000d5b3
6 changed files with 278 additions and 154 deletions

View File

@@ -6,9 +6,22 @@ router = APIRouter()
@router.get("/info")
async def get_system_info():
"""Return system info: CPU cores, GPU availability, parallel support."""
"""Return system info: CPU cores, GPU availability, parallel backend."""
cpu_cores = mp.cpu_count() or 1
# Check Ray
ray_available = False
ray_initialized = False
try:
from app.services.parallel_coverage_service import RAY_AVAILABLE
ray_available = RAY_AVAILABLE
if ray_available:
import ray
ray_initialized = ray.is_initialized()
except Exception:
pass
# Check GPU
gpu_info = None
try:
import cupy as cp
@@ -24,7 +37,9 @@ async def get_system_info():
return {
"cpu_cores": cpu_cores,
"parallel_workers": min(cpu_cores, 14),
"parallel_enabled": True,
"parallel_backend": "ray" if ray_available else "sequential",
"ray_available": ray_available,
"ray_initialized": ray_initialized,
"gpu": gpu_info,
"gpu_enabled": gpu_info is not None,
}

View File

@@ -54,7 +54,7 @@ from app.services.weather_service import weather_service
from app.services.indoor_service import indoor_service
from app.services.atmospheric_service import atmospheric_service
from app.services.parallel_coverage_service import (
calculate_coverage_parallel, get_cpu_count
calculate_coverage_parallel, get_cpu_count, get_parallel_backend,
)
@@ -360,8 +360,9 @@ class CoverageService:
num_workers = get_cpu_count()
if use_parallel:
backend = get_parallel_backend()
_clog(f"━━━ PHASE 3: Calculating {len(grid)} points "
f"(PARALLEL, {num_workers} workers) ━━━")
f"(PARALLEL/{backend}, {num_workers} workers) ━━━")
try:
loop = asyncio.get_event_loop()

View File

@@ -1,93 +1,71 @@
"""
Parallel coverage calculation using ProcessPoolExecutor.
Parallel coverage calculation.
Workers receive pre-loaded terrain cache, buildings, and OSM data
via a shared pickle file. Each worker initializes module-level
service singletons with the cached data, then processes point chunks.
Primary backend: Ray (shared-memory object store, zero-copy numpy arrays)
Fallback: Sequential (single-threaded, no extra dependencies)
Ray advantages over ProcessPoolExecutor:
- ray.put() stores terrain cache ONCE in shared memory
- Workers access numpy arrays via zero-copy (no per-worker pickle/copy)
- Eliminates MemoryError on Detailed preset with large terrain + buildings
Usage:
from app.services.parallel_coverage_service import calculate_coverage_parallel
from app.services.parallel_coverage_service import (
calculate_coverage_parallel, get_cpu_count, RAY_AVAILABLE,
)
"""
import os
import sys
import time
import pickle
import tempfile
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
from typing import List, Dict, Tuple, Any, Optional, Callable
import numpy as np
# ── Module-level worker state (set once per process by _init_worker) ──
# ── Try to import Ray ──
_worker_data: Dict[str, Any] = {}
_worker_initialized = False
RAY_AVAILABLE = False
try:
import ray
RAY_AVAILABLE = True
except ImportError:
ray = None # type: ignore
def _init_worker(shared_data_path: str):
"""Initialize a worker process with shared data from temp file.
# ── Worker-level spatial index cache (persists across tasks in same worker) ──
Injects terrain cache into the module-level terrain_service singleton
so that all other services (LOS, dominant path, etc.) automatically
see the cached tiles.
_worker_spatial_idx = None
_worker_cache_key: Optional[str] = None
def _ray_process_chunk_impl(chunk, terrain_cache, buildings, osm_data, config):
"""Implementation: process a chunk of (lat, lon, elevation) tuples.
Called inside a Ray remote function. terrain_cache numpy arrays come
from the Ray object store via zero-copy.
"""
global _worker_data, _worker_initialized
global _worker_spatial_idx, _worker_cache_key
if _worker_initialized:
return
t0 = time.time()
pid = os.getpid()
# Load shared data
with open(shared_data_path, 'rb') as f:
data = pickle.load(f)
# Inject terrain cache into the global singleton —
# this automatically fixes los_service, dominant_path_service, etc.
# because they hold references to the same terrain_service object.
# Inject terrain cache into the module-level singleton.
# For numpy arrays, Ray gives us a read-only view into shared memory.
from app.services.terrain_service import terrain_service
terrain_service._tile_cache = data['terrain_cache']
terrain_service._tile_cache = terrain_cache
# Build spatial index from buildings
from app.services.spatial_index import SpatialIndex
spatial_idx = SpatialIndex()
if data['buildings']:
spatial_idx.build(data['buildings'])
# Build or reuse spatial index (expensive — ~1s for 350K buildings).
cache_key = config.get('cache_key', '')
if _worker_cache_key != cache_key:
from app.services.spatial_index import SpatialIndex
_worker_spatial_idx = SpatialIndex()
if buildings:
_worker_spatial_idx.build(buildings)
_worker_cache_key = cache_key
_worker_data = {
'buildings': data['buildings'],
'streets': data['streets'],
'water_bodies': data['water_bodies'],
'vegetation_areas': data['vegetation_areas'],
'spatial_idx': spatial_idx,
'site_dict': data['site_dict'],
'settings_dict': data['settings_dict'],
'site_elevation': data['site_elevation'],
}
_worker_initialized = True
dt = time.time() - t0
print(f"[WORKER {pid}] Initialized in {dt:.1f}s — "
f"{len(data['terrain_cache'])} tiles, "
f"{len(data['buildings'])} buildings, "
f"{len(data.get('vegetation_areas', []))} vegetation",
flush=True)
def _process_chunk(chunk: List[Tuple[float, float, float]]) -> List[Dict]:
"""Process a chunk of (lat, lon, point_elevation) tuples.
Returns list of CoveragePoint dicts for points above min_signal.
"""
# Process points
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
data = _worker_data
site = SiteParams(**data['site_dict'])
settings = CoverageSettings(**data['settings_dict'])
site = SiteParams(**config['site_dict'])
settings = CoverageSettings(**config['settings_dict'])
svc = CoverageService()
timing = {
@@ -100,10 +78,10 @@ def _process_chunk(chunk: List[Tuple[float, float, float]]) -> List[Dict]:
for lat, lon, point_elev in chunk:
point = svc._calculate_point_sync(
site, lat, lon, settings,
data['buildings'], data['streets'],
data['spatial_idx'], data['water_bodies'],
data['vegetation_areas'],
data['site_elevation'], point_elev, timing,
buildings, osm_data.get('streets', []),
_worker_spatial_idx, osm_data.get('water_bodies', []),
osm_data.get('vegetation_areas', []),
config['site_elevation'], point_elev, timing,
)
if point.rsrp >= settings.min_signal:
results.append(point.model_dump())
@@ -111,6 +89,13 @@ def _process_chunk(chunk: List[Tuple[float, float, float]]) -> List[Dict]:
return results
# ── Register the Ray remote function (only if Ray is available) ──
_ray_process_chunk = None
if RAY_AVAILABLE:
_ray_process_chunk = ray.remote(_ray_process_chunk_impl)
# ── Public API ──
@@ -122,6 +107,42 @@ def get_cpu_count() -> int:
return 4
def get_parallel_backend() -> str:
"""Return which parallel backend is available."""
if RAY_AVAILABLE:
return "ray"
return "sequential"
def _try_init_ray(num_cpus: int) -> bool:
"""Initialize Ray lazily. Returns True if Ray is ready."""
if not RAY_AVAILABLE:
return False
if ray.is_initialized():
return True
try:
data_path = os.environ.get('RFCP_DATA_PATH', './data')
ray_tmp = os.path.join(data_path, 'ray_tmp')
os.makedirs(ray_tmp, exist_ok=True)
ray.init(
num_cpus=num_cpus,
include_dashboard=False,
log_to_driver=True,
_temp_dir=ray_tmp,
)
print(f"[PARALLEL] Ray initialized: {num_cpus} CPUs, "
f"object store ~{ray.cluster_resources().get('object_store_memory', 0) / 1e9:.1f}GB",
flush=True)
return True
except Exception as e:
print(f"[PARALLEL] Ray init failed: {e}", flush=True)
return False
def calculate_coverage_parallel(
grid: List[Tuple[float, float]],
point_elevations: Dict[Tuple[float, float], float],
@@ -136,21 +157,12 @@ def calculate_coverage_parallel(
num_workers: Optional[int] = None,
log_fn: Optional[Callable[[str], None]] = None,
) -> Tuple[List[Dict], Dict[str, float]]:
"""Calculate coverage points in parallel using ProcessPoolExecutor.
"""Calculate coverage points in parallel.
Args:
grid: List of (lat, lon) tuples.
point_elevations: Pre-computed {(lat, lon): elevation} dict.
site_dict: SiteParams as a dict (for pickling).
settings_dict: CoverageSettings as a dict (for pickling).
terrain_cache: {tile_name: np.ndarray} — pre-loaded SRTM tiles.
buildings, streets, water_bodies, vegetation_areas: OSM data.
site_elevation: Elevation at site location (meters).
num_workers: Override worker count (default: auto-detect).
log_fn: Logging function (receives string messages).
Uses Ray if available (shared memory, zero-copy numpy), otherwise
falls back to sequential single-threaded calculation.
Returns:
(results, timing) where results is list of CoveragePoint dicts.
Same signature as before — drop-in replacement.
"""
if log_fn is None:
log_fn = lambda msg: print(f"[PARALLEL] {msg}", flush=True)
@@ -159,92 +171,170 @@ def calculate_coverage_parallel(
num_workers = get_cpu_count()
total_points = len(grid)
log_fn(f"Parallel mode: {total_points} points, {num_workers} workers")
# Prepare items with pre-computed elevations
# Try Ray
if RAY_AVAILABLE and _try_init_ray(num_workers):
try:
return _calculate_with_ray(
grid, point_elevations, site_dict, settings_dict,
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
num_workers, log_fn,
)
except Exception as e:
log_fn(f"Ray execution failed: {e} — falling back to sequential")
# Fallback: sequential
log_fn(f"Sequential fallback: {total_points} points")
return _calculate_sequential(
grid, point_elevations, site_dict, settings_dict,
buildings, streets, water_bodies, vegetation_areas,
site_elevation, log_fn,
)
# ── Ray backend ──
def _calculate_with_ray(
grid, point_elevations, site_dict, settings_dict,
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
num_workers, log_fn,
):
"""Execute using Ray shared-memory object store."""
total_points = len(grid)
log_fn(f"Ray mode: {total_points} points, {num_workers} workers")
# ── Put large data into Ray object store ──
# Numpy arrays (terrain tiles) get zero-copy shared memory.
# Python objects (buildings) get serialized once, stored in plasma.
t_put = time.time()
terrain_ref = ray.put(terrain_cache)
buildings_ref = ray.put(buildings)
osm_ref = ray.put({
'streets': streets,
'water_bodies': water_bodies,
'vegetation_areas': vegetation_areas,
})
cache_key = f"{site_dict['lat']:.4f},{site_dict['lon']:.4f},{len(buildings)}"
config_ref = ray.put({
'site_dict': site_dict,
'settings_dict': settings_dict,
'site_elevation': site_elevation,
'cache_key': cache_key,
})
put_time = time.time() - t_put
log_fn(f"ray.put() done in {put_time:.1f}s")
# ── Prepare and submit chunks ──
items = [
(lat, lon, point_elevations.get((lat, lon), 0.0))
for lat, lon in grid
]
# Split into chunks — ~4 chunks per worker for granular progress
chunks_per_worker = 4
chunk_size = max(1, len(items) // (num_workers * chunks_per_worker))
# ~4 chunks per worker for granular progress
chunk_size = max(1, len(items) // (num_workers * 4))
chunks = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
log_fn(f"Split into {len(chunks)} chunks of ~{chunk_size} points")
log_fn(f"Submitting {len(chunks)} chunks of ~{chunk_size} points")
# ── Serialize shared data to temp file (once, not per-worker) ──
t_serial = time.time()
shared_data = {
'terrain_cache': terrain_cache,
'buildings': buildings,
'streets': streets,
'water_bodies': water_bodies,
'vegetation_areas': vegetation_areas,
'site_dict': site_dict,
'settings_dict': settings_dict,
'site_elevation': site_elevation,
}
tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix='.pkl')
try:
pickle.dump(shared_data, tmpfile, protocol=pickle.HIGHEST_PROTOCOL)
finally:
tmpfile.close()
shared_data_path = tmpfile.name
file_size_mb = os.path.getsize(shared_data_path) / (1024 * 1024)
serial_time = time.time() - t_serial
log_fn(f"Serialized shared data: {file_size_mb:.1f}MB in {serial_time:.1f}s")
# Free main-process memory for the duplicate
del shared_data
# ── Run in process pool ──
t_calc = time.time()
pending = [
_ray_process_chunk.remote(chunk, terrain_ref, buildings_ref, osm_ref, config_ref)
for chunk in chunks
]
# ── Collect results with progress via ray.wait() ──
all_results: List[Dict] = []
completed_points = 0
total_chunks = len(pending)
remaining = list(pending)
completed_chunks = 0
try:
with ProcessPoolExecutor(
max_workers=num_workers,
initializer=_init_worker,
initargs=(shared_data_path,),
) as executor:
futures = [executor.submit(_process_chunk, chunk) for chunk in chunks]
while remaining:
# Wait for at least 1 result, batch up to ~10% for progress logging
batch = max(1, min(len(remaining), total_chunks // 10 or 1))
done, remaining = ray.wait(remaining, num_returns=batch, timeout=600)
for i, future in enumerate(futures):
try:
chunk_results = future.result(timeout=600) # 10 min max per chunk
all_results.extend(chunk_results)
except Exception as e:
log_fn(f"Chunk {i} failed: {e}")
for ref in done:
try:
chunk_results = ray.get(ref)
all_results.extend(chunk_results)
except Exception as e:
log_fn(f"Chunk error: {e}")
completed_points += len(chunks[i])
pct = min(100, completed_points * 100 // total_points)
elapsed = time.time() - t_calc
rate = completed_points / elapsed if elapsed > 0 else 0
# Log every ~10% or on last chunk
if (i + 1) % max(1, len(chunks) // 10) == 0 or i == len(chunks) - 1:
eta = (total_points - completed_points) / rate if rate > 0 else 0
log_fn(f"Progress: {completed_points}/{total_points} ({pct}%) — "
f"{rate:.0f} pts/s, ETA {eta:.0f}s")
finally:
# Clean up temp file
try:
os.unlink(shared_data_path)
except Exception:
pass
completed_chunks += len(done)
pct = completed_chunks * 100 // total_chunks
elapsed = time.time() - t_calc
pts = len(all_results)
rate = pts / elapsed if elapsed > 0 else 0
eta = (total_points - pts) / rate if rate > 0 else 0
log_fn(f"Progress: {completed_chunks}/{total_chunks} chunks ({pct}%) — "
f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
calc_time = time.time() - t_calc
log_fn(f"Parallel done: {calc_time:.1f}s, {len(all_results)} results "
log_fn(f"Ray done: {calc_time:.1f}s, {len(all_results)} results "
f"({calc_time / max(1, total_points) * 1000:.1f}ms/point)")
timing = {
"parallel_total": calc_time,
"serialize": serial_time,
"ray_put": put_time,
"workers": num_workers,
"backend": "ray",
}
return all_results, timing
# ── Sequential fallback ──
def _calculate_sequential(
grid, point_elevations, site_dict, settings_dict,
buildings, streets, water_bodies, vegetation_areas,
site_elevation, log_fn,
):
"""Sequential fallback — no extra dependencies, runs in calling thread."""
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
from app.services.spatial_index import SpatialIndex
site = SiteParams(**site_dict)
settings = CoverageSettings(**settings_dict)
svc = CoverageService()
spatial_idx = SpatialIndex()
if buildings:
spatial_idx.build(buildings)
total = len(grid)
log_interval = max(1, total // 20)
timing = {
"los": 0.0, "buildings": 0.0, "antenna": 0.0,
"dominant_path": 0.0, "street_canyon": 0.0,
"reflection": 0.0, "vegetation": 0.0,
}
t0 = time.time()
results = []
for i, (lat, lon) in enumerate(grid):
if i % log_interval == 0:
log_fn(f"Sequential: {i}/{total} ({i * 100 // total}%)")
point_elev = point_elevations.get((lat, lon), 0.0)
point = svc._calculate_point_sync(
site, lat, lon, settings,
buildings, streets, spatial_idx,
water_bodies, vegetation_areas,
site_elevation, point_elev, timing,
)
if point.rsrp >= settings.min_signal:
results.append(point.model_dump())
calc_time = time.time() - t0
log_fn(f"Sequential done: {calc_time:.1f}s, {len(results)} results "
f"({calc_time / max(1, total) * 1000:.1f}ms/point)")
timing["sequential_total"] = calc_time
timing["backend"] = "sequential"
return results, timing