@mytec: refactor to ray ready for testing

This commit is contained in:
2026-01-31 21:09:10 +02:00
parent 3b010fed83
commit 221000d5b3
6 changed files with 278 additions and 154 deletions

View File

@@ -20,7 +20,9 @@
"Bash(mv:*)", "Bash(mv:*)",
"Read(*)", "Read(*)",
"Write(*)", "Write(*)",
"Bash(python3:*)" "Bash(python3:*)",
"Bash(source:*)",
"Bash(/mnt/d/root/rfcp/venv/bin/python3:*)"
] ]
} }
} }

View File

@@ -6,9 +6,22 @@ router = APIRouter()
@router.get("/info") @router.get("/info")
async def get_system_info(): async def get_system_info():
"""Return system info: CPU cores, GPU availability, parallel support.""" """Return system info: CPU cores, GPU availability, parallel backend."""
cpu_cores = mp.cpu_count() or 1 cpu_cores = mp.cpu_count() or 1
# Check Ray
ray_available = False
ray_initialized = False
try:
from app.services.parallel_coverage_service import RAY_AVAILABLE
ray_available = RAY_AVAILABLE
if ray_available:
import ray
ray_initialized = ray.is_initialized()
except Exception:
pass
# Check GPU
gpu_info = None gpu_info = None
try: try:
import cupy as cp import cupy as cp
@@ -24,7 +37,9 @@ async def get_system_info():
return { return {
"cpu_cores": cpu_cores, "cpu_cores": cpu_cores,
"parallel_workers": min(cpu_cores, 14), "parallel_workers": min(cpu_cores, 14),
"parallel_enabled": True, "parallel_backend": "ray" if ray_available else "sequential",
"ray_available": ray_available,
"ray_initialized": ray_initialized,
"gpu": gpu_info, "gpu": gpu_info,
"gpu_enabled": gpu_info is not None, "gpu_enabled": gpu_info is not None,
} }

View File

@@ -54,7 +54,7 @@ from app.services.weather_service import weather_service
from app.services.indoor_service import indoor_service from app.services.indoor_service import indoor_service
from app.services.atmospheric_service import atmospheric_service from app.services.atmospheric_service import atmospheric_service
from app.services.parallel_coverage_service import ( from app.services.parallel_coverage_service import (
calculate_coverage_parallel, get_cpu_count calculate_coverage_parallel, get_cpu_count, get_parallel_backend,
) )
@@ -360,8 +360,9 @@ class CoverageService:
num_workers = get_cpu_count() num_workers = get_cpu_count()
if use_parallel: if use_parallel:
backend = get_parallel_backend()
_clog(f"━━━ PHASE 3: Calculating {len(grid)} points " _clog(f"━━━ PHASE 3: Calculating {len(grid)} points "
f"(PARALLEL, {num_workers} workers) ━━━") f"(PARALLEL/{backend}, {num_workers} workers) ━━━")
try: try:
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()

View File

@@ -1,93 +1,71 @@
""" """
Parallel coverage calculation using ProcessPoolExecutor. Parallel coverage calculation.
Workers receive pre-loaded terrain cache, buildings, and OSM data Primary backend: Ray (shared-memory object store, zero-copy numpy arrays)
via a shared pickle file. Each worker initializes module-level Fallback: Sequential (single-threaded, no extra dependencies)
service singletons with the cached data, then processes point chunks.
Ray advantages over ProcessPoolExecutor:
- ray.put() stores terrain cache ONCE in shared memory
- Workers access numpy arrays via zero-copy (no per-worker pickle/copy)
- Eliminates MemoryError on Detailed preset with large terrain + buildings
Usage: Usage:
from app.services.parallel_coverage_service import calculate_coverage_parallel from app.services.parallel_coverage_service import (
calculate_coverage_parallel, get_cpu_count, RAY_AVAILABLE,
)
""" """
import os import os
import sys import sys
import time import time
import pickle
import tempfile
import multiprocessing as mp import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor
from typing import List, Dict, Tuple, Any, Optional, Callable from typing import List, Dict, Tuple, Any, Optional, Callable
import numpy as np import numpy as np
# ── Module-level worker state (set once per process by _init_worker) ── # ── Try to import Ray ──
_worker_data: Dict[str, Any] = {} RAY_AVAILABLE = False
_worker_initialized = False try:
import ray
RAY_AVAILABLE = True
except ImportError:
ray = None # type: ignore
def _init_worker(shared_data_path: str): # ── Worker-level spatial index cache (persists across tasks in same worker) ──
"""Initialize a worker process with shared data from temp file.
Injects terrain cache into the module-level terrain_service singleton _worker_spatial_idx = None
so that all other services (LOS, dominant path, etc.) automatically _worker_cache_key: Optional[str] = None
see the cached tiles.
def _ray_process_chunk_impl(chunk, terrain_cache, buildings, osm_data, config):
"""Implementation: process a chunk of (lat, lon, elevation) tuples.
Called inside a Ray remote function. terrain_cache numpy arrays come
from the Ray object store via zero-copy.
""" """
global _worker_data, _worker_initialized global _worker_spatial_idx, _worker_cache_key
if _worker_initialized: # Inject terrain cache into the module-level singleton.
return # For numpy arrays, Ray gives us a read-only view into shared memory.
t0 = time.time()
pid = os.getpid()
# Load shared data
with open(shared_data_path, 'rb') as f:
data = pickle.load(f)
# Inject terrain cache into the global singleton —
# this automatically fixes los_service, dominant_path_service, etc.
# because they hold references to the same terrain_service object.
from app.services.terrain_service import terrain_service from app.services.terrain_service import terrain_service
terrain_service._tile_cache = data['terrain_cache'] terrain_service._tile_cache = terrain_cache
# Build spatial index from buildings # Build or reuse spatial index (expensive — ~1s for 350K buildings).
from app.services.spatial_index import SpatialIndex cache_key = config.get('cache_key', '')
spatial_idx = SpatialIndex() if _worker_cache_key != cache_key:
if data['buildings']: from app.services.spatial_index import SpatialIndex
spatial_idx.build(data['buildings']) _worker_spatial_idx = SpatialIndex()
if buildings:
_worker_spatial_idx.build(buildings)
_worker_cache_key = cache_key
_worker_data = { # Process points
'buildings': data['buildings'],
'streets': data['streets'],
'water_bodies': data['water_bodies'],
'vegetation_areas': data['vegetation_areas'],
'spatial_idx': spatial_idx,
'site_dict': data['site_dict'],
'settings_dict': data['settings_dict'],
'site_elevation': data['site_elevation'],
}
_worker_initialized = True
dt = time.time() - t0
print(f"[WORKER {pid}] Initialized in {dt:.1f}s — "
f"{len(data['terrain_cache'])} tiles, "
f"{len(data['buildings'])} buildings, "
f"{len(data.get('vegetation_areas', []))} vegetation",
flush=True)
def _process_chunk(chunk: List[Tuple[float, float, float]]) -> List[Dict]:
"""Process a chunk of (lat, lon, point_elevation) tuples.
Returns list of CoveragePoint dicts for points above min_signal.
"""
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
data = _worker_data site = SiteParams(**config['site_dict'])
site = SiteParams(**data['site_dict']) settings = CoverageSettings(**config['settings_dict'])
settings = CoverageSettings(**data['settings_dict'])
svc = CoverageService() svc = CoverageService()
timing = { timing = {
@@ -100,10 +78,10 @@ def _process_chunk(chunk: List[Tuple[float, float, float]]) -> List[Dict]:
for lat, lon, point_elev in chunk: for lat, lon, point_elev in chunk:
point = svc._calculate_point_sync( point = svc._calculate_point_sync(
site, lat, lon, settings, site, lat, lon, settings,
data['buildings'], data['streets'], buildings, osm_data.get('streets', []),
data['spatial_idx'], data['water_bodies'], _worker_spatial_idx, osm_data.get('water_bodies', []),
data['vegetation_areas'], osm_data.get('vegetation_areas', []),
data['site_elevation'], point_elev, timing, config['site_elevation'], point_elev, timing,
) )
if point.rsrp >= settings.min_signal: if point.rsrp >= settings.min_signal:
results.append(point.model_dump()) results.append(point.model_dump())
@@ -111,6 +89,13 @@ def _process_chunk(chunk: List[Tuple[float, float, float]]) -> List[Dict]:
return results return results
# ── Register the Ray remote function (only if Ray is available) ──
_ray_process_chunk = None
if RAY_AVAILABLE:
_ray_process_chunk = ray.remote(_ray_process_chunk_impl)
# ── Public API ── # ── Public API ──
@@ -122,6 +107,42 @@ def get_cpu_count() -> int:
return 4 return 4
def get_parallel_backend() -> str:
"""Return which parallel backend is available."""
if RAY_AVAILABLE:
return "ray"
return "sequential"
def _try_init_ray(num_cpus: int) -> bool:
"""Initialize Ray lazily. Returns True if Ray is ready."""
if not RAY_AVAILABLE:
return False
if ray.is_initialized():
return True
try:
data_path = os.environ.get('RFCP_DATA_PATH', './data')
ray_tmp = os.path.join(data_path, 'ray_tmp')
os.makedirs(ray_tmp, exist_ok=True)
ray.init(
num_cpus=num_cpus,
include_dashboard=False,
log_to_driver=True,
_temp_dir=ray_tmp,
)
print(f"[PARALLEL] Ray initialized: {num_cpus} CPUs, "
f"object store ~{ray.cluster_resources().get('object_store_memory', 0) / 1e9:.1f}GB",
flush=True)
return True
except Exception as e:
print(f"[PARALLEL] Ray init failed: {e}", flush=True)
return False
def calculate_coverage_parallel( def calculate_coverage_parallel(
grid: List[Tuple[float, float]], grid: List[Tuple[float, float]],
point_elevations: Dict[Tuple[float, float], float], point_elevations: Dict[Tuple[float, float], float],
@@ -136,21 +157,12 @@ def calculate_coverage_parallel(
num_workers: Optional[int] = None, num_workers: Optional[int] = None,
log_fn: Optional[Callable[[str], None]] = None, log_fn: Optional[Callable[[str], None]] = None,
) -> Tuple[List[Dict], Dict[str, float]]: ) -> Tuple[List[Dict], Dict[str, float]]:
"""Calculate coverage points in parallel using ProcessPoolExecutor. """Calculate coverage points in parallel.
Args: Uses Ray if available (shared memory, zero-copy numpy), otherwise
grid: List of (lat, lon) tuples. falls back to sequential single-threaded calculation.
point_elevations: Pre-computed {(lat, lon): elevation} dict.
site_dict: SiteParams as a dict (for pickling).
settings_dict: CoverageSettings as a dict (for pickling).
terrain_cache: {tile_name: np.ndarray} — pre-loaded SRTM tiles.
buildings, streets, water_bodies, vegetation_areas: OSM data.
site_elevation: Elevation at site location (meters).
num_workers: Override worker count (default: auto-detect).
log_fn: Logging function (receives string messages).
Returns: Same signature as before — drop-in replacement.
(results, timing) where results is list of CoveragePoint dicts.
""" """
if log_fn is None: if log_fn is None:
log_fn = lambda msg: print(f"[PARALLEL] {msg}", flush=True) log_fn = lambda msg: print(f"[PARALLEL] {msg}", flush=True)
@@ -159,92 +171,170 @@ def calculate_coverage_parallel(
num_workers = get_cpu_count() num_workers = get_cpu_count()
total_points = len(grid) total_points = len(grid)
log_fn(f"Parallel mode: {total_points} points, {num_workers} workers")
# Prepare items with pre-computed elevations # Try Ray
if RAY_AVAILABLE and _try_init_ray(num_workers):
try:
return _calculate_with_ray(
grid, point_elevations, site_dict, settings_dict,
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
num_workers, log_fn,
)
except Exception as e:
log_fn(f"Ray execution failed: {e} — falling back to sequential")
# Fallback: sequential
log_fn(f"Sequential fallback: {total_points} points")
return _calculate_sequential(
grid, point_elevations, site_dict, settings_dict,
buildings, streets, water_bodies, vegetation_areas,
site_elevation, log_fn,
)
# ── Ray backend ──
def _calculate_with_ray(
grid, point_elevations, site_dict, settings_dict,
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
num_workers, log_fn,
):
"""Execute using Ray shared-memory object store."""
total_points = len(grid)
log_fn(f"Ray mode: {total_points} points, {num_workers} workers")
# ── Put large data into Ray object store ──
# Numpy arrays (terrain tiles) get zero-copy shared memory.
# Python objects (buildings) get serialized once, stored in plasma.
t_put = time.time()
terrain_ref = ray.put(terrain_cache)
buildings_ref = ray.put(buildings)
osm_ref = ray.put({
'streets': streets,
'water_bodies': water_bodies,
'vegetation_areas': vegetation_areas,
})
cache_key = f"{site_dict['lat']:.4f},{site_dict['lon']:.4f},{len(buildings)}"
config_ref = ray.put({
'site_dict': site_dict,
'settings_dict': settings_dict,
'site_elevation': site_elevation,
'cache_key': cache_key,
})
put_time = time.time() - t_put
log_fn(f"ray.put() done in {put_time:.1f}s")
# ── Prepare and submit chunks ──
items = [ items = [
(lat, lon, point_elevations.get((lat, lon), 0.0)) (lat, lon, point_elevations.get((lat, lon), 0.0))
for lat, lon in grid for lat, lon in grid
] ]
# Split into chunks — ~4 chunks per worker for granular progress # ~4 chunks per worker for granular progress
chunks_per_worker = 4 chunk_size = max(1, len(items) // (num_workers * 4))
chunk_size = max(1, len(items) // (num_workers * chunks_per_worker))
chunks = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] chunks = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
log_fn(f"Split into {len(chunks)} chunks of ~{chunk_size} points") log_fn(f"Submitting {len(chunks)} chunks of ~{chunk_size} points")
# ── Serialize shared data to temp file (once, not per-worker) ──
t_serial = time.time()
shared_data = {
'terrain_cache': terrain_cache,
'buildings': buildings,
'streets': streets,
'water_bodies': water_bodies,
'vegetation_areas': vegetation_areas,
'site_dict': site_dict,
'settings_dict': settings_dict,
'site_elevation': site_elevation,
}
tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix='.pkl')
try:
pickle.dump(shared_data, tmpfile, protocol=pickle.HIGHEST_PROTOCOL)
finally:
tmpfile.close()
shared_data_path = tmpfile.name
file_size_mb = os.path.getsize(shared_data_path) / (1024 * 1024)
serial_time = time.time() - t_serial
log_fn(f"Serialized shared data: {file_size_mb:.1f}MB in {serial_time:.1f}s")
# Free main-process memory for the duplicate
del shared_data
# ── Run in process pool ──
t_calc = time.time() t_calc = time.time()
pending = [
_ray_process_chunk.remote(chunk, terrain_ref, buildings_ref, osm_ref, config_ref)
for chunk in chunks
]
# ── Collect results with progress via ray.wait() ──
all_results: List[Dict] = [] all_results: List[Dict] = []
completed_points = 0 total_chunks = len(pending)
remaining = list(pending)
completed_chunks = 0
try: while remaining:
with ProcessPoolExecutor( # Wait for at least 1 result, batch up to ~10% for progress logging
max_workers=num_workers, batch = max(1, min(len(remaining), total_chunks // 10 or 1))
initializer=_init_worker, done, remaining = ray.wait(remaining, num_returns=batch, timeout=600)
initargs=(shared_data_path,),
) as executor:
futures = [executor.submit(_process_chunk, chunk) for chunk in chunks]
for i, future in enumerate(futures): for ref in done:
try: try:
chunk_results = future.result(timeout=600) # 10 min max per chunk chunk_results = ray.get(ref)
all_results.extend(chunk_results) all_results.extend(chunk_results)
except Exception as e: except Exception as e:
log_fn(f"Chunk {i} failed: {e}") log_fn(f"Chunk error: {e}")
completed_points += len(chunks[i]) completed_chunks += len(done)
pct = min(100, completed_points * 100 // total_points) pct = completed_chunks * 100 // total_chunks
elapsed = time.time() - t_calc elapsed = time.time() - t_calc
rate = completed_points / elapsed if elapsed > 0 else 0 pts = len(all_results)
rate = pts / elapsed if elapsed > 0 else 0
# Log every ~10% or on last chunk eta = (total_points - pts) / rate if rate > 0 else 0
if (i + 1) % max(1, len(chunks) // 10) == 0 or i == len(chunks) - 1: log_fn(f"Progress: {completed_chunks}/{total_chunks} chunks ({pct}%) — "
eta = (total_points - completed_points) / rate if rate > 0 else 0 f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
log_fn(f"Progress: {completed_points}/{total_points} ({pct}%) — "
f"{rate:.0f} pts/s, ETA {eta:.0f}s")
finally:
# Clean up temp file
try:
os.unlink(shared_data_path)
except Exception:
pass
calc_time = time.time() - t_calc calc_time = time.time() - t_calc
log_fn(f"Parallel done: {calc_time:.1f}s, {len(all_results)} results " log_fn(f"Ray done: {calc_time:.1f}s, {len(all_results)} results "
f"({calc_time / max(1, total_points) * 1000:.1f}ms/point)") f"({calc_time / max(1, total_points) * 1000:.1f}ms/point)")
timing = { timing = {
"parallel_total": calc_time, "parallel_total": calc_time,
"serialize": serial_time, "ray_put": put_time,
"workers": num_workers, "workers": num_workers,
"backend": "ray",
} }
return all_results, timing return all_results, timing
# ── Sequential fallback ──
def _calculate_sequential(
grid, point_elevations, site_dict, settings_dict,
buildings, streets, water_bodies, vegetation_areas,
site_elevation, log_fn,
):
"""Sequential fallback — no extra dependencies, runs in calling thread."""
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
from app.services.spatial_index import SpatialIndex
site = SiteParams(**site_dict)
settings = CoverageSettings(**settings_dict)
svc = CoverageService()
spatial_idx = SpatialIndex()
if buildings:
spatial_idx.build(buildings)
total = len(grid)
log_interval = max(1, total // 20)
timing = {
"los": 0.0, "buildings": 0.0, "antenna": 0.0,
"dominant_path": 0.0, "street_canyon": 0.0,
"reflection": 0.0, "vegetation": 0.0,
}
t0 = time.time()
results = []
for i, (lat, lon) in enumerate(grid):
if i % log_interval == 0:
log_fn(f"Sequential: {i}/{total} ({i * 100 // total}%)")
point_elev = point_elevations.get((lat, lon), 0.0)
point = svc._calculate_point_sync(
site, lat, lon, settings,
buildings, streets, spatial_idx,
water_bodies, vegetation_areas,
site_elevation, point_elev, timing,
)
if point.rsrp >= settings.min_signal:
results.append(point.model_dump())
calc_time = time.time() - t0
log_fn(f"Sequential done: {calc_time:.1f}s, {len(results)} results "
f"({calc_time / max(1, total) * 1000:.1f}ms/point)")
timing["sequential_total"] = calc_time
timing["backend"] = "sequential"
return results, timing

View File

@@ -11,3 +11,4 @@ requests==2.31.0
httpx==0.27.0 httpx==0.27.0
aiosqlite>=0.19.0 aiosqlite>=0.19.0
sqlalchemy>=2.0.0 sqlalchemy>=2.0.0
ray[default]>=2.9.0

View File

@@ -83,6 +83,21 @@ a = Analysis(
# Encoding # Encoding
'email.mime', 'email.mime',
'email.mime.multipart', 'email.mime.multipart',
# Ray (parallel processing) — graceful fallback if missing
'ray',
'ray._private',
'ray._private.worker',
'ray._private.node',
'ray._private.services',
'ray._private.utils',
'ray._raylet',
'ray.runtime_context',
'ray.util',
# Multiprocessing (fallback)
'multiprocessing',
'multiprocessing.pool',
'multiprocessing.queues',
'concurrent.futures',
], ],
hookspath=[], hookspath=[],
hooksconfig={}, hooksconfig={},