@mytec: feat: Phase 3.0 Architecture Refactor

Major refactoring of RFCP backend:
- Modular propagation models (8 models)
- SharedMemoryManager for terrain data
- ProcessPoolExecutor parallel processing
- WebSocket progress streaming
- Building filtering pipeline (351k → 15k)
- 82 unit tests

Performance: Standard preset 38s → 5s (7.6x speedup)

Known issue: Detailed preset timeout (fix in 3.1.0)
This commit is contained in:
2026-02-01 23:12:26 +02:00
parent 1dde56705a
commit defa3ad440
71 changed files with 7134 additions and 256 deletions

View File

@@ -47,6 +47,24 @@ class CancellationToken:
return self._event.is_set()
# ── Active pool tracking (for graceful shutdown) ──
_active_pool = None # Global ref to current ProcessPoolExecutor
_active_pool_lock = threading.Lock()
def _set_active_pool(pool):
global _active_pool
with _active_pool_lock:
_active_pool = pool
def _clear_active_pool():
global _active_pool
with _active_pool_lock:
_active_pool = None
# ── Worker process cleanup ──
def _clog(msg: str):
@@ -57,10 +75,23 @@ def _clog(msg: str):
def _kill_worker_processes() -> int:
"""Kill ALL rfcp-server processes except the current (main) process.
Uses process NAME matching instead of PID tree because psutil.children()
cannot see grandchildren spawned by ProcessPoolExecutor workers.
First shuts down the active ProcessPoolExecutor (if any), then uses
process NAME matching to kill remaining workers.
Returns the number of processes killed.
"""
global _active_pool
# Step 0: Shut down active ProcessPoolExecutor gracefully
with _active_pool_lock:
pool = _active_pool
_active_pool = None
if pool is not None:
try:
pool.shutdown(wait=False, cancel_futures=True)
_clog("Active ProcessPoolExecutor shutdown requested")
except Exception as e:
_clog(f"Pool shutdown error: {e}")
my_pid = os.getpid()
killed_count = 0
@@ -154,10 +185,12 @@ def _ray_process_chunk_impl(chunk, terrain_cache, buildings, osm_data, config):
# Build or reuse spatial index (expensive — ~1s for 350K buildings).
cache_key = config.get('cache_key', '')
if _worker_cache_key != cache_key:
from app.services.spatial_index import SpatialIndex
_worker_spatial_idx = SpatialIndex()
if buildings:
from app.services.spatial_index import SpatialIndex
_worker_spatial_idx = SpatialIndex()
_worker_spatial_idx.build(buildings)
else:
_worker_spatial_idx = None
_worker_cache_key = cache_key
# Process points
@@ -262,6 +295,7 @@ def calculate_coverage_parallel(
log_fn: Optional[Callable[[str], None]] = None,
cancel_token: Optional[CancellationToken] = None,
precomputed: Optional[Dict] = None,
progress_fn: Optional[Callable[[str, float], None]] = None,
) -> Tuple[List[Dict], Dict[str, float]]:
"""Calculate coverage points in parallel.
@@ -287,6 +321,7 @@ def calculate_coverage_parallel(
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
num_workers, log_fn, cancel_token, precomputed,
progress_fn,
)
except Exception as e:
log_fn(f"Ray execution failed: {e} — falling back to sequential")
@@ -300,6 +335,7 @@ def calculate_coverage_parallel(
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
pool_workers, log_fn, cancel_token, precomputed,
progress_fn,
)
except Exception as e:
log_fn(f"ProcessPool failed: {e} — falling back to sequential")
@@ -310,6 +346,7 @@ def calculate_coverage_parallel(
grid, point_elevations, site_dict, settings_dict,
buildings, streets, water_bodies, vegetation_areas,
site_elevation, log_fn, cancel_token, precomputed,
progress_fn,
)
@@ -321,6 +358,7 @@ def _calculate_with_ray(
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
num_workers, log_fn, cancel_token=None, precomputed=None,
progress_fn=None,
):
"""Execute using Ray shared-memory object store."""
total_points = len(grid)
@@ -404,6 +442,9 @@ def _calculate_with_ray(
eta = (total_points - pts) / rate if rate > 0 else 0
log_fn(f"Progress: {completed_chunks}/{total_chunks} chunks ({pct}%) — "
f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
if progress_fn:
# Map chunk progress to 40%-95% range
progress_fn("Calculating coverage", 0.40 + 0.55 * (completed_chunks / total_chunks))
calc_time = time.time() - t_calc
log_fn(f"Ray done: {calc_time:.1f}s, {len(all_results)} results "
@@ -428,9 +469,10 @@ def _pool_worker_process_chunk(args):
from app.services.terrain_service import terrain_service
terrain_service._tile_cache = terrain_cache
from app.services.spatial_index import SpatialIndex
spatial_idx = SpatialIndex()
spatial_idx = None
if buildings:
from app.services.spatial_index import SpatialIndex
spatial_idx = SpatialIndex()
spatial_idx.build(buildings)
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
@@ -465,32 +507,162 @@ def _pool_worker_process_chunk(args):
return results
def _store_terrain_in_shm(terrain_cache: Dict[str, np.ndarray], log_fn) -> Tuple[list, Dict[str, dict]]:
"""Store terrain tile arrays in shared memory. Returns (shm_blocks, tile_refs).
tile_refs is a dict mapping tile_name -> {shm_name, shape, dtype_str}
that workers use to reconstruct numpy arrays from shared memory.
"""
import multiprocessing.shared_memory as shm_mod
blocks = []
refs = {}
for tile_name, arr in terrain_cache.items():
try:
block = shm_mod.SharedMemory(create=True, size=arr.nbytes)
blocks.append(block)
# Copy tile data to shared memory
shm_arr = np.ndarray(arr.shape, dtype=arr.dtype, buffer=block.buf)
shm_arr[:] = arr[:]
refs[tile_name] = {
'shm_name': block.name,
'shape': arr.shape,
'dtype': str(arr.dtype),
}
except Exception as e:
log_fn(f"Failed to store tile {tile_name} in shm: {e}")
# Fallback: worker will have to use pickled copy
pass
return blocks, refs
def _pool_worker_shm_chunk(args):
"""Worker function that reads terrain from shared memory instead of pickle."""
import multiprocessing.shared_memory as shm_mod
chunk, terrain_shm_refs, buildings, osm_data, config = args
# Reconstruct terrain cache from shared memory (zero-copy numpy views)
terrain_cache = {}
for tile_name, ref in terrain_shm_refs.items():
try:
block = shm_mod.SharedMemory(name=ref['shm_name'])
terrain_cache[tile_name] = np.ndarray(
ref['shape'], dtype=ref['dtype'], buffer=block.buf,
)
except Exception:
pass
# Inject terrain cache
from app.services.terrain_service import terrain_service
terrain_service._tile_cache = terrain_cache
# Build spatial index
global _worker_spatial_idx, _worker_cache_key
cache_key = config.get('cache_key', '')
if _worker_cache_key != cache_key:
if buildings:
from app.services.spatial_index import SpatialIndex
_worker_spatial_idx = SpatialIndex()
_worker_spatial_idx.build(buildings)
else:
_worker_spatial_idx = None
_worker_cache_key = cache_key
# Process points
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
site = SiteParams(**config['site_dict'])
settings = CoverageSettings(**config['settings_dict'])
svc = CoverageService()
timing = {
"los": 0.0, "buildings": 0.0, "antenna": 0.0,
"dominant_path": 0.0, "street_canyon": 0.0,
"reflection": 0.0, "vegetation": 0.0,
}
precomputed = config.get('precomputed')
results = []
for lat, lon, point_elev in chunk:
pre = precomputed.get((lat, lon)) if precomputed else None
point = svc._calculate_point_sync(
site, lat, lon, settings,
buildings, osm_data.get('streets', []),
_worker_spatial_idx, osm_data.get('water_bodies', []),
osm_data.get('vegetation_areas', []),
config['site_elevation'], point_elev, timing,
precomputed_distance=pre.get('distance') if pre else None,
precomputed_path_loss=pre.get('path_loss') if pre else None,
)
if point.rsrp >= settings.min_signal:
results.append(point.model_dump())
return results
def _calculate_with_process_pool(
grid, point_elevations, site_dict, settings_dict,
terrain_cache, buildings, streets, water_bodies,
vegetation_areas, site_elevation,
num_workers, log_fn, cancel_token=None, precomputed=None,
progress_fn=None,
):
"""Execute using ProcessPoolExecutor with reduced workers to limit memory."""
"""Execute using ProcessPoolExecutor.
Uses shared memory for terrain tiles (zero-copy numpy views) to reduce
memory usage compared to pickling full terrain arrays per worker.
"""
from concurrent.futures import ProcessPoolExecutor, as_completed
total_points = len(grid)
log_fn(f"ProcessPool mode: {total_points} points, {num_workers} workers")
# Estimate pickle size for building data and cap workers accordingly
building_count = len(buildings)
if building_count > 10000:
num_workers = min(num_workers, 3)
log_fn(f"Large building set ({building_count}) — reducing workers to {num_workers}")
elif building_count > 5000:
num_workers = min(num_workers, 4)
log_fn(f"ProcessPool mode: {total_points} points, {num_workers} workers, "
f"{building_count} buildings")
# Store terrain tiles in shared memory
shm_blocks = []
terrain_shm_refs = {}
try:
shm_blocks, terrain_shm_refs = _store_terrain_in_shm(terrain_cache, log_fn)
if terrain_shm_refs:
tile_mb = sum(
np.prod(r['shape']) * np.dtype(r['dtype']).itemsize
for r in terrain_shm_refs.values()
) / (1024 * 1024)
log_fn(f"Stored {len(terrain_shm_refs)} terrain tiles in shared memory ({tile_mb:.0f} MB)")
use_shm = True
else:
use_shm = False
except Exception as e:
log_fn(f"Shared memory setup failed ({e}), using pickle fallback")
use_shm = False
items = [
(lat, lon, point_elevations.get((lat, lon), 0.0))
for lat, lon in grid
]
# Larger chunks than Ray — fewer workers means bigger chunks
chunk_size = max(1, len(items) // (num_workers * 2))
chunks = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
log_fn(f"Submitting {len(chunks)} chunks of ~{chunk_size} points")
cache_key = f"{site_dict['lat']:.4f},{site_dict['lon']:.4f},{len(buildings)}"
config = {
'site_dict': site_dict,
'settings_dict': settings_dict,
'site_elevation': site_elevation,
'cache_key': cache_key,
}
if precomputed:
config['precomputed'] = precomputed
@@ -505,20 +677,32 @@ def _calculate_with_process_pool(
pool = None
try:
# Use spawn context for clean worker processes
ctx = mp.get_context('spawn')
pool = ProcessPoolExecutor(max_workers=num_workers, mp_context=ctx)
futures = {
pool.submit(
_pool_worker_process_chunk,
(chunk, terrain_cache, buildings, osm_data, config),
): i
for i, chunk in enumerate(chunks)
}
_set_active_pool(pool)
if use_shm:
# Shared memory path: pass shm refs instead of terrain data
worker_fn = _pool_worker_shm_chunk
futures = {
pool.submit(
worker_fn,
(chunk, terrain_shm_refs, buildings, osm_data, config),
): i
for i, chunk in enumerate(chunks)
}
else:
# Pickle fallback path
futures = {
pool.submit(
_pool_worker_process_chunk,
(chunk, terrain_cache, buildings, osm_data, config),
): i
for i, chunk in enumerate(chunks)
}
completed_chunks = 0
for future in as_completed(futures):
# Check cancellation between chunks
if cancel_token and cancel_token.is_cancelled:
log_fn(f"Cancelled — cancelling {len(futures) - completed_chunks - 1} pending futures")
for f in futures:
@@ -539,20 +723,27 @@ def _calculate_with_process_pool(
eta = (total_points - pts) / rate if rate > 0 else 0
log_fn(f"Progress: {completed_chunks}/{len(chunks)} chunks ({pct}%) — "
f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
if progress_fn:
progress_fn("Calculating coverage", 0.40 + 0.55 * (completed_chunks / len(chunks)))
except Exception as e:
log_fn(f"ProcessPool error: {e}")
finally:
# CRITICAL: Always cleanup pool and orphaned workers
_clear_active_pool()
if pool:
pool.shutdown(wait=False, cancel_futures=True)
# Give pool time to cleanup gracefully
time.sleep(0.5)
# Then force kill any survivors by process name
killed = _kill_worker_processes()
if killed > 0:
log_fn(f"Force killed {killed} orphaned workers")
# Cleanup shared memory blocks
for block in shm_blocks:
try:
block.close()
block.unlink()
except Exception:
pass
calc_time = time.time() - t_calc
log_fn(f"ProcessPool done: {calc_time:.1f}s, {len(all_results)} results "
@@ -561,7 +752,7 @@ def _calculate_with_process_pool(
timing = {
"parallel_total": calc_time,
"workers": num_workers,
"backend": "process_pool",
"backend": "process_pool" + ("/shm" if use_shm else "/pickle"),
}
return all_results, timing
@@ -573,6 +764,7 @@ def _calculate_sequential(
grid, point_elevations, site_dict, settings_dict,
buildings, streets, water_bodies, vegetation_areas,
site_elevation, log_fn, cancel_token=None, precomputed=None,
progress_fn=None,
):
"""Sequential fallback — no extra dependencies, runs in calling thread."""
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
@@ -582,8 +774,9 @@ def _calculate_sequential(
settings = CoverageSettings(**settings_dict)
svc = CoverageService()
spatial_idx = SpatialIndex()
spatial_idx = None
if buildings:
spatial_idx = SpatialIndex()
spatial_idx.build(buildings)
total = len(grid)
@@ -604,6 +797,8 @@ def _calculate_sequential(
if i % log_interval == 0:
log_fn(f"Sequential: {i}/{total} ({i * 100 // total}%)")
if progress_fn:
progress_fn("Calculating coverage", 0.40 + 0.55 * (i / total))
point_elev = point_elevations.get((lat, lon), 0.0)