@mytec: feat: Phase 3.0 Architecture Refactor ✅
Major refactoring of RFCP backend: - Modular propagation models (8 models) - SharedMemoryManager for terrain data - ProcessPoolExecutor parallel processing - WebSocket progress streaming - Building filtering pipeline (351k → 15k) - 82 unit tests Performance: Standard preset 38s → 5s (7.6x speedup) Known issue: Detailed preset timeout (fix in 3.1.0)
This commit is contained in:
@@ -47,6 +47,24 @@ class CancellationToken:
|
||||
return self._event.is_set()
|
||||
|
||||
|
||||
# ── Active pool tracking (for graceful shutdown) ──
|
||||
|
||||
_active_pool = None # Global ref to current ProcessPoolExecutor
|
||||
_active_pool_lock = threading.Lock()
|
||||
|
||||
|
||||
def _set_active_pool(pool):
|
||||
global _active_pool
|
||||
with _active_pool_lock:
|
||||
_active_pool = pool
|
||||
|
||||
|
||||
def _clear_active_pool():
|
||||
global _active_pool
|
||||
with _active_pool_lock:
|
||||
_active_pool = None
|
||||
|
||||
|
||||
# ── Worker process cleanup ──
|
||||
|
||||
def _clog(msg: str):
|
||||
@@ -57,10 +75,23 @@ def _clog(msg: str):
|
||||
def _kill_worker_processes() -> int:
|
||||
"""Kill ALL rfcp-server processes except the current (main) process.
|
||||
|
||||
Uses process NAME matching instead of PID tree because psutil.children()
|
||||
cannot see grandchildren spawned by ProcessPoolExecutor workers.
|
||||
First shuts down the active ProcessPoolExecutor (if any), then uses
|
||||
process NAME matching to kill remaining workers.
|
||||
Returns the number of processes killed.
|
||||
"""
|
||||
global _active_pool
|
||||
|
||||
# Step 0: Shut down active ProcessPoolExecutor gracefully
|
||||
with _active_pool_lock:
|
||||
pool = _active_pool
|
||||
_active_pool = None
|
||||
if pool is not None:
|
||||
try:
|
||||
pool.shutdown(wait=False, cancel_futures=True)
|
||||
_clog("Active ProcessPoolExecutor shutdown requested")
|
||||
except Exception as e:
|
||||
_clog(f"Pool shutdown error: {e}")
|
||||
|
||||
my_pid = os.getpid()
|
||||
killed_count = 0
|
||||
|
||||
@@ -154,10 +185,12 @@ def _ray_process_chunk_impl(chunk, terrain_cache, buildings, osm_data, config):
|
||||
# Build or reuse spatial index (expensive — ~1s for 350K buildings).
|
||||
cache_key = config.get('cache_key', '')
|
||||
if _worker_cache_key != cache_key:
|
||||
from app.services.spatial_index import SpatialIndex
|
||||
_worker_spatial_idx = SpatialIndex()
|
||||
if buildings:
|
||||
from app.services.spatial_index import SpatialIndex
|
||||
_worker_spatial_idx = SpatialIndex()
|
||||
_worker_spatial_idx.build(buildings)
|
||||
else:
|
||||
_worker_spatial_idx = None
|
||||
_worker_cache_key = cache_key
|
||||
|
||||
# Process points
|
||||
@@ -262,6 +295,7 @@ def calculate_coverage_parallel(
|
||||
log_fn: Optional[Callable[[str], None]] = None,
|
||||
cancel_token: Optional[CancellationToken] = None,
|
||||
precomputed: Optional[Dict] = None,
|
||||
progress_fn: Optional[Callable[[str, float], None]] = None,
|
||||
) -> Tuple[List[Dict], Dict[str, float]]:
|
||||
"""Calculate coverage points in parallel.
|
||||
|
||||
@@ -287,6 +321,7 @@ def calculate_coverage_parallel(
|
||||
terrain_cache, buildings, streets, water_bodies,
|
||||
vegetation_areas, site_elevation,
|
||||
num_workers, log_fn, cancel_token, precomputed,
|
||||
progress_fn,
|
||||
)
|
||||
except Exception as e:
|
||||
log_fn(f"Ray execution failed: {e} — falling back to sequential")
|
||||
@@ -300,6 +335,7 @@ def calculate_coverage_parallel(
|
||||
terrain_cache, buildings, streets, water_bodies,
|
||||
vegetation_areas, site_elevation,
|
||||
pool_workers, log_fn, cancel_token, precomputed,
|
||||
progress_fn,
|
||||
)
|
||||
except Exception as e:
|
||||
log_fn(f"ProcessPool failed: {e} — falling back to sequential")
|
||||
@@ -310,6 +346,7 @@ def calculate_coverage_parallel(
|
||||
grid, point_elevations, site_dict, settings_dict,
|
||||
buildings, streets, water_bodies, vegetation_areas,
|
||||
site_elevation, log_fn, cancel_token, precomputed,
|
||||
progress_fn,
|
||||
)
|
||||
|
||||
|
||||
@@ -321,6 +358,7 @@ def _calculate_with_ray(
|
||||
terrain_cache, buildings, streets, water_bodies,
|
||||
vegetation_areas, site_elevation,
|
||||
num_workers, log_fn, cancel_token=None, precomputed=None,
|
||||
progress_fn=None,
|
||||
):
|
||||
"""Execute using Ray shared-memory object store."""
|
||||
total_points = len(grid)
|
||||
@@ -404,6 +442,9 @@ def _calculate_with_ray(
|
||||
eta = (total_points - pts) / rate if rate > 0 else 0
|
||||
log_fn(f"Progress: {completed_chunks}/{total_chunks} chunks ({pct}%) — "
|
||||
f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
|
||||
if progress_fn:
|
||||
# Map chunk progress to 40%-95% range
|
||||
progress_fn("Calculating coverage", 0.40 + 0.55 * (completed_chunks / total_chunks))
|
||||
|
||||
calc_time = time.time() - t_calc
|
||||
log_fn(f"Ray done: {calc_time:.1f}s, {len(all_results)} results "
|
||||
@@ -428,9 +469,10 @@ def _pool_worker_process_chunk(args):
|
||||
from app.services.terrain_service import terrain_service
|
||||
terrain_service._tile_cache = terrain_cache
|
||||
|
||||
from app.services.spatial_index import SpatialIndex
|
||||
spatial_idx = SpatialIndex()
|
||||
spatial_idx = None
|
||||
if buildings:
|
||||
from app.services.spatial_index import SpatialIndex
|
||||
spatial_idx = SpatialIndex()
|
||||
spatial_idx.build(buildings)
|
||||
|
||||
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
|
||||
@@ -465,32 +507,162 @@ def _pool_worker_process_chunk(args):
|
||||
return results
|
||||
|
||||
|
||||
def _store_terrain_in_shm(terrain_cache: Dict[str, np.ndarray], log_fn) -> Tuple[list, Dict[str, dict]]:
|
||||
"""Store terrain tile arrays in shared memory. Returns (shm_blocks, tile_refs).
|
||||
|
||||
tile_refs is a dict mapping tile_name -> {shm_name, shape, dtype_str}
|
||||
that workers use to reconstruct numpy arrays from shared memory.
|
||||
"""
|
||||
import multiprocessing.shared_memory as shm_mod
|
||||
|
||||
blocks = []
|
||||
refs = {}
|
||||
|
||||
for tile_name, arr in terrain_cache.items():
|
||||
try:
|
||||
block = shm_mod.SharedMemory(create=True, size=arr.nbytes)
|
||||
blocks.append(block)
|
||||
# Copy tile data to shared memory
|
||||
shm_arr = np.ndarray(arr.shape, dtype=arr.dtype, buffer=block.buf)
|
||||
shm_arr[:] = arr[:]
|
||||
refs[tile_name] = {
|
||||
'shm_name': block.name,
|
||||
'shape': arr.shape,
|
||||
'dtype': str(arr.dtype),
|
||||
}
|
||||
except Exception as e:
|
||||
log_fn(f"Failed to store tile {tile_name} in shm: {e}")
|
||||
# Fallback: worker will have to use pickled copy
|
||||
pass
|
||||
|
||||
return blocks, refs
|
||||
|
||||
|
||||
def _pool_worker_shm_chunk(args):
|
||||
"""Worker function that reads terrain from shared memory instead of pickle."""
|
||||
import multiprocessing.shared_memory as shm_mod
|
||||
|
||||
chunk, terrain_shm_refs, buildings, osm_data, config = args
|
||||
|
||||
# Reconstruct terrain cache from shared memory (zero-copy numpy views)
|
||||
terrain_cache = {}
|
||||
for tile_name, ref in terrain_shm_refs.items():
|
||||
try:
|
||||
block = shm_mod.SharedMemory(name=ref['shm_name'])
|
||||
terrain_cache[tile_name] = np.ndarray(
|
||||
ref['shape'], dtype=ref['dtype'], buffer=block.buf,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Inject terrain cache
|
||||
from app.services.terrain_service import terrain_service
|
||||
terrain_service._tile_cache = terrain_cache
|
||||
|
||||
# Build spatial index
|
||||
global _worker_spatial_idx, _worker_cache_key
|
||||
cache_key = config.get('cache_key', '')
|
||||
if _worker_cache_key != cache_key:
|
||||
if buildings:
|
||||
from app.services.spatial_index import SpatialIndex
|
||||
_worker_spatial_idx = SpatialIndex()
|
||||
_worker_spatial_idx.build(buildings)
|
||||
else:
|
||||
_worker_spatial_idx = None
|
||||
_worker_cache_key = cache_key
|
||||
|
||||
# Process points
|
||||
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
|
||||
site = SiteParams(**config['site_dict'])
|
||||
settings = CoverageSettings(**config['settings_dict'])
|
||||
svc = CoverageService()
|
||||
|
||||
timing = {
|
||||
"los": 0.0, "buildings": 0.0, "antenna": 0.0,
|
||||
"dominant_path": 0.0, "street_canyon": 0.0,
|
||||
"reflection": 0.0, "vegetation": 0.0,
|
||||
}
|
||||
|
||||
precomputed = config.get('precomputed')
|
||||
|
||||
results = []
|
||||
for lat, lon, point_elev in chunk:
|
||||
pre = precomputed.get((lat, lon)) if precomputed else None
|
||||
point = svc._calculate_point_sync(
|
||||
site, lat, lon, settings,
|
||||
buildings, osm_data.get('streets', []),
|
||||
_worker_spatial_idx, osm_data.get('water_bodies', []),
|
||||
osm_data.get('vegetation_areas', []),
|
||||
config['site_elevation'], point_elev, timing,
|
||||
precomputed_distance=pre.get('distance') if pre else None,
|
||||
precomputed_path_loss=pre.get('path_loss') if pre else None,
|
||||
)
|
||||
if point.rsrp >= settings.min_signal:
|
||||
results.append(point.model_dump())
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _calculate_with_process_pool(
|
||||
grid, point_elevations, site_dict, settings_dict,
|
||||
terrain_cache, buildings, streets, water_bodies,
|
||||
vegetation_areas, site_elevation,
|
||||
num_workers, log_fn, cancel_token=None, precomputed=None,
|
||||
progress_fn=None,
|
||||
):
|
||||
"""Execute using ProcessPoolExecutor with reduced workers to limit memory."""
|
||||
"""Execute using ProcessPoolExecutor.
|
||||
|
||||
Uses shared memory for terrain tiles (zero-copy numpy views) to reduce
|
||||
memory usage compared to pickling full terrain arrays per worker.
|
||||
"""
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
|
||||
total_points = len(grid)
|
||||
log_fn(f"ProcessPool mode: {total_points} points, {num_workers} workers")
|
||||
|
||||
# Estimate pickle size for building data and cap workers accordingly
|
||||
building_count = len(buildings)
|
||||
if building_count > 10000:
|
||||
num_workers = min(num_workers, 3)
|
||||
log_fn(f"Large building set ({building_count}) — reducing workers to {num_workers}")
|
||||
elif building_count > 5000:
|
||||
num_workers = min(num_workers, 4)
|
||||
|
||||
log_fn(f"ProcessPool mode: {total_points} points, {num_workers} workers, "
|
||||
f"{building_count} buildings")
|
||||
|
||||
# Store terrain tiles in shared memory
|
||||
shm_blocks = []
|
||||
terrain_shm_refs = {}
|
||||
try:
|
||||
shm_blocks, terrain_shm_refs = _store_terrain_in_shm(terrain_cache, log_fn)
|
||||
if terrain_shm_refs:
|
||||
tile_mb = sum(
|
||||
np.prod(r['shape']) * np.dtype(r['dtype']).itemsize
|
||||
for r in terrain_shm_refs.values()
|
||||
) / (1024 * 1024)
|
||||
log_fn(f"Stored {len(terrain_shm_refs)} terrain tiles in shared memory ({tile_mb:.0f} MB)")
|
||||
use_shm = True
|
||||
else:
|
||||
use_shm = False
|
||||
except Exception as e:
|
||||
log_fn(f"Shared memory setup failed ({e}), using pickle fallback")
|
||||
use_shm = False
|
||||
|
||||
items = [
|
||||
(lat, lon, point_elevations.get((lat, lon), 0.0))
|
||||
for lat, lon in grid
|
||||
]
|
||||
|
||||
# Larger chunks than Ray — fewer workers means bigger chunks
|
||||
chunk_size = max(1, len(items) // (num_workers * 2))
|
||||
chunks = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
|
||||
log_fn(f"Submitting {len(chunks)} chunks of ~{chunk_size} points")
|
||||
|
||||
cache_key = f"{site_dict['lat']:.4f},{site_dict['lon']:.4f},{len(buildings)}"
|
||||
config = {
|
||||
'site_dict': site_dict,
|
||||
'settings_dict': settings_dict,
|
||||
'site_elevation': site_elevation,
|
||||
'cache_key': cache_key,
|
||||
}
|
||||
if precomputed:
|
||||
config['precomputed'] = precomputed
|
||||
@@ -505,20 +677,32 @@ def _calculate_with_process_pool(
|
||||
pool = None
|
||||
|
||||
try:
|
||||
# Use spawn context for clean worker processes
|
||||
ctx = mp.get_context('spawn')
|
||||
pool = ProcessPoolExecutor(max_workers=num_workers, mp_context=ctx)
|
||||
futures = {
|
||||
pool.submit(
|
||||
_pool_worker_process_chunk,
|
||||
(chunk, terrain_cache, buildings, osm_data, config),
|
||||
): i
|
||||
for i, chunk in enumerate(chunks)
|
||||
}
|
||||
_set_active_pool(pool)
|
||||
|
||||
if use_shm:
|
||||
# Shared memory path: pass shm refs instead of terrain data
|
||||
worker_fn = _pool_worker_shm_chunk
|
||||
futures = {
|
||||
pool.submit(
|
||||
worker_fn,
|
||||
(chunk, terrain_shm_refs, buildings, osm_data, config),
|
||||
): i
|
||||
for i, chunk in enumerate(chunks)
|
||||
}
|
||||
else:
|
||||
# Pickle fallback path
|
||||
futures = {
|
||||
pool.submit(
|
||||
_pool_worker_process_chunk,
|
||||
(chunk, terrain_cache, buildings, osm_data, config),
|
||||
): i
|
||||
for i, chunk in enumerate(chunks)
|
||||
}
|
||||
|
||||
completed_chunks = 0
|
||||
for future in as_completed(futures):
|
||||
# Check cancellation between chunks
|
||||
if cancel_token and cancel_token.is_cancelled:
|
||||
log_fn(f"Cancelled — cancelling {len(futures) - completed_chunks - 1} pending futures")
|
||||
for f in futures:
|
||||
@@ -539,20 +723,27 @@ def _calculate_with_process_pool(
|
||||
eta = (total_points - pts) / rate if rate > 0 else 0
|
||||
log_fn(f"Progress: {completed_chunks}/{len(chunks)} chunks ({pct}%) — "
|
||||
f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
|
||||
if progress_fn:
|
||||
progress_fn("Calculating coverage", 0.40 + 0.55 * (completed_chunks / len(chunks)))
|
||||
|
||||
except Exception as e:
|
||||
log_fn(f"ProcessPool error: {e}")
|
||||
|
||||
finally:
|
||||
# CRITICAL: Always cleanup pool and orphaned workers
|
||||
_clear_active_pool()
|
||||
if pool:
|
||||
pool.shutdown(wait=False, cancel_futures=True)
|
||||
# Give pool time to cleanup gracefully
|
||||
time.sleep(0.5)
|
||||
# Then force kill any survivors by process name
|
||||
killed = _kill_worker_processes()
|
||||
if killed > 0:
|
||||
log_fn(f"Force killed {killed} orphaned workers")
|
||||
# Cleanup shared memory blocks
|
||||
for block in shm_blocks:
|
||||
try:
|
||||
block.close()
|
||||
block.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
calc_time = time.time() - t_calc
|
||||
log_fn(f"ProcessPool done: {calc_time:.1f}s, {len(all_results)} results "
|
||||
@@ -561,7 +752,7 @@ def _calculate_with_process_pool(
|
||||
timing = {
|
||||
"parallel_total": calc_time,
|
||||
"workers": num_workers,
|
||||
"backend": "process_pool",
|
||||
"backend": "process_pool" + ("/shm" if use_shm else "/pickle"),
|
||||
}
|
||||
return all_results, timing
|
||||
|
||||
@@ -573,6 +764,7 @@ def _calculate_sequential(
|
||||
grid, point_elevations, site_dict, settings_dict,
|
||||
buildings, streets, water_bodies, vegetation_areas,
|
||||
site_elevation, log_fn, cancel_token=None, precomputed=None,
|
||||
progress_fn=None,
|
||||
):
|
||||
"""Sequential fallback — no extra dependencies, runs in calling thread."""
|
||||
from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
|
||||
@@ -582,8 +774,9 @@ def _calculate_sequential(
|
||||
settings = CoverageSettings(**settings_dict)
|
||||
svc = CoverageService()
|
||||
|
||||
spatial_idx = SpatialIndex()
|
||||
spatial_idx = None
|
||||
if buildings:
|
||||
spatial_idx = SpatialIndex()
|
||||
spatial_idx.build(buildings)
|
||||
|
||||
total = len(grid)
|
||||
@@ -604,6 +797,8 @@ def _calculate_sequential(
|
||||
|
||||
if i % log_interval == 0:
|
||||
log_fn(f"Sequential: {i}/{total} ({i * 100 // total}%)")
|
||||
if progress_fn:
|
||||
progress_fn("Calculating coverage", 0.40 + 0.55 * (i / total))
|
||||
|
||||
point_elev = point_elevations.get((lat, lon), 0.0)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user