@mytec: feat: Phase 3.0 Architecture Refactor ✅

Major refactoring of RFCP backend: - Modular propagation models (8 models) - SharedMemoryManager for terrain data - ProcessPoolExecutor parallel processing - WebSocket progress streaming - Building filtering pipeline (351k → 15k) - 82 unit tests Performance: Standard preset 38s → 5s (7.6x speedup) Known issue: Detailed preset timeout (fix in 3.1.0)
2026-02-01 23:12:26 +02:00
parent 1dde56705a
commit defa3ad440
71 changed files with 7134 additions and 256 deletions
--- a/backend/app/services/parallel_coverage_service.py
+++ b/backend/app/services/parallel_coverage_service.py
@@ -47,6 +47,24 @@ class CancellationToken:
        return self._event.is_set()


+# ── Active pool tracking (for graceful shutdown) ──
+
+_active_pool = None  # Global ref to current ProcessPoolExecutor
+_active_pool_lock = threading.Lock()
+
+
+def _set_active_pool(pool):
+    global _active_pool
+    with _active_pool_lock:
+        _active_pool = pool
+
+
+def _clear_active_pool():
+    global _active_pool
+    with _active_pool_lock:
+        _active_pool = None
+
+
 # ── Worker process cleanup ──

 def _clog(msg: str):
@@ -57,10 +75,23 @@ def _clog(msg: str):
 def _kill_worker_processes() -> int:
    """Kill ALL rfcp-server processes except the current (main) process.

-    Uses process NAME matching instead of PID tree because psutil.children()
-    cannot see grandchildren spawned by ProcessPoolExecutor workers.
+    First shuts down the active ProcessPoolExecutor (if any), then uses
+    process NAME matching to kill remaining workers.
    Returns the number of processes killed.
    """
+    global _active_pool
+
+    # Step 0: Shut down active ProcessPoolExecutor gracefully
+    with _active_pool_lock:
+        pool = _active_pool
+        _active_pool = None
+    if pool is not None:
+        try:
+            pool.shutdown(wait=False, cancel_futures=True)
+            _clog("Active ProcessPoolExecutor shutdown requested")
+        except Exception as e:
+            _clog(f"Pool shutdown error: {e}")
+
    my_pid = os.getpid()
    killed_count = 0

@@ -154,10 +185,12 @@ def _ray_process_chunk_impl(chunk, terrain_cache, buildings, osm_data, config):
    # Build or reuse spatial index (expensive — ~1s for 350K buildings).
    cache_key = config.get('cache_key', '')
    if _worker_cache_key != cache_key:
-        from app.services.spatial_index import SpatialIndex
-        _worker_spatial_idx = SpatialIndex()
        if buildings:
+            from app.services.spatial_index import SpatialIndex
+            _worker_spatial_idx = SpatialIndex()
            _worker_spatial_idx.build(buildings)
+        else:
+            _worker_spatial_idx = None
        _worker_cache_key = cache_key

    # Process points
@@ -262,6 +295,7 @@ def calculate_coverage_parallel(
    log_fn: Optional[Callable[[str], None]] = None,
    cancel_token: Optional[CancellationToken] = None,
    precomputed: Optional[Dict] = None,
+    progress_fn: Optional[Callable[[str, float], None]] = None,
 ) -> Tuple[List[Dict], Dict[str, float]]:
    """Calculate coverage points in parallel.

@@ -287,6 +321,7 @@ def calculate_coverage_parallel(
                terrain_cache, buildings, streets, water_bodies,
                vegetation_areas, site_elevation,
                num_workers, log_fn, cancel_token, precomputed,
+                progress_fn,
            )
        except Exception as e:
            log_fn(f"Ray execution failed: {e} — falling back to sequential")
@@ -300,6 +335,7 @@ def calculate_coverage_parallel(
                terrain_cache, buildings, streets, water_bodies,
                vegetation_areas, site_elevation,
                pool_workers, log_fn, cancel_token, precomputed,
+                progress_fn,
            )
        except Exception as e:
            log_fn(f"ProcessPool failed: {e} — falling back to sequential")
@@ -310,6 +346,7 @@ def calculate_coverage_parallel(
        grid, point_elevations, site_dict, settings_dict,
        buildings, streets, water_bodies, vegetation_areas,
        site_elevation, log_fn, cancel_token, precomputed,
+        progress_fn,
    )


@@ -321,6 +358,7 @@ def _calculate_with_ray(
    terrain_cache, buildings, streets, water_bodies,
    vegetation_areas, site_elevation,
    num_workers, log_fn, cancel_token=None, precomputed=None,
+    progress_fn=None,
 ):
    """Execute using Ray shared-memory object store."""
    total_points = len(grid)
@@ -404,6 +442,9 @@ def _calculate_with_ray(
        eta = (total_points - pts) / rate if rate > 0 else 0
        log_fn(f"Progress: {completed_chunks}/{total_chunks} chunks ({pct}%) — "
               f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
+        if progress_fn:
+            # Map chunk progress to 40%-95% range
+            progress_fn("Calculating coverage", 0.40 + 0.55 * (completed_chunks / total_chunks))

    calc_time = time.time() - t_calc
    log_fn(f"Ray done: {calc_time:.1f}s, {len(all_results)} results "
@@ -428,9 +469,10 @@ def _pool_worker_process_chunk(args):
    from app.services.terrain_service import terrain_service
    terrain_service._tile_cache = terrain_cache

-    from app.services.spatial_index import SpatialIndex
-    spatial_idx = SpatialIndex()
+    spatial_idx = None
    if buildings:
+        from app.services.spatial_index import SpatialIndex
+        spatial_idx = SpatialIndex()
        spatial_idx.build(buildings)

    from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
@@ -465,32 +507,162 @@ def _pool_worker_process_chunk(args):
    return results


+def _store_terrain_in_shm(terrain_cache: Dict[str, np.ndarray], log_fn) -> Tuple[list, Dict[str, dict]]:
+    """Store terrain tile arrays in shared memory. Returns (shm_blocks, tile_refs).
+
+    tile_refs is a dict mapping tile_name -> {shm_name, shape, dtype_str}
+    that workers use to reconstruct numpy arrays from shared memory.
+    """
+    import multiprocessing.shared_memory as shm_mod
+
+    blocks = []
+    refs = {}
+
+    for tile_name, arr in terrain_cache.items():
+        try:
+            block = shm_mod.SharedMemory(create=True, size=arr.nbytes)
+            blocks.append(block)
+            # Copy tile data to shared memory
+            shm_arr = np.ndarray(arr.shape, dtype=arr.dtype, buffer=block.buf)
+            shm_arr[:] = arr[:]
+            refs[tile_name] = {
+                'shm_name': block.name,
+                'shape': arr.shape,
+                'dtype': str(arr.dtype),
+            }
+        except Exception as e:
+            log_fn(f"Failed to store tile {tile_name} in shm: {e}")
+            # Fallback: worker will have to use pickled copy
+            pass
+
+    return blocks, refs
+
+
+def _pool_worker_shm_chunk(args):
+    """Worker function that reads terrain from shared memory instead of pickle."""
+    import multiprocessing.shared_memory as shm_mod
+
+    chunk, terrain_shm_refs, buildings, osm_data, config = args
+
+    # Reconstruct terrain cache from shared memory (zero-copy numpy views)
+    terrain_cache = {}
+    for tile_name, ref in terrain_shm_refs.items():
+        try:
+            block = shm_mod.SharedMemory(name=ref['shm_name'])
+            terrain_cache[tile_name] = np.ndarray(
+                ref['shape'], dtype=ref['dtype'], buffer=block.buf,
+            )
+        except Exception:
+            pass
+
+    # Inject terrain cache
+    from app.services.terrain_service import terrain_service
+    terrain_service._tile_cache = terrain_cache
+
+    # Build spatial index
+    global _worker_spatial_idx, _worker_cache_key
+    cache_key = config.get('cache_key', '')
+    if _worker_cache_key != cache_key:
+        if buildings:
+            from app.services.spatial_index import SpatialIndex
+            _worker_spatial_idx = SpatialIndex()
+            _worker_spatial_idx.build(buildings)
+        else:
+            _worker_spatial_idx = None
+        _worker_cache_key = cache_key
+
+    # Process points
+    from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
+    site = SiteParams(**config['site_dict'])
+    settings = CoverageSettings(**config['settings_dict'])
+    svc = CoverageService()
+
+    timing = {
+        "los": 0.0, "buildings": 0.0, "antenna": 0.0,
+        "dominant_path": 0.0, "street_canyon": 0.0,
+        "reflection": 0.0, "vegetation": 0.0,
+    }
+
+    precomputed = config.get('precomputed')
+
+    results = []
+    for lat, lon, point_elev in chunk:
+        pre = precomputed.get((lat, lon)) if precomputed else None
+        point = svc._calculate_point_sync(
+            site, lat, lon, settings,
+            buildings, osm_data.get('streets', []),
+            _worker_spatial_idx, osm_data.get('water_bodies', []),
+            osm_data.get('vegetation_areas', []),
+            config['site_elevation'], point_elev, timing,
+            precomputed_distance=pre.get('distance') if pre else None,
+            precomputed_path_loss=pre.get('path_loss') if pre else None,
+        )
+        if point.rsrp >= settings.min_signal:
+            results.append(point.model_dump())
+
+    return results
+
+
 def _calculate_with_process_pool(
    grid, point_elevations, site_dict, settings_dict,
    terrain_cache, buildings, streets, water_bodies,
    vegetation_areas, site_elevation,
    num_workers, log_fn, cancel_token=None, precomputed=None,
+    progress_fn=None,
 ):
-    """Execute using ProcessPoolExecutor with reduced workers to limit memory."""
+    """Execute using ProcessPoolExecutor.
+
+    Uses shared memory for terrain tiles (zero-copy numpy views) to reduce
+    memory usage compared to pickling full terrain arrays per worker.
+    """
    from concurrent.futures import ProcessPoolExecutor, as_completed

    total_points = len(grid)
-    log_fn(f"ProcessPool mode: {total_points} points, {num_workers} workers")
+
+    # Estimate pickle size for building data and cap workers accordingly
+    building_count = len(buildings)
+    if building_count > 10000:
+        num_workers = min(num_workers, 3)
+        log_fn(f"Large building set ({building_count}) — reducing workers to {num_workers}")
+    elif building_count > 5000:
+        num_workers = min(num_workers, 4)
+
+    log_fn(f"ProcessPool mode: {total_points} points, {num_workers} workers, "
+           f"{building_count} buildings")
+
+    # Store terrain tiles in shared memory
+    shm_blocks = []
+    terrain_shm_refs = {}
+    try:
+        shm_blocks, terrain_shm_refs = _store_terrain_in_shm(terrain_cache, log_fn)
+        if terrain_shm_refs:
+            tile_mb = sum(
+                np.prod(r['shape']) * np.dtype(r['dtype']).itemsize
+                for r in terrain_shm_refs.values()
+            ) / (1024 * 1024)
+            log_fn(f"Stored {len(terrain_shm_refs)} terrain tiles in shared memory ({tile_mb:.0f} MB)")
+            use_shm = True
+        else:
+            use_shm = False
+    except Exception as e:
+        log_fn(f"Shared memory setup failed ({e}), using pickle fallback")
+        use_shm = False

    items = [
        (lat, lon, point_elevations.get((lat, lon), 0.0))
        for lat, lon in grid
    ]

-    # Larger chunks than Ray — fewer workers means bigger chunks
    chunk_size = max(1, len(items) // (num_workers * 2))
    chunks = [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
    log_fn(f"Submitting {len(chunks)} chunks of ~{chunk_size} points")

+    cache_key = f"{site_dict['lat']:.4f},{site_dict['lon']:.4f},{len(buildings)}"
    config = {
        'site_dict': site_dict,
        'settings_dict': settings_dict,
        'site_elevation': site_elevation,
+        'cache_key': cache_key,
    }
    if precomputed:
        config['precomputed'] = precomputed
@@ -505,20 +677,32 @@ def _calculate_with_process_pool(
    pool = None

    try:
-        # Use spawn context for clean worker processes
        ctx = mp.get_context('spawn')
        pool = ProcessPoolExecutor(max_workers=num_workers, mp_context=ctx)
-        futures = {
-            pool.submit(
-                _pool_worker_process_chunk,
-                (chunk, terrain_cache, buildings, osm_data, config),
-            ): i
-            for i, chunk in enumerate(chunks)
-        }
+        _set_active_pool(pool)
+
+        if use_shm:
+            # Shared memory path: pass shm refs instead of terrain data
+            worker_fn = _pool_worker_shm_chunk
+            futures = {
+                pool.submit(
+                    worker_fn,
+                    (chunk, terrain_shm_refs, buildings, osm_data, config),
+                ): i
+                for i, chunk in enumerate(chunks)
+            }
+        else:
+            # Pickle fallback path
+            futures = {
+                pool.submit(
+                    _pool_worker_process_chunk,
+                    (chunk, terrain_cache, buildings, osm_data, config),
+                ): i
+                for i, chunk in enumerate(chunks)
+            }

        completed_chunks = 0
        for future in as_completed(futures):
-            # Check cancellation between chunks
            if cancel_token and cancel_token.is_cancelled:
                log_fn(f"Cancelled — cancelling {len(futures) - completed_chunks - 1} pending futures")
                for f in futures:
@@ -539,20 +723,27 @@ def _calculate_with_process_pool(
            eta = (total_points - pts) / rate if rate > 0 else 0
            log_fn(f"Progress: {completed_chunks}/{len(chunks)} chunks ({pct}%) — "
                   f"{pts} pts, {rate:.0f} pts/s, ETA {eta:.0f}s")
+            if progress_fn:
+                progress_fn("Calculating coverage", 0.40 + 0.55 * (completed_chunks / len(chunks)))

    except Exception as e:
        log_fn(f"ProcessPool error: {e}")

    finally:
-        # CRITICAL: Always cleanup pool and orphaned workers
+        _clear_active_pool()
        if pool:
            pool.shutdown(wait=False, cancel_futures=True)
-        # Give pool time to cleanup gracefully
        time.sleep(0.5)
-        # Then force kill any survivors by process name
        killed = _kill_worker_processes()
        if killed > 0:
            log_fn(f"Force killed {killed} orphaned workers")
+        # Cleanup shared memory blocks
+        for block in shm_blocks:
+            try:
+                block.close()
+                block.unlink()
+            except Exception:
+                pass

    calc_time = time.time() - t_calc
    log_fn(f"ProcessPool done: {calc_time:.1f}s, {len(all_results)} results "
@@ -561,7 +752,7 @@ def _calculate_with_process_pool(
    timing = {
        "parallel_total": calc_time,
        "workers": num_workers,
-        "backend": "process_pool",
+        "backend": "process_pool" + ("/shm" if use_shm else "/pickle"),
    }
    return all_results, timing

@@ -573,6 +764,7 @@ def _calculate_sequential(
    grid, point_elevations, site_dict, settings_dict,
    buildings, streets, water_bodies, vegetation_areas,
    site_elevation, log_fn, cancel_token=None, precomputed=None,
+    progress_fn=None,
 ):
    """Sequential fallback — no extra dependencies, runs in calling thread."""
    from app.services.coverage_service import CoverageService, SiteParams, CoverageSettings
@@ -582,8 +774,9 @@ def _calculate_sequential(
    settings = CoverageSettings(**settings_dict)
    svc = CoverageService()

-    spatial_idx = SpatialIndex()
+    spatial_idx = None
    if buildings:
+        spatial_idx = SpatialIndex()
        spatial_idx.build(buildings)

    total = len(grid)
@@ -604,6 +797,8 @@ def _calculate_sequential(

        if i % log_interval == 0:
            log_fn(f"Sequential: {i}/{total} ({i * 100 // total}%)")
+            if progress_fn:
+                progress_fn("Calculating coverage", 0.40 + 0.55 * (i / total))

        point_elev = point_elevations.get((lat, lon), 0.0)