rfcp/backend/app/api/routes/system.py

import os
import json
import asyncio
import multiprocessing as mp
from pathlib import Path
from fastapi import APIRouter

router = APIRouter()

# Valid SRTM tile sizes (bytes)
_SRTM1_SIZE = 3601 * 3601 * 2  # 25,934,402
_SRTM3_SIZE = 1201 * 1201 * 2  #  2,884,802


@router.get("/info")
async def get_system_info():
    """Return system info: CPU cores, GPU availability, parallel backend."""
    cpu_cores = mp.cpu_count() or 1

    # Check Ray
    ray_available = False
    ray_initialized = False
    try:
        from app.services.parallel_coverage_service import RAY_AVAILABLE
        ray_available = RAY_AVAILABLE
        if ray_available:
            import ray
            ray_initialized = ray.is_initialized()
    except Exception:
        pass

    # Check GPU via gpu_service
    from app.services.gpu_service import gpu_service
    gpu_info = gpu_service.get_info()

    # Determine parallel backend
    if ray_available:
        parallel_backend = "ray"
    elif cpu_cores > 1:
        parallel_backend = "process_pool"
    else:
        parallel_backend = "sequential"

    return {
        "cpu_cores": cpu_cores,
        "parallel_workers": min(cpu_cores, 14),
        "parallel_backend": parallel_backend,
        "ray_available": ray_available,
        "ray_initialized": ray_initialized,
        "gpu": gpu_info,
        "gpu_available": gpu_info.get("available", False),
    }


@router.get("/models")
async def get_propagation_models():
    """Return available propagation models and their valid ranges."""
    from app.core.engine import engine
    return {
        "models": engine.get_available_models(),
    }


@router.post("/shutdown")
async def shutdown():
    """Graceful shutdown endpoint. Kills worker processes then self-terminates.

    Electron calls this first, waits briefly, then does PID-tree kill.
    The os._exit(3s) is a safety net in case Electron doesn't kill us.
    """
    from app.services.parallel_coverage_service import _kill_worker_processes

    killed = _kill_worker_processes()

    # Safety net: self-terminate after 3s if Electron doesn't kill us.
    # Delay is long enough for Electron to do PID-tree kill first (preferred).
    loop = asyncio.get_running_loop()
    loop.call_later(3.0, lambda: os._exit(0))

    return {"status": "shutting down", "workers_killed": killed}


@router.get("/diagnostics")
async def get_diagnostics():
    """Validate terrain tiles and OSM cache files.

    Checks:
    - Terrain .hgt files: must be exactly SRTM1 or SRTM3 size
    - OSM cache .json files: must be valid JSON with expected structure
    - Cache manager stats (memory + disk)
    """
    data_path = Path(os.environ.get('RFCP_DATA_PATH', './data'))
    terrain_path = data_path / 'terrain'
    osm_dirs = [
        data_path / 'osm' / 'buildings',
        data_path / 'osm' / 'streets',
        data_path / 'osm' / 'vegetation',
        data_path / 'osm' / 'water',
    ]

    # --- Terrain tiles ---
    terrain_tiles = []
    terrain_errors = []
    total_terrain_bytes = 0

    if terrain_path.exists():
        for hgt in sorted(terrain_path.glob("*.hgt")):
            size = hgt.stat().st_size
            total_terrain_bytes += size
            if size == _SRTM1_SIZE:
                terrain_tiles.append({"name": hgt.name, "type": "SRTM1", "size": size})
            elif size == _SRTM3_SIZE:
                terrain_tiles.append({"name": hgt.name, "type": "SRTM3", "size": size})
            else:
                terrain_errors.append({
                    "name": hgt.name,
                    "size": size,
                    "error": f"Invalid size (expected {_SRTM1_SIZE} or {_SRTM3_SIZE})",
                })

    # --- OSM cache ---
    osm_files = []
    osm_errors = []
    total_osm_bytes = 0

    for osm_dir in osm_dirs:
        if not osm_dir.exists():
            continue
        category = osm_dir.name
        for jf in sorted(osm_dir.glob("*.json")):
            fsize = jf.stat().st_size
            total_osm_bytes += fsize
            try:
                data = json.loads(jf.read_text())
                has_timestamp = '_cached_at' in data or '_ts' in data
                has_data = 'data' in data or 'v' in data
                if has_timestamp and has_data:
                    osm_files.append({
                        "name": jf.name,
                        "category": category,
                        "size": fsize,
                        "valid": True,
                    })
                else:
                    osm_errors.append({
                        "name": jf.name,
                        "category": category,
                        "size": fsize,
                        "error": "Missing expected keys (_cached_at/data or _ts/v)",
                    })
            except json.JSONDecodeError as e:
                osm_errors.append({
                    "name": jf.name,
                    "category": category,
                    "size": fsize,
                    "error": f"Invalid JSON: {e}",
                })

    # --- Cache manager stats ---
    try:
        from app.services.cache import cache_manager
        cache_stats = cache_manager.stats()
    except Exception:
        cache_stats = None

    return {
        "data_path": str(data_path),
        "terrain": {
            "path": str(terrain_path),
            "exists": terrain_path.exists(),
            "tile_count": len(terrain_tiles),
            "error_count": len(terrain_errors),
            "total_mb": round(total_terrain_bytes / (1024 * 1024), 1),
            "tiles": terrain_tiles,
            "errors": terrain_errors,
        },
        "osm_cache": {
            "valid_count": len(osm_files),
            "error_count": len(osm_errors),
            "total_mb": round(total_osm_bytes / (1024 * 1024), 1),
            "files": osm_files,
            "errors": osm_errors,
        },
        "cache_manager": cache_stats,
    }