@mytec: iter2.5 vectorization start
This commit is contained in:
@@ -23,12 +23,12 @@ Usage:
|
||||
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import time
|
||||
import threading
|
||||
import multiprocessing as mp
|
||||
from typing import List, Dict, Tuple, Any, Optional, Callable
|
||||
import numpy as np
|
||||
import psutil
|
||||
|
||||
|
||||
# ── Cancellation token ──
|
||||
@@ -49,42 +49,77 @@ class CancellationToken:
|
||||
|
||||
# ── Worker process cleanup ──
|
||||
|
||||
def _clog(msg: str):
|
||||
"""Log with [PARALLEL] prefix."""
|
||||
print(f"[PARALLEL] {msg}", flush=True)
|
||||
|
||||
|
||||
def _kill_worker_processes() -> int:
|
||||
"""Kill all child processes of the current process.
|
||||
"""Kill ALL rfcp-server processes except the current (main) process.
|
||||
|
||||
Uses psutil to find and terminate/kill child processes that may be
|
||||
orphaned after ProcessPoolExecutor timeout or cancellation.
|
||||
Returns the number of children killed.
|
||||
Uses process NAME matching instead of PID tree because psutil.children()
|
||||
cannot see grandchildren spawned by ProcessPoolExecutor workers.
|
||||
Returns the number of processes killed.
|
||||
"""
|
||||
try:
|
||||
current = psutil.Process(os.getpid())
|
||||
children = current.children(recursive=True)
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
return 0
|
||||
my_pid = os.getpid()
|
||||
killed_count = 0
|
||||
|
||||
if not children:
|
||||
return 0
|
||||
|
||||
count = len(children)
|
||||
|
||||
# First: graceful terminate
|
||||
for child in children:
|
||||
if sys.platform == 'win32':
|
||||
try:
|
||||
child.terminate()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
|
||||
# Wait up to 3 seconds for graceful exit
|
||||
gone, alive = psutil.wait_procs(children, timeout=3)
|
||||
|
||||
# Force kill survivors
|
||||
for p in alive:
|
||||
# List all rfcp-server.exe processes in CSV format
|
||||
result = subprocess.run(
|
||||
['tasklist', '/FI', 'IMAGENAME eq rfcp-server.exe', '/FO', 'CSV', '/NH'],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if 'rfcp-server.exe' not in line:
|
||||
continue
|
||||
parts = line.split(',')
|
||||
if len(parts) >= 2:
|
||||
pid_str = parts[1].strip().strip('"')
|
||||
try:
|
||||
pid = int(pid_str)
|
||||
if pid != my_pid:
|
||||
subprocess.run(
|
||||
['taskkill', '/F', '/PID', str(pid)],
|
||||
capture_output=True, timeout=5,
|
||||
)
|
||||
killed_count += 1
|
||||
_clog(f"Killed worker PID {pid}")
|
||||
except (ValueError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
except Exception as e:
|
||||
_clog(f"Kill workers error: {e}")
|
||||
# Fallback: kill ALL rfcp-server.exe
|
||||
try:
|
||||
subprocess.run(
|
||||
['taskkill', '/F', '/IM', 'rfcp-server.exe', '/T'],
|
||||
capture_output=True, timeout=5,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
# Unix: pgrep + kill
|
||||
try:
|
||||
p.kill()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
result = subprocess.run(
|
||||
['pgrep', '-f', 'rfcp-server'],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for pid_str in result.stdout.strip().split('\n'):
|
||||
if not pid_str:
|
||||
continue
|
||||
try:
|
||||
pid = int(pid_str)
|
||||
if pid != my_pid:
|
||||
os.kill(pid, 9) # SIGKILL
|
||||
killed_count += 1
|
||||
_clog(f"Killed worker PID {pid}")
|
||||
except (ValueError, ProcessLookupError, PermissionError):
|
||||
pass
|
||||
except Exception as e:
|
||||
_clog(f"Kill workers error: {e}")
|
||||
|
||||
return count
|
||||
return killed_count
|
||||
|
||||
|
||||
# ── Try to import Ray ──
|
||||
@@ -470,7 +505,9 @@ def _calculate_with_process_pool(
|
||||
pool = None
|
||||
|
||||
try:
|
||||
pool = ProcessPoolExecutor(max_workers=num_workers)
|
||||
# Use spawn context for clean worker processes
|
||||
ctx = mp.get_context('spawn')
|
||||
pool = ProcessPoolExecutor(max_workers=num_workers, mp_context=ctx)
|
||||
futures = {
|
||||
pool.submit(
|
||||
_pool_worker_process_chunk,
|
||||
@@ -510,9 +547,12 @@ def _calculate_with_process_pool(
|
||||
# CRITICAL: Always cleanup pool and orphaned workers
|
||||
if pool:
|
||||
pool.shutdown(wait=False, cancel_futures=True)
|
||||
# Give pool time to cleanup gracefully
|
||||
time.sleep(0.5)
|
||||
# Then force kill any survivors by process name
|
||||
killed = _kill_worker_processes()
|
||||
if killed > 0:
|
||||
log_fn(f"Killed {killed} orphaned worker processes")
|
||||
log_fn(f"Force killed {killed} orphaned workers")
|
||||
|
||||
calc_time = time.time() - t_calc
|
||||
log_fn(f"ProcessPool done: {calc_time:.1f}s, {len(all_results)} results "
|
||||
|
||||
Reference in New Issue
Block a user