From 19a1d03fc881afc4d78e2f45990bf52e79896e1a Mon Sep 17 00:00:00 2001 From: hetaoBackend Date: Sun, 8 Feb 2026 21:09:18 +0800 Subject: [PATCH] fix: fix sub agent timeout --- backend/src/subagents/config.py | 4 ++-- backend/src/subagents/executor.py | 3 ++- backend/src/tools/builtins/task_tool.py | 16 +++++++++++----- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/backend/src/subagents/config.py b/backend/src/subagents/config.py index cf0394c..8554e7d 100644 --- a/backend/src/subagents/config.py +++ b/backend/src/subagents/config.py @@ -15,7 +15,7 @@ class SubagentConfig: disallowed_tools: Optional list of tool names to deny. model: Model to use - 'inherit' uses parent's model. max_turns: Maximum number of agent turns before stopping. - timeout_seconds: Maximum execution time in seconds (default: 300 = 5 minutes). + timeout_seconds: Maximum execution time in seconds (default: 900 = 15 minutes). """ name: str @@ -25,4 +25,4 @@ class SubagentConfig: disallowed_tools: list[str] | None = field(default_factory=lambda: ["task"]) model: str = "inherit" max_turns: int = 50 - timeout_seconds: int = 300 + timeout_seconds: int = 900 diff --git a/backend/src/subagents/executor.py b/backend/src/subagents/executor.py index 65e5d81..b58aa9a 100644 --- a/backend/src/subagents/executor.py +++ b/backend/src/subagents/executor.py @@ -29,6 +29,7 @@ class SubagentStatus(Enum): RUNNING = "running" COMPLETED = "completed" FAILED = "failed" + TIMED_OUT = "timed_out" @dataclass @@ -332,7 +333,7 @@ class SubagentExecutor: f"[trace={self.trace_id}] Subagent {self.config.name} execution timed out after {self.config.timeout_seconds}s" ) with _background_tasks_lock: - _background_tasks[task_id].status = SubagentStatus.FAILED + _background_tasks[task_id].status = SubagentStatus.TIMED_OUT _background_tasks[task_id].error = f"Execution timed out after {self.config.timeout_seconds} seconds" _background_tasks[task_id].completed_at = datetime.now() # Cancel the future (best effort - may not stop the actual execution) diff --git a/backend/src/tools/builtins/task_tool.py b/backend/src/tools/builtins/task_tool.py index 44a86a8..36bf2ae 100644 --- a/backend/src/tools/builtins/task_tool.py +++ b/backend/src/tools/builtins/task_tool.py @@ -131,7 +131,7 @@ def task_tool( logger.info(f"[trace={trace_id}] Task {task_id} status: {result.status.value}") last_status = result.status - # Check if task completed or failed + # Check if task completed, failed, or timed out if result.status == SubagentStatus.COMPLETED: writer({"type": "task_completed", "task_id": task_id, "result": result.result}) logger.info(f"[trace={trace_id}] Task {task_id} completed after {poll_count} polls") @@ -140,14 +140,20 @@ def task_tool( writer({"type": "task_failed", "task_id": task_id, "error": result.error}) logger.error(f"[trace={trace_id}] Task {task_id} failed: {result.error}") return f"Task failed. Error: {result.error}" + elif result.status == SubagentStatus.TIMED_OUT: + writer({"type": "task_timed_out", "task_id": task_id, "error": result.error}) + logger.warning(f"[trace={trace_id}] Task {task_id} timed out: {result.error}") + return f"Task timed out. Error: {result.error}" # Still running, wait before next poll writer({"type": "task_running", "task_id": task_id, "poll_count": poll_count}) time.sleep(5) # Poll every 5 seconds poll_count += 1 - # Optional: Add timeout protection (e.g., max 5 minutes) - if poll_count > 60: # 60 * 5s = 5 minutes - logger.warning(f"[trace={trace_id}] Task {task_id} timed out after {poll_count} polls") + # Polling timeout as a safety net (in case thread pool timeout doesn't work) + # Set to 16 minutes (longer than the default 15-minute thread pool timeout) + # This catches edge cases where the background task gets stuck + if poll_count > 192: # 192 * 5s = 16 minutes + logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)") writer({"type": "task_timed_out", "task_id": task_id}) - return f"Task timed out after 5 minutes. Status: {result.status.value}" + return f"Task polling timed out after 16 minutes. This may indicate the background task is stuck. Status: {result.status.value}"