fix: fix sub agent timeout

This commit is contained in:
hetaoBackend
2026-02-08 21:09:18 +08:00
parent 4f9150229c
commit 19a1d03fc8
3 changed files with 15 additions and 8 deletions

View File

@@ -15,7 +15,7 @@ class SubagentConfig:
disallowed_tools: Optional list of tool names to deny.
model: Model to use - 'inherit' uses parent's model.
max_turns: Maximum number of agent turns before stopping.
timeout_seconds: Maximum execution time in seconds (default: 300 = 5 minutes).
timeout_seconds: Maximum execution time in seconds (default: 900 = 15 minutes).
"""
name: str
@@ -25,4 +25,4 @@ class SubagentConfig:
disallowed_tools: list[str] | None = field(default_factory=lambda: ["task"])
model: str = "inherit"
max_turns: int = 50
timeout_seconds: int = 300
timeout_seconds: int = 900

View File

@@ -29,6 +29,7 @@ class SubagentStatus(Enum):
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
TIMED_OUT = "timed_out"
@dataclass
@@ -332,7 +333,7 @@ class SubagentExecutor:
f"[trace={self.trace_id}] Subagent {self.config.name} execution timed out after {self.config.timeout_seconds}s"
)
with _background_tasks_lock:
_background_tasks[task_id].status = SubagentStatus.FAILED
_background_tasks[task_id].status = SubagentStatus.TIMED_OUT
_background_tasks[task_id].error = f"Execution timed out after {self.config.timeout_seconds} seconds"
_background_tasks[task_id].completed_at = datetime.now()
# Cancel the future (best effort - may not stop the actual execution)

View File

@@ -131,7 +131,7 @@ def task_tool(
logger.info(f"[trace={trace_id}] Task {task_id} status: {result.status.value}")
last_status = result.status
# Check if task completed or failed
# Check if task completed, failed, or timed out
if result.status == SubagentStatus.COMPLETED:
writer({"type": "task_completed", "task_id": task_id, "result": result.result})
logger.info(f"[trace={trace_id}] Task {task_id} completed after {poll_count} polls")
@@ -140,14 +140,20 @@ def task_tool(
writer({"type": "task_failed", "task_id": task_id, "error": result.error})
logger.error(f"[trace={trace_id}] Task {task_id} failed: {result.error}")
return f"Task failed. Error: {result.error}"
elif result.status == SubagentStatus.TIMED_OUT:
writer({"type": "task_timed_out", "task_id": task_id, "error": result.error})
logger.warning(f"[trace={trace_id}] Task {task_id} timed out: {result.error}")
return f"Task timed out. Error: {result.error}"
# Still running, wait before next poll
writer({"type": "task_running", "task_id": task_id, "poll_count": poll_count})
time.sleep(5) # Poll every 5 seconds
poll_count += 1
# Optional: Add timeout protection (e.g., max 5 minutes)
if poll_count > 60: # 60 * 5s = 5 minutes
logger.warning(f"[trace={trace_id}] Task {task_id} timed out after {poll_count} polls")
# Polling timeout as a safety net (in case thread pool timeout doesn't work)
# Set to 16 minutes (longer than the default 15-minute thread pool timeout)
# This catches edge cases where the background task gets stuck
if poll_count > 192: # 192 * 5s = 16 minutes
logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)")
writer({"type": "task_timed_out", "task_id": task_id})
return f"Task timed out after 5 minutes. Status: {result.status.value}"
return f"Task polling timed out after 16 minutes. This may indicate the background task is stuck. Status: {result.status.value}"