feat(subagents): make subagent timeout configurable via config.yaml (#897)

* feat(subagents): make subagent timeout configurable via config.yaml - Add SubagentsAppConfig supporting global and per-agent timeout_seconds - Load subagents config section in AppConfig.from_file() - Registry now applies config.yaml overrides without mutating builtin defaults - Polling safety-net in task_tool is now dynamic (execution timeout + 60s buffer) - Document subagents section in config.example.yaml - Add make test command and enforce TDD policy in CLAUDE.md - Add 38 unit tests covering config validation, timeout resolution, registry override behavior, and polling timeout formula Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * feat(subagents): add logging for subagent timeout config and execution - Log loaded timeout config (global default + per-agent overrides) on startup - Log debug message in registry when config.yaml overrides a builtin timeout - Include timeout in executor's async execution start log - Log effective timeout and polling limit when a task is dispatched - Fix UnboundLocalError: move max_poll_count assignment before logger.info Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * ci(backend): add lint step and run all unit tests via Makefile Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix lint --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-26 07:14:47 +08:00 · 2026-02-25 08:39:29 +08:00
parent 310d54e443
commit faa422072c
17 changed files with 554 additions and 40 deletions
--- a/backend/src/tools/builtins/task_tool.py
+++ b/backend/src/tools/builtins/task_tool.py
@@ -115,12 +115,15 @@ def task_tool(
    # Start background execution (always async to prevent blocking)
    # Use tool_call_id as task_id for better traceability
    task_id = executor.execute_async(prompt, task_id=tool_call_id)
-    logger.info(f"[trace={trace_id}] Started background task {task_id}, polling for completion...")

    # Poll for task completion in backend (removes need for LLM to poll)
    poll_count = 0
    last_status = None
    last_message_count = 0  # Track how many AI messages we've already sent
+    # Polling timeout: execution timeout + 60s buffer, checked every 5s
+    max_poll_count = (config.timeout_seconds + 60) // 5
+
+    logger.info(f"[trace={trace_id}] Started background task {task_id} (subagent={subagent_type}, timeout={config.timeout_seconds}s, polling_limit={max_poll_count} polls)")

    writer = get_stream_writer()
    # Send Task Started message'
@@ -176,9 +179,10 @@ def task_tool(
        poll_count += 1

        # Polling timeout as a safety net (in case thread pool timeout doesn't work)
-        # Set to 16 minutes (longer than the default 15-minute thread pool timeout)
+        # Set to execution timeout + 60s buffer, in 5s poll intervals
        # This catches edge cases where the background task gets stuck
-        if poll_count > 192:  # 192 * 5s = 16 minutes
+        if poll_count > max_poll_count:
+            timeout_minutes = config.timeout_seconds // 60
            logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)")
            writer({"type": "task_timed_out", "task_id": task_id})
-            return f"Task polling timed out after 16 minutes. This may indicate the background task is stuck. Status: {result.status.value}"
+            return f"Task polling timed out after {timeout_minutes} minutes. This may indicate the background task is stuck. Status: {result.status.value}"