feat(subagents): make subagent timeout configurable via config.yaml (#897)

* feat(subagents): make subagent timeout configurable via config.yaml - Add SubagentsAppConfig supporting global and per-agent timeout_seconds - Load subagents config section in AppConfig.from_file() - Registry now applies config.yaml overrides without mutating builtin defaults - Polling safety-net in task_tool is now dynamic (execution timeout + 60s buffer) - Document subagents section in config.example.yaml - Add make test command and enforce TDD policy in CLAUDE.md - Add 38 unit tests covering config validation, timeout resolution, registry override behavior, and polling timeout formula Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * feat(subagents): add logging for subagent timeout config and execution - Log loaded timeout config (global default + per-agent overrides) on startup - Log debug message in registry when config.yaml overrides a builtin timeout - Include timeout in executor's async execution start log - Log effective timeout and polling limit when a task is dispatched - Fix UnboundLocalError: move max_poll_count assignment before logger.info Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * ci(backend): add lint step and run all unit tests via Makefile Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix lint --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-23 22:24:46 +08:00 · 2026-02-25 08:39:29 +08:00
parent 310d54e443
commit faa422072c
17 changed files with 554 additions and 40 deletions
--- a/backend/src/agents/lead_agent/agent.py
+++ b/backend/src/agents/lead_agent/agent.py
@@ -245,17 +245,19 @@ def make_lead_agent(config: RunnableConfig):
    subagent_enabled = config.get("configurable", {}).get("subagent_enabled", False)
    max_concurrent_subagents = config.get("configurable", {}).get("max_concurrent_subagents", 3)
    print(f"thinking_enabled: {thinking_enabled}, model_name: {model_name}, is_plan_mode: {is_plan_mode}, subagent_enabled: {subagent_enabled}, max_concurrent_subagents: {max_concurrent_subagents}")
-    
+
    # Inject run metadata for LangSmith trace tagging
    if "metadata" not in config:
        config["metadata"] = {}
-    config["metadata"].update({
-        "model_name": model_name or "default",
-        "thinking_enabled": thinking_enabled,
-        "is_plan_mode": is_plan_mode,
-        "subagent_enabled": subagent_enabled,
-    })
-    
+    config["metadata"].update(
+        {
+            "model_name": model_name or "default",
+            "thinking_enabled": thinking_enabled,
+            "is_plan_mode": is_plan_mode,
+            "subagent_enabled": subagent_enabled,
+        }
+    )
+
    return create_agent(
        model=create_chat_model(name=model_name, thinking_enabled=thinking_enabled),
        tools=get_available_tools(model_name=model_name, subagent_enabled=subagent_enabled),
--- a/backend/src/community/aio_sandbox/remote_backend.py
+++ b/backend/src/community/aio_sandbox/remote_backend.py
@@ -18,7 +18,6 @@ Architecture:
 from __future__ import annotations

 import logging
-import os

 import requests

--- a/backend/src/config/app_config.py
+++ b/backend/src/config/app_config.py
@@ -11,6 +11,7 @@ from src.config.memory_config import load_memory_config_from_dict
 from src.config.model_config import ModelConfig
 from src.config.sandbox_config import SandboxConfig
 from src.config.skills_config import SkillsConfig
+from src.config.subagents_config import load_subagents_config_from_dict
 from src.config.summarization_config import load_summarization_config_from_dict
 from src.config.title_config import load_title_config_from_dict
 from src.config.tool_config import ToolConfig, ToolGroupConfig
@@ -87,6 +88,10 @@ class AppConfig(BaseModel):
        if "memory" in config_data:
            load_memory_config_from_dict(config_data["memory"])

+        # Load subagents config if present
+        if "subagents" in config_data:
+            load_subagents_config_from_dict(config_data["subagents"])
+
        # Load extensions config separately (it's in a different file)
        extensions_config = ExtensionsConfig.from_file()
        config_data["extensions"] = extensions_config.model_dump()
--- a/backend/src/config/subagents_config.py
+++ b/backend/src/config/subagents_config.py
@@ -0,0 +1,65 @@
+"""Configuration for the subagent system loaded from config.yaml."""
+
+import logging
+
+from pydantic import BaseModel, Field
+
+logger = logging.getLogger(__name__)
+
+
+class SubagentOverrideConfig(BaseModel):
+    """Per-agent configuration overrides."""
+
+    timeout_seconds: int | None = Field(
+        default=None,
+        ge=1,
+        description="Timeout in seconds for this subagent (None = use global default)",
+    )
+
+
+class SubagentsAppConfig(BaseModel):
+    """Configuration for the subagent system."""
+
+    timeout_seconds: int = Field(
+        default=900,
+        ge=1,
+        description="Default timeout in seconds for all subagents (default: 900 = 15 minutes)",
+    )
+    agents: dict[str, SubagentOverrideConfig] = Field(
+        default_factory=dict,
+        description="Per-agent configuration overrides keyed by agent name",
+    )
+
+    def get_timeout_for(self, agent_name: str) -> int:
+        """Get the effective timeout for a specific agent.
+
+        Args:
+            agent_name: The name of the subagent.
+
+        Returns:
+            The timeout in seconds, using per-agent override if set, otherwise global default.
+        """
+        override = self.agents.get(agent_name)
+        if override is not None and override.timeout_seconds is not None:
+            return override.timeout_seconds
+        return self.timeout_seconds
+
+
+_subagents_config: SubagentsAppConfig = SubagentsAppConfig()
+
+
+def get_subagents_app_config() -> SubagentsAppConfig:
+    """Get the current subagents configuration."""
+    return _subagents_config
+
+
+def load_subagents_config_from_dict(config_dict: dict) -> None:
+    """Load subagents configuration from a dictionary."""
+    global _subagents_config
+    _subagents_config = SubagentsAppConfig(**config_dict)
+
+    overrides_summary = {name: f"{override.timeout_seconds}s" for name, override in _subagents_config.agents.items() if override.timeout_seconds is not None}
+    if overrides_summary:
+        logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, per-agent overrides={overrides_summary}")
+    else:
+        logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, no per-agent overrides")
--- a/backend/src/config/tracing_config.py
+++ b/backend/src/config/tracing_config.py
@@ -1,11 +1,13 @@
 import logging
 import os
-from pydantic import BaseModel, Field
 import threading

+from pydantic import BaseModel, Field
+
 logger = logging.getLogger(__name__)
 _config_lock = threading.Lock()

+
 class TracingConfig(BaseModel):
    """Configuration for LangSmith tracing."""

@@ -41,11 +43,11 @@ def get_tracing_config() -> TracingConfig:
            endpoint=os.environ.get("LANGSMITH_ENDPOINT", "https://api.smith.langchain.com"),
        )
        return _tracing_config
-    
+
+
 def is_tracing_enabled() -> bool:
    """Check if LangSmith tracing is enabled and configured.
    Returns:
        True if tracing is enabled and has an API key.
    """
    return get_tracing_config().is_configured
-
--- a/backend/src/models/factory.py
+++ b/backend/src/models/factory.py
@@ -1,4 +1,5 @@
 import logging
+
 from langchain.chat_models import BaseChatModel

 from src.config import get_app_config, get_tracing_config, is_tracing_enabled
@@ -6,6 +7,7 @@ from src.reflection import resolve_class

 logger = logging.getLogger(__name__)

+
 def create_chat_model(name: str | None = None, thinking_enabled: bool = False, **kwargs) -> BaseChatModel:
    """Create a chat model instance from the config.

@@ -50,9 +52,7 @@ def create_chat_model(name: str | None = None, thinking_enabled: bool = False, *
            )
            existing_callbacks = model_instance.callbacks or []
            model_instance.callbacks = [*existing_callbacks, tracer]
-            logger.debug(
-                f"LangSmith tracing attached to model '{name}' (project='{tracing_config.project}')"
-            )
+            logger.debug(f"LangSmith tracing attached to model '{name}' (project='{tracing_config.project}')")
        except Exception as e:
            logger.warning(f"Failed to attach LangSmith tracing to model '{name}': {e}")
    return model_instance
--- a/backend/src/subagents/executor.py
+++ b/backend/src/subagents/executor.py
@@ -343,7 +343,7 @@ class SubagentExecutor:
            status=SubagentStatus.PENDING,
        )

-        logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}")
+        logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}, timeout={self.config.timeout_seconds}s")

        with _background_tasks_lock:
            _background_tasks[task_id] = result
--- a/backend/src/subagents/registry.py
+++ b/backend/src/subagents/registry.py
@@ -1,28 +1,46 @@
 """Subagent registry for managing available subagents."""

+import logging
+from dataclasses import replace
+
 from src.subagents.builtins import BUILTIN_SUBAGENTS
 from src.subagents.config import SubagentConfig

+logger = logging.getLogger(__name__)
+

 def get_subagent_config(name: str) -> SubagentConfig | None:
-    """Get a subagent configuration by name.
+    """Get a subagent configuration by name, with config.yaml overrides applied.

    Args:
        name: The name of the subagent.

    Returns:
-        SubagentConfig if found, None otherwise.
+        SubagentConfig if found (with any config.yaml overrides applied), None otherwise.
    """
-    return BUILTIN_SUBAGENTS.get(name)
+    config = BUILTIN_SUBAGENTS.get(name)
+    if config is None:
+        return None
+
+    # Apply timeout override from config.yaml (lazy import to avoid circular deps)
+    from src.config.subagents_config import get_subagents_app_config
+
+    app_config = get_subagents_app_config()
+    effective_timeout = app_config.get_timeout_for(name)
+    if effective_timeout != config.timeout_seconds:
+        logger.debug(f"Subagent '{name}': timeout overridden by config.yaml ({config.timeout_seconds}s -> {effective_timeout}s)")
+        config = replace(config, timeout_seconds=effective_timeout)
+
+    return config


 def list_subagents() -> list[SubagentConfig]:
-    """List all available subagent configurations.
+    """List all available subagent configurations (with config.yaml overrides applied).

    Returns:
        List of all registered SubagentConfig instances.
    """
-    return list(BUILTIN_SUBAGENTS.values())
+    return [get_subagent_config(name) for name in BUILTIN_SUBAGENTS]


 def get_subagent_names() -> list[str]:
--- a/backend/src/tools/builtins/task_tool.py
+++ b/backend/src/tools/builtins/task_tool.py
@@ -115,12 +115,15 @@ def task_tool(
    # Start background execution (always async to prevent blocking)
    # Use tool_call_id as task_id for better traceability
    task_id = executor.execute_async(prompt, task_id=tool_call_id)
-    logger.info(f"[trace={trace_id}] Started background task {task_id}, polling for completion...")

    # Poll for task completion in backend (removes need for LLM to poll)
    poll_count = 0
    last_status = None
    last_message_count = 0  # Track how many AI messages we've already sent
+    # Polling timeout: execution timeout + 60s buffer, checked every 5s
+    max_poll_count = (config.timeout_seconds + 60) // 5
+
+    logger.info(f"[trace={trace_id}] Started background task {task_id} (subagent={subagent_type}, timeout={config.timeout_seconds}s, polling_limit={max_poll_count} polls)")

    writer = get_stream_writer()
    # Send Task Started message'
@@ -176,9 +179,10 @@ def task_tool(
        poll_count += 1

        # Polling timeout as a safety net (in case thread pool timeout doesn't work)
-        # Set to 16 minutes (longer than the default 15-minute thread pool timeout)
+        # Set to execution timeout + 60s buffer, in 5s poll intervals
        # This catches edge cases where the background task gets stuck
-        if poll_count > 192:  # 192 * 5s = 16 minutes
+        if poll_count > max_poll_count:
+            timeout_minutes = config.timeout_seconds // 60
            logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)")
            writer({"type": "task_timed_out", "task_id": task_id})
-            return f"Task polling timed out after 16 minutes. This may indicate the background task is stuck. Status: {result.status.value}"
+            return f"Task polling timed out after {timeout_minutes} minutes. This may indicate the background task is stuck. Status: {result.status.value}"