feat(subagents): make subagent timeout configurable via config.yaml (#897)

* feat(subagents): make subagent timeout configurable via config.yaml

- Add SubagentsAppConfig supporting global and per-agent timeout_seconds
- Load subagents config section in AppConfig.from_file()
- Registry now applies config.yaml overrides without mutating builtin defaults
- Polling safety-net in task_tool is now dynamic (execution timeout + 60s buffer)
- Document subagents section in config.example.yaml
- Add make test command and enforce TDD policy in CLAUDE.md
- Add 38 unit tests covering config validation, timeout resolution, registry
  override behavior, and polling timeout formula

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* feat(subagents): add logging for subagent timeout config and execution

- Log loaded timeout config (global default + per-agent overrides) on startup
- Log debug message in registry when config.yaml overrides a builtin timeout
- Include timeout in executor's async execution start log
- Log effective timeout and polling limit when a task is dispatched
- Fix UnboundLocalError: move max_poll_count assignment before logger.info

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ci(backend): add lint step and run all unit tests via Makefile

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix lint

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
DanielWalnut
2026-02-25 08:39:29 +08:00
committed by GitHub
parent 310d54e443
commit faa422072c
17 changed files with 554 additions and 40 deletions

View File

@@ -245,17 +245,19 @@ def make_lead_agent(config: RunnableConfig):
subagent_enabled = config.get("configurable", {}).get("subagent_enabled", False)
max_concurrent_subagents = config.get("configurable", {}).get("max_concurrent_subagents", 3)
print(f"thinking_enabled: {thinking_enabled}, model_name: {model_name}, is_plan_mode: {is_plan_mode}, subagent_enabled: {subagent_enabled}, max_concurrent_subagents: {max_concurrent_subagents}")
# Inject run metadata for LangSmith trace tagging
if "metadata" not in config:
config["metadata"] = {}
config["metadata"].update({
"model_name": model_name or "default",
"thinking_enabled": thinking_enabled,
"is_plan_mode": is_plan_mode,
"subagent_enabled": subagent_enabled,
})
config["metadata"].update(
{
"model_name": model_name or "default",
"thinking_enabled": thinking_enabled,
"is_plan_mode": is_plan_mode,
"subagent_enabled": subagent_enabled,
}
)
return create_agent(
model=create_chat_model(name=model_name, thinking_enabled=thinking_enabled),
tools=get_available_tools(model_name=model_name, subagent_enabled=subagent_enabled),

View File

@@ -18,7 +18,6 @@ Architecture:
from __future__ import annotations
import logging
import os
import requests

View File

@@ -11,6 +11,7 @@ from src.config.memory_config import load_memory_config_from_dict
from src.config.model_config import ModelConfig
from src.config.sandbox_config import SandboxConfig
from src.config.skills_config import SkillsConfig
from src.config.subagents_config import load_subagents_config_from_dict
from src.config.summarization_config import load_summarization_config_from_dict
from src.config.title_config import load_title_config_from_dict
from src.config.tool_config import ToolConfig, ToolGroupConfig
@@ -87,6 +88,10 @@ class AppConfig(BaseModel):
if "memory" in config_data:
load_memory_config_from_dict(config_data["memory"])
# Load subagents config if present
if "subagents" in config_data:
load_subagents_config_from_dict(config_data["subagents"])
# Load extensions config separately (it's in a different file)
extensions_config = ExtensionsConfig.from_file()
config_data["extensions"] = extensions_config.model_dump()

View File

@@ -0,0 +1,65 @@
"""Configuration for the subagent system loaded from config.yaml."""
import logging
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
class SubagentOverrideConfig(BaseModel):
"""Per-agent configuration overrides."""
timeout_seconds: int | None = Field(
default=None,
ge=1,
description="Timeout in seconds for this subagent (None = use global default)",
)
class SubagentsAppConfig(BaseModel):
"""Configuration for the subagent system."""
timeout_seconds: int = Field(
default=900,
ge=1,
description="Default timeout in seconds for all subagents (default: 900 = 15 minutes)",
)
agents: dict[str, SubagentOverrideConfig] = Field(
default_factory=dict,
description="Per-agent configuration overrides keyed by agent name",
)
def get_timeout_for(self, agent_name: str) -> int:
"""Get the effective timeout for a specific agent.
Args:
agent_name: The name of the subagent.
Returns:
The timeout in seconds, using per-agent override if set, otherwise global default.
"""
override = self.agents.get(agent_name)
if override is not None and override.timeout_seconds is not None:
return override.timeout_seconds
return self.timeout_seconds
_subagents_config: SubagentsAppConfig = SubagentsAppConfig()
def get_subagents_app_config() -> SubagentsAppConfig:
"""Get the current subagents configuration."""
return _subagents_config
def load_subagents_config_from_dict(config_dict: dict) -> None:
"""Load subagents configuration from a dictionary."""
global _subagents_config
_subagents_config = SubagentsAppConfig(**config_dict)
overrides_summary = {name: f"{override.timeout_seconds}s" for name, override in _subagents_config.agents.items() if override.timeout_seconds is not None}
if overrides_summary:
logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, per-agent overrides={overrides_summary}")
else:
logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, no per-agent overrides")

View File

@@ -1,11 +1,13 @@
import logging
import os
from pydantic import BaseModel, Field
import threading
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
_config_lock = threading.Lock()
class TracingConfig(BaseModel):
"""Configuration for LangSmith tracing."""
@@ -41,11 +43,11 @@ def get_tracing_config() -> TracingConfig:
endpoint=os.environ.get("LANGSMITH_ENDPOINT", "https://api.smith.langchain.com"),
)
return _tracing_config
def is_tracing_enabled() -> bool:
"""Check if LangSmith tracing is enabled and configured.
Returns:
True if tracing is enabled and has an API key.
"""
return get_tracing_config().is_configured

View File

@@ -1,4 +1,5 @@
import logging
from langchain.chat_models import BaseChatModel
from src.config import get_app_config, get_tracing_config, is_tracing_enabled
@@ -6,6 +7,7 @@ from src.reflection import resolve_class
logger = logging.getLogger(__name__)
def create_chat_model(name: str | None = None, thinking_enabled: bool = False, **kwargs) -> BaseChatModel:
"""Create a chat model instance from the config.
@@ -50,9 +52,7 @@ def create_chat_model(name: str | None = None, thinking_enabled: bool = False, *
)
existing_callbacks = model_instance.callbacks or []
model_instance.callbacks = [*existing_callbacks, tracer]
logger.debug(
f"LangSmith tracing attached to model '{name}' (project='{tracing_config.project}')"
)
logger.debug(f"LangSmith tracing attached to model '{name}' (project='{tracing_config.project}')")
except Exception as e:
logger.warning(f"Failed to attach LangSmith tracing to model '{name}': {e}")
return model_instance

View File

@@ -343,7 +343,7 @@ class SubagentExecutor:
status=SubagentStatus.PENDING,
)
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}")
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}, timeout={self.config.timeout_seconds}s")
with _background_tasks_lock:
_background_tasks[task_id] = result

View File

@@ -1,28 +1,46 @@
"""Subagent registry for managing available subagents."""
import logging
from dataclasses import replace
from src.subagents.builtins import BUILTIN_SUBAGENTS
from src.subagents.config import SubagentConfig
logger = logging.getLogger(__name__)
def get_subagent_config(name: str) -> SubagentConfig | None:
"""Get a subagent configuration by name.
"""Get a subagent configuration by name, with config.yaml overrides applied.
Args:
name: The name of the subagent.
Returns:
SubagentConfig if found, None otherwise.
SubagentConfig if found (with any config.yaml overrides applied), None otherwise.
"""
return BUILTIN_SUBAGENTS.get(name)
config = BUILTIN_SUBAGENTS.get(name)
if config is None:
return None
# Apply timeout override from config.yaml (lazy import to avoid circular deps)
from src.config.subagents_config import get_subagents_app_config
app_config = get_subagents_app_config()
effective_timeout = app_config.get_timeout_for(name)
if effective_timeout != config.timeout_seconds:
logger.debug(f"Subagent '{name}': timeout overridden by config.yaml ({config.timeout_seconds}s -> {effective_timeout}s)")
config = replace(config, timeout_seconds=effective_timeout)
return config
def list_subagents() -> list[SubagentConfig]:
"""List all available subagent configurations.
"""List all available subagent configurations (with config.yaml overrides applied).
Returns:
List of all registered SubagentConfig instances.
"""
return list(BUILTIN_SUBAGENTS.values())
return [get_subagent_config(name) for name in BUILTIN_SUBAGENTS]
def get_subagent_names() -> list[str]:

View File

@@ -115,12 +115,15 @@ def task_tool(
# Start background execution (always async to prevent blocking)
# Use tool_call_id as task_id for better traceability
task_id = executor.execute_async(prompt, task_id=tool_call_id)
logger.info(f"[trace={trace_id}] Started background task {task_id}, polling for completion...")
# Poll for task completion in backend (removes need for LLM to poll)
poll_count = 0
last_status = None
last_message_count = 0 # Track how many AI messages we've already sent
# Polling timeout: execution timeout + 60s buffer, checked every 5s
max_poll_count = (config.timeout_seconds + 60) // 5
logger.info(f"[trace={trace_id}] Started background task {task_id} (subagent={subagent_type}, timeout={config.timeout_seconds}s, polling_limit={max_poll_count} polls)")
writer = get_stream_writer()
# Send Task Started message'
@@ -176,9 +179,10 @@ def task_tool(
poll_count += 1
# Polling timeout as a safety net (in case thread pool timeout doesn't work)
# Set to 16 minutes (longer than the default 15-minute thread pool timeout)
# Set to execution timeout + 60s buffer, in 5s poll intervals
# This catches edge cases where the background task gets stuck
if poll_count > 192: # 192 * 5s = 16 minutes
if poll_count > max_poll_count:
timeout_minutes = config.timeout_seconds // 60
logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)")
writer({"type": "task_timed_out", "task_id": task_id})
return f"Task polling timed out after 16 minutes. This may indicate the background task is stuck. Status: {result.status.value}"
return f"Task polling timed out after {timeout_minutes} minutes. This may indicate the background task is stuck. Status: {result.status.value}"