mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-23 22:24:46 +08:00
feat(subagents): make subagent timeout configurable via config.yaml (#897)
* feat(subagents): make subagent timeout configurable via config.yaml - Add SubagentsAppConfig supporting global and per-agent timeout_seconds - Load subagents config section in AppConfig.from_file() - Registry now applies config.yaml overrides without mutating builtin defaults - Polling safety-net in task_tool is now dynamic (execution timeout + 60s buffer) - Document subagents section in config.example.yaml - Add make test command and enforce TDD policy in CLAUDE.md - Add 38 unit tests covering config validation, timeout resolution, registry override behavior, and polling timeout formula Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * feat(subagents): add logging for subagent timeout config and execution - Log loaded timeout config (global default + per-agent overrides) on startup - Log debug message in registry when config.yaml overrides a builtin timeout - Include timeout in executor's async execution start log - Log effective timeout and polling limit when a task is dispatched - Fix UnboundLocalError: move max_poll_count assignment before logger.info Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * ci(backend): add lint step and run all unit tests via Makefile Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix lint --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -245,17 +245,19 @@ def make_lead_agent(config: RunnableConfig):
|
||||
subagent_enabled = config.get("configurable", {}).get("subagent_enabled", False)
|
||||
max_concurrent_subagents = config.get("configurable", {}).get("max_concurrent_subagents", 3)
|
||||
print(f"thinking_enabled: {thinking_enabled}, model_name: {model_name}, is_plan_mode: {is_plan_mode}, subagent_enabled: {subagent_enabled}, max_concurrent_subagents: {max_concurrent_subagents}")
|
||||
|
||||
|
||||
# Inject run metadata for LangSmith trace tagging
|
||||
if "metadata" not in config:
|
||||
config["metadata"] = {}
|
||||
config["metadata"].update({
|
||||
"model_name": model_name or "default",
|
||||
"thinking_enabled": thinking_enabled,
|
||||
"is_plan_mode": is_plan_mode,
|
||||
"subagent_enabled": subagent_enabled,
|
||||
})
|
||||
|
||||
config["metadata"].update(
|
||||
{
|
||||
"model_name": model_name or "default",
|
||||
"thinking_enabled": thinking_enabled,
|
||||
"is_plan_mode": is_plan_mode,
|
||||
"subagent_enabled": subagent_enabled,
|
||||
}
|
||||
)
|
||||
|
||||
return create_agent(
|
||||
model=create_chat_model(name=model_name, thinking_enabled=thinking_enabled),
|
||||
tools=get_available_tools(model_name=model_name, subagent_enabled=subagent_enabled),
|
||||
|
||||
@@ -18,7 +18,6 @@ Architecture:
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ from src.config.memory_config import load_memory_config_from_dict
|
||||
from src.config.model_config import ModelConfig
|
||||
from src.config.sandbox_config import SandboxConfig
|
||||
from src.config.skills_config import SkillsConfig
|
||||
from src.config.subagents_config import load_subagents_config_from_dict
|
||||
from src.config.summarization_config import load_summarization_config_from_dict
|
||||
from src.config.title_config import load_title_config_from_dict
|
||||
from src.config.tool_config import ToolConfig, ToolGroupConfig
|
||||
@@ -87,6 +88,10 @@ class AppConfig(BaseModel):
|
||||
if "memory" in config_data:
|
||||
load_memory_config_from_dict(config_data["memory"])
|
||||
|
||||
# Load subagents config if present
|
||||
if "subagents" in config_data:
|
||||
load_subagents_config_from_dict(config_data["subagents"])
|
||||
|
||||
# Load extensions config separately (it's in a different file)
|
||||
extensions_config = ExtensionsConfig.from_file()
|
||||
config_data["extensions"] = extensions_config.model_dump()
|
||||
|
||||
65
backend/src/config/subagents_config.py
Normal file
65
backend/src/config/subagents_config.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Configuration for the subagent system loaded from config.yaml."""
|
||||
|
||||
import logging
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubagentOverrideConfig(BaseModel):
|
||||
"""Per-agent configuration overrides."""
|
||||
|
||||
timeout_seconds: int | None = Field(
|
||||
default=None,
|
||||
ge=1,
|
||||
description="Timeout in seconds for this subagent (None = use global default)",
|
||||
)
|
||||
|
||||
|
||||
class SubagentsAppConfig(BaseModel):
|
||||
"""Configuration for the subagent system."""
|
||||
|
||||
timeout_seconds: int = Field(
|
||||
default=900,
|
||||
ge=1,
|
||||
description="Default timeout in seconds for all subagents (default: 900 = 15 minutes)",
|
||||
)
|
||||
agents: dict[str, SubagentOverrideConfig] = Field(
|
||||
default_factory=dict,
|
||||
description="Per-agent configuration overrides keyed by agent name",
|
||||
)
|
||||
|
||||
def get_timeout_for(self, agent_name: str) -> int:
|
||||
"""Get the effective timeout for a specific agent.
|
||||
|
||||
Args:
|
||||
agent_name: The name of the subagent.
|
||||
|
||||
Returns:
|
||||
The timeout in seconds, using per-agent override if set, otherwise global default.
|
||||
"""
|
||||
override = self.agents.get(agent_name)
|
||||
if override is not None and override.timeout_seconds is not None:
|
||||
return override.timeout_seconds
|
||||
return self.timeout_seconds
|
||||
|
||||
|
||||
_subagents_config: SubagentsAppConfig = SubagentsAppConfig()
|
||||
|
||||
|
||||
def get_subagents_app_config() -> SubagentsAppConfig:
|
||||
"""Get the current subagents configuration."""
|
||||
return _subagents_config
|
||||
|
||||
|
||||
def load_subagents_config_from_dict(config_dict: dict) -> None:
|
||||
"""Load subagents configuration from a dictionary."""
|
||||
global _subagents_config
|
||||
_subagents_config = SubagentsAppConfig(**config_dict)
|
||||
|
||||
overrides_summary = {name: f"{override.timeout_seconds}s" for name, override in _subagents_config.agents.items() if override.timeout_seconds is not None}
|
||||
if overrides_summary:
|
||||
logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, per-agent overrides={overrides_summary}")
|
||||
else:
|
||||
logger.info(f"Subagents config loaded: default timeout={_subagents_config.timeout_seconds}s, no per-agent overrides")
|
||||
@@ -1,11 +1,13 @@
|
||||
import logging
|
||||
import os
|
||||
from pydantic import BaseModel, Field
|
||||
import threading
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
_config_lock = threading.Lock()
|
||||
|
||||
|
||||
class TracingConfig(BaseModel):
|
||||
"""Configuration for LangSmith tracing."""
|
||||
|
||||
@@ -41,11 +43,11 @@ def get_tracing_config() -> TracingConfig:
|
||||
endpoint=os.environ.get("LANGSMITH_ENDPOINT", "https://api.smith.langchain.com"),
|
||||
)
|
||||
return _tracing_config
|
||||
|
||||
|
||||
|
||||
def is_tracing_enabled() -> bool:
|
||||
"""Check if LangSmith tracing is enabled and configured.
|
||||
Returns:
|
||||
True if tracing is enabled and has an API key.
|
||||
"""
|
||||
return get_tracing_config().is_configured
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
|
||||
from langchain.chat_models import BaseChatModel
|
||||
|
||||
from src.config import get_app_config, get_tracing_config, is_tracing_enabled
|
||||
@@ -6,6 +7,7 @@ from src.reflection import resolve_class
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_chat_model(name: str | None = None, thinking_enabled: bool = False, **kwargs) -> BaseChatModel:
|
||||
"""Create a chat model instance from the config.
|
||||
|
||||
@@ -50,9 +52,7 @@ def create_chat_model(name: str | None = None, thinking_enabled: bool = False, *
|
||||
)
|
||||
existing_callbacks = model_instance.callbacks or []
|
||||
model_instance.callbacks = [*existing_callbacks, tracer]
|
||||
logger.debug(
|
||||
f"LangSmith tracing attached to model '{name}' (project='{tracing_config.project}')"
|
||||
)
|
||||
logger.debug(f"LangSmith tracing attached to model '{name}' (project='{tracing_config.project}')")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to attach LangSmith tracing to model '{name}': {e}")
|
||||
return model_instance
|
||||
|
||||
@@ -343,7 +343,7 @@ class SubagentExecutor:
|
||||
status=SubagentStatus.PENDING,
|
||||
)
|
||||
|
||||
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}")
|
||||
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}, timeout={self.config.timeout_seconds}s")
|
||||
|
||||
with _background_tasks_lock:
|
||||
_background_tasks[task_id] = result
|
||||
|
||||
@@ -1,28 +1,46 @@
|
||||
"""Subagent registry for managing available subagents."""
|
||||
|
||||
import logging
|
||||
from dataclasses import replace
|
||||
|
||||
from src.subagents.builtins import BUILTIN_SUBAGENTS
|
||||
from src.subagents.config import SubagentConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_subagent_config(name: str) -> SubagentConfig | None:
|
||||
"""Get a subagent configuration by name.
|
||||
"""Get a subagent configuration by name, with config.yaml overrides applied.
|
||||
|
||||
Args:
|
||||
name: The name of the subagent.
|
||||
|
||||
Returns:
|
||||
SubagentConfig if found, None otherwise.
|
||||
SubagentConfig if found (with any config.yaml overrides applied), None otherwise.
|
||||
"""
|
||||
return BUILTIN_SUBAGENTS.get(name)
|
||||
config = BUILTIN_SUBAGENTS.get(name)
|
||||
if config is None:
|
||||
return None
|
||||
|
||||
# Apply timeout override from config.yaml (lazy import to avoid circular deps)
|
||||
from src.config.subagents_config import get_subagents_app_config
|
||||
|
||||
app_config = get_subagents_app_config()
|
||||
effective_timeout = app_config.get_timeout_for(name)
|
||||
if effective_timeout != config.timeout_seconds:
|
||||
logger.debug(f"Subagent '{name}': timeout overridden by config.yaml ({config.timeout_seconds}s -> {effective_timeout}s)")
|
||||
config = replace(config, timeout_seconds=effective_timeout)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def list_subagents() -> list[SubagentConfig]:
|
||||
"""List all available subagent configurations.
|
||||
"""List all available subagent configurations (with config.yaml overrides applied).
|
||||
|
||||
Returns:
|
||||
List of all registered SubagentConfig instances.
|
||||
"""
|
||||
return list(BUILTIN_SUBAGENTS.values())
|
||||
return [get_subagent_config(name) for name in BUILTIN_SUBAGENTS]
|
||||
|
||||
|
||||
def get_subagent_names() -> list[str]:
|
||||
|
||||
@@ -115,12 +115,15 @@ def task_tool(
|
||||
# Start background execution (always async to prevent blocking)
|
||||
# Use tool_call_id as task_id for better traceability
|
||||
task_id = executor.execute_async(prompt, task_id=tool_call_id)
|
||||
logger.info(f"[trace={trace_id}] Started background task {task_id}, polling for completion...")
|
||||
|
||||
# Poll for task completion in backend (removes need for LLM to poll)
|
||||
poll_count = 0
|
||||
last_status = None
|
||||
last_message_count = 0 # Track how many AI messages we've already sent
|
||||
# Polling timeout: execution timeout + 60s buffer, checked every 5s
|
||||
max_poll_count = (config.timeout_seconds + 60) // 5
|
||||
|
||||
logger.info(f"[trace={trace_id}] Started background task {task_id} (subagent={subagent_type}, timeout={config.timeout_seconds}s, polling_limit={max_poll_count} polls)")
|
||||
|
||||
writer = get_stream_writer()
|
||||
# Send Task Started message'
|
||||
@@ -176,9 +179,10 @@ def task_tool(
|
||||
poll_count += 1
|
||||
|
||||
# Polling timeout as a safety net (in case thread pool timeout doesn't work)
|
||||
# Set to 16 minutes (longer than the default 15-minute thread pool timeout)
|
||||
# Set to execution timeout + 60s buffer, in 5s poll intervals
|
||||
# This catches edge cases where the background task gets stuck
|
||||
if poll_count > 192: # 192 * 5s = 16 minutes
|
||||
if poll_count > max_poll_count:
|
||||
timeout_minutes = config.timeout_seconds // 60
|
||||
logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)")
|
||||
writer({"type": "task_timed_out", "task_id": task_id})
|
||||
return f"Task polling timed out after 16 minutes. This may indicate the background task is stuck. Status: {result.status.value}"
|
||||
return f"Task polling timed out after {timeout_minutes} minutes. This may indicate the background task is stuck. Status: {result.status.value}"
|
||||
|
||||
Reference in New Issue
Block a user