feat: Generate a fallback report upon recursion limit hit (#838)

* finish handle_recursion_limit_fallback * fix * renmae test file * fix * doc --------- Co-authored-by: lxl0413 <lixinling2021@gmail.com>
2026-04-17 03:34:45 +08:00 · 2026-01-26 21:10:18 +08:00
parent 9a34e32252
commit ee02b9f637
7 changed files with 895 additions and 12 deletions
--- a/src/config/configuration.py
+++ b/src/config/configuration.py
@@ -63,6 +63,9 @@ class Configuration:
    interrupt_before_tools: list[str] = field(
        default_factory=list
    )  # List of tool names to interrupt before execution
+    enable_recursion_fallback: bool = (
+        True  # Enable graceful fallback when recursion limit is reached
+    )

    @classmethod
    def from_runnable_config(
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -7,10 +7,11 @@ import os
 from functools import partial
 from typing import Annotated, Any, Literal

-from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
 from langchain_core.runnables import RunnableConfig
 from langchain_core.tools import tool
 from langchain_mcp_adapters.client import MultiServerMCPClient
+from langgraph.errors import GraphRecursionError
 from langgraph.types import Command, interrupt

 from src.agents import create_agent
@@ -19,7 +20,7 @@ from src.config.agents import AGENT_LLM_MAP
 from src.config.configuration import Configuration
 from src.llms.llm import get_llm_by_type, get_llm_token_limit_by_type
 from src.prompts.planner_model import Plan
-from src.prompts.template import apply_prompt_template
+from src.prompts.template import apply_prompt_template, get_system_prompt_template
 from src.tools import (
    crawl_tool,
    get_retriever_tool,
@@ -929,6 +930,79 @@ def validate_web_search_usage(messages: list, agent_name: str = "agent") -> bool
    return web_search_used


+async def _handle_recursion_limit_fallback(
+    messages: list,
+    agent_name: str,
+    current_step,
+    state: State,
+) -> list:
+    """Handle GraphRecursionError with graceful fallback using LLM summary.
+
+    When the agent hits the recursion limit, this function generates a final output
+    using only the observations already gathered, without calling any tools.
+
+    Args:
+        messages: Messages accumulated during agent execution before hitting limit
+        agent_name: Name of the agent that hit the limit
+        current_step: The current step being executed
+        state: Current workflow state
+
+    Returns:
+        list: Messages including the accumulated messages plus the fallback summary
+
+    Raises:
+        Exception: If the fallback LLM call fails
+    """
+    logger.warning(
+        f"Recursion limit reached for {agent_name} agent. "
+        f"Attempting graceful fallback with {len(messages)} accumulated messages."
+    )
+
+    if len(messages) == 0:
+        return messages
+
+    cleared_messages = messages.copy()
+    while len(cleared_messages) > 0 and cleared_messages[-1].type == "system":
+        cleared_messages = cleared_messages[:-1]
+
+    # Prepare state for prompt template
+    fallback_state = {
+        "locale": state.get("locale", "en-US"),
+    }
+
+    # Apply the recursion_fallback prompt template
+    system_prompt = get_system_prompt_template(agent_name, fallback_state, None, fallback_state.get("locale", "en-US"))
+    limit_prompt = get_system_prompt_template("recursion_fallback", fallback_state, None, fallback_state.get("locale", "en-US"))
+    fallback_messages = cleared_messages + [
+        SystemMessage(content=system_prompt),
+        SystemMessage(content=limit_prompt)
+    ]
+
+    # Get the LLM without tools (strip all tools from binding)
+    fallback_llm = get_llm_by_type(AGENT_LLM_MAP[agent_name])
+
+    # Call the LLM with the updated messages
+    fallback_response = fallback_llm.invoke(fallback_messages)
+    fallback_content = fallback_response.content
+
+    logger.info(
+        f"Graceful fallback succeeded for {agent_name} agent. "
+        f"Generated summary of {len(fallback_content)} characters."
+    )
+
+    # Sanitize response
+    fallback_content = sanitize_tool_response(str(fallback_content))
+
+    # Update the step with the fallback result
+    current_step.execution_res = fallback_content
+
+    # Return the accumulated messages plus the fallback response
+    result_messages = list(cleared_messages)
+    result_messages.append(AIMessage(content=fallback_content, name=agent_name))
+
+    return result_messages
+
+
 async def _execute_agent_step(
    state: State, agent, agent_name: str, config: RunnableConfig = None
 ) -> Command[Literal["research_team"]]:
@@ -1049,11 +1123,51 @@ async def _execute_agent_step(
            f"Context compression for {agent_name}: {len(compressed_state.get('messages', []))} messages, "
            f"estimated tokens before: ~{token_count_before}, after: ~{token_count_after}"
        )
-    
+
    try:
-        result = await agent.ainvoke(
-            input=agent_input, config={"recursion_limit": recursion_limit}
-        )
+        # Use stream from the start to capture messages in real-time
+        # This allows us to retrieve accumulated messages even if recursion limit is hit
+        accumulated_messages = []
+        for chunk in agent.stream(
+            input=agent_input,
+            config={"recursion_limit": recursion_limit},
+            stream_mode="values",
+        ):
+            if isinstance(chunk, dict) and "messages" in chunk:
+                accumulated_messages = chunk["messages"]
+
+        # If we get here, execution completed successfully
+        result = {"messages": accumulated_messages}
+    except GraphRecursionError:
+        # Check if recursion fallback is enabled
+        configurable = Configuration.from_runnable_config(config) if config else Configuration()
+
+        if configurable.enable_recursion_fallback:
+            try:
+                # Call fallback with accumulated messages (function returns list of messages)
+                response_messages = await _handle_recursion_limit_fallback(
+                    messages=accumulated_messages,
+                    agent_name=agent_name,
+                    current_step=current_step,
+                    state=state,
+                )
+
+                # Create result dict so the code can continue normally from line 1178
+                result = {"messages": response_messages}
+            except Exception as fallback_error:
+                # If fallback fails, log and fall through to standard error handling
+                logger.error(
+                    f"Recursion fallback failed for {agent_name} agent: {fallback_error}. "
+                    "Falling back to standard error handling."
+                )
+                raise
+        else:
+            # Fallback disabled, let error propagate to standard handler
+            logger.info(
+                f"Recursion limit reached but graceful fallback is disabled. "
+                "Using standard error handling."
+            )
+            raise
    except Exception as e:
        import traceback

@@ -1088,8 +1202,10 @@ async def _execute_agent_step(
            goto="research_team",
        )

+    response_messages = result["messages"]
+
    # Process the result
-    response_content = result["messages"][-1].content
+    response_content = response_messages[-1].content
    
    # Sanitize response to remove extra tokens and truncate if needed
    response_content = sanitize_tool_response(str(response_content))
--- a/src/prompts/recursion_fallback.md
+++ b/src/prompts/recursion_fallback.md
@@ -0,0 +1,16 @@
+---
+CURRENT_TIME: {{ CURRENT_TIME }}
+locale: {{ locale }}
+---
+
+You have reached the maximum number of reasoning steps.
+
+Using ONLY the tool observations already produced,
+write the final research report in EXACTLY the same format
+as you would normally output at the end of this task.
+
+Do not call any tools.
+Do not add new information.
+If something is missing, state it explicitly.
+
+Always output in the locale of **{{ locale }}**.
--- a/src/prompts/template.py
+++ b/src/prompts/template.py
@@ -4,7 +4,6 @@
 import dataclasses
 import os
 from datetime import datetime
-
 from jinja2 import Environment, FileSystemLoader, TemplateNotFound, select_autoescape
 from langchain.agents import AgentState

@@ -61,6 +60,28 @@ def apply_prompt_template(
    Returns:
        List of messages with the system prompt as the first message
    """
+    try:
+        system_prompt = get_system_prompt_template(prompt_name, state, configurable, locale)
+        return [{"role": "system", "content": system_prompt}] + state["messages"]
+    except Exception as e:
+        raise ValueError(f"Error applying template {prompt_name} for locale {locale}: {e}")
+
+def get_system_prompt_template(
+    prompt_name: str, state: AgentState, configurable: Configuration = None, locale: str = "en-US"
+) -> str:
+    """
+    Render and return the system prompt template with state and configuration variables.
+    This function loads a Jinja2-based prompt template (with optional locale-specific
+    variants), applies variables from the agent state and Configuration object, and
+    returns the fully rendered system prompt string.
+    Args:
+        prompt_name: Name of the prompt template to load (without .md extension).
+        state: Current agent state containing variables available to the template.
+        configurable: Optional Configuration object providing additional template variables.
+        locale: Language locale for template selection (e.g., en-US, zh-CN).
+    Returns:
+        The rendered system prompt string after applying all template variables.
+    """
    # Convert state to dict for template rendering
    state_vars = {
        "CURRENT_TIME": datetime.now().strftime("%a %b %d %Y %H:%M:%S %z"),
@@ -74,15 +95,15 @@ def apply_prompt_template(
    try:
        # Normalize locale format
        normalized_locale = locale.replace("-", "_") if locale and locale.strip() else "en_US"
-        
+
        # Try locale-specific template first
        try:
            template = env.get_template(f"{prompt_name}.{normalized_locale}.md")
        except TemplateNotFound:
            # Fallback to English template
            template = env.get_template(f"{prompt_name}.md")
-        
+
        system_prompt = template.render(**state_vars)
-        return [{"role": "system", "content": system_prompt}] + state["messages"]
+        return system_prompt
    except Exception as e:
-        raise ValueError(f"Error applying template {prompt_name} for locale {locale}: {e}")
+        raise ValueError(f"Error loading template {prompt_name} for locale {locale}: {e}")