From b24f4d3f38d45403b7525537de3ba7112c0a8835 Mon Sep 17 00:00:00 2001
From: Willem Jiang <willem.jiang@gmail.com>
Date: Fri, 28 Nov 2025 18:52:42 +0800
Subject: [PATCH] fix: apply context compression to prevent token overflow
 (Issue #721) (#722)

* fix: apply context compression to prevent token overflow (Issue #721)

- Add token_limit configuration to conf.yaml.example for BASIC_MODEL and REASONING_MODEL
- Implement context compression in _execute_agent_step() before agent invocation
- Preserve first 3 messages (system prompt + context) during compression
- Enhance ContextManager logging with better token count reporting
- Prevent 400 Input tokens exceeded errors by automatically compressing message history

* feat: add model-based token limit inference for Issue #721

- Add smart default token limits based on common LLM models
- Support model name inference when token_limit not explicitly configured
- Models include: OpenAI (GPT-4o, GPT-4, etc.), Claude, Gemini, Doubao, DeepSeek, etc.
- Conservative defaults prevent token overflow even without explicit configuration
- Priority: explicit config > model inference > safe default (100,000 tokens)
- Ensures Issue #721 protection for all users, not just those with token_limit set
---
 conf.yaml.example            |  2 +
 src/graph/nodes.py           | 18 ++++++++
 src/llms/llm.py              | 88 ++++++++++++++++++++++++++++++++++--
 src/utils/context_manager.py | 10 ++--
 4 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/conf.yaml.example b/conf.yaml.example
index 23939cd..646a303 100644
--- a/conf.yaml.example
+++ b/conf.yaml.example
@@ -12,6 +12,7 @@ BASIC_MODEL:
   api_key: xxxx
   # max_retries: 3 # Maximum number of retries for LLM calls
   # verify_ssl: false  # Uncomment this line to disable SSL certificate verification for self-signed certificates
+  # token_limit: 200000 # Maximum input tokens for context compression (prevents token overflow errors)
 
 # Local model configuration example:
 
@@ -39,6 +40,7 @@ BASIC_MODEL:
 #   model: "doubao-1-5-thinking-pro-m-250428"
 #   api_key: xxxx
 #   max_retries: 3 # Maximum number of retries for LLM calls
+#   token_limit: 150000 # Maximum input tokens for context compression
 
 
 # OTHER SETTINGS:
diff --git a/src/graph/nodes.py b/src/graph/nodes.py
index 9b47b6c..f02a3d3 100644
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -974,6 +974,24 @@ async def _execute_agent_step(
     except Exception as validation_error:
         logger.error(f"Error validating agent input messages: {validation_error}")
     
+    # Apply context compression to prevent token overflow (Issue #721)
+    llm_token_limit = get_llm_token_limit_by_type(AGENT_LLM_MAP[agent_name])
+    if llm_token_limit:
+        token_count_before = sum(
+            len(str(msg.content).split()) for msg in agent_input.get("messages", []) if hasattr(msg, "content")
+        )
+        compressed_state = ContextManager(llm_token_limit, preserve_prefix_message_count=3).compress_messages(
+            {"messages": agent_input["messages"]}
+        )
+        agent_input["messages"] = compressed_state.get("messages", [])
+        token_count_after = sum(
+            len(str(msg.content).split()) for msg in agent_input.get("messages", []) if hasattr(msg, "content")
+        )
+        logger.info(
+            f"Context compression for {agent_name}: {len(compressed_state.get('messages', []))} messages, "
+            f"estimated tokens before: ~{token_count_before}, after: ~{token_count_after}"
+        )
+    
     try:
         result = await agent.ainvoke(
             input=agent_input, config={"recursion_limit": recursion_limit}
diff --git a/src/llms/llm.py b/src/llms/llm.py
index 809bebe..1c7157b 100644
--- a/src/llms/llm.py
+++ b/src/llms/llm.py
@@ -178,23 +178,101 @@ def get_configured_llm_models() -> dict[str, list[str]]:
         return {}
 
 
+def _get_model_token_limit_defaults() -> dict[str, int]:
+    """
+    Get default token limits for common LLM models.
+    These are conservative limits to prevent token overflow errors (Issue #721).
+    Users can override by setting token_limit in their config.
+    """
+    return {
+        # OpenAI models
+        "gpt-4o": 120000,
+        "gpt-4-turbo": 120000,
+        "gpt-4": 8000,
+        "gpt-3.5-turbo": 4000,
+        # Anthropic Claude
+        "claude-3": 180000,
+        "claude-2": 100000,
+        # Google Gemini
+        "gemini-2": 180000,
+        "gemini-1.5-pro": 180000,
+        "gemini-1.5-flash": 180000,
+        "gemini-pro": 30000,
+        # Bytedance Doubao
+        "doubao": 200000,
+        # DeepSeek
+        "deepseek": 100000,
+        # Ollama/local
+        "qwen": 30000,
+        "llama": 4000,
+        # Default fallback for unknown models
+        "default": 100000,
+    }
+
+
+def _infer_token_limit_from_model(model_name: str) -> int:
+    """
+    Infer a reasonable token limit from the model name.
+    This helps protect against token overflow errors when token_limit is not explicitly configured.
+    
+    Args:
+        model_name: The model name from configuration
+        
+    Returns:
+        A conservative token limit based on known model capabilities
+    """
+    if not model_name:
+        return 100000  # Safe default
+    
+    model_name_lower = model_name.lower()
+    defaults = _get_model_token_limit_defaults()
+    
+    # Try exact or prefix matches
+    for key, limit in defaults.items():
+        if key in model_name_lower:
+            return limit
+    
+    # Return safe default if no match found
+    return defaults["default"]
+
+
 def get_llm_token_limit_by_type(llm_type: str) -> int:
     """
     Get the maximum token limit for a given LLM type.
+    
+    Priority order:
+    1. Explicitly configured token_limit in conf.yaml
+    2. Inferred from model name based on known model capabilities
+    3. Safe default (100,000 tokens)
+    
+    This helps prevent token overflow errors (Issue #721) even when token_limit is not configured.
 
     Args:
-        llm_type (str): The type of LLM.
+        llm_type (str): The type of LLM (e.g., 'basic', 'reasoning', 'vision', 'code').
 
     Returns:
-        int: The maximum token limit for the specified LLM type.
+        int: The maximum token limit for the specified LLM type (conservative estimate).
     """
-
     llm_type_config_keys = _get_llm_type_config_keys()
     config_key = llm_type_config_keys.get(llm_type)
 
     conf = load_yaml_config(_get_config_file_path())
-    llm_max_token = conf.get(config_key, {}).get("token_limit")
-    return llm_max_token
+    model_config = conf.get(config_key, {})
+    
+    # First priority: explicitly configured token_limit
+    if "token_limit" in model_config:
+        configured_limit = model_config["token_limit"]
+        if configured_limit is not None:
+            return configured_limit
+    
+    # Second priority: infer from model name
+    model_name = model_config.get("model")
+    if model_name:
+        inferred_limit = _infer_token_limit_from_model(model_name)
+        return inferred_limit
+    
+    # Fallback: safe default
+    return _get_model_token_limit_defaults()["default"]
 
 
 # In the future, we will use reasoning_llm and vl_llm for different purposes
diff --git a/src/utils/context_manager.py b/src/utils/context_manager.py
index 123582e..72267d6 100644
--- a/src/utils/context_manager.py
+++ b/src/utils/context_manager.py
@@ -166,13 +166,17 @@ class ContextManager:
         messages = state["messages"]
 
         if not self.is_over_limit(messages):
+            logger.debug(f"Messages within limit ({self.count_tokens(messages)} <= {self.token_limit} tokens)")
             return state
 
-        # 2. Compress messages
+        # Compress messages
+        original_token_count = self.count_tokens(messages)
         compressed_messages = self._compress_messages(messages)
+        compressed_token_count = self.count_tokens(compressed_messages)
 
-        logger.info(
-            f"Message compression completed: {self.count_tokens(messages)} -> {self.count_tokens(compressed_messages)} tokens"
+        logger.warning(
+            f"Message compression executed (Issue #721): {original_token_count} -> {compressed_token_count} tokens "
+            f"(limit: {self.token_limit}), {len(messages)} -> {len(compressed_messages)} messages"
         )
 
         state["messages"] = compressed_messages