From b24f4d3f38d45403b7525537de3ba7112c0a8835 Mon Sep 17 00:00:00 2001 From: Willem Jiang Date: Fri, 28 Nov 2025 18:52:42 +0800 Subject: [PATCH] fix: apply context compression to prevent token overflow (Issue #721) (#722) * fix: apply context compression to prevent token overflow (Issue #721) - Add token_limit configuration to conf.yaml.example for BASIC_MODEL and REASONING_MODEL - Implement context compression in _execute_agent_step() before agent invocation - Preserve first 3 messages (system prompt + context) during compression - Enhance ContextManager logging with better token count reporting - Prevent 400 Input tokens exceeded errors by automatically compressing message history * feat: add model-based token limit inference for Issue #721 - Add smart default token limits based on common LLM models - Support model name inference when token_limit not explicitly configured - Models include: OpenAI (GPT-4o, GPT-4, etc.), Claude, Gemini, Doubao, DeepSeek, etc. - Conservative defaults prevent token overflow even without explicit configuration - Priority: explicit config > model inference > safe default (100,000 tokens) - Ensures Issue #721 protection for all users, not just those with token_limit set --- conf.yaml.example | 2 + src/graph/nodes.py | 18 ++++++++ src/llms/llm.py | 88 ++++++++++++++++++++++++++++++++++-- src/utils/context_manager.py | 10 ++-- 4 files changed, 110 insertions(+), 8 deletions(-) diff --git a/conf.yaml.example b/conf.yaml.example index 23939cd..646a303 100644 --- a/conf.yaml.example +++ b/conf.yaml.example @@ -12,6 +12,7 @@ BASIC_MODEL: api_key: xxxx # max_retries: 3 # Maximum number of retries for LLM calls # verify_ssl: false # Uncomment this line to disable SSL certificate verification for self-signed certificates + # token_limit: 200000 # Maximum input tokens for context compression (prevents token overflow errors) # Local model configuration example: @@ -39,6 +40,7 @@ BASIC_MODEL: # model: "doubao-1-5-thinking-pro-m-250428" # api_key: xxxx # max_retries: 3 # Maximum number of retries for LLM calls +# token_limit: 150000 # Maximum input tokens for context compression # OTHER SETTINGS: diff --git a/src/graph/nodes.py b/src/graph/nodes.py index 9b47b6c..f02a3d3 100644 --- a/src/graph/nodes.py +++ b/src/graph/nodes.py @@ -974,6 +974,24 @@ async def _execute_agent_step( except Exception as validation_error: logger.error(f"Error validating agent input messages: {validation_error}") + # Apply context compression to prevent token overflow (Issue #721) + llm_token_limit = get_llm_token_limit_by_type(AGENT_LLM_MAP[agent_name]) + if llm_token_limit: + token_count_before = sum( + len(str(msg.content).split()) for msg in agent_input.get("messages", []) if hasattr(msg, "content") + ) + compressed_state = ContextManager(llm_token_limit, preserve_prefix_message_count=3).compress_messages( + {"messages": agent_input["messages"]} + ) + agent_input["messages"] = compressed_state.get("messages", []) + token_count_after = sum( + len(str(msg.content).split()) for msg in agent_input.get("messages", []) if hasattr(msg, "content") + ) + logger.info( + f"Context compression for {agent_name}: {len(compressed_state.get('messages', []))} messages, " + f"estimated tokens before: ~{token_count_before}, after: ~{token_count_after}" + ) + try: result = await agent.ainvoke( input=agent_input, config={"recursion_limit": recursion_limit} diff --git a/src/llms/llm.py b/src/llms/llm.py index 809bebe..1c7157b 100644 --- a/src/llms/llm.py +++ b/src/llms/llm.py @@ -178,23 +178,101 @@ def get_configured_llm_models() -> dict[str, list[str]]: return {} +def _get_model_token_limit_defaults() -> dict[str, int]: + """ + Get default token limits for common LLM models. + These are conservative limits to prevent token overflow errors (Issue #721). + Users can override by setting token_limit in their config. + """ + return { + # OpenAI models + "gpt-4o": 120000, + "gpt-4-turbo": 120000, + "gpt-4": 8000, + "gpt-3.5-turbo": 4000, + # Anthropic Claude + "claude-3": 180000, + "claude-2": 100000, + # Google Gemini + "gemini-2": 180000, + "gemini-1.5-pro": 180000, + "gemini-1.5-flash": 180000, + "gemini-pro": 30000, + # Bytedance Doubao + "doubao": 200000, + # DeepSeek + "deepseek": 100000, + # Ollama/local + "qwen": 30000, + "llama": 4000, + # Default fallback for unknown models + "default": 100000, + } + + +def _infer_token_limit_from_model(model_name: str) -> int: + """ + Infer a reasonable token limit from the model name. + This helps protect against token overflow errors when token_limit is not explicitly configured. + + Args: + model_name: The model name from configuration + + Returns: + A conservative token limit based on known model capabilities + """ + if not model_name: + return 100000 # Safe default + + model_name_lower = model_name.lower() + defaults = _get_model_token_limit_defaults() + + # Try exact or prefix matches + for key, limit in defaults.items(): + if key in model_name_lower: + return limit + + # Return safe default if no match found + return defaults["default"] + + def get_llm_token_limit_by_type(llm_type: str) -> int: """ Get the maximum token limit for a given LLM type. + + Priority order: + 1. Explicitly configured token_limit in conf.yaml + 2. Inferred from model name based on known model capabilities + 3. Safe default (100,000 tokens) + + This helps prevent token overflow errors (Issue #721) even when token_limit is not configured. Args: - llm_type (str): The type of LLM. + llm_type (str): The type of LLM (e.g., 'basic', 'reasoning', 'vision', 'code'). Returns: - int: The maximum token limit for the specified LLM type. + int: The maximum token limit for the specified LLM type (conservative estimate). """ - llm_type_config_keys = _get_llm_type_config_keys() config_key = llm_type_config_keys.get(llm_type) conf = load_yaml_config(_get_config_file_path()) - llm_max_token = conf.get(config_key, {}).get("token_limit") - return llm_max_token + model_config = conf.get(config_key, {}) + + # First priority: explicitly configured token_limit + if "token_limit" in model_config: + configured_limit = model_config["token_limit"] + if configured_limit is not None: + return configured_limit + + # Second priority: infer from model name + model_name = model_config.get("model") + if model_name: + inferred_limit = _infer_token_limit_from_model(model_name) + return inferred_limit + + # Fallback: safe default + return _get_model_token_limit_defaults()["default"] # In the future, we will use reasoning_llm and vl_llm for different purposes diff --git a/src/utils/context_manager.py b/src/utils/context_manager.py index 123582e..72267d6 100644 --- a/src/utils/context_manager.py +++ b/src/utils/context_manager.py @@ -166,13 +166,17 @@ class ContextManager: messages = state["messages"] if not self.is_over_limit(messages): + logger.debug(f"Messages within limit ({self.count_tokens(messages)} <= {self.token_limit} tokens)") return state - # 2. Compress messages + # Compress messages + original_token_count = self.count_tokens(messages) compressed_messages = self._compress_messages(messages) + compressed_token_count = self.count_tokens(compressed_messages) - logger.info( - f"Message compression completed: {self.count_tokens(messages)} -> {self.count_tokens(compressed_messages)} tokens" + logger.warning( + f"Message compression executed (Issue #721): {original_token_count} -> {compressed_token_count} tokens " + f"(limit: {self.token_limit}), {len(messages)} -> {len(compressed_messages)} messages" ) state["messages"] = compressed_messages