mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-26 15:24:48 +08:00
* fix: apply context compression to prevent token overflow (Issue #721) - Add token_limit configuration to conf.yaml.example for BASIC_MODEL and REASONING_MODEL - Implement context compression in _execute_agent_step() before agent invocation - Preserve first 3 messages (system prompt + context) during compression - Enhance ContextManager logging with better token count reporting - Prevent 400 Input tokens exceeded errors by automatically compressing message history * feat: add model-based token limit inference for Issue #721 - Add smart default token limits based on common LLM models - Support model name inference when token_limit not explicitly configured - Models include: OpenAI (GPT-4o, GPT-4, etc.), Claude, Gemini, Doubao, DeepSeek, etc. - Conservative defaults prevent token overflow even without explicit configuration - Priority: explicit config > model inference > safe default (100,000 tokens) - Ensures Issue #721 protection for all users, not just those with token_limit set
This commit is contained in:
@@ -12,6 +12,7 @@ BASIC_MODEL:
|
|||||||
api_key: xxxx
|
api_key: xxxx
|
||||||
# max_retries: 3 # Maximum number of retries for LLM calls
|
# max_retries: 3 # Maximum number of retries for LLM calls
|
||||||
# verify_ssl: false # Uncomment this line to disable SSL certificate verification for self-signed certificates
|
# verify_ssl: false # Uncomment this line to disable SSL certificate verification for self-signed certificates
|
||||||
|
# token_limit: 200000 # Maximum input tokens for context compression (prevents token overflow errors)
|
||||||
|
|
||||||
# Local model configuration example:
|
# Local model configuration example:
|
||||||
|
|
||||||
@@ -39,6 +40,7 @@ BASIC_MODEL:
|
|||||||
# model: "doubao-1-5-thinking-pro-m-250428"
|
# model: "doubao-1-5-thinking-pro-m-250428"
|
||||||
# api_key: xxxx
|
# api_key: xxxx
|
||||||
# max_retries: 3 # Maximum number of retries for LLM calls
|
# max_retries: 3 # Maximum number of retries for LLM calls
|
||||||
|
# token_limit: 150000 # Maximum input tokens for context compression
|
||||||
|
|
||||||
|
|
||||||
# OTHER SETTINGS:
|
# OTHER SETTINGS:
|
||||||
|
|||||||
@@ -974,6 +974,24 @@ async def _execute_agent_step(
|
|||||||
except Exception as validation_error:
|
except Exception as validation_error:
|
||||||
logger.error(f"Error validating agent input messages: {validation_error}")
|
logger.error(f"Error validating agent input messages: {validation_error}")
|
||||||
|
|
||||||
|
# Apply context compression to prevent token overflow (Issue #721)
|
||||||
|
llm_token_limit = get_llm_token_limit_by_type(AGENT_LLM_MAP[agent_name])
|
||||||
|
if llm_token_limit:
|
||||||
|
token_count_before = sum(
|
||||||
|
len(str(msg.content).split()) for msg in agent_input.get("messages", []) if hasattr(msg, "content")
|
||||||
|
)
|
||||||
|
compressed_state = ContextManager(llm_token_limit, preserve_prefix_message_count=3).compress_messages(
|
||||||
|
{"messages": agent_input["messages"]}
|
||||||
|
)
|
||||||
|
agent_input["messages"] = compressed_state.get("messages", [])
|
||||||
|
token_count_after = sum(
|
||||||
|
len(str(msg.content).split()) for msg in agent_input.get("messages", []) if hasattr(msg, "content")
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"Context compression for {agent_name}: {len(compressed_state.get('messages', []))} messages, "
|
||||||
|
f"estimated tokens before: ~{token_count_before}, after: ~{token_count_after}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = await agent.ainvoke(
|
result = await agent.ainvoke(
|
||||||
input=agent_input, config={"recursion_limit": recursion_limit}
|
input=agent_input, config={"recursion_limit": recursion_limit}
|
||||||
|
|||||||
@@ -178,23 +178,101 @@ def get_configured_llm_models() -> dict[str, list[str]]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_model_token_limit_defaults() -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Get default token limits for common LLM models.
|
||||||
|
These are conservative limits to prevent token overflow errors (Issue #721).
|
||||||
|
Users can override by setting token_limit in their config.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
# OpenAI models
|
||||||
|
"gpt-4o": 120000,
|
||||||
|
"gpt-4-turbo": 120000,
|
||||||
|
"gpt-4": 8000,
|
||||||
|
"gpt-3.5-turbo": 4000,
|
||||||
|
# Anthropic Claude
|
||||||
|
"claude-3": 180000,
|
||||||
|
"claude-2": 100000,
|
||||||
|
# Google Gemini
|
||||||
|
"gemini-2": 180000,
|
||||||
|
"gemini-1.5-pro": 180000,
|
||||||
|
"gemini-1.5-flash": 180000,
|
||||||
|
"gemini-pro": 30000,
|
||||||
|
# Bytedance Doubao
|
||||||
|
"doubao": 200000,
|
||||||
|
# DeepSeek
|
||||||
|
"deepseek": 100000,
|
||||||
|
# Ollama/local
|
||||||
|
"qwen": 30000,
|
||||||
|
"llama": 4000,
|
||||||
|
# Default fallback for unknown models
|
||||||
|
"default": 100000,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_token_limit_from_model(model_name: str) -> int:
|
||||||
|
"""
|
||||||
|
Infer a reasonable token limit from the model name.
|
||||||
|
This helps protect against token overflow errors when token_limit is not explicitly configured.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: The model name from configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A conservative token limit based on known model capabilities
|
||||||
|
"""
|
||||||
|
if not model_name:
|
||||||
|
return 100000 # Safe default
|
||||||
|
|
||||||
|
model_name_lower = model_name.lower()
|
||||||
|
defaults = _get_model_token_limit_defaults()
|
||||||
|
|
||||||
|
# Try exact or prefix matches
|
||||||
|
for key, limit in defaults.items():
|
||||||
|
if key in model_name_lower:
|
||||||
|
return limit
|
||||||
|
|
||||||
|
# Return safe default if no match found
|
||||||
|
return defaults["default"]
|
||||||
|
|
||||||
|
|
||||||
def get_llm_token_limit_by_type(llm_type: str) -> int:
|
def get_llm_token_limit_by_type(llm_type: str) -> int:
|
||||||
"""
|
"""
|
||||||
Get the maximum token limit for a given LLM type.
|
Get the maximum token limit for a given LLM type.
|
||||||
|
|
||||||
|
Priority order:
|
||||||
|
1. Explicitly configured token_limit in conf.yaml
|
||||||
|
2. Inferred from model name based on known model capabilities
|
||||||
|
3. Safe default (100,000 tokens)
|
||||||
|
|
||||||
|
This helps prevent token overflow errors (Issue #721) even when token_limit is not configured.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
llm_type (str): The type of LLM.
|
llm_type (str): The type of LLM (e.g., 'basic', 'reasoning', 'vision', 'code').
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int: The maximum token limit for the specified LLM type.
|
int: The maximum token limit for the specified LLM type (conservative estimate).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
llm_type_config_keys = _get_llm_type_config_keys()
|
llm_type_config_keys = _get_llm_type_config_keys()
|
||||||
config_key = llm_type_config_keys.get(llm_type)
|
config_key = llm_type_config_keys.get(llm_type)
|
||||||
|
|
||||||
conf = load_yaml_config(_get_config_file_path())
|
conf = load_yaml_config(_get_config_file_path())
|
||||||
llm_max_token = conf.get(config_key, {}).get("token_limit")
|
model_config = conf.get(config_key, {})
|
||||||
return llm_max_token
|
|
||||||
|
# First priority: explicitly configured token_limit
|
||||||
|
if "token_limit" in model_config:
|
||||||
|
configured_limit = model_config["token_limit"]
|
||||||
|
if configured_limit is not None:
|
||||||
|
return configured_limit
|
||||||
|
|
||||||
|
# Second priority: infer from model name
|
||||||
|
model_name = model_config.get("model")
|
||||||
|
if model_name:
|
||||||
|
inferred_limit = _infer_token_limit_from_model(model_name)
|
||||||
|
return inferred_limit
|
||||||
|
|
||||||
|
# Fallback: safe default
|
||||||
|
return _get_model_token_limit_defaults()["default"]
|
||||||
|
|
||||||
|
|
||||||
# In the future, we will use reasoning_llm and vl_llm for different purposes
|
# In the future, we will use reasoning_llm and vl_llm for different purposes
|
||||||
|
|||||||
@@ -166,13 +166,17 @@ class ContextManager:
|
|||||||
messages = state["messages"]
|
messages = state["messages"]
|
||||||
|
|
||||||
if not self.is_over_limit(messages):
|
if not self.is_over_limit(messages):
|
||||||
|
logger.debug(f"Messages within limit ({self.count_tokens(messages)} <= {self.token_limit} tokens)")
|
||||||
return state
|
return state
|
||||||
|
|
||||||
# 2. Compress messages
|
# Compress messages
|
||||||
|
original_token_count = self.count_tokens(messages)
|
||||||
compressed_messages = self._compress_messages(messages)
|
compressed_messages = self._compress_messages(messages)
|
||||||
|
compressed_token_count = self.count_tokens(compressed_messages)
|
||||||
|
|
||||||
logger.info(
|
logger.warning(
|
||||||
f"Message compression completed: {self.count_tokens(messages)} -> {self.count_tokens(compressed_messages)} tokens"
|
f"Message compression executed (Issue #721): {original_token_count} -> {compressed_token_count} tokens "
|
||||||
|
f"(limit: {self.token_limit}), {len(messages)} -> {len(compressed_messages)} messages"
|
||||||
)
|
)
|
||||||
|
|
||||||
state["messages"] = compressed_messages
|
state["messages"] = compressed_messages
|
||||||
|
|||||||
Reference in New Issue
Block a user