mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-16 03:14:45 +08:00
fix: parsed json with extra tokens issue (#656)
Fixes #598 * fix: parsed json with extra tokens issue * Added unit test for json.ts * fix the json unit test running issue * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update the code with code review suggestion --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
This commit is contained in:
@@ -27,7 +27,7 @@ from src.tools import (
|
||||
)
|
||||
from src.tools.search import LoggedTavilySearch
|
||||
from src.utils.context_manager import ContextManager, validate_message_content
|
||||
from src.utils.json_utils import repair_json_output
|
||||
from src.utils.json_utils import repair_json_output, sanitize_tool_response
|
||||
|
||||
from ..config import SELECTED_SEARCH_ENGINE, SearchEngine
|
||||
from .types import State
|
||||
@@ -834,6 +834,10 @@ async def _execute_agent_step(
|
||||
|
||||
# Process the result
|
||||
response_content = result["messages"][-1].content
|
||||
|
||||
# Sanitize response to remove extra tokens and truncate if needed
|
||||
response_content = sanitize_tool_response(str(response_content))
|
||||
|
||||
logger.debug(f"{agent_name.capitalize()} full response: {response_content}")
|
||||
|
||||
# Update the step with the execution result
|
||||
|
||||
@@ -266,7 +266,7 @@ class ContextManager:
|
||||
pass
|
||||
|
||||
|
||||
def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
|
||||
def validate_message_content(messages: List[BaseMessage], max_content_length: int = 100000) -> List[BaseMessage]:
|
||||
"""
|
||||
Validate and fix all messages to ensure they have valid content before sending to LLM.
|
||||
|
||||
@@ -274,9 +274,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
|
||||
1. All messages have a content field
|
||||
2. No message has None or empty string content (except for legitimate empty responses)
|
||||
3. Complex objects (lists, dicts) are converted to JSON strings
|
||||
4. Content is truncated if too long to prevent token overflow
|
||||
|
||||
Args:
|
||||
messages: List of messages to validate
|
||||
max_content_length: Maximum allowed content length per message (default 100000)
|
||||
|
||||
Returns:
|
||||
List of validated messages with fixed content
|
||||
@@ -304,6 +306,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
|
||||
logger.debug(f"Message {i} ({type(msg).__name__}) has non-string content type {type(msg.content).__name__}, converting to string")
|
||||
msg.content = str(msg.content)
|
||||
|
||||
# Validate content length
|
||||
if isinstance(msg.content, str) and len(msg.content) > max_content_length:
|
||||
logger.warning(f"Message {i} content truncated from {len(msg.content)} to {max_content_length} chars")
|
||||
msg.content = msg.content[:max_content_length].rstrip() + "..."
|
||||
|
||||
validated.append(msg)
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating message {i}: {e}")
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import json_repair
|
||||
@@ -31,10 +32,84 @@ def sanitize_args(args: Any) -> str:
|
||||
)
|
||||
|
||||
|
||||
def _extract_json_from_content(content: str) -> str:
|
||||
"""
|
||||
Extract valid JSON from content that may have extra tokens.
|
||||
|
||||
Attempts to find the last valid JSON closing bracket and truncate there.
|
||||
Handles both objects {} and arrays [].
|
||||
|
||||
Args:
|
||||
content: String that may contain JSON with extra tokens
|
||||
|
||||
Returns:
|
||||
String with potential JSON extracted or original content
|
||||
"""
|
||||
content = content.strip()
|
||||
|
||||
# Try to find a complete JSON object or array
|
||||
# Look for the last closing brace/bracket that could be valid JSON
|
||||
|
||||
# Track counters and whether we've seen opening brackets
|
||||
brace_count = 0
|
||||
bracket_count = 0
|
||||
seen_opening_brace = False
|
||||
seen_opening_bracket = False
|
||||
in_string = False
|
||||
escape_next = False
|
||||
last_valid_end = -1
|
||||
|
||||
for i, char in enumerate(content):
|
||||
if escape_next:
|
||||
escape_next = False
|
||||
continue
|
||||
|
||||
if char == '\\':
|
||||
escape_next = True
|
||||
continue
|
||||
|
||||
if char == '"' and not escape_next:
|
||||
in_string = not in_string
|
||||
continue
|
||||
|
||||
if in_string:
|
||||
continue
|
||||
|
||||
if char == '{':
|
||||
brace_count += 1
|
||||
seen_opening_brace = True
|
||||
elif char == '}':
|
||||
brace_count -= 1
|
||||
# Only mark as valid end if we started with opening brace and reached balanced state
|
||||
if brace_count == 0 and seen_opening_brace:
|
||||
last_valid_end = i
|
||||
elif char == '[':
|
||||
bracket_count += 1
|
||||
seen_opening_bracket = True
|
||||
elif char == ']':
|
||||
bracket_count -= 1
|
||||
# Only mark as valid end if we started with opening bracket and reached balanced state
|
||||
if bracket_count == 0 and seen_opening_bracket:
|
||||
last_valid_end = i
|
||||
|
||||
if last_valid_end > 0:
|
||||
truncated = content[:last_valid_end + 1]
|
||||
if truncated != content:
|
||||
logger.debug(f"Truncated content from {len(content)} to {len(truncated)} chars")
|
||||
return truncated
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def repair_json_output(content: str) -> str:
|
||||
"""
|
||||
Repair and normalize JSON output.
|
||||
|
||||
Handles:
|
||||
- JSON with extra tokens after closing brackets
|
||||
- Incomplete JSON structures
|
||||
- Malformed JSON from quantized models
|
||||
|
||||
Args:
|
||||
content (str): String content that may contain JSON
|
||||
|
||||
@@ -42,6 +117,12 @@ def repair_json_output(content: str) -> str:
|
||||
str: Repaired JSON string, or original content if not JSON
|
||||
"""
|
||||
content = content.strip()
|
||||
|
||||
if not content:
|
||||
return content
|
||||
|
||||
# First attempt: try to extract valid JSON if there are extra tokens
|
||||
content = _extract_json_from_content(content)
|
||||
|
||||
try:
|
||||
# Try to repair and parse JSON
|
||||
@@ -53,6 +134,49 @@ def repair_json_output(content: str) -> str:
|
||||
return content
|
||||
content = json.dumps(repaired_content, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logger.warning(f"JSON repair failed: {e}")
|
||||
logger.debug(f"JSON repair failed: {e}")
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def sanitize_tool_response(content: str, max_length: int = 50000) -> str:
|
||||
"""
|
||||
Sanitize tool response to remove extra tokens and invalid content.
|
||||
|
||||
This function:
|
||||
- Strips whitespace and trailing tokens
|
||||
- Truncates excessively long responses
|
||||
- Cleans up common garbage patterns
|
||||
- Attempts JSON repair for JSON-like responses
|
||||
|
||||
Args:
|
||||
content: Tool response content
|
||||
max_length: Maximum allowed length (default 50000 chars)
|
||||
|
||||
Returns:
|
||||
Sanitized content string
|
||||
"""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
content = content.strip()
|
||||
|
||||
# First, try to extract valid JSON to remove trailing tokens
|
||||
if content.startswith('{') or content.startswith('['):
|
||||
content = _extract_json_from_content(content)
|
||||
|
||||
# Truncate if too long to prevent token overflow
|
||||
if len(content) > max_length:
|
||||
logger.warning(f"Tool response truncated from {len(content)} to {max_length} chars")
|
||||
content = content[:max_length].rstrip() + "..."
|
||||
|
||||
# Remove common garbage patterns that appear from some models
|
||||
# These are often seen from quantized models with output corruption
|
||||
garbage_patterns = [
|
||||
r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', # Control characters
|
||||
]
|
||||
|
||||
for pattern in garbage_patterns:
|
||||
content = re.sub(pattern, '', content)
|
||||
|
||||
return content
|
||||
|
||||
Reference in New Issue
Block a user