fix: parsed json with extra tokens issue (#656)

Fixes #598 

* fix: parsed json with extra tokens issue

* Added unit test for json.ts

* fix the json unit test running issue

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update the code with code review suggestion

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
This commit is contained in:
Willem Jiang
2025-10-26 07:24:25 +08:00
committed by GitHub
parent fd5a9aeae4
commit c7a82b82b4
7 changed files with 779 additions and 7 deletions

View File

@@ -27,7 +27,7 @@ from src.tools import (
)
from src.tools.search import LoggedTavilySearch
from src.utils.context_manager import ContextManager, validate_message_content
from src.utils.json_utils import repair_json_output
from src.utils.json_utils import repair_json_output, sanitize_tool_response
from ..config import SELECTED_SEARCH_ENGINE, SearchEngine
from .types import State
@@ -834,6 +834,10 @@ async def _execute_agent_step(
# Process the result
response_content = result["messages"][-1].content
# Sanitize response to remove extra tokens and truncate if needed
response_content = sanitize_tool_response(str(response_content))
logger.debug(f"{agent_name.capitalize()} full response: {response_content}")
# Update the step with the execution result

View File

@@ -266,7 +266,7 @@ class ContextManager:
pass
def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
def validate_message_content(messages: List[BaseMessage], max_content_length: int = 100000) -> List[BaseMessage]:
"""
Validate and fix all messages to ensure they have valid content before sending to LLM.
@@ -274,9 +274,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
1. All messages have a content field
2. No message has None or empty string content (except for legitimate empty responses)
3. Complex objects (lists, dicts) are converted to JSON strings
4. Content is truncated if too long to prevent token overflow
Args:
messages: List of messages to validate
max_content_length: Maximum allowed content length per message (default 100000)
Returns:
List of validated messages with fixed content
@@ -304,6 +306,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
logger.debug(f"Message {i} ({type(msg).__name__}) has non-string content type {type(msg.content).__name__}, converting to string")
msg.content = str(msg.content)
# Validate content length
if isinstance(msg.content, str) and len(msg.content) > max_content_length:
logger.warning(f"Message {i} content truncated from {len(msg.content)} to {max_content_length} chars")
msg.content = msg.content[:max_content_length].rstrip() + "..."
validated.append(msg)
except Exception as e:
logger.error(f"Error validating message {i}: {e}")

View File

@@ -3,6 +3,7 @@
import json
import logging
import re
from typing import Any
import json_repair
@@ -31,10 +32,84 @@ def sanitize_args(args: Any) -> str:
)
def _extract_json_from_content(content: str) -> str:
"""
Extract valid JSON from content that may have extra tokens.
Attempts to find the last valid JSON closing bracket and truncate there.
Handles both objects {} and arrays [].
Args:
content: String that may contain JSON with extra tokens
Returns:
String with potential JSON extracted or original content
"""
content = content.strip()
# Try to find a complete JSON object or array
# Look for the last closing brace/bracket that could be valid JSON
# Track counters and whether we've seen opening brackets
brace_count = 0
bracket_count = 0
seen_opening_brace = False
seen_opening_bracket = False
in_string = False
escape_next = False
last_valid_end = -1
for i, char in enumerate(content):
if escape_next:
escape_next = False
continue
if char == '\\':
escape_next = True
continue
if char == '"' and not escape_next:
in_string = not in_string
continue
if in_string:
continue
if char == '{':
brace_count += 1
seen_opening_brace = True
elif char == '}':
brace_count -= 1
# Only mark as valid end if we started with opening brace and reached balanced state
if brace_count == 0 and seen_opening_brace:
last_valid_end = i
elif char == '[':
bracket_count += 1
seen_opening_bracket = True
elif char == ']':
bracket_count -= 1
# Only mark as valid end if we started with opening bracket and reached balanced state
if bracket_count == 0 and seen_opening_bracket:
last_valid_end = i
if last_valid_end > 0:
truncated = content[:last_valid_end + 1]
if truncated != content:
logger.debug(f"Truncated content from {len(content)} to {len(truncated)} chars")
return truncated
return content
def repair_json_output(content: str) -> str:
"""
Repair and normalize JSON output.
Handles:
- JSON with extra tokens after closing brackets
- Incomplete JSON structures
- Malformed JSON from quantized models
Args:
content (str): String content that may contain JSON
@@ -42,6 +117,12 @@ def repair_json_output(content: str) -> str:
str: Repaired JSON string, or original content if not JSON
"""
content = content.strip()
if not content:
return content
# First attempt: try to extract valid JSON if there are extra tokens
content = _extract_json_from_content(content)
try:
# Try to repair and parse JSON
@@ -53,6 +134,49 @@ def repair_json_output(content: str) -> str:
return content
content = json.dumps(repaired_content, ensure_ascii=False)
except Exception as e:
logger.warning(f"JSON repair failed: {e}")
logger.debug(f"JSON repair failed: {e}")
return content
def sanitize_tool_response(content: str, max_length: int = 50000) -> str:
"""
Sanitize tool response to remove extra tokens and invalid content.
This function:
- Strips whitespace and trailing tokens
- Truncates excessively long responses
- Cleans up common garbage patterns
- Attempts JSON repair for JSON-like responses
Args:
content: Tool response content
max_length: Maximum allowed length (default 50000 chars)
Returns:
Sanitized content string
"""
if not content:
return content
content = content.strip()
# First, try to extract valid JSON to remove trailing tokens
if content.startswith('{') or content.startswith('['):
content = _extract_json_from_content(content)
# Truncate if too long to prevent token overflow
if len(content) > max_length:
logger.warning(f"Tool response truncated from {len(content)} to {max_length} chars")
content = content[:max_length].rstrip() + "..."
# Remove common garbage patterns that appear from some models
# These are often seen from quantized models with output corruption
garbage_patterns = [
r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', # Control characters
]
for pattern in garbage_patterns:
content = re.sub(pattern, '', content)
return content