fix: parsed json with extra tokens issue (#656)

Fixes #598 * fix: parsed json with extra tokens issue * Added unit test for json.ts * fix the json unit test running issue * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update the code with code review suggestion --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
2026-04-16 03:14:45 +08:00 · 2025-10-26 07:24:25 +08:00
parent fd5a9aeae4
commit c7a82b82b4
7 changed files with 779 additions and 7 deletions
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -27,7 +27,7 @@ from src.tools import (
 )
 from src.tools.search import LoggedTavilySearch
 from src.utils.context_manager import ContextManager, validate_message_content
-from src.utils.json_utils import repair_json_output
+from src.utils.json_utils import repair_json_output, sanitize_tool_response

 from ..config import SELECTED_SEARCH_ENGINE, SearchEngine
 from .types import State
@@ -834,6 +834,10 @@ async def _execute_agent_step(

    # Process the result
    response_content = result["messages"][-1].content
+    
+    # Sanitize response to remove extra tokens and truncate if needed
+    response_content = sanitize_tool_response(str(response_content))
+    
    logger.debug(f"{agent_name.capitalize()} full response: {response_content}")

    # Update the step with the execution result
--- a/src/utils/context_manager.py
+++ b/src/utils/context_manager.py
@@ -266,7 +266,7 @@ class ContextManager:
        pass


-def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
+def validate_message_content(messages: List[BaseMessage], max_content_length: int = 100000) -> List[BaseMessage]:
    """
    Validate and fix all messages to ensure they have valid content before sending to LLM.
    
@@ -274,9 +274,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
    1. All messages have a content field
    2. No message has None or empty string content (except for legitimate empty responses)
    3. Complex objects (lists, dicts) are converted to JSON strings
+    4. Content is truncated if too long to prevent token overflow
    
    Args:
        messages: List of messages to validate
+        max_content_length: Maximum allowed content length per message (default 100000)
    
    Returns:
        List of validated messages with fixed content
@@ -304,6 +306,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
                logger.debug(f"Message {i} ({type(msg).__name__}) has non-string content type {type(msg.content).__name__}, converting to string")
                msg.content = str(msg.content)
            
+            # Validate content length
+            if isinstance(msg.content, str) and len(msg.content) > max_content_length:
+                logger.warning(f"Message {i} content truncated from {len(msg.content)} to {max_content_length} chars")
+                msg.content = msg.content[:max_content_length].rstrip() + "..."
+            
            validated.append(msg)
        except Exception as e:
            logger.error(f"Error validating message {i}: {e}")
--- a/src/utils/json_utils.py
+++ b/src/utils/json_utils.py
@@ -3,6 +3,7 @@

 import json
 import logging
+import re
 from typing import Any

 import json_repair
@@ -31,10 +32,84 @@ def sanitize_args(args: Any) -> str:
        )


+def _extract_json_from_content(content: str) -> str:
+    """
+    Extract valid JSON from content that may have extra tokens.
+    
+    Attempts to find the last valid JSON closing bracket and truncate there.
+    Handles both objects {} and arrays [].
+    
+    Args:
+        content: String that may contain JSON with extra tokens
+        
+    Returns:
+        String with potential JSON extracted or original content
+    """
+    content = content.strip()
+    
+    # Try to find a complete JSON object or array
+    # Look for the last closing brace/bracket that could be valid JSON
+    
+    # Track counters and whether we've seen opening brackets
+    brace_count = 0
+    bracket_count = 0
+    seen_opening_brace = False
+    seen_opening_bracket = False
+    in_string = False
+    escape_next = False
+    last_valid_end = -1
+    
+    for i, char in enumerate(content):
+        if escape_next:
+            escape_next = False
+            continue
+        
+        if char == '\\':
+            escape_next = True
+            continue
+        
+        if char == '"' and not escape_next:
+            in_string = not in_string
+            continue
+        
+        if in_string:
+            continue
+        
+        if char == '{':
+            brace_count += 1
+            seen_opening_brace = True
+        elif char == '}':
+            brace_count -= 1
+            # Only mark as valid end if we started with opening brace and reached balanced state
+            if brace_count == 0 and seen_opening_brace:
+                last_valid_end = i
+        elif char == '[':
+            bracket_count += 1
+            seen_opening_bracket = True
+        elif char == ']':
+            bracket_count -= 1
+            # Only mark as valid end if we started with opening bracket and reached balanced state
+            if bracket_count == 0 and seen_opening_bracket:
+                last_valid_end = i
+    
+    if last_valid_end > 0:
+        truncated = content[:last_valid_end + 1]
+        if truncated != content:
+            logger.debug(f"Truncated content from {len(content)} to {len(truncated)} chars")
+        return truncated
+    
+    return content
+
+
 def repair_json_output(content: str) -> str:
    """
    Repair and normalize JSON output.

+    Handles:
+    - JSON with extra tokens after closing brackets
+    - Incomplete JSON structures
+    - Malformed JSON from quantized models
+    
    Args:
        content (str): String content that may contain JSON

@@ -42,6 +117,12 @@ def repair_json_output(content: str) -> str:
        str: Repaired JSON string, or original content if not JSON
    """
    content = content.strip()
+    
+    if not content:
+        return content
+
+    # First attempt: try to extract valid JSON if there are extra tokens
+    content = _extract_json_from_content(content)

    try:
        # Try to repair and parse JSON
@@ -53,6 +134,49 @@ def repair_json_output(content: str) -> str:
            return content
        content = json.dumps(repaired_content, ensure_ascii=False)
    except Exception as e:
-        logger.warning(f"JSON repair failed: {e}")
+        logger.debug(f"JSON repair failed: {e}")

    return content
+
+
+def sanitize_tool_response(content: str, max_length: int = 50000) -> str:
+    """
+    Sanitize tool response to remove extra tokens and invalid content.
+    
+    This function:
+    - Strips whitespace and trailing tokens
+    - Truncates excessively long responses
+    - Cleans up common garbage patterns
+    - Attempts JSON repair for JSON-like responses
+    
+    Args:
+        content: Tool response content
+        max_length: Maximum allowed length (default 50000 chars)
+        
+    Returns:
+        Sanitized content string
+    """
+    if not content:
+        return content
+    
+    content = content.strip()
+    
+    # First, try to extract valid JSON to remove trailing tokens
+    if content.startswith('{') or content.startswith('['):
+        content = _extract_json_from_content(content)
+    
+    # Truncate if too long to prevent token overflow
+    if len(content) > max_length:
+        logger.warning(f"Tool response truncated from {len(content)} to {max_length} chars")
+        content = content[:max_length].rstrip() + "..."
+    
+    # Remove common garbage patterns that appear from some models
+    # These are often seen from quantized models with output corruption
+    garbage_patterns = [
+        r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]',  # Control characters
+    ]
+    
+    for pattern in garbage_patterns:
+        content = re.sub(pattern, '', content)
+    
+    return content