fix: parsed json with extra tokens issue (#656)

Fixes #598 * fix: parsed json with extra tokens issue * Added unit test for json.ts * fix the json unit test running issue * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update the code with code review suggestion --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
2026-04-03 06:12:14 +08:00 · 2025-10-26 07:24:25 +08:00
parent fd5a9aeae4
commit c7a82b82b4
7 changed files with 779 additions and 7 deletions
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -27,7 +27,7 @@ from src.tools import (
 )
 from src.tools.search import LoggedTavilySearch
 from src.utils.context_manager import ContextManager, validate_message_content
-from src.utils.json_utils import repair_json_output
+from src.utils.json_utils import repair_json_output, sanitize_tool_response

 from ..config import SELECTED_SEARCH_ENGINE, SearchEngine
 from .types import State
@@ -834,6 +834,10 @@ async def _execute_agent_step(

    # Process the result
    response_content = result["messages"][-1].content
+    
+    # Sanitize response to remove extra tokens and truncate if needed
+    response_content = sanitize_tool_response(str(response_content))
+    
    logger.debug(f"{agent_name.capitalize()} full response: {response_content}")

    # Update the step with the execution result
--- a/src/utils/context_manager.py
+++ b/src/utils/context_manager.py
@@ -266,7 +266,7 @@ class ContextManager:
        pass


-def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
+def validate_message_content(messages: List[BaseMessage], max_content_length: int = 100000) -> List[BaseMessage]:
    """
    Validate and fix all messages to ensure they have valid content before sending to LLM.
    
@@ -274,9 +274,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
    1. All messages have a content field
    2. No message has None or empty string content (except for legitimate empty responses)
    3. Complex objects (lists, dicts) are converted to JSON strings
+    4. Content is truncated if too long to prevent token overflow
    
    Args:
        messages: List of messages to validate
+        max_content_length: Maximum allowed content length per message (default 100000)
    
    Returns:
        List of validated messages with fixed content
@@ -304,6 +306,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
                logger.debug(f"Message {i} ({type(msg).__name__}) has non-string content type {type(msg.content).__name__}, converting to string")
                msg.content = str(msg.content)
            
+            # Validate content length
+            if isinstance(msg.content, str) and len(msg.content) > max_content_length:
+                logger.warning(f"Message {i} content truncated from {len(msg.content)} to {max_content_length} chars")
+                msg.content = msg.content[:max_content_length].rstrip() + "..."
+            
            validated.append(msg)
        except Exception as e:
            logger.error(f"Error validating message {i}: {e}")
--- a/src/utils/json_utils.py
+++ b/src/utils/json_utils.py
@@ -3,6 +3,7 @@

 import json
 import logging
+import re
 from typing import Any

 import json_repair
@@ -31,10 +32,84 @@ def sanitize_args(args: Any) -> str:
        )


+def _extract_json_from_content(content: str) -> str:
+    """
+    Extract valid JSON from content that may have extra tokens.
+    
+    Attempts to find the last valid JSON closing bracket and truncate there.
+    Handles both objects {} and arrays [].
+    
+    Args:
+        content: String that may contain JSON with extra tokens
+        
+    Returns:
+        String with potential JSON extracted or original content
+    """
+    content = content.strip()
+    
+    # Try to find a complete JSON object or array
+    # Look for the last closing brace/bracket that could be valid JSON
+    
+    # Track counters and whether we've seen opening brackets
+    brace_count = 0
+    bracket_count = 0
+    seen_opening_brace = False
+    seen_opening_bracket = False
+    in_string = False
+    escape_next = False
+    last_valid_end = -1
+    
+    for i, char in enumerate(content):
+        if escape_next:
+            escape_next = False
+            continue
+        
+        if char == '\\':
+            escape_next = True
+            continue
+        
+        if char == '"' and not escape_next:
+            in_string = not in_string
+            continue
+        
+        if in_string:
+            continue
+        
+        if char == '{':
+            brace_count += 1
+            seen_opening_brace = True
+        elif char == '}':
+            brace_count -= 1
+            # Only mark as valid end if we started with opening brace and reached balanced state
+            if brace_count == 0 and seen_opening_brace:
+                last_valid_end = i
+        elif char == '[':
+            bracket_count += 1
+            seen_opening_bracket = True
+        elif char == ']':
+            bracket_count -= 1
+            # Only mark as valid end if we started with opening bracket and reached balanced state
+            if bracket_count == 0 and seen_opening_bracket:
+                last_valid_end = i
+    
+    if last_valid_end > 0:
+        truncated = content[:last_valid_end + 1]
+        if truncated != content:
+            logger.debug(f"Truncated content from {len(content)} to {len(truncated)} chars")
+        return truncated
+    
+    return content
+
+
 def repair_json_output(content: str) -> str:
    """
    Repair and normalize JSON output.

+    Handles:
+    - JSON with extra tokens after closing brackets
+    - Incomplete JSON structures
+    - Malformed JSON from quantized models
+    
    Args:
        content (str): String content that may contain JSON

@@ -42,6 +117,12 @@ def repair_json_output(content: str) -> str:
        str: Repaired JSON string, or original content if not JSON
    """
    content = content.strip()
+    
+    if not content:
+        return content
+
+    # First attempt: try to extract valid JSON if there are extra tokens
+    content = _extract_json_from_content(content)

    try:
        # Try to repair and parse JSON
@@ -53,6 +134,49 @@ def repair_json_output(content: str) -> str:
            return content
        content = json.dumps(repaired_content, ensure_ascii=False)
    except Exception as e:
-        logger.warning(f"JSON repair failed: {e}")
+        logger.debug(f"JSON repair failed: {e}")

    return content
+
+
+def sanitize_tool_response(content: str, max_length: int = 50000) -> str:
+    """
+    Sanitize tool response to remove extra tokens and invalid content.
+    
+    This function:
+    - Strips whitespace and trailing tokens
+    - Truncates excessively long responses
+    - Cleans up common garbage patterns
+    - Attempts JSON repair for JSON-like responses
+    
+    Args:
+        content: Tool response content
+        max_length: Maximum allowed length (default 50000 chars)
+        
+    Returns:
+        Sanitized content string
+    """
+    if not content:
+        return content
+    
+    content = content.strip()
+    
+    # First, try to extract valid JSON to remove trailing tokens
+    if content.startswith('{') or content.startswith('['):
+        content = _extract_json_from_content(content)
+    
+    # Truncate if too long to prevent token overflow
+    if len(content) > max_length:
+        logger.warning(f"Tool response truncated from {len(content)} to {max_length} chars")
+        content = content[:max_length].rstrip() + "..."
+    
+    # Remove common garbage patterns that appear from some models
+    # These are often seen from quantized models with output corruption
+    garbage_patterns = [
+        r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]',  # Control characters
+    ]
+    
+    for pattern in garbage_patterns:
+        content = re.sub(pattern, '', content)
+    
+    return content
--- a/tests/unit/utils/test_json_utils.py
+++ b/tests/unit/utils/test_json_utils.py
@@ -3,7 +3,7 @@

 import json

-from src.utils.json_utils import repair_json_output
+from src.utils.json_utils import repair_json_output, sanitize_tool_response, _extract_json_from_content


 class TestRepairJsonOutput:
@@ -106,3 +106,119 @@ class TestRepairJsonOutput:
        # Should attempt to process as JSON since it contains ```json
        assert isinstance(result, str)
        assert result == '{"key": "value"}'
+
+
+class TestExtractJsonFromContent:
+    def test_json_with_extra_tokens_after_closing_brace(self):
+        """Test extracting JSON with extra tokens after closing brace"""
+        content = '{"key": "value"} extra tokens here'
+        result = _extract_json_from_content(content)
+        assert result == '{"key": "value"}'
+
+    def test_json_with_extra_tokens_after_closing_bracket(self):
+        """Test extracting JSON array with extra tokens"""
+        content = '[1, 2, 3] garbage data'
+        result = _extract_json_from_content(content)
+        assert result == '[1, 2, 3]'
+
+    def test_nested_json_with_extra_tokens(self):
+        """Test nested JSON with extra tokens"""
+        content = '{"nested": {"inner": [1, 2, 3]}} invalid text'
+        result = _extract_json_from_content(content)
+        assert result == '{"nested": {"inner": [1, 2, 3]}}'
+
+    def test_json_with_string_containing_braces(self):
+        """Test JSON with strings containing braces"""
+        content = '{"text": "this has {braces} in it"} extra'
+        result = _extract_json_from_content(content)
+        assert result == '{"text": "this has {braces} in it"}'
+
+    def test_json_with_escaped_quotes(self):
+        """Test JSON with escaped quotes in strings"""
+        content = '{"text": "quote \\"here\\""} junk'
+        result = _extract_json_from_content(content)
+        assert result == '{"text": "quote \\"here\\""}'
+
+    def test_clean_json_no_extra_tokens(self):
+        """Test clean JSON without extra tokens"""
+        content = '{"key": "value"}'
+        result = _extract_json_from_content(content)
+        assert result == '{"key": "value"}'
+
+    def test_empty_object(self):
+        """Test empty object"""
+        content = '{} extra'
+        result = _extract_json_from_content(content)
+        assert result == '{}'
+
+    def test_empty_array(self):
+        """Test empty array"""
+        content = '[] more stuff'
+        result = _extract_json_from_content(content)
+        assert result == '[]'
+
+    def test_extra_closing_brace_no_opening(self):
+        """Test that extra closing brace without opening is not marked as valid end"""
+        content = '} garbage data'
+        result = _extract_json_from_content(content)
+        # Should return original content since no opening brace was seen
+        assert result == content
+
+    def test_extra_closing_bracket_no_opening(self):
+        """Test that extra closing bracket without opening is not marked as valid end"""
+        content = '] garbage data'
+        result = _extract_json_from_content(content)
+        # Should return original content since no opening bracket was seen
+        assert result == content
+
+
+class TestSanitizeToolResponse:
+    def test_basic_sanitization(self):
+        """Test basic tool response sanitization"""
+        content = "normal response"
+        result = sanitize_tool_response(content)
+        assert result == "normal response"
+
+    def test_json_with_extra_tokens(self):
+        """Test sanitizing JSON with extra tokens"""
+        content = '{"data": "value"} some garbage'
+        result = sanitize_tool_response(content)
+        assert result == '{"data": "value"}'
+
+    def test_very_long_response_truncation(self):
+        """Test truncation of very long responses"""
+        long_content = "a" * 60000  # Exceeds default max of 50000
+        result = sanitize_tool_response(long_content)
+        assert len(result) <= 50003  # 50000 + "..."
+        assert result.endswith("...")
+
+    def test_custom_max_length(self):
+        """Test custom maximum length"""
+        long_content = "a" * 1000
+        result = sanitize_tool_response(long_content, max_length=100)
+        assert len(result) <= 103  # 100 + "..."
+        assert result.endswith("...")
+
+    def test_control_character_removal(self):
+        """Test removal of control characters"""
+        content = "text with \x00 null \x01 chars"
+        result = sanitize_tool_response(content)
+        assert "\x00" not in result
+        assert "\x01" not in result
+
+    def test_none_content(self):
+        """Test handling of None content"""
+        result = sanitize_tool_response("")
+        assert result == ""
+
+    def test_whitespace_handling(self):
+        """Test whitespace handling"""
+        content = "  text with spaces  "
+        result = sanitize_tool_response(content)
+        assert result == "text with spaces"
+
+    def test_json_array_with_extra_tokens(self):
+        """Test JSON array with extra tokens"""
+        content = '[{"id": 1}, {"id": 2}] invalid stuff'
+        result = sanitize_tool_response(content)
+        assert result == '[{"id": 1}, {"id": 2}]'
--- a/web/src/app/chat/components/research-activities-block.tsx
+++ b/web/src/app/chat/components/research-activities-block.tsx
@@ -147,11 +147,18 @@ function WebSearchToolCall({ toolCall }: { toolCall: ToolCallRuntime }) {
  }, [toolCall.result]);
  const searchResults = useMemo<SearchResult[]>(() => {
    let results: SearchResult[] | undefined = undefined;
+    let parseError = false;
+    
    try {
-      results = toolCall.result ? parseJSON(toolCall.result, []) : undefined;
-    } catch {
+      if (toolCall.result) {
+        results = parseJSON(toolCall.result, []);
+      }
+    } catch (error) {
+      parseError = true;
+      console.warn("Failed to parse search results:", error);
      results = undefined;
    }
+    
    if (Array.isArray(results)) {
      results.forEach((result) => {
        if (result.type === "page") {
@@ -159,8 +166,10 @@ function WebSearchToolCall({ toolCall }: { toolCall: ToolCallRuntime }) {
        }
      });
    } else {
+      // If parsing failed, still try to show something useful
      results = [];
    }
+    
    return results;
  }, [toolCall.result]);
  const pageResults = useMemo(
--- a/web/src/core/utils/json.ts
+++ b/web/src/core/utils/json.ts
@@ -1,11 +1,72 @@
 import { parse } from "best-effort-json-parser";

+/**
+ * Extract valid JSON from content that may have extra tokens.
+ * Finds the last closing brace/bracket that could be valid JSON.
+ */
+function extractValidJSON(content: string): string {
+  let braceCount = 0;
+  let bracketCount = 0;
+  let inString = false;
+  let escapeNext = false;
+  let lastValidEnd = -1;
+
+  for (let i = 0; i < content.length; i++) {
+    const char = content[i];
+    
+    if (escapeNext) {
+      escapeNext = false;
+      continue;
+    }
+    
+    if (char === "\\") {
+      escapeNext = true;
+      continue;
+    }
+    
+    if (char === '"') {
+      inString = !inString;
+      continue;
+    }
+    
+    if (inString) {
+      continue;
+    }
+    
+    if (char === "{") {
+      braceCount++;
+    } else if (char === "}") {
+      if (braceCount > 0) {
+        braceCount--;
+        if (braceCount === 0) {
+          lastValidEnd = i;
+        }
+      }
+    } else if (char === "[") {
+      bracketCount++;
+    } else if (char === "]") {
+      if (bracketCount > 0) {
+        bracketCount--;
+        if (bracketCount === 0) {
+          lastValidEnd = i;
+        }
+      }
+    }
+  }
+  
+  if (lastValidEnd > 0) {
+    return content.substring(0, lastValidEnd + 1);
+  }
+  
+  return content;
+}
+
 export function parseJSON<T>(json: string | null | undefined, fallback: T) {
  if (!json) {
    return fallback;
  }
  try {
-    const raw = json
+    let raw = json
      .trim()
      .replace(/^```json\s*/, "")
      .replace(/^```js\s*/, "")
@@ -13,8 +74,17 @@ export function parseJSON<T>(json: string | null | undefined, fallback: T) {
      .replace(/^```plaintext\s*/, "")
      .replace(/^```\s*/, "")
      .replace(/\s*```$/, "");
+    
+    // First attempt: try to extract valid JSON to remove extra tokens
+    if (raw.startsWith("{") || raw.startsWith("[")) {
+      raw = extractValidJSON(raw);
+    }
+    
+    // Parse the cleaned content
    return parse(raw) as T;
  } catch {
+    // Fallback: try to extract meaningful content from malformed JSON
+    // This is a last-resort attempt to salvage partial data
    return fallback;
  }
 }
--- a/web/tests/json.test.ts
+++ b/web/tests/json.test.ts
@@ -0,0 +1,442 @@
+import { parseJSON } from "../src/core/utils/json";
+
+describe("parseJSON - extractValidJSON helper", () => {
+  it("extracts JSON object with extra tokens after closing brace", () => {
+    const input = '{"key": "value"} extra tokens here';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("extracts JSON array with extra tokens after closing bracket", () => {
+    const input = '[1, 2, 3] garbage data here';
+    const result = parseJSON(input, []);
+    expect(result).toEqual([1, 2, 3]);
+  });
+
+  it("handles nested JSON with extra tokens", () => {
+    const input = '{"nested": {"inner": [1, 2, 3]}} invalid text';
+    const result = parseJSON(input, null);
+    expect(result).toEqual({
+      nested: {
+        inner: [1, 2, 3],
+      },
+    });
+  });
+
+  it("handles JSON with strings containing braces", () => {
+    const input = '{"text": "this has {braces} in it"} extra';
+    const result = parseJSON(input, null);
+    expect(result.text).toBe("this has {braces} in it");
+  });
+
+  it("handles JSON with escaped quotes in strings", () => {
+    const input = '{"text": "quote \\"here\\""} junk';
+    const result = parseJSON(input, null);
+    expect(result.text).toBe('quote "here"');
+  });
+
+  it("handles clean JSON without extra tokens", () => {
+    const input = '{"key": "value"}';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("handles empty object", () => {
+    const input = '{} extra';
+    const result = parseJSON(input, {});
+    expect(result).toEqual({});
+  });
+
+  it("handles empty array", () => {
+    const input = '[] more stuff';
+    const result = parseJSON(input, []);
+    expect(result).toEqual([]);
+  });
+
+  it("handles JSON with null values", () => {
+    const input = '{"value": null} trash';
+    const result = parseJSON(input, {});
+    expect(result.value).toBeNull();
+  });
+
+  it("handles JSON with boolean values", () => {
+    const input = '{"active": true, "deleted": false} garbage';
+    const result = parseJSON(input, {});
+    expect(result.active).toBe(true);
+    expect(result.deleted).toBe(false);
+  });
+
+  it("handles JSON with numbers", () => {
+    const input = '{"int": 42, "float": 3.14, "negative": -7} data';
+    const result = parseJSON(input, {});
+    expect(result.int).toBe(42);
+    expect(result.float).toBe(3.14);
+    expect(result.negative).toBe(-7);
+  });
+
+  it("handles JSON with unicode characters", () => {
+    const input = '{"name": "测试", "emoji": "🎯"} extra';
+    const result = parseJSON(input, {});
+    expect(result.name).toBe("测试");
+    expect(result.emoji).toBe("🎯");
+  });
+
+  it("handles multiple levels of nesting", () => {
+    const input = '{"a": {"b": {"c": {"d": "value"}}}} junk';
+    const result = parseJSON(input, {});
+    expect(result.a.b.c.d).toBe("value");
+  });
+
+  it("handles arrays of objects", () => {
+    const input = '[{"id": 1, "name": "test1"}, {"id": 2, "name": "test2"}] garbage';
+    const result = parseJSON(input, []);
+    expect(result.length).toBe(2);
+    expect(result[0].id).toBe(1);
+    expect(result[1].name).toBe("test2");
+  });
+});
+
+describe("parseJSON - with code block markers", () => {
+  it("strips json code block markers", () => {
+    const input = '```json\n{"key": "value"}\n```';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("strips js code block markers", () => {
+    const input = '```js\n{"key": "value"}\n```';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("strips ts code block markers", () => {
+    const input = '```ts\n{"key": "value"}\n```';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("strips plaintext code block markers", () => {
+    const input = '```plaintext\n{"key": "value"}\n```';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("strips generic code block markers", () => {
+    const input = '```\n{"key": "value"}\n```';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("handles code block without closing marker", () => {
+    const input = '```json\n{"key": "value"}';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+
+  it("handles code block with extra whitespace", () => {
+    const input = '```json   \n{"key": "value"}\n```   ';
+    const result = parseJSON(input, null);
+    expect(result.key).toBe("value");
+  });
+});
+
+describe("parseJSON - issue #598 specific cases", () => {
+  it("handles JSON with extra tokens from quantized models", () => {
+    // This is similar to what Qwen3 235B returns
+    const input =
+      '{"text": "Published: 2010-01-07\\nTitle: Photon Counting OTDR", "data": "Published:", "reminding": " 2010-01-07\\nTitle: Photon"} some garbage tokens';
+    const result = parseJSON(input, {});
+    expect(result.text).toBeTruthy();
+    expect(result.text).toContain("Published");
+    expect(result.data).toBeTruthy();
+    expect(result.reminding).toBeTruthy();
+  });
+
+  it("handles search results JSON with extra tokens", () => {
+    const input = `[
+      {"type": "page", "title": "Example", "url": "https://example.com", "content": "Example content"},
+      {"type": "page", "title": "Test", "url": "https://test.com", "content": "Test content"}
+    ] trailing garbage`;
+    const result = parseJSON(input, []);
+    expect(result.length).toBe(2);
+    expect(result[0].type).toBe("page");
+    expect(result[1].title).toBe("Test");
+  });
+
+  it("handles crawler response with extra tokens", () => {
+    const input = `{
+      "title": "Article Title",
+      "content": "Article content here..."
+    } [incomplete json or garbage`;
+    const result = parseJSON(input, {});
+    expect(result.title).toBe("Article Title");
+    expect(result.content).toContain("Article content");
+  });
+
+  it("handles non-JSON content gracefully", () => {
+    const input = "This is just plain text, not JSON";
+    const fallback = { default: true };
+    const result = parseJSON(input, fallback);
+    // best-effort-json-parser may parse plain text as key-value pairs
+    // Just ensure we get some result (not throwing an error)
+    expect(result).toBeDefined();
+    expect(result).not.toBeNull();
+  });
+
+  it("returns fallback for null input", () => {
+    const fallback = [{ default: true }];
+    const result = parseJSON(null, fallback);
+    expect(result).toEqual(fallback);
+  });
+
+  it("returns fallback for undefined input", () => {
+    const fallback = [];
+    const result = parseJSON(undefined, fallback);
+    expect(result).toEqual(fallback);
+  });
+
+  it("returns fallback for empty string input", () => {
+    const fallback = {};
+    const result = parseJSON("", fallback);
+    expect(result).toEqual(fallback);
+  });
+});
+
+describe("parseJSON - edge cases", () => {
+  it("handles JSON with special characters in strings", () => {
+    const input = '{"text": "Special chars: @#$%^&*()"} extra';
+    const result = parseJSON(input, {});
+    expect(result.text).toBe("Special chars: @#$%^&*()");
+  });
+
+  it("handles JSON with newlines in strings", () => {
+    const input = '{"text": "Line 1\\nLine 2\\nLine 3"} junk';
+    const result = parseJSON(input, {});
+    expect(result.text).toContain("Line");
+  });
+
+  it("handles JSON with tabs in strings", () => {
+    const input = '{"text": "Col1\\tCol2\\tCol3"} trash';
+    const result = parseJSON(input, {});
+    expect(result.text).toContain("Col");
+  });
+
+  it("handles deeply nested objects", () => {
+    const input = '{"a":{"b":{"c":{"d":{"e":{"f":"deep"}}}}}}} extra';
+    const result = parseJSON(input, {});
+    expect(result.a.b.c.d.e.f).toBe("deep");
+  });
+
+  it("handles large arrays", () => {
+    const largeArray = Array.from({ length: 100 }, (_, i) => ({ id: i }));
+    const input = JSON.stringify(largeArray) + " garbage text";
+    const result = parseJSON(input, []);
+    expect(result.length).toBe(100);
+    expect(result[99].id).toBe(99);
+  });
+
+  it("handles whitespace in JSON", () => {
+    const input = `{
+      "key"  :  "value"  ,
+      "number"  :  42
+    } extra`;
+    const result = parseJSON(input, {});
+    expect(result.key).toBe("value");
+    expect(result.number).toBe(42);
+  });
+
+  it("handles JSON with escaped slashes", () => {
+    const input = '{"url": "https:\\/\\/example.com"} junk';
+    const result = parseJSON(input, {});
+    expect(result.url).toContain("example.com");
+  });
+
+  it("preserves numeric precision", () => {
+    const input = '{"value": 1.23456789} extra';
+    const result = parseJSON(input, {});
+    expect(result.value).toBe(1.23456789);
+  });
+
+  it("handles JSON with very long strings", () => {
+    const longString = "A".repeat(10000);
+    const input = `{"text": "${longString}"} garbage`;
+    const result = parseJSON(input, {});
+    expect(result.text.length).toBe(10000);
+  });
+});
+
+describe("parseJSON - type safety", () => {
+  it("properly types object results", () => {
+    interface TestObject {
+      id: number;
+      name: string;
+      active: boolean;
+    }
+    const input = '{"id": 1, "name": "test", "active": true} junk';
+    const fallback: TestObject = { id: 0, name: "", active: false };
+    const result = parseJSON<TestObject>(input, fallback);
+    expect(result.id).toBe(1);
+    expect(result.name).toBe("test");
+    expect(result.active).toBe(true);
+  });
+
+  it("properly types array results", () => {
+    interface Item {
+      id: number;
+      label: string;
+    }
+    const input = '[{"id": 1, "label": "a"}, {"id": 2, "label": "b"}] extra';
+    const fallback: Item[] = [];
+    const result = parseJSON<Item[]>(input, fallback);
+    expect(result[0].id).toBe(1);
+    expect(result[1].label).toBe("b");
+  });
+});
+
+describe("parseJSON - malformed JSON recovery", () => {
+  it("handles missing closing braces", () => {
+    const input = '{"key": "value"';
+    const result = parseJSON(input, { key: "default" });
+    // Should return something (either fixed JSON or fallback)
+    expect(result).toBeDefined();
+  });
+
+  it("handles extra closing braces", () => {
+    const input = '{"key": "value"}}}';
+    const result = parseJSON(input, {});
+    expect(result.key).toBe("value");
+  });
+
+  it("handles mixed quotes", () => {
+    const input = '{"key": "value"} extra';
+    const result = parseJSON(input, {});
+    expect(result.key).toBe("value");
+  });
+
+  it("handles unquoted keys (not valid JSON, uses fallback)", () => {
+    const input = "{key: 'value'} extra";
+    const fallback = { key: "default" };
+    const result = parseJSON(input, fallback);
+    // Should return something
+    expect(result).toBeDefined();
+  });
+});
+
+describe("parseJSON - real-world scenarios", () => {
+  it("handles Tavily search results format", () => {
+    const input = `[
+      {
+        "type": "page",
+        "title": "Sample Article",
+        "url": "https://example.com/article",
+        "content": "This is sample content..."
+      }
+    ] processing complete`;
+    const result = parseJSON(input, []);
+    expect(result[0].type).toBe("page");
+    expect(result[0].title).toBe("Sample Article");
+  });
+
+  it("handles crawler article format", () => {
+    const input = `{
+      "title": "News Article",
+      "content": "Article body text...",
+      "author": "John Doe",
+      "date": "2024-01-01"
+    } [incomplete extra`;
+    const result = parseJSON(input, {});
+    expect(result.title).toBe("News Article");
+    expect(result.content).toBeDefined();
+  });
+
+  it("handles local search tool results", () => {
+    const input = `[
+      {
+        "id": "doc-1",
+        "title": "Document 1",
+        "content": "Document content here"
+      },
+      {
+        "id": "doc-2",
+        "title": "Document 2",
+        "content": "Another document"
+      }
+    ] extra garbage`;
+    const result = parseJSON(input, []);
+    expect(result.length).toBe(2);
+    expect(result[0].id).toBe("doc-1");
+  });
+
+  it("handles Python REPL output with JSON", () => {
+    const input = `{"result": 42, "error": null, "stdout": "Output here"} [process ended]`;
+    const result = parseJSON(input, {});
+    expect(result.result).toBe(42);
+    expect(result.error).toBeNull();
+  });
+
+  it("handles MCP tool response format", () => {
+    const input = `{
+      "tool": "web_search",
+      "status": "success",
+      "data": [{"title": "Result", "url": "https://example.com"}]
+    } additional text`;
+    const result = parseJSON(input, {});
+    expect(result.tool).toBe("web_search");
+    expect(result.data[0].title).toBe("Result");
+  });
+});
+
+describe("parseJSON - issue #598 regression tests", () => {
+  it("does not lose data when removing extra tokens", () => {
+    const input = `{
+      "research": "Complete research data here with lots of information",
+      "sources": [
+        {"title": "Source 1", "url": "https://source1.com"},
+        {"title": "Source 2", "url": "https://source2.com"}
+      ]
+    } garbage tokens that should be removed`;
+
+    const result = parseJSON(input, {});
+    expect(result.research).toBeDefined();
+    expect(result.sources.length).toBe(2);
+    expect(result.sources[0].title).toBe("Source 1");
+  });
+
+  it("handles consecutive tool calls with JSON", () => {
+    const firstResult = '{"step": 1, "data": "first"} extra';
+    const secondResult = '{"step": 2, "data": "second"} junk';
+
+    const result1 = parseJSON(firstResult, {});
+    const result2 = parseJSON(secondResult, {});
+
+    expect(result1.step).toBe(1);
+    expect(result2.step).toBe(2);
+  });
+
+  it("maintains performance with large responses", () => {
+    const largeContent = "A".repeat(50000);
+    const input = `{"content": "${largeContent}", "status": "ok"} extra data`;
+
+    const startTime = Date.now();
+    const result = parseJSON(input, {});
+    const duration = Date.now() - startTime;
+
+    expect(result.content).toBeDefined();
+    expect(result.status).toBe("ok");
+    // Should complete quickly (< 2 seconds for this size)
+    expect(duration).toBeLessThan(2000);
+  });
+
+  it("handles multiple consecutive extra tokens", () => {
+    const input =
+      '{"data": "value"}} } ] unexpected tokens here } { [ ) ] incomplete';
+    const result = parseJSON(input, {});
+    expect(result.data).toBe("value");
+  });
+
+  it("handles unicode garbage after JSON", () => {
+    const input = '{"text": "测试"} 乱码数据 🎯 garbage';
+    const result = parseJSON(input, {});
+    expect(result.text).toBe("测试");
+  });
+});