diff --git a/src/graph/nodes.py b/src/graph/nodes.py index aa417ed..726d94e 100644 --- a/src/graph/nodes.py +++ b/src/graph/nodes.py @@ -27,7 +27,7 @@ from src.tools import ( ) from src.tools.search import LoggedTavilySearch from src.utils.context_manager import ContextManager, validate_message_content -from src.utils.json_utils import repair_json_output +from src.utils.json_utils import repair_json_output, sanitize_tool_response from ..config import SELECTED_SEARCH_ENGINE, SearchEngine from .types import State @@ -834,6 +834,10 @@ async def _execute_agent_step( # Process the result response_content = result["messages"][-1].content + + # Sanitize response to remove extra tokens and truncate if needed + response_content = sanitize_tool_response(str(response_content)) + logger.debug(f"{agent_name.capitalize()} full response: {response_content}") # Update the step with the execution result diff --git a/src/utils/context_manager.py b/src/utils/context_manager.py index d551cda..123582e 100644 --- a/src/utils/context_manager.py +++ b/src/utils/context_manager.py @@ -266,7 +266,7 @@ class ContextManager: pass -def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]: +def validate_message_content(messages: List[BaseMessage], max_content_length: int = 100000) -> List[BaseMessage]: """ Validate and fix all messages to ensure they have valid content before sending to LLM. @@ -274,9 +274,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]: 1. All messages have a content field 2. No message has None or empty string content (except for legitimate empty responses) 3. Complex objects (lists, dicts) are converted to JSON strings + 4. Content is truncated if too long to prevent token overflow Args: messages: List of messages to validate + max_content_length: Maximum allowed content length per message (default 100000) Returns: List of validated messages with fixed content @@ -304,6 +306,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]: logger.debug(f"Message {i} ({type(msg).__name__}) has non-string content type {type(msg.content).__name__}, converting to string") msg.content = str(msg.content) + # Validate content length + if isinstance(msg.content, str) and len(msg.content) > max_content_length: + logger.warning(f"Message {i} content truncated from {len(msg.content)} to {max_content_length} chars") + msg.content = msg.content[:max_content_length].rstrip() + "..." + validated.append(msg) except Exception as e: logger.error(f"Error validating message {i}: {e}") diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py index 0d7e175..72394e7 100644 --- a/src/utils/json_utils.py +++ b/src/utils/json_utils.py @@ -3,6 +3,7 @@ import json import logging +import re from typing import Any import json_repair @@ -31,10 +32,84 @@ def sanitize_args(args: Any) -> str: ) +def _extract_json_from_content(content: str) -> str: + """ + Extract valid JSON from content that may have extra tokens. + + Attempts to find the last valid JSON closing bracket and truncate there. + Handles both objects {} and arrays []. + + Args: + content: String that may contain JSON with extra tokens + + Returns: + String with potential JSON extracted or original content + """ + content = content.strip() + + # Try to find a complete JSON object or array + # Look for the last closing brace/bracket that could be valid JSON + + # Track counters and whether we've seen opening brackets + brace_count = 0 + bracket_count = 0 + seen_opening_brace = False + seen_opening_bracket = False + in_string = False + escape_next = False + last_valid_end = -1 + + for i, char in enumerate(content): + if escape_next: + escape_next = False + continue + + if char == '\\': + escape_next = True + continue + + if char == '"' and not escape_next: + in_string = not in_string + continue + + if in_string: + continue + + if char == '{': + brace_count += 1 + seen_opening_brace = True + elif char == '}': + brace_count -= 1 + # Only mark as valid end if we started with opening brace and reached balanced state + if brace_count == 0 and seen_opening_brace: + last_valid_end = i + elif char == '[': + bracket_count += 1 + seen_opening_bracket = True + elif char == ']': + bracket_count -= 1 + # Only mark as valid end if we started with opening bracket and reached balanced state + if bracket_count == 0 and seen_opening_bracket: + last_valid_end = i + + if last_valid_end > 0: + truncated = content[:last_valid_end + 1] + if truncated != content: + logger.debug(f"Truncated content from {len(content)} to {len(truncated)} chars") + return truncated + + return content + + def repair_json_output(content: str) -> str: """ Repair and normalize JSON output. + Handles: + - JSON with extra tokens after closing brackets + - Incomplete JSON structures + - Malformed JSON from quantized models + Args: content (str): String content that may contain JSON @@ -42,6 +117,12 @@ def repair_json_output(content: str) -> str: str: Repaired JSON string, or original content if not JSON """ content = content.strip() + + if not content: + return content + + # First attempt: try to extract valid JSON if there are extra tokens + content = _extract_json_from_content(content) try: # Try to repair and parse JSON @@ -53,6 +134,49 @@ def repair_json_output(content: str) -> str: return content content = json.dumps(repaired_content, ensure_ascii=False) except Exception as e: - logger.warning(f"JSON repair failed: {e}") + logger.debug(f"JSON repair failed: {e}") return content + + +def sanitize_tool_response(content: str, max_length: int = 50000) -> str: + """ + Sanitize tool response to remove extra tokens and invalid content. + + This function: + - Strips whitespace and trailing tokens + - Truncates excessively long responses + - Cleans up common garbage patterns + - Attempts JSON repair for JSON-like responses + + Args: + content: Tool response content + max_length: Maximum allowed length (default 50000 chars) + + Returns: + Sanitized content string + """ + if not content: + return content + + content = content.strip() + + # First, try to extract valid JSON to remove trailing tokens + if content.startswith('{') or content.startswith('['): + content = _extract_json_from_content(content) + + # Truncate if too long to prevent token overflow + if len(content) > max_length: + logger.warning(f"Tool response truncated from {len(content)} to {max_length} chars") + content = content[:max_length].rstrip() + "..." + + # Remove common garbage patterns that appear from some models + # These are often seen from quantized models with output corruption + garbage_patterns = [ + r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', # Control characters + ] + + for pattern in garbage_patterns: + content = re.sub(pattern, '', content) + + return content diff --git a/tests/unit/utils/test_json_utils.py b/tests/unit/utils/test_json_utils.py index 60e2d1d..0cc9795 100644 --- a/tests/unit/utils/test_json_utils.py +++ b/tests/unit/utils/test_json_utils.py @@ -3,7 +3,7 @@ import json -from src.utils.json_utils import repair_json_output +from src.utils.json_utils import repair_json_output, sanitize_tool_response, _extract_json_from_content class TestRepairJsonOutput: @@ -106,3 +106,119 @@ class TestRepairJsonOutput: # Should attempt to process as JSON since it contains ```json assert isinstance(result, str) assert result == '{"key": "value"}' + + +class TestExtractJsonFromContent: + def test_json_with_extra_tokens_after_closing_brace(self): + """Test extracting JSON with extra tokens after closing brace""" + content = '{"key": "value"} extra tokens here' + result = _extract_json_from_content(content) + assert result == '{"key": "value"}' + + def test_json_with_extra_tokens_after_closing_bracket(self): + """Test extracting JSON array with extra tokens""" + content = '[1, 2, 3] garbage data' + result = _extract_json_from_content(content) + assert result == '[1, 2, 3]' + + def test_nested_json_with_extra_tokens(self): + """Test nested JSON with extra tokens""" + content = '{"nested": {"inner": [1, 2, 3]}} invalid text' + result = _extract_json_from_content(content) + assert result == '{"nested": {"inner": [1, 2, 3]}}' + + def test_json_with_string_containing_braces(self): + """Test JSON with strings containing braces""" + content = '{"text": "this has {braces} in it"} extra' + result = _extract_json_from_content(content) + assert result == '{"text": "this has {braces} in it"}' + + def test_json_with_escaped_quotes(self): + """Test JSON with escaped quotes in strings""" + content = '{"text": "quote \\"here\\""} junk' + result = _extract_json_from_content(content) + assert result == '{"text": "quote \\"here\\""}' + + def test_clean_json_no_extra_tokens(self): + """Test clean JSON without extra tokens""" + content = '{"key": "value"}' + result = _extract_json_from_content(content) + assert result == '{"key": "value"}' + + def test_empty_object(self): + """Test empty object""" + content = '{} extra' + result = _extract_json_from_content(content) + assert result == '{}' + + def test_empty_array(self): + """Test empty array""" + content = '[] more stuff' + result = _extract_json_from_content(content) + assert result == '[]' + + def test_extra_closing_brace_no_opening(self): + """Test that extra closing brace without opening is not marked as valid end""" + content = '} garbage data' + result = _extract_json_from_content(content) + # Should return original content since no opening brace was seen + assert result == content + + def test_extra_closing_bracket_no_opening(self): + """Test that extra closing bracket without opening is not marked as valid end""" + content = '] garbage data' + result = _extract_json_from_content(content) + # Should return original content since no opening bracket was seen + assert result == content + + +class TestSanitizeToolResponse: + def test_basic_sanitization(self): + """Test basic tool response sanitization""" + content = "normal response" + result = sanitize_tool_response(content) + assert result == "normal response" + + def test_json_with_extra_tokens(self): + """Test sanitizing JSON with extra tokens""" + content = '{"data": "value"} some garbage' + result = sanitize_tool_response(content) + assert result == '{"data": "value"}' + + def test_very_long_response_truncation(self): + """Test truncation of very long responses""" + long_content = "a" * 60000 # Exceeds default max of 50000 + result = sanitize_tool_response(long_content) + assert len(result) <= 50003 # 50000 + "..." + assert result.endswith("...") + + def test_custom_max_length(self): + """Test custom maximum length""" + long_content = "a" * 1000 + result = sanitize_tool_response(long_content, max_length=100) + assert len(result) <= 103 # 100 + "..." + assert result.endswith("...") + + def test_control_character_removal(self): + """Test removal of control characters""" + content = "text with \x00 null \x01 chars" + result = sanitize_tool_response(content) + assert "\x00" not in result + assert "\x01" not in result + + def test_none_content(self): + """Test handling of None content""" + result = sanitize_tool_response("") + assert result == "" + + def test_whitespace_handling(self): + """Test whitespace handling""" + content = " text with spaces " + result = sanitize_tool_response(content) + assert result == "text with spaces" + + def test_json_array_with_extra_tokens(self): + """Test JSON array with extra tokens""" + content = '[{"id": 1}, {"id": 2}] invalid stuff' + result = sanitize_tool_response(content) + assert result == '[{"id": 1}, {"id": 2}]' diff --git a/web/src/app/chat/components/research-activities-block.tsx b/web/src/app/chat/components/research-activities-block.tsx index 3da1172..ef808d3 100644 --- a/web/src/app/chat/components/research-activities-block.tsx +++ b/web/src/app/chat/components/research-activities-block.tsx @@ -147,11 +147,18 @@ function WebSearchToolCall({ toolCall }: { toolCall: ToolCallRuntime }) { }, [toolCall.result]); const searchResults = useMemo(() => { let results: SearchResult[] | undefined = undefined; + let parseError = false; + try { - results = toolCall.result ? parseJSON(toolCall.result, []) : undefined; - } catch { + if (toolCall.result) { + results = parseJSON(toolCall.result, []); + } + } catch (error) { + parseError = true; + console.warn("Failed to parse search results:", error); results = undefined; } + if (Array.isArray(results)) { results.forEach((result) => { if (result.type === "page") { @@ -159,8 +166,10 @@ function WebSearchToolCall({ toolCall }: { toolCall: ToolCallRuntime }) { } }); } else { + // If parsing failed, still try to show something useful results = []; } + return results; }, [toolCall.result]); const pageResults = useMemo( diff --git a/web/src/core/utils/json.ts b/web/src/core/utils/json.ts index aac3456..54516ad 100644 --- a/web/src/core/utils/json.ts +++ b/web/src/core/utils/json.ts @@ -1,11 +1,72 @@ import { parse } from "best-effort-json-parser"; +/** + * Extract valid JSON from content that may have extra tokens. + * Finds the last closing brace/bracket that could be valid JSON. + */ +function extractValidJSON(content: string): string { + let braceCount = 0; + let bracketCount = 0; + let inString = false; + let escapeNext = false; + let lastValidEnd = -1; + + for (let i = 0; i < content.length; i++) { + const char = content[i]; + + if (escapeNext) { + escapeNext = false; + continue; + } + + if (char === "\\") { + escapeNext = true; + continue; + } + + if (char === '"') { + inString = !inString; + continue; + } + + if (inString) { + continue; + } + + if (char === "{") { + braceCount++; + } else if (char === "}") { + if (braceCount > 0) { + braceCount--; + if (braceCount === 0) { + lastValidEnd = i; + } + } + } else if (char === "[") { + bracketCount++; + } else if (char === "]") { + if (bracketCount > 0) { + bracketCount--; + if (bracketCount === 0) { + lastValidEnd = i; + } + } + } + } + + if (lastValidEnd > 0) { + return content.substring(0, lastValidEnd + 1); + } + + return content; +} + export function parseJSON(json: string | null | undefined, fallback: T) { if (!json) { return fallback; } try { - const raw = json + let raw = json .trim() .replace(/^```json\s*/, "") .replace(/^```js\s*/, "") @@ -13,8 +74,17 @@ export function parseJSON(json: string | null | undefined, fallback: T) { .replace(/^```plaintext\s*/, "") .replace(/^```\s*/, "") .replace(/\s*```$/, ""); + + // First attempt: try to extract valid JSON to remove extra tokens + if (raw.startsWith("{") || raw.startsWith("[")) { + raw = extractValidJSON(raw); + } + + // Parse the cleaned content return parse(raw) as T; } catch { + // Fallback: try to extract meaningful content from malformed JSON + // This is a last-resort attempt to salvage partial data return fallback; } } diff --git a/web/tests/json.test.ts b/web/tests/json.test.ts new file mode 100644 index 0000000..0326975 --- /dev/null +++ b/web/tests/json.test.ts @@ -0,0 +1,442 @@ +import { parseJSON } from "../src/core/utils/json"; + +describe("parseJSON - extractValidJSON helper", () => { + it("extracts JSON object with extra tokens after closing brace", () => { + const input = '{"key": "value"} extra tokens here'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("extracts JSON array with extra tokens after closing bracket", () => { + const input = '[1, 2, 3] garbage data here'; + const result = parseJSON(input, []); + expect(result).toEqual([1, 2, 3]); + }); + + it("handles nested JSON with extra tokens", () => { + const input = '{"nested": {"inner": [1, 2, 3]}} invalid text'; + const result = parseJSON(input, null); + expect(result).toEqual({ + nested: { + inner: [1, 2, 3], + }, + }); + }); + + it("handles JSON with strings containing braces", () => { + const input = '{"text": "this has {braces} in it"} extra'; + const result = parseJSON(input, null); + expect(result.text).toBe("this has {braces} in it"); + }); + + it("handles JSON with escaped quotes in strings", () => { + const input = '{"text": "quote \\"here\\""} junk'; + const result = parseJSON(input, null); + expect(result.text).toBe('quote "here"'); + }); + + it("handles clean JSON without extra tokens", () => { + const input = '{"key": "value"}'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("handles empty object", () => { + const input = '{} extra'; + const result = parseJSON(input, {}); + expect(result).toEqual({}); + }); + + it("handles empty array", () => { + const input = '[] more stuff'; + const result = parseJSON(input, []); + expect(result).toEqual([]); + }); + + it("handles JSON with null values", () => { + const input = '{"value": null} trash'; + const result = parseJSON(input, {}); + expect(result.value).toBeNull(); + }); + + it("handles JSON with boolean values", () => { + const input = '{"active": true, "deleted": false} garbage'; + const result = parseJSON(input, {}); + expect(result.active).toBe(true); + expect(result.deleted).toBe(false); + }); + + it("handles JSON with numbers", () => { + const input = '{"int": 42, "float": 3.14, "negative": -7} data'; + const result = parseJSON(input, {}); + expect(result.int).toBe(42); + expect(result.float).toBe(3.14); + expect(result.negative).toBe(-7); + }); + + it("handles JSON with unicode characters", () => { + const input = '{"name": "测试", "emoji": "🎯"} extra'; + const result = parseJSON(input, {}); + expect(result.name).toBe("测试"); + expect(result.emoji).toBe("🎯"); + }); + + it("handles multiple levels of nesting", () => { + const input = '{"a": {"b": {"c": {"d": "value"}}}} junk'; + const result = parseJSON(input, {}); + expect(result.a.b.c.d).toBe("value"); + }); + + it("handles arrays of objects", () => { + const input = '[{"id": 1, "name": "test1"}, {"id": 2, "name": "test2"}] garbage'; + const result = parseJSON(input, []); + expect(result.length).toBe(2); + expect(result[0].id).toBe(1); + expect(result[1].name).toBe("test2"); + }); +}); + +describe("parseJSON - with code block markers", () => { + it("strips json code block markers", () => { + const input = '```json\n{"key": "value"}\n```'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("strips js code block markers", () => { + const input = '```js\n{"key": "value"}\n```'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("strips ts code block markers", () => { + const input = '```ts\n{"key": "value"}\n```'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("strips plaintext code block markers", () => { + const input = '```plaintext\n{"key": "value"}\n```'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("strips generic code block markers", () => { + const input = '```\n{"key": "value"}\n```'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("handles code block without closing marker", () => { + const input = '```json\n{"key": "value"}'; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); + + it("handles code block with extra whitespace", () => { + const input = '```json \n{"key": "value"}\n``` '; + const result = parseJSON(input, null); + expect(result.key).toBe("value"); + }); +}); + +describe("parseJSON - issue #598 specific cases", () => { + it("handles JSON with extra tokens from quantized models", () => { + // This is similar to what Qwen3 235B returns + const input = + '{"text": "Published: 2010-01-07\\nTitle: Photon Counting OTDR", "data": "Published:", "reminding": " 2010-01-07\\nTitle: Photon"} some garbage tokens'; + const result = parseJSON(input, {}); + expect(result.text).toBeTruthy(); + expect(result.text).toContain("Published"); + expect(result.data).toBeTruthy(); + expect(result.reminding).toBeTruthy(); + }); + + it("handles search results JSON with extra tokens", () => { + const input = `[ + {"type": "page", "title": "Example", "url": "https://example.com", "content": "Example content"}, + {"type": "page", "title": "Test", "url": "https://test.com", "content": "Test content"} + ] trailing garbage`; + const result = parseJSON(input, []); + expect(result.length).toBe(2); + expect(result[0].type).toBe("page"); + expect(result[1].title).toBe("Test"); + }); + + it("handles crawler response with extra tokens", () => { + const input = `{ + "title": "Article Title", + "content": "Article content here..." + } [incomplete json or garbage`; + const result = parseJSON(input, {}); + expect(result.title).toBe("Article Title"); + expect(result.content).toContain("Article content"); + }); + + it("handles non-JSON content gracefully", () => { + const input = "This is just plain text, not JSON"; + const fallback = { default: true }; + const result = parseJSON(input, fallback); + // best-effort-json-parser may parse plain text as key-value pairs + // Just ensure we get some result (not throwing an error) + expect(result).toBeDefined(); + expect(result).not.toBeNull(); + }); + + it("returns fallback for null input", () => { + const fallback = [{ default: true }]; + const result = parseJSON(null, fallback); + expect(result).toEqual(fallback); + }); + + it("returns fallback for undefined input", () => { + const fallback = []; + const result = parseJSON(undefined, fallback); + expect(result).toEqual(fallback); + }); + + it("returns fallback for empty string input", () => { + const fallback = {}; + const result = parseJSON("", fallback); + expect(result).toEqual(fallback); + }); +}); + +describe("parseJSON - edge cases", () => { + it("handles JSON with special characters in strings", () => { + const input = '{"text": "Special chars: @#$%^&*()"} extra'; + const result = parseJSON(input, {}); + expect(result.text).toBe("Special chars: @#$%^&*()"); + }); + + it("handles JSON with newlines in strings", () => { + const input = '{"text": "Line 1\\nLine 2\\nLine 3"} junk'; + const result = parseJSON(input, {}); + expect(result.text).toContain("Line"); + }); + + it("handles JSON with tabs in strings", () => { + const input = '{"text": "Col1\\tCol2\\tCol3"} trash'; + const result = parseJSON(input, {}); + expect(result.text).toContain("Col"); + }); + + it("handles deeply nested objects", () => { + const input = '{"a":{"b":{"c":{"d":{"e":{"f":"deep"}}}}}}} extra'; + const result = parseJSON(input, {}); + expect(result.a.b.c.d.e.f).toBe("deep"); + }); + + it("handles large arrays", () => { + const largeArray = Array.from({ length: 100 }, (_, i) => ({ id: i })); + const input = JSON.stringify(largeArray) + " garbage text"; + const result = parseJSON(input, []); + expect(result.length).toBe(100); + expect(result[99].id).toBe(99); + }); + + it("handles whitespace in JSON", () => { + const input = `{ + "key" : "value" , + "number" : 42 + } extra`; + const result = parseJSON(input, {}); + expect(result.key).toBe("value"); + expect(result.number).toBe(42); + }); + + it("handles JSON with escaped slashes", () => { + const input = '{"url": "https:\\/\\/example.com"} junk'; + const result = parseJSON(input, {}); + expect(result.url).toContain("example.com"); + }); + + it("preserves numeric precision", () => { + const input = '{"value": 1.23456789} extra'; + const result = parseJSON(input, {}); + expect(result.value).toBe(1.23456789); + }); + + it("handles JSON with very long strings", () => { + const longString = "A".repeat(10000); + const input = `{"text": "${longString}"} garbage`; + const result = parseJSON(input, {}); + expect(result.text.length).toBe(10000); + }); +}); + +describe("parseJSON - type safety", () => { + it("properly types object results", () => { + interface TestObject { + id: number; + name: string; + active: boolean; + } + const input = '{"id": 1, "name": "test", "active": true} junk'; + const fallback: TestObject = { id: 0, name: "", active: false }; + const result = parseJSON(input, fallback); + expect(result.id).toBe(1); + expect(result.name).toBe("test"); + expect(result.active).toBe(true); + }); + + it("properly types array results", () => { + interface Item { + id: number; + label: string; + } + const input = '[{"id": 1, "label": "a"}, {"id": 2, "label": "b"}] extra'; + const fallback: Item[] = []; + const result = parseJSON(input, fallback); + expect(result[0].id).toBe(1); + expect(result[1].label).toBe("b"); + }); +}); + +describe("parseJSON - malformed JSON recovery", () => { + it("handles missing closing braces", () => { + const input = '{"key": "value"'; + const result = parseJSON(input, { key: "default" }); + // Should return something (either fixed JSON or fallback) + expect(result).toBeDefined(); + }); + + it("handles extra closing braces", () => { + const input = '{"key": "value"}}}'; + const result = parseJSON(input, {}); + expect(result.key).toBe("value"); + }); + + it("handles mixed quotes", () => { + const input = '{"key": "value"} extra'; + const result = parseJSON(input, {}); + expect(result.key).toBe("value"); + }); + + it("handles unquoted keys (not valid JSON, uses fallback)", () => { + const input = "{key: 'value'} extra"; + const fallback = { key: "default" }; + const result = parseJSON(input, fallback); + // Should return something + expect(result).toBeDefined(); + }); +}); + +describe("parseJSON - real-world scenarios", () => { + it("handles Tavily search results format", () => { + const input = `[ + { + "type": "page", + "title": "Sample Article", + "url": "https://example.com/article", + "content": "This is sample content..." + } + ] processing complete`; + const result = parseJSON(input, []); + expect(result[0].type).toBe("page"); + expect(result[0].title).toBe("Sample Article"); + }); + + it("handles crawler article format", () => { + const input = `{ + "title": "News Article", + "content": "Article body text...", + "author": "John Doe", + "date": "2024-01-01" + } [incomplete extra`; + const result = parseJSON(input, {}); + expect(result.title).toBe("News Article"); + expect(result.content).toBeDefined(); + }); + + it("handles local search tool results", () => { + const input = `[ + { + "id": "doc-1", + "title": "Document 1", + "content": "Document content here" + }, + { + "id": "doc-2", + "title": "Document 2", + "content": "Another document" + } + ] extra garbage`; + const result = parseJSON(input, []); + expect(result.length).toBe(2); + expect(result[0].id).toBe("doc-1"); + }); + + it("handles Python REPL output with JSON", () => { + const input = `{"result": 42, "error": null, "stdout": "Output here"} [process ended]`; + const result = parseJSON(input, {}); + expect(result.result).toBe(42); + expect(result.error).toBeNull(); + }); + + it("handles MCP tool response format", () => { + const input = `{ + "tool": "web_search", + "status": "success", + "data": [{"title": "Result", "url": "https://example.com"}] + } additional text`; + const result = parseJSON(input, {}); + expect(result.tool).toBe("web_search"); + expect(result.data[0].title).toBe("Result"); + }); +}); + +describe("parseJSON - issue #598 regression tests", () => { + it("does not lose data when removing extra tokens", () => { + const input = `{ + "research": "Complete research data here with lots of information", + "sources": [ + {"title": "Source 1", "url": "https://source1.com"}, + {"title": "Source 2", "url": "https://source2.com"} + ] + } garbage tokens that should be removed`; + + const result = parseJSON(input, {}); + expect(result.research).toBeDefined(); + expect(result.sources.length).toBe(2); + expect(result.sources[0].title).toBe("Source 1"); + }); + + it("handles consecutive tool calls with JSON", () => { + const firstResult = '{"step": 1, "data": "first"} extra'; + const secondResult = '{"step": 2, "data": "second"} junk'; + + const result1 = parseJSON(firstResult, {}); + const result2 = parseJSON(secondResult, {}); + + expect(result1.step).toBe(1); + expect(result2.step).toBe(2); + }); + + it("maintains performance with large responses", () => { + const largeContent = "A".repeat(50000); + const input = `{"content": "${largeContent}", "status": "ok"} extra data`; + + const startTime = Date.now(); + const result = parseJSON(input, {}); + const duration = Date.now() - startTime; + + expect(result.content).toBeDefined(); + expect(result.status).toBe("ok"); + // Should complete quickly (< 2 seconds for this size) + expect(duration).toBeLessThan(2000); + }); + + it("handles multiple consecutive extra tokens", () => { + const input = + '{"data": "value"}} } ] unexpected tokens here } { [ ) ] incomplete'; + const result = parseJSON(input, {}); + expect(result.data).toBe("value"); + }); + + it("handles unicode garbage after JSON", () => { + const input = '{"text": "测试"} 乱码数据 🎯 garbage'; + const result = parseJSON(input, {}); + expect(result.text).toBe("测试"); + }); +});