mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 06:12:14 +08:00
fix: parsed json with extra tokens issue (#656)
Fixes #598 * fix: parsed json with extra tokens issue * Added unit test for json.ts * fix the json unit test running issue * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update the code with code review suggestion --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
This commit is contained in:
@@ -27,7 +27,7 @@ from src.tools import (
|
||||
)
|
||||
from src.tools.search import LoggedTavilySearch
|
||||
from src.utils.context_manager import ContextManager, validate_message_content
|
||||
from src.utils.json_utils import repair_json_output
|
||||
from src.utils.json_utils import repair_json_output, sanitize_tool_response
|
||||
|
||||
from ..config import SELECTED_SEARCH_ENGINE, SearchEngine
|
||||
from .types import State
|
||||
@@ -834,6 +834,10 @@ async def _execute_agent_step(
|
||||
|
||||
# Process the result
|
||||
response_content = result["messages"][-1].content
|
||||
|
||||
# Sanitize response to remove extra tokens and truncate if needed
|
||||
response_content = sanitize_tool_response(str(response_content))
|
||||
|
||||
logger.debug(f"{agent_name.capitalize()} full response: {response_content}")
|
||||
|
||||
# Update the step with the execution result
|
||||
|
||||
@@ -266,7 +266,7 @@ class ContextManager:
|
||||
pass
|
||||
|
||||
|
||||
def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
|
||||
def validate_message_content(messages: List[BaseMessage], max_content_length: int = 100000) -> List[BaseMessage]:
|
||||
"""
|
||||
Validate and fix all messages to ensure they have valid content before sending to LLM.
|
||||
|
||||
@@ -274,9 +274,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
|
||||
1. All messages have a content field
|
||||
2. No message has None or empty string content (except for legitimate empty responses)
|
||||
3. Complex objects (lists, dicts) are converted to JSON strings
|
||||
4. Content is truncated if too long to prevent token overflow
|
||||
|
||||
Args:
|
||||
messages: List of messages to validate
|
||||
max_content_length: Maximum allowed content length per message (default 100000)
|
||||
|
||||
Returns:
|
||||
List of validated messages with fixed content
|
||||
@@ -304,6 +306,11 @@ def validate_message_content(messages: List[BaseMessage]) -> List[BaseMessage]:
|
||||
logger.debug(f"Message {i} ({type(msg).__name__}) has non-string content type {type(msg.content).__name__}, converting to string")
|
||||
msg.content = str(msg.content)
|
||||
|
||||
# Validate content length
|
||||
if isinstance(msg.content, str) and len(msg.content) > max_content_length:
|
||||
logger.warning(f"Message {i} content truncated from {len(msg.content)} to {max_content_length} chars")
|
||||
msg.content = msg.content[:max_content_length].rstrip() + "..."
|
||||
|
||||
validated.append(msg)
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating message {i}: {e}")
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import json_repair
|
||||
@@ -31,10 +32,84 @@ def sanitize_args(args: Any) -> str:
|
||||
)
|
||||
|
||||
|
||||
def _extract_json_from_content(content: str) -> str:
|
||||
"""
|
||||
Extract valid JSON from content that may have extra tokens.
|
||||
|
||||
Attempts to find the last valid JSON closing bracket and truncate there.
|
||||
Handles both objects {} and arrays [].
|
||||
|
||||
Args:
|
||||
content: String that may contain JSON with extra tokens
|
||||
|
||||
Returns:
|
||||
String with potential JSON extracted or original content
|
||||
"""
|
||||
content = content.strip()
|
||||
|
||||
# Try to find a complete JSON object or array
|
||||
# Look for the last closing brace/bracket that could be valid JSON
|
||||
|
||||
# Track counters and whether we've seen opening brackets
|
||||
brace_count = 0
|
||||
bracket_count = 0
|
||||
seen_opening_brace = False
|
||||
seen_opening_bracket = False
|
||||
in_string = False
|
||||
escape_next = False
|
||||
last_valid_end = -1
|
||||
|
||||
for i, char in enumerate(content):
|
||||
if escape_next:
|
||||
escape_next = False
|
||||
continue
|
||||
|
||||
if char == '\\':
|
||||
escape_next = True
|
||||
continue
|
||||
|
||||
if char == '"' and not escape_next:
|
||||
in_string = not in_string
|
||||
continue
|
||||
|
||||
if in_string:
|
||||
continue
|
||||
|
||||
if char == '{':
|
||||
brace_count += 1
|
||||
seen_opening_brace = True
|
||||
elif char == '}':
|
||||
brace_count -= 1
|
||||
# Only mark as valid end if we started with opening brace and reached balanced state
|
||||
if brace_count == 0 and seen_opening_brace:
|
||||
last_valid_end = i
|
||||
elif char == '[':
|
||||
bracket_count += 1
|
||||
seen_opening_bracket = True
|
||||
elif char == ']':
|
||||
bracket_count -= 1
|
||||
# Only mark as valid end if we started with opening bracket and reached balanced state
|
||||
if bracket_count == 0 and seen_opening_bracket:
|
||||
last_valid_end = i
|
||||
|
||||
if last_valid_end > 0:
|
||||
truncated = content[:last_valid_end + 1]
|
||||
if truncated != content:
|
||||
logger.debug(f"Truncated content from {len(content)} to {len(truncated)} chars")
|
||||
return truncated
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def repair_json_output(content: str) -> str:
|
||||
"""
|
||||
Repair and normalize JSON output.
|
||||
|
||||
Handles:
|
||||
- JSON with extra tokens after closing brackets
|
||||
- Incomplete JSON structures
|
||||
- Malformed JSON from quantized models
|
||||
|
||||
Args:
|
||||
content (str): String content that may contain JSON
|
||||
|
||||
@@ -42,6 +117,12 @@ def repair_json_output(content: str) -> str:
|
||||
str: Repaired JSON string, or original content if not JSON
|
||||
"""
|
||||
content = content.strip()
|
||||
|
||||
if not content:
|
||||
return content
|
||||
|
||||
# First attempt: try to extract valid JSON if there are extra tokens
|
||||
content = _extract_json_from_content(content)
|
||||
|
||||
try:
|
||||
# Try to repair and parse JSON
|
||||
@@ -53,6 +134,49 @@ def repair_json_output(content: str) -> str:
|
||||
return content
|
||||
content = json.dumps(repaired_content, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logger.warning(f"JSON repair failed: {e}")
|
||||
logger.debug(f"JSON repair failed: {e}")
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def sanitize_tool_response(content: str, max_length: int = 50000) -> str:
|
||||
"""
|
||||
Sanitize tool response to remove extra tokens and invalid content.
|
||||
|
||||
This function:
|
||||
- Strips whitespace and trailing tokens
|
||||
- Truncates excessively long responses
|
||||
- Cleans up common garbage patterns
|
||||
- Attempts JSON repair for JSON-like responses
|
||||
|
||||
Args:
|
||||
content: Tool response content
|
||||
max_length: Maximum allowed length (default 50000 chars)
|
||||
|
||||
Returns:
|
||||
Sanitized content string
|
||||
"""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
content = content.strip()
|
||||
|
||||
# First, try to extract valid JSON to remove trailing tokens
|
||||
if content.startswith('{') or content.startswith('['):
|
||||
content = _extract_json_from_content(content)
|
||||
|
||||
# Truncate if too long to prevent token overflow
|
||||
if len(content) > max_length:
|
||||
logger.warning(f"Tool response truncated from {len(content)} to {max_length} chars")
|
||||
content = content[:max_length].rstrip() + "..."
|
||||
|
||||
# Remove common garbage patterns that appear from some models
|
||||
# These are often seen from quantized models with output corruption
|
||||
garbage_patterns = [
|
||||
r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]', # Control characters
|
||||
]
|
||||
|
||||
for pattern in garbage_patterns:
|
||||
content = re.sub(pattern, '', content)
|
||||
|
||||
return content
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import json
|
||||
|
||||
from src.utils.json_utils import repair_json_output
|
||||
from src.utils.json_utils import repair_json_output, sanitize_tool_response, _extract_json_from_content
|
||||
|
||||
|
||||
class TestRepairJsonOutput:
|
||||
@@ -106,3 +106,119 @@ class TestRepairJsonOutput:
|
||||
# Should attempt to process as JSON since it contains ```json
|
||||
assert isinstance(result, str)
|
||||
assert result == '{"key": "value"}'
|
||||
|
||||
|
||||
class TestExtractJsonFromContent:
|
||||
def test_json_with_extra_tokens_after_closing_brace(self):
|
||||
"""Test extracting JSON with extra tokens after closing brace"""
|
||||
content = '{"key": "value"} extra tokens here'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"key": "value"}'
|
||||
|
||||
def test_json_with_extra_tokens_after_closing_bracket(self):
|
||||
"""Test extracting JSON array with extra tokens"""
|
||||
content = '[1, 2, 3] garbage data'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '[1, 2, 3]'
|
||||
|
||||
def test_nested_json_with_extra_tokens(self):
|
||||
"""Test nested JSON with extra tokens"""
|
||||
content = '{"nested": {"inner": [1, 2, 3]}} invalid text'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"nested": {"inner": [1, 2, 3]}}'
|
||||
|
||||
def test_json_with_string_containing_braces(self):
|
||||
"""Test JSON with strings containing braces"""
|
||||
content = '{"text": "this has {braces} in it"} extra'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"text": "this has {braces} in it"}'
|
||||
|
||||
def test_json_with_escaped_quotes(self):
|
||||
"""Test JSON with escaped quotes in strings"""
|
||||
content = '{"text": "quote \\"here\\""} junk'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"text": "quote \\"here\\""}'
|
||||
|
||||
def test_clean_json_no_extra_tokens(self):
|
||||
"""Test clean JSON without extra tokens"""
|
||||
content = '{"key": "value"}'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"key": "value"}'
|
||||
|
||||
def test_empty_object(self):
|
||||
"""Test empty object"""
|
||||
content = '{} extra'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{}'
|
||||
|
||||
def test_empty_array(self):
|
||||
"""Test empty array"""
|
||||
content = '[] more stuff'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '[]'
|
||||
|
||||
def test_extra_closing_brace_no_opening(self):
|
||||
"""Test that extra closing brace without opening is not marked as valid end"""
|
||||
content = '} garbage data'
|
||||
result = _extract_json_from_content(content)
|
||||
# Should return original content since no opening brace was seen
|
||||
assert result == content
|
||||
|
||||
def test_extra_closing_bracket_no_opening(self):
|
||||
"""Test that extra closing bracket without opening is not marked as valid end"""
|
||||
content = '] garbage data'
|
||||
result = _extract_json_from_content(content)
|
||||
# Should return original content since no opening bracket was seen
|
||||
assert result == content
|
||||
|
||||
|
||||
class TestSanitizeToolResponse:
|
||||
def test_basic_sanitization(self):
|
||||
"""Test basic tool response sanitization"""
|
||||
content = "normal response"
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == "normal response"
|
||||
|
||||
def test_json_with_extra_tokens(self):
|
||||
"""Test sanitizing JSON with extra tokens"""
|
||||
content = '{"data": "value"} some garbage'
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == '{"data": "value"}'
|
||||
|
||||
def test_very_long_response_truncation(self):
|
||||
"""Test truncation of very long responses"""
|
||||
long_content = "a" * 60000 # Exceeds default max of 50000
|
||||
result = sanitize_tool_response(long_content)
|
||||
assert len(result) <= 50003 # 50000 + "..."
|
||||
assert result.endswith("...")
|
||||
|
||||
def test_custom_max_length(self):
|
||||
"""Test custom maximum length"""
|
||||
long_content = "a" * 1000
|
||||
result = sanitize_tool_response(long_content, max_length=100)
|
||||
assert len(result) <= 103 # 100 + "..."
|
||||
assert result.endswith("...")
|
||||
|
||||
def test_control_character_removal(self):
|
||||
"""Test removal of control characters"""
|
||||
content = "text with \x00 null \x01 chars"
|
||||
result = sanitize_tool_response(content)
|
||||
assert "\x00" not in result
|
||||
assert "\x01" not in result
|
||||
|
||||
def test_none_content(self):
|
||||
"""Test handling of None content"""
|
||||
result = sanitize_tool_response("")
|
||||
assert result == ""
|
||||
|
||||
def test_whitespace_handling(self):
|
||||
"""Test whitespace handling"""
|
||||
content = " text with spaces "
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == "text with spaces"
|
||||
|
||||
def test_json_array_with_extra_tokens(self):
|
||||
"""Test JSON array with extra tokens"""
|
||||
content = '[{"id": 1}, {"id": 2}] invalid stuff'
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == '[{"id": 1}, {"id": 2}]'
|
||||
|
||||
@@ -147,11 +147,18 @@ function WebSearchToolCall({ toolCall }: { toolCall: ToolCallRuntime }) {
|
||||
}, [toolCall.result]);
|
||||
const searchResults = useMemo<SearchResult[]>(() => {
|
||||
let results: SearchResult[] | undefined = undefined;
|
||||
let parseError = false;
|
||||
|
||||
try {
|
||||
results = toolCall.result ? parseJSON(toolCall.result, []) : undefined;
|
||||
} catch {
|
||||
if (toolCall.result) {
|
||||
results = parseJSON(toolCall.result, []);
|
||||
}
|
||||
} catch (error) {
|
||||
parseError = true;
|
||||
console.warn("Failed to parse search results:", error);
|
||||
results = undefined;
|
||||
}
|
||||
|
||||
if (Array.isArray(results)) {
|
||||
results.forEach((result) => {
|
||||
if (result.type === "page") {
|
||||
@@ -159,8 +166,10 @@ function WebSearchToolCall({ toolCall }: { toolCall: ToolCallRuntime }) {
|
||||
}
|
||||
});
|
||||
} else {
|
||||
// If parsing failed, still try to show something useful
|
||||
results = [];
|
||||
}
|
||||
|
||||
return results;
|
||||
}, [toolCall.result]);
|
||||
const pageResults = useMemo(
|
||||
|
||||
@@ -1,11 +1,72 @@
|
||||
import { parse } from "best-effort-json-parser";
|
||||
|
||||
/**
|
||||
* Extract valid JSON from content that may have extra tokens.
|
||||
* Finds the last closing brace/bracket that could be valid JSON.
|
||||
*/
|
||||
function extractValidJSON(content: string): string {
|
||||
let braceCount = 0;
|
||||
let bracketCount = 0;
|
||||
let inString = false;
|
||||
let escapeNext = false;
|
||||
let lastValidEnd = -1;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
|
||||
if (escapeNext) {
|
||||
escapeNext = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (char === "\\") {
|
||||
escapeNext = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (char === '"') {
|
||||
inString = !inString;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (inString) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (char === "{") {
|
||||
braceCount++;
|
||||
} else if (char === "}") {
|
||||
if (braceCount > 0) {
|
||||
braceCount--;
|
||||
if (braceCount === 0) {
|
||||
lastValidEnd = i;
|
||||
}
|
||||
}
|
||||
} else if (char === "[") {
|
||||
bracketCount++;
|
||||
} else if (char === "]") {
|
||||
if (bracketCount > 0) {
|
||||
bracketCount--;
|
||||
if (bracketCount === 0) {
|
||||
lastValidEnd = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (lastValidEnd > 0) {
|
||||
return content.substring(0, lastValidEnd + 1);
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
export function parseJSON<T>(json: string | null | undefined, fallback: T) {
|
||||
if (!json) {
|
||||
return fallback;
|
||||
}
|
||||
try {
|
||||
const raw = json
|
||||
let raw = json
|
||||
.trim()
|
||||
.replace(/^```json\s*/, "")
|
||||
.replace(/^```js\s*/, "")
|
||||
@@ -13,8 +74,17 @@ export function parseJSON<T>(json: string | null | undefined, fallback: T) {
|
||||
.replace(/^```plaintext\s*/, "")
|
||||
.replace(/^```\s*/, "")
|
||||
.replace(/\s*```$/, "");
|
||||
|
||||
// First attempt: try to extract valid JSON to remove extra tokens
|
||||
if (raw.startsWith("{") || raw.startsWith("[")) {
|
||||
raw = extractValidJSON(raw);
|
||||
}
|
||||
|
||||
// Parse the cleaned content
|
||||
return parse(raw) as T;
|
||||
} catch {
|
||||
// Fallback: try to extract meaningful content from malformed JSON
|
||||
// This is a last-resort attempt to salvage partial data
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
|
||||
442
web/tests/json.test.ts
Normal file
442
web/tests/json.test.ts
Normal file
@@ -0,0 +1,442 @@
|
||||
import { parseJSON } from "../src/core/utils/json";
|
||||
|
||||
describe("parseJSON - extractValidJSON helper", () => {
|
||||
it("extracts JSON object with extra tokens after closing brace", () => {
|
||||
const input = '{"key": "value"} extra tokens here';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("extracts JSON array with extra tokens after closing bracket", () => {
|
||||
const input = '[1, 2, 3] garbage data here';
|
||||
const result = parseJSON(input, []);
|
||||
expect(result).toEqual([1, 2, 3]);
|
||||
});
|
||||
|
||||
it("handles nested JSON with extra tokens", () => {
|
||||
const input = '{"nested": {"inner": [1, 2, 3]}} invalid text';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result).toEqual({
|
||||
nested: {
|
||||
inner: [1, 2, 3],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("handles JSON with strings containing braces", () => {
|
||||
const input = '{"text": "this has {braces} in it"} extra';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.text).toBe("this has {braces} in it");
|
||||
});
|
||||
|
||||
it("handles JSON with escaped quotes in strings", () => {
|
||||
const input = '{"text": "quote \\"here\\""} junk';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.text).toBe('quote "here"');
|
||||
});
|
||||
|
||||
it("handles clean JSON without extra tokens", () => {
|
||||
const input = '{"key": "value"}';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("handles empty object", () => {
|
||||
const input = '{} extra';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result).toEqual({});
|
||||
});
|
||||
|
||||
it("handles empty array", () => {
|
||||
const input = '[] more stuff';
|
||||
const result = parseJSON(input, []);
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it("handles JSON with null values", () => {
|
||||
const input = '{"value": null} trash';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.value).toBeNull();
|
||||
});
|
||||
|
||||
it("handles JSON with boolean values", () => {
|
||||
const input = '{"active": true, "deleted": false} garbage';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.active).toBe(true);
|
||||
expect(result.deleted).toBe(false);
|
||||
});
|
||||
|
||||
it("handles JSON with numbers", () => {
|
||||
const input = '{"int": 42, "float": 3.14, "negative": -7} data';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.int).toBe(42);
|
||||
expect(result.float).toBe(3.14);
|
||||
expect(result.negative).toBe(-7);
|
||||
});
|
||||
|
||||
it("handles JSON with unicode characters", () => {
|
||||
const input = '{"name": "测试", "emoji": "🎯"} extra';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.name).toBe("测试");
|
||||
expect(result.emoji).toBe("🎯");
|
||||
});
|
||||
|
||||
it("handles multiple levels of nesting", () => {
|
||||
const input = '{"a": {"b": {"c": {"d": "value"}}}} junk';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.a.b.c.d).toBe("value");
|
||||
});
|
||||
|
||||
it("handles arrays of objects", () => {
|
||||
const input = '[{"id": 1, "name": "test1"}, {"id": 2, "name": "test2"}] garbage';
|
||||
const result = parseJSON(input, []);
|
||||
expect(result.length).toBe(2);
|
||||
expect(result[0].id).toBe(1);
|
||||
expect(result[1].name).toBe("test2");
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseJSON - with code block markers", () => {
|
||||
it("strips json code block markers", () => {
|
||||
const input = '```json\n{"key": "value"}\n```';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("strips js code block markers", () => {
|
||||
const input = '```js\n{"key": "value"}\n```';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("strips ts code block markers", () => {
|
||||
const input = '```ts\n{"key": "value"}\n```';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("strips plaintext code block markers", () => {
|
||||
const input = '```plaintext\n{"key": "value"}\n```';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("strips generic code block markers", () => {
|
||||
const input = '```\n{"key": "value"}\n```';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("handles code block without closing marker", () => {
|
||||
const input = '```json\n{"key": "value"}';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("handles code block with extra whitespace", () => {
|
||||
const input = '```json \n{"key": "value"}\n``` ';
|
||||
const result = parseJSON(input, null);
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseJSON - issue #598 specific cases", () => {
|
||||
it("handles JSON with extra tokens from quantized models", () => {
|
||||
// This is similar to what Qwen3 235B returns
|
||||
const input =
|
||||
'{"text": "Published: 2010-01-07\\nTitle: Photon Counting OTDR", "data": "Published:", "reminding": " 2010-01-07\\nTitle: Photon"} some garbage tokens';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.text).toBeTruthy();
|
||||
expect(result.text).toContain("Published");
|
||||
expect(result.data).toBeTruthy();
|
||||
expect(result.reminding).toBeTruthy();
|
||||
});
|
||||
|
||||
it("handles search results JSON with extra tokens", () => {
|
||||
const input = `[
|
||||
{"type": "page", "title": "Example", "url": "https://example.com", "content": "Example content"},
|
||||
{"type": "page", "title": "Test", "url": "https://test.com", "content": "Test content"}
|
||||
] trailing garbage`;
|
||||
const result = parseJSON(input, []);
|
||||
expect(result.length).toBe(2);
|
||||
expect(result[0].type).toBe("page");
|
||||
expect(result[1].title).toBe("Test");
|
||||
});
|
||||
|
||||
it("handles crawler response with extra tokens", () => {
|
||||
const input = `{
|
||||
"title": "Article Title",
|
||||
"content": "Article content here..."
|
||||
} [incomplete json or garbage`;
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.title).toBe("Article Title");
|
||||
expect(result.content).toContain("Article content");
|
||||
});
|
||||
|
||||
it("handles non-JSON content gracefully", () => {
|
||||
const input = "This is just plain text, not JSON";
|
||||
const fallback = { default: true };
|
||||
const result = parseJSON(input, fallback);
|
||||
// best-effort-json-parser may parse plain text as key-value pairs
|
||||
// Just ensure we get some result (not throwing an error)
|
||||
expect(result).toBeDefined();
|
||||
expect(result).not.toBeNull();
|
||||
});
|
||||
|
||||
it("returns fallback for null input", () => {
|
||||
const fallback = [{ default: true }];
|
||||
const result = parseJSON(null, fallback);
|
||||
expect(result).toEqual(fallback);
|
||||
});
|
||||
|
||||
it("returns fallback for undefined input", () => {
|
||||
const fallback = [];
|
||||
const result = parseJSON(undefined, fallback);
|
||||
expect(result).toEqual(fallback);
|
||||
});
|
||||
|
||||
it("returns fallback for empty string input", () => {
|
||||
const fallback = {};
|
||||
const result = parseJSON("", fallback);
|
||||
expect(result).toEqual(fallback);
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseJSON - edge cases", () => {
|
||||
it("handles JSON with special characters in strings", () => {
|
||||
const input = '{"text": "Special chars: @#$%^&*()"} extra';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.text).toBe("Special chars: @#$%^&*()");
|
||||
});
|
||||
|
||||
it("handles JSON with newlines in strings", () => {
|
||||
const input = '{"text": "Line 1\\nLine 2\\nLine 3"} junk';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.text).toContain("Line");
|
||||
});
|
||||
|
||||
it("handles JSON with tabs in strings", () => {
|
||||
const input = '{"text": "Col1\\tCol2\\tCol3"} trash';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.text).toContain("Col");
|
||||
});
|
||||
|
||||
it("handles deeply nested objects", () => {
|
||||
const input = '{"a":{"b":{"c":{"d":{"e":{"f":"deep"}}}}}}} extra';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.a.b.c.d.e.f).toBe("deep");
|
||||
});
|
||||
|
||||
it("handles large arrays", () => {
|
||||
const largeArray = Array.from({ length: 100 }, (_, i) => ({ id: i }));
|
||||
const input = JSON.stringify(largeArray) + " garbage text";
|
||||
const result = parseJSON(input, []);
|
||||
expect(result.length).toBe(100);
|
||||
expect(result[99].id).toBe(99);
|
||||
});
|
||||
|
||||
it("handles whitespace in JSON", () => {
|
||||
const input = `{
|
||||
"key" : "value" ,
|
||||
"number" : 42
|
||||
} extra`;
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.key).toBe("value");
|
||||
expect(result.number).toBe(42);
|
||||
});
|
||||
|
||||
it("handles JSON with escaped slashes", () => {
|
||||
const input = '{"url": "https:\\/\\/example.com"} junk';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.url).toContain("example.com");
|
||||
});
|
||||
|
||||
it("preserves numeric precision", () => {
|
||||
const input = '{"value": 1.23456789} extra';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.value).toBe(1.23456789);
|
||||
});
|
||||
|
||||
it("handles JSON with very long strings", () => {
|
||||
const longString = "A".repeat(10000);
|
||||
const input = `{"text": "${longString}"} garbage`;
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.text.length).toBe(10000);
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseJSON - type safety", () => {
|
||||
it("properly types object results", () => {
|
||||
interface TestObject {
|
||||
id: number;
|
||||
name: string;
|
||||
active: boolean;
|
||||
}
|
||||
const input = '{"id": 1, "name": "test", "active": true} junk';
|
||||
const fallback: TestObject = { id: 0, name: "", active: false };
|
||||
const result = parseJSON<TestObject>(input, fallback);
|
||||
expect(result.id).toBe(1);
|
||||
expect(result.name).toBe("test");
|
||||
expect(result.active).toBe(true);
|
||||
});
|
||||
|
||||
it("properly types array results", () => {
|
||||
interface Item {
|
||||
id: number;
|
||||
label: string;
|
||||
}
|
||||
const input = '[{"id": 1, "label": "a"}, {"id": 2, "label": "b"}] extra';
|
||||
const fallback: Item[] = [];
|
||||
const result = parseJSON<Item[]>(input, fallback);
|
||||
expect(result[0].id).toBe(1);
|
||||
expect(result[1].label).toBe("b");
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseJSON - malformed JSON recovery", () => {
|
||||
it("handles missing closing braces", () => {
|
||||
const input = '{"key": "value"';
|
||||
const result = parseJSON(input, { key: "default" });
|
||||
// Should return something (either fixed JSON or fallback)
|
||||
expect(result).toBeDefined();
|
||||
});
|
||||
|
||||
it("handles extra closing braces", () => {
|
||||
const input = '{"key": "value"}}}';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("handles mixed quotes", () => {
|
||||
const input = '{"key": "value"} extra';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.key).toBe("value");
|
||||
});
|
||||
|
||||
it("handles unquoted keys (not valid JSON, uses fallback)", () => {
|
||||
const input = "{key: 'value'} extra";
|
||||
const fallback = { key: "default" };
|
||||
const result = parseJSON(input, fallback);
|
||||
// Should return something
|
||||
expect(result).toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseJSON - real-world scenarios", () => {
|
||||
it("handles Tavily search results format", () => {
|
||||
const input = `[
|
||||
{
|
||||
"type": "page",
|
||||
"title": "Sample Article",
|
||||
"url": "https://example.com/article",
|
||||
"content": "This is sample content..."
|
||||
}
|
||||
] processing complete`;
|
||||
const result = parseJSON(input, []);
|
||||
expect(result[0].type).toBe("page");
|
||||
expect(result[0].title).toBe("Sample Article");
|
||||
});
|
||||
|
||||
it("handles crawler article format", () => {
|
||||
const input = `{
|
||||
"title": "News Article",
|
||||
"content": "Article body text...",
|
||||
"author": "John Doe",
|
||||
"date": "2024-01-01"
|
||||
} [incomplete extra`;
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.title).toBe("News Article");
|
||||
expect(result.content).toBeDefined();
|
||||
});
|
||||
|
||||
it("handles local search tool results", () => {
|
||||
const input = `[
|
||||
{
|
||||
"id": "doc-1",
|
||||
"title": "Document 1",
|
||||
"content": "Document content here"
|
||||
},
|
||||
{
|
||||
"id": "doc-2",
|
||||
"title": "Document 2",
|
||||
"content": "Another document"
|
||||
}
|
||||
] extra garbage`;
|
||||
const result = parseJSON(input, []);
|
||||
expect(result.length).toBe(2);
|
||||
expect(result[0].id).toBe("doc-1");
|
||||
});
|
||||
|
||||
it("handles Python REPL output with JSON", () => {
|
||||
const input = `{"result": 42, "error": null, "stdout": "Output here"} [process ended]`;
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.result).toBe(42);
|
||||
expect(result.error).toBeNull();
|
||||
});
|
||||
|
||||
it("handles MCP tool response format", () => {
|
||||
const input = `{
|
||||
"tool": "web_search",
|
||||
"status": "success",
|
||||
"data": [{"title": "Result", "url": "https://example.com"}]
|
||||
} additional text`;
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.tool).toBe("web_search");
|
||||
expect(result.data[0].title).toBe("Result");
|
||||
});
|
||||
});
|
||||
|
||||
describe("parseJSON - issue #598 regression tests", () => {
|
||||
it("does not lose data when removing extra tokens", () => {
|
||||
const input = `{
|
||||
"research": "Complete research data here with lots of information",
|
||||
"sources": [
|
||||
{"title": "Source 1", "url": "https://source1.com"},
|
||||
{"title": "Source 2", "url": "https://source2.com"}
|
||||
]
|
||||
} garbage tokens that should be removed`;
|
||||
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.research).toBeDefined();
|
||||
expect(result.sources.length).toBe(2);
|
||||
expect(result.sources[0].title).toBe("Source 1");
|
||||
});
|
||||
|
||||
it("handles consecutive tool calls with JSON", () => {
|
||||
const firstResult = '{"step": 1, "data": "first"} extra';
|
||||
const secondResult = '{"step": 2, "data": "second"} junk';
|
||||
|
||||
const result1 = parseJSON(firstResult, {});
|
||||
const result2 = parseJSON(secondResult, {});
|
||||
|
||||
expect(result1.step).toBe(1);
|
||||
expect(result2.step).toBe(2);
|
||||
});
|
||||
|
||||
it("maintains performance with large responses", () => {
|
||||
const largeContent = "A".repeat(50000);
|
||||
const input = `{"content": "${largeContent}", "status": "ok"} extra data`;
|
||||
|
||||
const startTime = Date.now();
|
||||
const result = parseJSON(input, {});
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.status).toBe("ok");
|
||||
// Should complete quickly (< 2 seconds for this size)
|
||||
expect(duration).toBeLessThan(2000);
|
||||
});
|
||||
|
||||
it("handles multiple consecutive extra tokens", () => {
|
||||
const input =
|
||||
'{"data": "value"}} } ] unexpected tokens here } { [ ) ] incomplete';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.data).toBe("value");
|
||||
});
|
||||
|
||||
it("handles unicode garbage after JSON", () => {
|
||||
const input = '{"text": "测试"} 乱码数据 🎯 garbage';
|
||||
const result = parseJSON(input, {});
|
||||
expect(result.text).toBe("测试");
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user