fix: improve JSON repair handling for markdown code blocks (#841)

* fix: improve JSON repair handling for markdown code blocks

* unified import path

* compress_crawl_udf

* fix

* reverse
This commit is contained in:
Xun
2026-01-30 08:47:23 +08:00
committed by GitHub
parent 756421c3ac
commit 3adb4e90cb
4 changed files with 394 additions and 6 deletions

View File

@@ -6,6 +6,7 @@ import json
from src.utils.json_utils import (
_extract_json_from_content,
repair_json_output,
sanitize_args,
sanitize_tool_response,
)
@@ -39,6 +40,49 @@ class TestRepairJsonOutput:
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_json_with_code_block_uppercase_json(self):
"""Test JSON wrapped in ```JSON (uppercase) code block"""
content = '```JSON\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_json_with_code_block_uppercase_ts(self):
"""Test JSON wrapped in ```TS (uppercase) code block"""
content = '```TS\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_json_with_code_block_mixed_case_json(self):
"""Test JSON wrapped in ```Json (mixed case) code block"""
content = '```Json\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_json_with_code_block_uppercase_ts_with_prefix(self):
"""Test JSON wrapped in ```TS code block with prefix text"""
content = 'some prefix ```TS\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_json_with_code_block_uppercase_json_with_prefix(self):
"""Test JSON wrapped in ```JSON code block with prefix text - case sensitive fix"""
# This tests the fix for case-insensitive guard when fence is not at start
content = 'prefix ```JSON\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_json_with_plain_code_block_uppercase(self):
"""Test JSON wrapped in plain ``` code block (case insensitive)"""
content = '```\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_malformed_json_repair(self):
"""Test with malformed JSON that can be repaired"""
content = '{"key": "value", "incomplete":'
@@ -226,3 +270,312 @@ class TestSanitizeToolResponse:
content = '[{"id": 1}, {"id": 2}] invalid stuff'
result = sanitize_tool_response(content)
assert result == '[{"id": 1}, {"id": 2}]'
class TestSanitizeArgs:
def test_sanitize_special_characters(self):
"""Test sanitization of special characters"""
args = '{"key": "value", "array": [1, 2, 3]}'
result = sanitize_args(args)
assert result == '{"key": "value", "array": [1, 2, 3]}'
def test_sanitize_square_brackets(self):
"""Test sanitization of square brackets"""
args = '[1, 2, 3]'
result = sanitize_args(args)
assert result == '[1, 2, 3]'
def test_sanitize_curly_braces(self):
"""Test sanitization of curly braces"""
args = '{key: value}'
result = sanitize_args(args)
assert result == '{key: value}'
def test_sanitize_mixed_brackets(self):
"""Test sanitization of mixed bracket types"""
args = '{[test]}'
result = sanitize_args(args)
assert result == '{[test]}'
def test_sanitize_non_string_input(self):
"""Test sanitization of non-string input returns empty string"""
assert sanitize_args(None) == ""
assert sanitize_args(123) == ""
assert sanitize_args([1, 2, 3]) == ""
assert sanitize_args({"key": "value"}) == ""
def test_sanitize_empty_string(self):
"""Test sanitization of empty string"""
result = sanitize_args("")
assert result == ""
def test_sanitize_plain_text(self):
"""Test sanitization of plain text without special characters"""
args = "plain text without brackets or braces"
result = sanitize_args(args)
assert result == "plain text without brackets or braces"
def test_sanitize_nested_structures(self):
"""Test sanitization of deeply nested structures"""
args = '{"outer": {"inner": [1, [2, 3]]}}'
result = sanitize_args(args)
assert result == '{"outer": {"inner": [1, [2, 3]]}}'
class TestRepairJsonOutputEdgeCases:
def test_code_block_with_leading_spaces(self):
"""Test code block with leading spaces"""
content = ' ```json\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_code_block_with_tabs(self):
"""Test code block with tabs"""
content = '\t```json\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_code_block_with_multiple_newlines(self):
"""Test code block with multiple newlines after opening fence"""
content = '```json\n\n\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_code_block_with_spaces_before_closing(self):
"""Test code block with spaces before closing fence"""
content = '```json\n{"key": "value"}\n ```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_json_with_newlines_in_values(self):
"""Test JSON with newlines in string values"""
content = '{"text": "line1\\nline2\\nline3"}'
result = repair_json_output(content)
expected = json.dumps({"text": "line1\nline2\nline3"}, ensure_ascii=False)
assert result == expected
def test_json_with_special_unicode(self):
"""Test JSON with special unicode characters"""
content = '{"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}'
result = repair_json_output(content)
expected = json.dumps({"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}, ensure_ascii=False)
assert result == expected
def test_json_boolean_values(self):
"""Test JSON with boolean values"""
content = '{"active": true, "disabled": false, "nullable": null}'
result = repair_json_output(content)
expected = json.dumps({"active": True, "disabled": False, "nullable": None}, ensure_ascii=False)
assert result == expected
def test_json_numeric_values(self):
"""Test JSON with various numeric values"""
content = '{"int": 42, "float": 3.14159, "negative": -123, "scientific": 1.23e10}'
result = repair_json_output(content)
parsed = json.loads(result)
assert parsed["int"] == 42
assert parsed["float"] == 3.14159
assert parsed["negative"] == -123
def test_plain_code_block_marker(self):
"""Test plain ``` code block without language specifier"""
content = '```\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_multiple_json_objects_takes_first_complete(self):
"""Test that multiple JSON objects are properly extracted"""
content = '{"first": "object"} {"second": "object"}'
result = repair_json_output(content)
# json_repair will combine multiple objects into an array
expected = json.dumps([{"first": "object"}, {"second": "object"}], ensure_ascii=False)
assert result == expected
def test_chinese_json_with_code_block(self):
"""Test JSON with Chinese content wrapped in markdown code block"""
content = '''```json
{
"locale": "en-US",
"has_enough_context": true,
"thought": "测试中文内容",
"title": "地月距离小报告",
"steps": []
}
```'''
result = repair_json_output(content)
parsed = json.loads(result)
assert parsed["locale"] == "en-US"
assert parsed["title"] == "地月距离小报告"
assert parsed["thought"] == "测试中文内容"
assert isinstance(parsed["steps"], list)
def test_code_block_uppercase_json_with_leading_spaces(self):
"""Test uppercase JSON code block with leading spaces"""
content = ' ```JSON\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_code_block_uppercase_json_with_tabs(self):
"""Test uppercase JSON code block with tabs"""
content = '\t```JSON\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_code_block_mixed_case_with_multiple_newlines(self):
"""Test mixed case code block with multiple newlines"""
content = '```JsOn\n\n\n{"key": "value"}\n```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_code_block_uppercase_with_spaces_before_closing(self):
"""Test uppercase code block with spaces before closing fence"""
content = '```TYPESCRIPT\n{"key": "value"}\n ```'
result = repair_json_output(content)
expected = json.dumps({"key": "value"}, ensure_ascii=False)
assert result == expected
def test_code_block_case_insensitive_various_languages(self):
"""Test code blocks with various language specifiers in different cases"""
test_cases = [
('```Python\n{"key": "value"}\n```', '{"key": "value"}'),
('```PYTHON\n{"key": "value"}\n```', '{"key": "value"}'),
('```pYtHoN\n{"key": "value"}\n```', '{"key": "value"}'),
('```sql\n{"key": "value"}\n```', '{"key": "value"}'),
('```SQL\n{"key": "value"}\n```', '{"key": "value"}'),
]
for content, expected_json_str in test_cases:
result = repair_json_output(content)
# Verify it's valid JSON
parsed = json.loads(result)
assert parsed["key"] == "value"
class TestExtractJsonFromContentEdgeCases:
def test_deeply_nested_json(self):
"""Test extraction of deeply nested JSON"""
content = '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}} garbage'
result = _extract_json_from_content(content)
assert result == '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}}'
def test_json_array_of_arrays(self):
"""Test extraction of nested arrays"""
content = '[[1, 2], [3, 4], [5, 6]] extra'
result = _extract_json_from_content(content)
assert result == '[[1, 2], [3, 4], [5, 6]]'
def test_json_with_backslashes_in_string(self):
"""Test JSON with backslashes in string values"""
content = r'{"path": "C:\\Users\\test\\file.txt"} garbage'
result = _extract_json_from_content(content)
assert result == r'{"path": "C:\\Users\\test\\file.txt"}'
def test_json_with_forward_slashes(self):
"""Test JSON with forward slashes in string values"""
content = '{"url": "https://example.com/path/to/resource"} extra'
result = _extract_json_from_content(content)
assert result == '{"url": "https://example.com/path/to/resource"}'
def test_mixed_object_and_array(self):
"""Test JSON with mixed objects and arrays"""
content = '{"items": [{"id": 1}, {"id": 2}], "count": 2} tail'
result = _extract_json_from_content(content)
assert result == '{"items": [{"id": 1}, {"id": 2}], "count": 2}'
def test_json_with_unicode_escape_sequences(self):
"""Test JSON with unicode escape sequences"""
content = r'{"text": "\u4E2D\u6587"} junk'
result = _extract_json_from_content(content)
assert result == r'{"text": "\u4E2D\u6587"}'
def test_no_json_structure(self):
"""Test content without JSON structure"""
content = 'just plain text without brackets'
result = _extract_json_from_content(content)
assert result == content
def test_unbalanced_braces_in_middle(self):
"""Test content with unbalanced braces doesn't extract invalid JSON"""
content = '{"incomplete": {"nested": } text'
result = _extract_json_from_content(content)
# Should not mark as valid end since braces are unbalanced
assert result == content
def test_json_with_comma_separated_values(self):
"""Test JSON object with multiple comma-separated values"""
content = '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5} more text'
result = _extract_json_from_content(content)
assert result == '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}'
class TestSanitizeToolResponseEdgeCases:
def test_json_object_with_extra_tokens(self):
"""Test sanitizing JSON object with trailing tokens"""
content = '{"status": "success", "data": {"id": 123}} trailing garbage'
result = sanitize_tool_response(content)
assert result == '{"status": "success", "data": {"id": 123}}'
def test_truncation_at_exact_boundary(self):
"""Test truncation behavior at exact max_length boundary"""
content = "x" * 50000
result = sanitize_tool_response(content, max_length=50000)
assert len(result) == 50000
assert not result.endswith("...")
def test_truncation_one_over_boundary(self):
"""Test truncation when content is one char over limit"""
content = "x" * 50001
result = sanitize_tool_response(content, max_length=50000)
assert len(result) <= 50003
assert result.endswith("...")
def test_multiple_control_characters(self):
"""Test removal of multiple types of control characters"""
content = "text\x00with\x01various\x02control\x1Fchars\x7F"
result = sanitize_tool_response(content)
# All control characters should be removed
assert "\x00" not in result
assert "\x01" not in result
assert "\x02" not in result
assert "\x1F" not in result
assert "\x7F" not in result
assert "textwithvariouscontrolchars" == result
def test_newline_and_tab_preservation(self):
"""Test that newlines and tabs are preserved (they are valid)"""
content = "line1\nline2\tindented"
result = sanitize_tool_response(content)
assert "\n" in result
assert "\t" in result
assert result == "line1\nline2\tindented"
def test_non_json_content_unchanged(self):
"""Test that non-JSON content is not modified"""
content = "This is plain text without any JSON structure"
result = sanitize_tool_response(content)
assert result == content
def test_json_array_at_start(self):
"""Test extraction of JSON array at start of content"""
content = '[1, 2, 3, 4, 5] followed by text'
result = sanitize_tool_response(content)
assert result == '[1, 2, 3, 4, 5]'
def test_empty_json_structures_preserved(self):
"""Test that empty JSON structures are preserved"""
content = '{"empty_obj": {}, "empty_arr": []} extra'
result = sanitize_tool_response(content)
assert result == '{"empty_obj": {}, "empty_arr": []}'
def test_whitespace_variations(self):
"""Test handling of various whitespace patterns"""
content = " \n\t content with spaces \t\n "
result = sanitize_tool_response(content)
assert result == "content with spaces"