From 3adb4e90cbf14e8dd0b34ab72fcd02e3b550635f Mon Sep 17 00:00:00 2001 From: Xun Date: Fri, 30 Jan 2026 08:47:23 +0800 Subject: [PATCH] fix: improve JSON repair handling for markdown code blocks (#841) * fix: improve JSON repair handling for markdown code blocks * unified import path * compress_crawl_udf * fix * reverse --- src/graph/nodes.py | 11 +- src/tools/crawl.py | 14 +- src/utils/json_utils.py | 22 ++ tests/unit/utils/test_json_utils.py | 353 ++++++++++++++++++++++++++++ 4 files changed, 394 insertions(+), 6 deletions(-) diff --git a/src/graph/nodes.py b/src/graph/nodes.py index f48a847..54f41d8 100644 --- a/src/graph/nodes.py +++ b/src/graph/nodes.py @@ -332,9 +332,12 @@ def planner_node( logger.debug(f"Current state messages: {state['messages']}") logger.info(f"Planner response: {full_response}") + # Clean the response first to handle markdown code blocks (```json, ```ts, etc.) + cleaned_response = repair_json_output(full_response) + # Validate explicitly that response content is valid JSON before proceeding to parse it - if not full_response.strip().startswith('{') and not full_response.strip().startswith('['): - logger.warning("Planner response does not appear to be valid JSON") + if not cleaned_response.strip().startswith('{') and not cleaned_response.strip().startswith('['): + logger.warning("Planner response does not appear to be valid JSON after cleanup") if plan_iterations > 0: return Command( update=preserve_state_meta_fields(state), @@ -347,7 +350,7 @@ def planner_node( ) try: - curr_plan = json.loads(repair_json_output(full_response)) + curr_plan = json.loads(cleaned_response) # Need to extract the plan from the full_response curr_plan_content = extract_plan_content(curr_plan) # load the current_plan @@ -1428,4 +1431,4 @@ async def analyst_node( config, "analyst", [], # No tools - pure reasoning - ) \ No newline at end of file + ) diff --git a/src/tools/crawl.py b/src/tools/crawl.py index b85f2ac..90eba03 100644 --- a/src/tools/crawl.py +++ b/src/tools/crawl.py @@ -8,8 +8,8 @@ from urllib.parse import urlparse from langchain_core.tools import tool +from src.crawler.article import Article from src.crawler import Crawler - from .decorators import log_io logger = logging.getLogger(__name__) @@ -43,8 +43,18 @@ def crawl_tool( try: crawler = Crawler() article = crawler.crawl(url) - return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]}, ensure_ascii=False) + article_content = compress_crawl_content(article) + return json.dumps({"url": url, "crawled_content": article_content}, ensure_ascii=False) except BaseException as e: error_msg = f"Failed to crawl. Error: {repr(e)}" logger.error(error_msg) return error_msg + + +def compress_crawl_content(article: Article) -> str: + """ + Compress user-defined function for article content. + We can customize this function to implement different compression strategies. + Currently, it truncates the markdown content to the first 1000 characters. + """ + return article.to_markdown()[:1000] diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py index 72394e7..211b87a 100644 --- a/src/utils/json_utils.py +++ b/src/utils/json_utils.py @@ -7,6 +7,7 @@ import re from typing import Any import json_repair +import re logger = logging.getLogger(__name__) @@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str: if not content: return content + # Handle markdown code blocks (```json, ```ts, or ```) + # This must be checked first, as content may start with ``` instead of { or [ + if "```" in content: + # Remove opening markdown code block markers (```json, ```ts, or ```), allowing + # optional leading spaces and multiple blank lines after the fence. + content = re.sub( + r'^[ \t]*```(?:json|ts)?[ \t]*\n+', + '', + content, + flags=re.IGNORECASE | re.MULTILINE, + ) + # Remove closing markdown code block markers (```), allowing optional + # leading newlines and trailing spaces. + content = re.sub( + r'\n*```[ \t]*$', + '', + content, + flags=re.MULTILINE, + ) + content = content.strip() + # First attempt: try to extract valid JSON if there are extra tokens content = _extract_json_from_content(content) diff --git a/tests/unit/utils/test_json_utils.py b/tests/unit/utils/test_json_utils.py index e9ead1a..5803ca1 100644 --- a/tests/unit/utils/test_json_utils.py +++ b/tests/unit/utils/test_json_utils.py @@ -6,6 +6,7 @@ import json from src.utils.json_utils import ( _extract_json_from_content, repair_json_output, + sanitize_args, sanitize_tool_response, ) @@ -39,6 +40,49 @@ class TestRepairJsonOutput: expected = json.dumps({"key": "value"}, ensure_ascii=False) assert result == expected + def test_json_with_code_block_uppercase_json(self): + """Test JSON wrapped in ```JSON (uppercase) code block""" + content = '```JSON\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_json_with_code_block_uppercase_ts(self): + """Test JSON wrapped in ```TS (uppercase) code block""" + content = '```TS\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_json_with_code_block_mixed_case_json(self): + """Test JSON wrapped in ```Json (mixed case) code block""" + content = '```Json\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_json_with_code_block_uppercase_ts_with_prefix(self): + """Test JSON wrapped in ```TS code block with prefix text""" + content = 'some prefix ```TS\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_json_with_code_block_uppercase_json_with_prefix(self): + """Test JSON wrapped in ```JSON code block with prefix text - case sensitive fix""" + # This tests the fix for case-insensitive guard when fence is not at start + content = 'prefix ```JSON\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_json_with_plain_code_block_uppercase(self): + """Test JSON wrapped in plain ``` code block (case insensitive)""" + content = '```\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + def test_malformed_json_repair(self): """Test with malformed JSON that can be repaired""" content = '{"key": "value", "incomplete":' @@ -226,3 +270,312 @@ class TestSanitizeToolResponse: content = '[{"id": 1}, {"id": 2}] invalid stuff' result = sanitize_tool_response(content) assert result == '[{"id": 1}, {"id": 2}]' + + +class TestSanitizeArgs: + def test_sanitize_special_characters(self): + """Test sanitization of special characters""" + args = '{"key": "value", "array": [1, 2, 3]}' + result = sanitize_args(args) + assert result == '{"key": "value", "array": [1, 2, 3]}' + + def test_sanitize_square_brackets(self): + """Test sanitization of square brackets""" + args = '[1, 2, 3]' + result = sanitize_args(args) + assert result == '[1, 2, 3]' + + def test_sanitize_curly_braces(self): + """Test sanitization of curly braces""" + args = '{key: value}' + result = sanitize_args(args) + assert result == '{key: value}' + + def test_sanitize_mixed_brackets(self): + """Test sanitization of mixed bracket types""" + args = '{[test]}' + result = sanitize_args(args) + assert result == '{[test]}' + + def test_sanitize_non_string_input(self): + """Test sanitization of non-string input returns empty string""" + assert sanitize_args(None) == "" + assert sanitize_args(123) == "" + assert sanitize_args([1, 2, 3]) == "" + assert sanitize_args({"key": "value"}) == "" + + def test_sanitize_empty_string(self): + """Test sanitization of empty string""" + result = sanitize_args("") + assert result == "" + + def test_sanitize_plain_text(self): + """Test sanitization of plain text without special characters""" + args = "plain text without brackets or braces" + result = sanitize_args(args) + assert result == "plain text without brackets or braces" + + def test_sanitize_nested_structures(self): + """Test sanitization of deeply nested structures""" + args = '{"outer": {"inner": [1, [2, 3]]}}' + result = sanitize_args(args) + assert result == '{"outer": {"inner": [1, [2, 3]]}}' + + +class TestRepairJsonOutputEdgeCases: + def test_code_block_with_leading_spaces(self): + """Test code block with leading spaces""" + content = ' ```json\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_code_block_with_tabs(self): + """Test code block with tabs""" + content = '\t```json\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_code_block_with_multiple_newlines(self): + """Test code block with multiple newlines after opening fence""" + content = '```json\n\n\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_code_block_with_spaces_before_closing(self): + """Test code block with spaces before closing fence""" + content = '```json\n{"key": "value"}\n ```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_json_with_newlines_in_values(self): + """Test JSON with newlines in string values""" + content = '{"text": "line1\\nline2\\nline3"}' + result = repair_json_output(content) + expected = json.dumps({"text": "line1\nline2\nline3"}, ensure_ascii=False) + assert result == expected + + def test_json_with_special_unicode(self): + """Test JSON with special unicode characters""" + content = '{"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}' + result = repair_json_output(content) + expected = json.dumps({"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}, ensure_ascii=False) + assert result == expected + + def test_json_boolean_values(self): + """Test JSON with boolean values""" + content = '{"active": true, "disabled": false, "nullable": null}' + result = repair_json_output(content) + expected = json.dumps({"active": True, "disabled": False, "nullable": None}, ensure_ascii=False) + assert result == expected + + def test_json_numeric_values(self): + """Test JSON with various numeric values""" + content = '{"int": 42, "float": 3.14159, "negative": -123, "scientific": 1.23e10}' + result = repair_json_output(content) + parsed = json.loads(result) + assert parsed["int"] == 42 + assert parsed["float"] == 3.14159 + assert parsed["negative"] == -123 + + def test_plain_code_block_marker(self): + """Test plain ``` code block without language specifier""" + content = '```\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_multiple_json_objects_takes_first_complete(self): + """Test that multiple JSON objects are properly extracted""" + content = '{"first": "object"} {"second": "object"}' + result = repair_json_output(content) + # json_repair will combine multiple objects into an array + expected = json.dumps([{"first": "object"}, {"second": "object"}], ensure_ascii=False) + assert result == expected + + def test_chinese_json_with_code_block(self): + """Test JSON with Chinese content wrapped in markdown code block""" + content = '''```json +{ + "locale": "en-US", + "has_enough_context": true, + "thought": "测试中文内容", + "title": "地月距离小报告", + "steps": [] +} +```''' + result = repair_json_output(content) + parsed = json.loads(result) + assert parsed["locale"] == "en-US" + assert parsed["title"] == "地月距离小报告" + assert parsed["thought"] == "测试中文内容" + assert isinstance(parsed["steps"], list) + + def test_code_block_uppercase_json_with_leading_spaces(self): + """Test uppercase JSON code block with leading spaces""" + content = ' ```JSON\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_code_block_uppercase_json_with_tabs(self): + """Test uppercase JSON code block with tabs""" + content = '\t```JSON\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_code_block_mixed_case_with_multiple_newlines(self): + """Test mixed case code block with multiple newlines""" + content = '```JsOn\n\n\n{"key": "value"}\n```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_code_block_uppercase_with_spaces_before_closing(self): + """Test uppercase code block with spaces before closing fence""" + content = '```TYPESCRIPT\n{"key": "value"}\n ```' + result = repair_json_output(content) + expected = json.dumps({"key": "value"}, ensure_ascii=False) + assert result == expected + + def test_code_block_case_insensitive_various_languages(self): + """Test code blocks with various language specifiers in different cases""" + test_cases = [ + ('```Python\n{"key": "value"}\n```', '{"key": "value"}'), + ('```PYTHON\n{"key": "value"}\n```', '{"key": "value"}'), + ('```pYtHoN\n{"key": "value"}\n```', '{"key": "value"}'), + ('```sql\n{"key": "value"}\n```', '{"key": "value"}'), + ('```SQL\n{"key": "value"}\n```', '{"key": "value"}'), + ] + for content, expected_json_str in test_cases: + result = repair_json_output(content) + # Verify it's valid JSON + parsed = json.loads(result) + assert parsed["key"] == "value" + + +class TestExtractJsonFromContentEdgeCases: + def test_deeply_nested_json(self): + """Test extraction of deeply nested JSON""" + content = '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}} garbage' + result = _extract_json_from_content(content) + assert result == '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}}' + + def test_json_array_of_arrays(self): + """Test extraction of nested arrays""" + content = '[[1, 2], [3, 4], [5, 6]] extra' + result = _extract_json_from_content(content) + assert result == '[[1, 2], [3, 4], [5, 6]]' + + def test_json_with_backslashes_in_string(self): + """Test JSON with backslashes in string values""" + content = r'{"path": "C:\\Users\\test\\file.txt"} garbage' + result = _extract_json_from_content(content) + assert result == r'{"path": "C:\\Users\\test\\file.txt"}' + + def test_json_with_forward_slashes(self): + """Test JSON with forward slashes in string values""" + content = '{"url": "https://example.com/path/to/resource"} extra' + result = _extract_json_from_content(content) + assert result == '{"url": "https://example.com/path/to/resource"}' + + def test_mixed_object_and_array(self): + """Test JSON with mixed objects and arrays""" + content = '{"items": [{"id": 1}, {"id": 2}], "count": 2} tail' + result = _extract_json_from_content(content) + assert result == '{"items": [{"id": 1}, {"id": 2}], "count": 2}' + + def test_json_with_unicode_escape_sequences(self): + """Test JSON with unicode escape sequences""" + content = r'{"text": "\u4E2D\u6587"} junk' + result = _extract_json_from_content(content) + assert result == r'{"text": "\u4E2D\u6587"}' + + def test_no_json_structure(self): + """Test content without JSON structure""" + content = 'just plain text without brackets' + result = _extract_json_from_content(content) + assert result == content + + def test_unbalanced_braces_in_middle(self): + """Test content with unbalanced braces doesn't extract invalid JSON""" + content = '{"incomplete": {"nested": } text' + result = _extract_json_from_content(content) + # Should not mark as valid end since braces are unbalanced + assert result == content + + def test_json_with_comma_separated_values(self): + """Test JSON object with multiple comma-separated values""" + content = '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5} more text' + result = _extract_json_from_content(content) + assert result == '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}' + + +class TestSanitizeToolResponseEdgeCases: + def test_json_object_with_extra_tokens(self): + """Test sanitizing JSON object with trailing tokens""" + content = '{"status": "success", "data": {"id": 123}} trailing garbage' + result = sanitize_tool_response(content) + assert result == '{"status": "success", "data": {"id": 123}}' + + def test_truncation_at_exact_boundary(self): + """Test truncation behavior at exact max_length boundary""" + content = "x" * 50000 + result = sanitize_tool_response(content, max_length=50000) + assert len(result) == 50000 + assert not result.endswith("...") + + def test_truncation_one_over_boundary(self): + """Test truncation when content is one char over limit""" + content = "x" * 50001 + result = sanitize_tool_response(content, max_length=50000) + assert len(result) <= 50003 + assert result.endswith("...") + + def test_multiple_control_characters(self): + """Test removal of multiple types of control characters""" + content = "text\x00with\x01various\x02control\x1Fchars\x7F" + result = sanitize_tool_response(content) + # All control characters should be removed + assert "\x00" not in result + assert "\x01" not in result + assert "\x02" not in result + assert "\x1F" not in result + assert "\x7F" not in result + assert "textwithvariouscontrolchars" == result + + def test_newline_and_tab_preservation(self): + """Test that newlines and tabs are preserved (they are valid)""" + content = "line1\nline2\tindented" + result = sanitize_tool_response(content) + assert "\n" in result + assert "\t" in result + assert result == "line1\nline2\tindented" + + def test_non_json_content_unchanged(self): + """Test that non-JSON content is not modified""" + content = "This is plain text without any JSON structure" + result = sanitize_tool_response(content) + assert result == content + + def test_json_array_at_start(self): + """Test extraction of JSON array at start of content""" + content = '[1, 2, 3, 4, 5] followed by text' + result = sanitize_tool_response(content) + assert result == '[1, 2, 3, 4, 5]' + + def test_empty_json_structures_preserved(self): + """Test that empty JSON structures are preserved""" + content = '{"empty_obj": {}, "empty_arr": []} extra' + result = sanitize_tool_response(content) + assert result == '{"empty_obj": {}, "empty_arr": []}' + + def test_whitespace_variations(self): + """Test handling of various whitespace patterns""" + content = " \n\t content with spaces \t\n " + result = sanitize_tool_response(content) + assert result == "content with spaces"