fix: improve JSON repair handling for markdown code blocks (#841)

* fix: improve JSON repair handling for markdown code blocks * unified import path * compress_crawl_udf * fix * reverse
2026-04-03 14:22:13 +08:00 · 2026-01-30 08:47:23 +08:00
parent 756421c3ac
commit 3adb4e90cb
4 changed files with 394 additions and 6 deletions
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -332,9 +332,12 @@ def planner_node(
    logger.debug(f"Current state messages: {state['messages']}")
    logger.info(f"Planner response: {full_response}")

+    # Clean the response first to handle markdown code blocks (```json, ```ts, etc.)
+    cleaned_response = repair_json_output(full_response)
+
    # Validate explicitly that response content is valid JSON before proceeding to parse it
-    if not full_response.strip().startswith('{') and not full_response.strip().startswith('['):
-        logger.warning("Planner response does not appear to be valid JSON")
+    if not cleaned_response.strip().startswith('{') and not cleaned_response.strip().startswith('['):
+        logger.warning("Planner response does not appear to be valid JSON after cleanup")
        if plan_iterations > 0:
            return Command(
                update=preserve_state_meta_fields(state),
@@ -347,7 +350,7 @@ def planner_node(
            )

    try:
-        curr_plan = json.loads(repair_json_output(full_response))
+        curr_plan = json.loads(cleaned_response)
        # Need to extract the plan from the full_response
        curr_plan_content = extract_plan_content(curr_plan)
        # load the current_plan
@@ -1428,4 +1431,4 @@ async def analyst_node(
        config,
        "analyst",
        [],  # No tools - pure reasoning
-    )
+    )
--- a/src/tools/crawl.py
+++ b/src/tools/crawl.py
@@ -8,8 +8,8 @@ from urllib.parse import urlparse

 from langchain_core.tools import tool

+from src.crawler.article import Article
 from src.crawler import Crawler
-
 from .decorators import log_io

 logger = logging.getLogger(__name__)
@@ -43,8 +43,18 @@ def crawl_tool(
    try:
        crawler = Crawler()
        article = crawler.crawl(url)
-        return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]}, ensure_ascii=False)
+        article_content = compress_crawl_content(article)
+        return json.dumps({"url": url, "crawled_content": article_content}, ensure_ascii=False)
    except BaseException as e:
        error_msg = f"Failed to crawl. Error: {repr(e)}"
        logger.error(error_msg)
        return error_msg
+
+
+def compress_crawl_content(article: Article) -> str:
+    """
+    Compress user-defined function for article content.
+    We can customize this function to implement different compression strategies.
+    Currently, it truncates the markdown content to the first 1000 characters.
+    """
+    return article.to_markdown()[:1000]
--- a/src/utils/json_utils.py
+++ b/src/utils/json_utils.py
@@ -7,6 +7,7 @@ import re
 from typing import Any

 import json_repair
+import re

 logger = logging.getLogger(__name__)

@@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str:
    if not content:
        return content

+    # Handle markdown code blocks (```json, ```ts, or ```)
+    # This must be checked first, as content may start with ``` instead of { or [
+    if "```" in content:
+        # Remove opening markdown code block markers (```json, ```ts, or ```), allowing
+        # optional leading spaces and multiple blank lines after the fence.
+        content = re.sub(
+            r'^[ \t]*```(?:json|ts)?[ \t]*\n+',
+            '',
+            content,
+            flags=re.IGNORECASE | re.MULTILINE,
+        )
+        # Remove closing markdown code block markers (```), allowing optional
+        # leading newlines and trailing spaces.
+        content = re.sub(
+            r'\n*```[ \t]*$',
+            '',
+            content,
+            flags=re.MULTILINE,
+        )
+        content = content.strip()
+
    # First attempt: try to extract valid JSON if there are extra tokens
    content = _extract_json_from_content(content)

--- a/tests/unit/utils/test_json_utils.py
+++ b/tests/unit/utils/test_json_utils.py
@@ -6,6 +6,7 @@ import json
 from src.utils.json_utils import (
    _extract_json_from_content,
    repair_json_output,
+    sanitize_args,
    sanitize_tool_response,
 )

@@ -39,6 +40,49 @@ class TestRepairJsonOutput:
        expected = json.dumps({"key": "value"}, ensure_ascii=False)
        assert result == expected

+    def test_json_with_code_block_uppercase_json(self):
+        """Test JSON wrapped in ```JSON (uppercase) code block"""
+        content = '```JSON\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_with_code_block_uppercase_ts(self):
+        """Test JSON wrapped in ```TS (uppercase) code block"""
+        content = '```TS\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_with_code_block_mixed_case_json(self):
+        """Test JSON wrapped in ```Json (mixed case) code block"""
+        content = '```Json\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_with_code_block_uppercase_ts_with_prefix(self):
+        """Test JSON wrapped in ```TS code block with prefix text"""
+        content = 'some prefix ```TS\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_with_code_block_uppercase_json_with_prefix(self):
+        """Test JSON wrapped in ```JSON code block with prefix text - case sensitive fix"""
+        # This tests the fix for case-insensitive guard when fence is not at start
+        content = 'prefix ```JSON\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_with_plain_code_block_uppercase(self):
+        """Test JSON wrapped in plain ``` code block (case insensitive)"""
+        content = '```\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
    def test_malformed_json_repair(self):
        """Test with malformed JSON that can be repaired"""
        content = '{"key": "value", "incomplete":'
@@ -226,3 +270,312 @@ class TestSanitizeToolResponse:
        content = '[{"id": 1}, {"id": 2}] invalid stuff'
        result = sanitize_tool_response(content)
        assert result == '[{"id": 1}, {"id": 2}]'
+
+
+class TestSanitizeArgs:
+    def test_sanitize_special_characters(self):
+        """Test sanitization of special characters"""
+        args = '{"key": "value", "array": [1, 2, 3]}'
+        result = sanitize_args(args)
+        assert result == '&#123;"key": "value", "array": &#91;1, 2, 3&#93;&#125;'
+
+    def test_sanitize_square_brackets(self):
+        """Test sanitization of square brackets"""
+        args = '[1, 2, 3]'
+        result = sanitize_args(args)
+        assert result == '&#91;1, 2, 3&#93;'
+
+    def test_sanitize_curly_braces(self):
+        """Test sanitization of curly braces"""
+        args = '{key: value}'
+        result = sanitize_args(args)
+        assert result == '&#123;key: value&#125;'
+
+    def test_sanitize_mixed_brackets(self):
+        """Test sanitization of mixed bracket types"""
+        args = '{[test]}'
+        result = sanitize_args(args)
+        assert result == '&#123;&#91;test&#93;&#125;'
+
+    def test_sanitize_non_string_input(self):
+        """Test sanitization of non-string input returns empty string"""
+        assert sanitize_args(None) == ""
+        assert sanitize_args(123) == ""
+        assert sanitize_args([1, 2, 3]) == ""
+        assert sanitize_args({"key": "value"}) == ""
+
+    def test_sanitize_empty_string(self):
+        """Test sanitization of empty string"""
+        result = sanitize_args("")
+        assert result == ""
+
+    def test_sanitize_plain_text(self):
+        """Test sanitization of plain text without special characters"""
+        args = "plain text without brackets or braces"
+        result = sanitize_args(args)
+        assert result == "plain text without brackets or braces"
+
+    def test_sanitize_nested_structures(self):
+        """Test sanitization of deeply nested structures"""
+        args = '{"outer": {"inner": [1, [2, 3]]}}'
+        result = sanitize_args(args)
+        assert result == '&#123;"outer": &#123;"inner": &#91;1, &#91;2, 3&#93;&#93;&#125;&#125;'
+
+
+class TestRepairJsonOutputEdgeCases:
+    def test_code_block_with_leading_spaces(self):
+        """Test code block with leading spaces"""
+        content = '   ```json\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_code_block_with_tabs(self):
+        """Test code block with tabs"""
+        content = '\t```json\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_code_block_with_multiple_newlines(self):
+        """Test code block with multiple newlines after opening fence"""
+        content = '```json\n\n\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_code_block_with_spaces_before_closing(self):
+        """Test code block with spaces before closing fence"""
+        content = '```json\n{"key": "value"}\n  ```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_with_newlines_in_values(self):
+        """Test JSON with newlines in string values"""
+        content = '{"text": "line1\\nline2\\nline3"}'
+        result = repair_json_output(content)
+        expected = json.dumps({"text": "line1\nline2\nline3"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_with_special_unicode(self):
+        """Test JSON with special unicode characters"""
+        content = '{"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}'
+        result = repair_json_output(content)
+        expected = json.dumps({"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_boolean_values(self):
+        """Test JSON with boolean values"""
+        content = '{"active": true, "disabled": false, "nullable": null}'
+        result = repair_json_output(content)
+        expected = json.dumps({"active": True, "disabled": False, "nullable": None}, ensure_ascii=False)
+        assert result == expected
+
+    def test_json_numeric_values(self):
+        """Test JSON with various numeric values"""
+        content = '{"int": 42, "float": 3.14159, "negative": -123, "scientific": 1.23e10}'
+        result = repair_json_output(content)
+        parsed = json.loads(result)
+        assert parsed["int"] == 42
+        assert parsed["float"] == 3.14159
+        assert parsed["negative"] == -123
+
+    def test_plain_code_block_marker(self):
+        """Test plain ``` code block without language specifier"""
+        content = '```\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_multiple_json_objects_takes_first_complete(self):
+        """Test that multiple JSON objects are properly extracted"""
+        content = '{"first": "object"} {"second": "object"}'
+        result = repair_json_output(content)
+        # json_repair will combine multiple objects into an array
+        expected = json.dumps([{"first": "object"}, {"second": "object"}], ensure_ascii=False)
+        assert result == expected
+
+    def test_chinese_json_with_code_block(self):
+        """Test JSON with Chinese content wrapped in markdown code block"""
+        content = '''```json
+{
+  "locale": "en-US",
+  "has_enough_context": true,
+  "thought": "测试中文内容",
+  "title": "地月距离小报告",
+  "steps": []
+}
+```'''
+        result = repair_json_output(content)
+        parsed = json.loads(result)
+        assert parsed["locale"] == "en-US"
+        assert parsed["title"] == "地月距离小报告"
+        assert parsed["thought"] == "测试中文内容"
+        assert isinstance(parsed["steps"], list)
+
+    def test_code_block_uppercase_json_with_leading_spaces(self):
+        """Test uppercase JSON code block with leading spaces"""
+        content = '   ```JSON\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_code_block_uppercase_json_with_tabs(self):
+        """Test uppercase JSON code block with tabs"""
+        content = '\t```JSON\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_code_block_mixed_case_with_multiple_newlines(self):
+        """Test mixed case code block with multiple newlines"""
+        content = '```JsOn\n\n\n{"key": "value"}\n```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_code_block_uppercase_with_spaces_before_closing(self):
+        """Test uppercase code block with spaces before closing fence"""
+        content = '```TYPESCRIPT\n{"key": "value"}\n  ```'
+        result = repair_json_output(content)
+        expected = json.dumps({"key": "value"}, ensure_ascii=False)
+        assert result == expected
+
+    def test_code_block_case_insensitive_various_languages(self):
+        """Test code blocks with various language specifiers in different cases"""
+        test_cases = [
+            ('```Python\n{"key": "value"}\n```', '{"key": "value"}'),
+            ('```PYTHON\n{"key": "value"}\n```', '{"key": "value"}'),
+            ('```pYtHoN\n{"key": "value"}\n```', '{"key": "value"}'),
+            ('```sql\n{"key": "value"}\n```', '{"key": "value"}'),
+            ('```SQL\n{"key": "value"}\n```', '{"key": "value"}'),
+        ]
+        for content, expected_json_str in test_cases:
+            result = repair_json_output(content)
+            # Verify it's valid JSON
+            parsed = json.loads(result)
+            assert parsed["key"] == "value"
+
+
+class TestExtractJsonFromContentEdgeCases:
+    def test_deeply_nested_json(self):
+        """Test extraction of deeply nested JSON"""
+        content = '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}} garbage'
+        result = _extract_json_from_content(content)
+        assert result == '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}}'
+
+    def test_json_array_of_arrays(self):
+        """Test extraction of nested arrays"""
+        content = '[[1, 2], [3, 4], [5, 6]] extra'
+        result = _extract_json_from_content(content)
+        assert result == '[[1, 2], [3, 4], [5, 6]]'
+
+    def test_json_with_backslashes_in_string(self):
+        """Test JSON with backslashes in string values"""
+        content = r'{"path": "C:\\Users\\test\\file.txt"} garbage'
+        result = _extract_json_from_content(content)
+        assert result == r'{"path": "C:\\Users\\test\\file.txt"}'
+
+    def test_json_with_forward_slashes(self):
+        """Test JSON with forward slashes in string values"""
+        content = '{"url": "https://example.com/path/to/resource"} extra'
+        result = _extract_json_from_content(content)
+        assert result == '{"url": "https://example.com/path/to/resource"}'
+
+    def test_mixed_object_and_array(self):
+        """Test JSON with mixed objects and arrays"""
+        content = '{"items": [{"id": 1}, {"id": 2}], "count": 2} tail'
+        result = _extract_json_from_content(content)
+        assert result == '{"items": [{"id": 1}, {"id": 2}], "count": 2}'
+
+    def test_json_with_unicode_escape_sequences(self):
+        """Test JSON with unicode escape sequences"""
+        content = r'{"text": "\u4E2D\u6587"} junk'
+        result = _extract_json_from_content(content)
+        assert result == r'{"text": "\u4E2D\u6587"}'
+
+    def test_no_json_structure(self):
+        """Test content without JSON structure"""
+        content = 'just plain text without brackets'
+        result = _extract_json_from_content(content)
+        assert result == content
+
+    def test_unbalanced_braces_in_middle(self):
+        """Test content with unbalanced braces doesn't extract invalid JSON"""
+        content = '{"incomplete": {"nested": } text'
+        result = _extract_json_from_content(content)
+        # Should not mark as valid end since braces are unbalanced
+        assert result == content
+
+    def test_json_with_comma_separated_values(self):
+        """Test JSON object with multiple comma-separated values"""
+        content = '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5} more text'
+        result = _extract_json_from_content(content)
+        assert result == '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}'
+
+
+class TestSanitizeToolResponseEdgeCases:
+    def test_json_object_with_extra_tokens(self):
+        """Test sanitizing JSON object with trailing tokens"""
+        content = '{"status": "success", "data": {"id": 123}} trailing garbage'
+        result = sanitize_tool_response(content)
+        assert result == '{"status": "success", "data": {"id": 123}}'
+
+    def test_truncation_at_exact_boundary(self):
+        """Test truncation behavior at exact max_length boundary"""
+        content = "x" * 50000
+        result = sanitize_tool_response(content, max_length=50000)
+        assert len(result) == 50000
+        assert not result.endswith("...")
+
+    def test_truncation_one_over_boundary(self):
+        """Test truncation when content is one char over limit"""
+        content = "x" * 50001
+        result = sanitize_tool_response(content, max_length=50000)
+        assert len(result) <= 50003
+        assert result.endswith("...")
+
+    def test_multiple_control_characters(self):
+        """Test removal of multiple types of control characters"""
+        content = "text\x00with\x01various\x02control\x1Fchars\x7F"
+        result = sanitize_tool_response(content)
+        # All control characters should be removed
+        assert "\x00" not in result
+        assert "\x01" not in result
+        assert "\x02" not in result
+        assert "\x1F" not in result
+        assert "\x7F" not in result
+        assert "textwithvariouscontrolchars" == result
+
+    def test_newline_and_tab_preservation(self):
+        """Test that newlines and tabs are preserved (they are valid)"""
+        content = "line1\nline2\tindented"
+        result = sanitize_tool_response(content)
+        assert "\n" in result
+        assert "\t" in result
+        assert result == "line1\nline2\tindented"
+
+    def test_non_json_content_unchanged(self):
+        """Test that non-JSON content is not modified"""
+        content = "This is plain text without any JSON structure"
+        result = sanitize_tool_response(content)
+        assert result == content
+
+    def test_json_array_at_start(self):
+        """Test extraction of JSON array at start of content"""
+        content = '[1, 2, 3, 4, 5] followed by text'
+        result = sanitize_tool_response(content)
+        assert result == '[1, 2, 3, 4, 5]'
+
+    def test_empty_json_structures_preserved(self):
+        """Test that empty JSON structures are preserved"""
+        content = '{"empty_obj": {}, "empty_arr": []} extra'
+        result = sanitize_tool_response(content)
+        assert result == '{"empty_obj": {}, "empty_arr": []}'
+
+    def test_whitespace_variations(self):
+        """Test handling of various whitespace patterns"""
+        content = "  \n\t  content with spaces  \t\n  "
+        result = sanitize_tool_response(content)
+        assert result == "content with spaces"