mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 14:22:13 +08:00
fix: improve JSON repair handling for markdown code blocks (#841)
* fix: improve JSON repair handling for markdown code blocks * unified import path * compress_crawl_udf * fix * reverse
This commit is contained in:
@@ -332,9 +332,12 @@ def planner_node(
|
||||
logger.debug(f"Current state messages: {state['messages']}")
|
||||
logger.info(f"Planner response: {full_response}")
|
||||
|
||||
# Clean the response first to handle markdown code blocks (```json, ```ts, etc.)
|
||||
cleaned_response = repair_json_output(full_response)
|
||||
|
||||
# Validate explicitly that response content is valid JSON before proceeding to parse it
|
||||
if not full_response.strip().startswith('{') and not full_response.strip().startswith('['):
|
||||
logger.warning("Planner response does not appear to be valid JSON")
|
||||
if not cleaned_response.strip().startswith('{') and not cleaned_response.strip().startswith('['):
|
||||
logger.warning("Planner response does not appear to be valid JSON after cleanup")
|
||||
if plan_iterations > 0:
|
||||
return Command(
|
||||
update=preserve_state_meta_fields(state),
|
||||
@@ -347,7 +350,7 @@ def planner_node(
|
||||
)
|
||||
|
||||
try:
|
||||
curr_plan = json.loads(repair_json_output(full_response))
|
||||
curr_plan = json.loads(cleaned_response)
|
||||
# Need to extract the plan from the full_response
|
||||
curr_plan_content = extract_plan_content(curr_plan)
|
||||
# load the current_plan
|
||||
@@ -1428,4 +1431,4 @@ async def analyst_node(
|
||||
config,
|
||||
"analyst",
|
||||
[], # No tools - pure reasoning
|
||||
)
|
||||
)
|
||||
|
||||
@@ -8,8 +8,8 @@ from urllib.parse import urlparse
|
||||
|
||||
from langchain_core.tools import tool
|
||||
|
||||
from src.crawler.article import Article
|
||||
from src.crawler import Crawler
|
||||
|
||||
from .decorators import log_io
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -43,8 +43,18 @@ def crawl_tool(
|
||||
try:
|
||||
crawler = Crawler()
|
||||
article = crawler.crawl(url)
|
||||
return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]}, ensure_ascii=False)
|
||||
article_content = compress_crawl_content(article)
|
||||
return json.dumps({"url": url, "crawled_content": article_content}, ensure_ascii=False)
|
||||
except BaseException as e:
|
||||
error_msg = f"Failed to crawl. Error: {repr(e)}"
|
||||
logger.error(error_msg)
|
||||
return error_msg
|
||||
|
||||
|
||||
def compress_crawl_content(article: Article) -> str:
|
||||
"""
|
||||
Compress user-defined function for article content.
|
||||
We can customize this function to implement different compression strategies.
|
||||
Currently, it truncates the markdown content to the first 1000 characters.
|
||||
"""
|
||||
return article.to_markdown()[:1000]
|
||||
|
||||
@@ -7,6 +7,7 @@ import re
|
||||
from typing import Any
|
||||
|
||||
import json_repair
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str:
|
||||
if not content:
|
||||
return content
|
||||
|
||||
# Handle markdown code blocks (```json, ```ts, or ```)
|
||||
# This must be checked first, as content may start with ``` instead of { or [
|
||||
if "```" in content:
|
||||
# Remove opening markdown code block markers (```json, ```ts, or ```), allowing
|
||||
# optional leading spaces and multiple blank lines after the fence.
|
||||
content = re.sub(
|
||||
r'^[ \t]*```(?:json|ts)?[ \t]*\n+',
|
||||
'',
|
||||
content,
|
||||
flags=re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
# Remove closing markdown code block markers (```), allowing optional
|
||||
# leading newlines and trailing spaces.
|
||||
content = re.sub(
|
||||
r'\n*```[ \t]*$',
|
||||
'',
|
||||
content,
|
||||
flags=re.MULTILINE,
|
||||
)
|
||||
content = content.strip()
|
||||
|
||||
# First attempt: try to extract valid JSON if there are extra tokens
|
||||
content = _extract_json_from_content(content)
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import json
|
||||
from src.utils.json_utils import (
|
||||
_extract_json_from_content,
|
||||
repair_json_output,
|
||||
sanitize_args,
|
||||
sanitize_tool_response,
|
||||
)
|
||||
|
||||
@@ -39,6 +40,49 @@ class TestRepairJsonOutput:
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_code_block_uppercase_json(self):
|
||||
"""Test JSON wrapped in ```JSON (uppercase) code block"""
|
||||
content = '```JSON\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_code_block_uppercase_ts(self):
|
||||
"""Test JSON wrapped in ```TS (uppercase) code block"""
|
||||
content = '```TS\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_code_block_mixed_case_json(self):
|
||||
"""Test JSON wrapped in ```Json (mixed case) code block"""
|
||||
content = '```Json\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_code_block_uppercase_ts_with_prefix(self):
|
||||
"""Test JSON wrapped in ```TS code block with prefix text"""
|
||||
content = 'some prefix ```TS\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_code_block_uppercase_json_with_prefix(self):
|
||||
"""Test JSON wrapped in ```JSON code block with prefix text - case sensitive fix"""
|
||||
# This tests the fix for case-insensitive guard when fence is not at start
|
||||
content = 'prefix ```JSON\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_plain_code_block_uppercase(self):
|
||||
"""Test JSON wrapped in plain ``` code block (case insensitive)"""
|
||||
content = '```\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_malformed_json_repair(self):
|
||||
"""Test with malformed JSON that can be repaired"""
|
||||
content = '{"key": "value", "incomplete":'
|
||||
@@ -226,3 +270,312 @@ class TestSanitizeToolResponse:
|
||||
content = '[{"id": 1}, {"id": 2}] invalid stuff'
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == '[{"id": 1}, {"id": 2}]'
|
||||
|
||||
|
||||
class TestSanitizeArgs:
|
||||
def test_sanitize_special_characters(self):
|
||||
"""Test sanitization of special characters"""
|
||||
args = '{"key": "value", "array": [1, 2, 3]}'
|
||||
result = sanitize_args(args)
|
||||
assert result == '{"key": "value", "array": [1, 2, 3]}'
|
||||
|
||||
def test_sanitize_square_brackets(self):
|
||||
"""Test sanitization of square brackets"""
|
||||
args = '[1, 2, 3]'
|
||||
result = sanitize_args(args)
|
||||
assert result == '[1, 2, 3]'
|
||||
|
||||
def test_sanitize_curly_braces(self):
|
||||
"""Test sanitization of curly braces"""
|
||||
args = '{key: value}'
|
||||
result = sanitize_args(args)
|
||||
assert result == '{key: value}'
|
||||
|
||||
def test_sanitize_mixed_brackets(self):
|
||||
"""Test sanitization of mixed bracket types"""
|
||||
args = '{[test]}'
|
||||
result = sanitize_args(args)
|
||||
assert result == '{[test]}'
|
||||
|
||||
def test_sanitize_non_string_input(self):
|
||||
"""Test sanitization of non-string input returns empty string"""
|
||||
assert sanitize_args(None) == ""
|
||||
assert sanitize_args(123) == ""
|
||||
assert sanitize_args([1, 2, 3]) == ""
|
||||
assert sanitize_args({"key": "value"}) == ""
|
||||
|
||||
def test_sanitize_empty_string(self):
|
||||
"""Test sanitization of empty string"""
|
||||
result = sanitize_args("")
|
||||
assert result == ""
|
||||
|
||||
def test_sanitize_plain_text(self):
|
||||
"""Test sanitization of plain text without special characters"""
|
||||
args = "plain text without brackets or braces"
|
||||
result = sanitize_args(args)
|
||||
assert result == "plain text without brackets or braces"
|
||||
|
||||
def test_sanitize_nested_structures(self):
|
||||
"""Test sanitization of deeply nested structures"""
|
||||
args = '{"outer": {"inner": [1, [2, 3]]}}'
|
||||
result = sanitize_args(args)
|
||||
assert result == '{"outer": {"inner": [1, [2, 3]]}}'
|
||||
|
||||
|
||||
class TestRepairJsonOutputEdgeCases:
|
||||
def test_code_block_with_leading_spaces(self):
|
||||
"""Test code block with leading spaces"""
|
||||
content = ' ```json\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_code_block_with_tabs(self):
|
||||
"""Test code block with tabs"""
|
||||
content = '\t```json\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_code_block_with_multiple_newlines(self):
|
||||
"""Test code block with multiple newlines after opening fence"""
|
||||
content = '```json\n\n\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_code_block_with_spaces_before_closing(self):
|
||||
"""Test code block with spaces before closing fence"""
|
||||
content = '```json\n{"key": "value"}\n ```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_newlines_in_values(self):
|
||||
"""Test JSON with newlines in string values"""
|
||||
content = '{"text": "line1\\nline2\\nline3"}'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"text": "line1\nline2\nline3"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_with_special_unicode(self):
|
||||
"""Test JSON with special unicode characters"""
|
||||
content = '{"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"emoji": "🔥💯", "chinese": "中文测试", "math": "∑∫"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_boolean_values(self):
|
||||
"""Test JSON with boolean values"""
|
||||
content = '{"active": true, "disabled": false, "nullable": null}'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"active": True, "disabled": False, "nullable": None}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_json_numeric_values(self):
|
||||
"""Test JSON with various numeric values"""
|
||||
content = '{"int": 42, "float": 3.14159, "negative": -123, "scientific": 1.23e10}'
|
||||
result = repair_json_output(content)
|
||||
parsed = json.loads(result)
|
||||
assert parsed["int"] == 42
|
||||
assert parsed["float"] == 3.14159
|
||||
assert parsed["negative"] == -123
|
||||
|
||||
def test_plain_code_block_marker(self):
|
||||
"""Test plain ``` code block without language specifier"""
|
||||
content = '```\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_multiple_json_objects_takes_first_complete(self):
|
||||
"""Test that multiple JSON objects are properly extracted"""
|
||||
content = '{"first": "object"} {"second": "object"}'
|
||||
result = repair_json_output(content)
|
||||
# json_repair will combine multiple objects into an array
|
||||
expected = json.dumps([{"first": "object"}, {"second": "object"}], ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_chinese_json_with_code_block(self):
|
||||
"""Test JSON with Chinese content wrapped in markdown code block"""
|
||||
content = '''```json
|
||||
{
|
||||
"locale": "en-US",
|
||||
"has_enough_context": true,
|
||||
"thought": "测试中文内容",
|
||||
"title": "地月距离小报告",
|
||||
"steps": []
|
||||
}
|
||||
```'''
|
||||
result = repair_json_output(content)
|
||||
parsed = json.loads(result)
|
||||
assert parsed["locale"] == "en-US"
|
||||
assert parsed["title"] == "地月距离小报告"
|
||||
assert parsed["thought"] == "测试中文内容"
|
||||
assert isinstance(parsed["steps"], list)
|
||||
|
||||
def test_code_block_uppercase_json_with_leading_spaces(self):
|
||||
"""Test uppercase JSON code block with leading spaces"""
|
||||
content = ' ```JSON\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_code_block_uppercase_json_with_tabs(self):
|
||||
"""Test uppercase JSON code block with tabs"""
|
||||
content = '\t```JSON\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_code_block_mixed_case_with_multiple_newlines(self):
|
||||
"""Test mixed case code block with multiple newlines"""
|
||||
content = '```JsOn\n\n\n{"key": "value"}\n```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_code_block_uppercase_with_spaces_before_closing(self):
|
||||
"""Test uppercase code block with spaces before closing fence"""
|
||||
content = '```TYPESCRIPT\n{"key": "value"}\n ```'
|
||||
result = repair_json_output(content)
|
||||
expected = json.dumps({"key": "value"}, ensure_ascii=False)
|
||||
assert result == expected
|
||||
|
||||
def test_code_block_case_insensitive_various_languages(self):
|
||||
"""Test code blocks with various language specifiers in different cases"""
|
||||
test_cases = [
|
||||
('```Python\n{"key": "value"}\n```', '{"key": "value"}'),
|
||||
('```PYTHON\n{"key": "value"}\n```', '{"key": "value"}'),
|
||||
('```pYtHoN\n{"key": "value"}\n```', '{"key": "value"}'),
|
||||
('```sql\n{"key": "value"}\n```', '{"key": "value"}'),
|
||||
('```SQL\n{"key": "value"}\n```', '{"key": "value"}'),
|
||||
]
|
||||
for content, expected_json_str in test_cases:
|
||||
result = repair_json_output(content)
|
||||
# Verify it's valid JSON
|
||||
parsed = json.loads(result)
|
||||
assert parsed["key"] == "value"
|
||||
|
||||
|
||||
class TestExtractJsonFromContentEdgeCases:
|
||||
def test_deeply_nested_json(self):
|
||||
"""Test extraction of deeply nested JSON"""
|
||||
content = '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}} garbage'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"l1": {"l2": {"l3": {"l4": {"l5": "deep"}}}}}'
|
||||
|
||||
def test_json_array_of_arrays(self):
|
||||
"""Test extraction of nested arrays"""
|
||||
content = '[[1, 2], [3, 4], [5, 6]] extra'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '[[1, 2], [3, 4], [5, 6]]'
|
||||
|
||||
def test_json_with_backslashes_in_string(self):
|
||||
"""Test JSON with backslashes in string values"""
|
||||
content = r'{"path": "C:\\Users\\test\\file.txt"} garbage'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == r'{"path": "C:\\Users\\test\\file.txt"}'
|
||||
|
||||
def test_json_with_forward_slashes(self):
|
||||
"""Test JSON with forward slashes in string values"""
|
||||
content = '{"url": "https://example.com/path/to/resource"} extra'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"url": "https://example.com/path/to/resource"}'
|
||||
|
||||
def test_mixed_object_and_array(self):
|
||||
"""Test JSON with mixed objects and arrays"""
|
||||
content = '{"items": [{"id": 1}, {"id": 2}], "count": 2} tail'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"items": [{"id": 1}, {"id": 2}], "count": 2}'
|
||||
|
||||
def test_json_with_unicode_escape_sequences(self):
|
||||
"""Test JSON with unicode escape sequences"""
|
||||
content = r'{"text": "\u4E2D\u6587"} junk'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == r'{"text": "\u4E2D\u6587"}'
|
||||
|
||||
def test_no_json_structure(self):
|
||||
"""Test content without JSON structure"""
|
||||
content = 'just plain text without brackets'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == content
|
||||
|
||||
def test_unbalanced_braces_in_middle(self):
|
||||
"""Test content with unbalanced braces doesn't extract invalid JSON"""
|
||||
content = '{"incomplete": {"nested": } text'
|
||||
result = _extract_json_from_content(content)
|
||||
# Should not mark as valid end since braces are unbalanced
|
||||
assert result == content
|
||||
|
||||
def test_json_with_comma_separated_values(self):
|
||||
"""Test JSON object with multiple comma-separated values"""
|
||||
content = '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5} more text'
|
||||
result = _extract_json_from_content(content)
|
||||
assert result == '{"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}'
|
||||
|
||||
|
||||
class TestSanitizeToolResponseEdgeCases:
|
||||
def test_json_object_with_extra_tokens(self):
|
||||
"""Test sanitizing JSON object with trailing tokens"""
|
||||
content = '{"status": "success", "data": {"id": 123}} trailing garbage'
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == '{"status": "success", "data": {"id": 123}}'
|
||||
|
||||
def test_truncation_at_exact_boundary(self):
|
||||
"""Test truncation behavior at exact max_length boundary"""
|
||||
content = "x" * 50000
|
||||
result = sanitize_tool_response(content, max_length=50000)
|
||||
assert len(result) == 50000
|
||||
assert not result.endswith("...")
|
||||
|
||||
def test_truncation_one_over_boundary(self):
|
||||
"""Test truncation when content is one char over limit"""
|
||||
content = "x" * 50001
|
||||
result = sanitize_tool_response(content, max_length=50000)
|
||||
assert len(result) <= 50003
|
||||
assert result.endswith("...")
|
||||
|
||||
def test_multiple_control_characters(self):
|
||||
"""Test removal of multiple types of control characters"""
|
||||
content = "text\x00with\x01various\x02control\x1Fchars\x7F"
|
||||
result = sanitize_tool_response(content)
|
||||
# All control characters should be removed
|
||||
assert "\x00" not in result
|
||||
assert "\x01" not in result
|
||||
assert "\x02" not in result
|
||||
assert "\x1F" not in result
|
||||
assert "\x7F" not in result
|
||||
assert "textwithvariouscontrolchars" == result
|
||||
|
||||
def test_newline_and_tab_preservation(self):
|
||||
"""Test that newlines and tabs are preserved (they are valid)"""
|
||||
content = "line1\nline2\tindented"
|
||||
result = sanitize_tool_response(content)
|
||||
assert "\n" in result
|
||||
assert "\t" in result
|
||||
assert result == "line1\nline2\tindented"
|
||||
|
||||
def test_non_json_content_unchanged(self):
|
||||
"""Test that non-JSON content is not modified"""
|
||||
content = "This is plain text without any JSON structure"
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == content
|
||||
|
||||
def test_json_array_at_start(self):
|
||||
"""Test extraction of JSON array at start of content"""
|
||||
content = '[1, 2, 3, 4, 5] followed by text'
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == '[1, 2, 3, 4, 5]'
|
||||
|
||||
def test_empty_json_structures_preserved(self):
|
||||
"""Test that empty JSON structures are preserved"""
|
||||
content = '{"empty_obj": {}, "empty_arr": []} extra'
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == '{"empty_obj": {}, "empty_arr": []}'
|
||||
|
||||
def test_whitespace_variations(self):
|
||||
"""Test handling of various whitespace patterns"""
|
||||
content = " \n\t content with spaces \t\n "
|
||||
result = sanitize_tool_response(content)
|
||||
assert result == "content with spaces"
|
||||
|
||||
Reference in New Issue
Block a user