fix: improve JSON repair handling for markdown code blocks (#841)

* fix: improve JSON repair handling for markdown code blocks * unified import path * compress_crawl_udf * fix * reverse
2026-04-11 17:44:45 +08:00 · 2026-01-30 08:47:23 +08:00
parent 756421c3ac
commit 3adb4e90cb
4 changed files with 394 additions and 6 deletions
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -332,9 +332,12 @@ def planner_node(
    logger.debug(f"Current state messages: {state['messages']}")
    logger.info(f"Planner response: {full_response}")

+    # Clean the response first to handle markdown code blocks (```json, ```ts, etc.)
+    cleaned_response = repair_json_output(full_response)
+
    # Validate explicitly that response content is valid JSON before proceeding to parse it
-    if not full_response.strip().startswith('{') and not full_response.strip().startswith('['):
-        logger.warning("Planner response does not appear to be valid JSON")
+    if not cleaned_response.strip().startswith('{') and not cleaned_response.strip().startswith('['):
+        logger.warning("Planner response does not appear to be valid JSON after cleanup")
        if plan_iterations > 0:
            return Command(
                update=preserve_state_meta_fields(state),
@@ -347,7 +350,7 @@ def planner_node(
            )

    try:
-        curr_plan = json.loads(repair_json_output(full_response))
+        curr_plan = json.loads(cleaned_response)
        # Need to extract the plan from the full_response
        curr_plan_content = extract_plan_content(curr_plan)
        # load the current_plan
@@ -1428,4 +1431,4 @@ async def analyst_node(
        config,
        "analyst",
        [],  # No tools - pure reasoning
-    )
+    )
--- a/src/tools/crawl.py
+++ b/src/tools/crawl.py
@@ -8,8 +8,8 @@ from urllib.parse import urlparse

 from langchain_core.tools import tool

+from src.crawler.article import Article
 from src.crawler import Crawler
-
 from .decorators import log_io

 logger = logging.getLogger(__name__)
@@ -43,8 +43,18 @@ def crawl_tool(
    try:
        crawler = Crawler()
        article = crawler.crawl(url)
-        return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]}, ensure_ascii=False)
+        article_content = compress_crawl_content(article)
+        return json.dumps({"url": url, "crawled_content": article_content}, ensure_ascii=False)
    except BaseException as e:
        error_msg = f"Failed to crawl. Error: {repr(e)}"
        logger.error(error_msg)
        return error_msg
+
+
+def compress_crawl_content(article: Article) -> str:
+    """
+    Compress user-defined function for article content.
+    We can customize this function to implement different compression strategies.
+    Currently, it truncates the markdown content to the first 1000 characters.
+    """
+    return article.to_markdown()[:1000]
--- a/src/utils/json_utils.py
+++ b/src/utils/json_utils.py
@@ -7,6 +7,7 @@ import re
 from typing import Any

 import json_repair
+import re

 logger = logging.getLogger(__name__)

@@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str:
    if not content:
        return content

+    # Handle markdown code blocks (```json, ```ts, or ```)
+    # This must be checked first, as content may start with ``` instead of { or [
+    if "```" in content:
+        # Remove opening markdown code block markers (```json, ```ts, or ```), allowing
+        # optional leading spaces and multiple blank lines after the fence.
+        content = re.sub(
+            r'^[ \t]*```(?:json|ts)?[ \t]*\n+',
+            '',
+            content,
+            flags=re.IGNORECASE | re.MULTILINE,
+        )
+        # Remove closing markdown code block markers (```), allowing optional
+        # leading newlines and trailing spaces.
+        content = re.sub(
+            r'\n*```[ \t]*$',
+            '',
+            content,
+            flags=re.MULTILINE,
+        )
+        content = content.strip()
+
    # First attempt: try to extract valid JSON if there are extra tokens
    content = _extract_json_from_content(content)