fix: improve JSON repair handling for markdown code blocks (#841)

* fix: improve JSON repair handling for markdown code blocks

* unified import path

* compress_crawl_udf

* fix

* reverse
This commit is contained in:
Xun
2026-01-30 08:47:23 +08:00
committed by GitHub
parent 756421c3ac
commit 3adb4e90cb
4 changed files with 394 additions and 6 deletions

View File

@@ -332,9 +332,12 @@ def planner_node(
logger.debug(f"Current state messages: {state['messages']}")
logger.info(f"Planner response: {full_response}")
# Clean the response first to handle markdown code blocks (```json, ```ts, etc.)
cleaned_response = repair_json_output(full_response)
# Validate explicitly that response content is valid JSON before proceeding to parse it
if not full_response.strip().startswith('{') and not full_response.strip().startswith('['):
logger.warning("Planner response does not appear to be valid JSON")
if not cleaned_response.strip().startswith('{') and not cleaned_response.strip().startswith('['):
logger.warning("Planner response does not appear to be valid JSON after cleanup")
if plan_iterations > 0:
return Command(
update=preserve_state_meta_fields(state),
@@ -347,7 +350,7 @@ def planner_node(
)
try:
curr_plan = json.loads(repair_json_output(full_response))
curr_plan = json.loads(cleaned_response)
# Need to extract the plan from the full_response
curr_plan_content = extract_plan_content(curr_plan)
# load the current_plan
@@ -1428,4 +1431,4 @@ async def analyst_node(
config,
"analyst",
[], # No tools - pure reasoning
)
)

View File

@@ -8,8 +8,8 @@ from urllib.parse import urlparse
from langchain_core.tools import tool
from src.crawler.article import Article
from src.crawler import Crawler
from .decorators import log_io
logger = logging.getLogger(__name__)
@@ -43,8 +43,18 @@ def crawl_tool(
try:
crawler = Crawler()
article = crawler.crawl(url)
return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]}, ensure_ascii=False)
article_content = compress_crawl_content(article)
return json.dumps({"url": url, "crawled_content": article_content}, ensure_ascii=False)
except BaseException as e:
error_msg = f"Failed to crawl. Error: {repr(e)}"
logger.error(error_msg)
return error_msg
def compress_crawl_content(article: Article) -> str:
"""
Compress user-defined function for article content.
We can customize this function to implement different compression strategies.
Currently, it truncates the markdown content to the first 1000 characters.
"""
return article.to_markdown()[:1000]

View File

@@ -7,6 +7,7 @@ import re
from typing import Any
import json_repair
import re
logger = logging.getLogger(__name__)
@@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str:
if not content:
return content
# Handle markdown code blocks (```json, ```ts, or ```)
# This must be checked first, as content may start with ``` instead of { or [
if "```" in content:
# Remove opening markdown code block markers (```json, ```ts, or ```), allowing
# optional leading spaces and multiple blank lines after the fence.
content = re.sub(
r'^[ \t]*```(?:json|ts)?[ \t]*\n+',
'',
content,
flags=re.IGNORECASE | re.MULTILINE,
)
# Remove closing markdown code block markers (```), allowing optional
# leading newlines and trailing spaces.
content = re.sub(
r'\n*```[ \t]*$',
'',
content,
flags=re.MULTILINE,
)
content = content.strip()
# First attempt: try to extract valid JSON if there are extra tokens
content = _extract_json_from_content(content)