fix: strip <think> tags from reporter output to prevent thinking text leakage (#781) (#862)

* fix: strip <think> tags from LLM output to prevent thinking text leakage (#781)

Some models (e.g. DeepSeek-R1, QwQ via ollama) embed reasoning in
content using <think>...</think> tags instead of the separate
reasoning_content field. This causes thinking text to leak into
both streamed messages and the final report.

Fix at two layers:
- server/app.py: strip <think> tags in _create_event_stream_message
  so ALL streamed content is filtered (coordinator, planner, etc.)
- graph/nodes.py: strip <think> tags in reporter_node before storing
  final_report (which is not streamed through the event layer)

The regex uses a fast-path check ("<think>" in content) to avoid
unnecessary regex calls on normal content.

* refactor: add defensive check for think tag stripping and add reporter_node tests (#781)

- Add isinstance and fast-path check in reporter_node before regex, consistent with app.py
- Add TestReporterNodeThinkTagStripping with 5 test cases covering various scenarios

* chore: re-trigger review
This commit is contained in:
大猫子
2026-02-16 09:38:17 +08:00
committed by GitHub
parent 06248fa6f1
commit 423f5c829c
4 changed files with 176 additions and 0 deletions

View File

@@ -4,6 +4,7 @@
import json import json
import logging import logging
import os import os
import re
from functools import partial from functools import partial
from typing import Annotated, Any, Literal from typing import Annotated, Any, Literal
@@ -900,6 +901,12 @@ def reporter_node(state: State, config: RunnableConfig):
logger.debug(f"Current invoke messages: {invoke_messages}") logger.debug(f"Current invoke messages: {invoke_messages}")
response = get_llm_by_type(AGENT_LLM_MAP["reporter"]).invoke(invoke_messages) response = get_llm_by_type(AGENT_LLM_MAP["reporter"]).invoke(invoke_messages)
response_content = response.content response_content = response.content
# Strip <think>...</think> tags that some models (e.g. QwQ, DeepSeek) embed
# directly in content instead of using the reasoning_content field (#781)
if isinstance(response_content, str) and "<think>" in response_content:
response_content = re.sub(
r"<think>[\s\S]*?</think>", "", response_content
).strip()
logger.info(f"reporter response: {response_content}") logger.info(f"reporter response: {response_content}")
return { return {

View File

@@ -6,6 +6,7 @@ import base64
import json import json
import logging import logging
import os import os
import re
from typing import Annotated, Any, List, Optional, cast from typing import Annotated, Any, List, Optional, cast
from uuid import uuid4 from uuid import uuid4
@@ -423,6 +424,11 @@ def _create_event_stream_message(
if not isinstance(content, str): if not isinstance(content, str):
content = json.dumps(content, ensure_ascii=False) content = json.dumps(content, ensure_ascii=False)
# Strip <think>...</think> tags that some models (e.g. DeepSeek-R1, QwQ via ollama)
# embed directly in content instead of using the reasoning_content field (#781)
if isinstance(content, str) and "<think>" in content:
content = re.sub(r"<think>[\s\S]*?</think>", "", content).strip()
event_stream_message = { event_stream_message = {
"thread_id": thread_id, "thread_id": thread_id,
"agent": agent_name, "agent": agent_name,

View File

@@ -2823,3 +2823,115 @@ async def test_execute_agent_step_no_tool_calls_still_works():
# Verify step execution result is set # Verify step execution result is set
assert state["current_plan"].steps[0].execution_res == "Based on my knowledge, here is the answer without needing to search." assert state["current_plan"].steps[0].execution_res == "Based on my knowledge, here is the answer without needing to search."
class TestReporterNodeThinkTagStripping:
"""Tests for stripping <think> tags from reporter_node output (#781).
Some models (e.g. DeepSeek-R1, QwQ via ollama) embed reasoning in
content using <think>...</think> tags instead of the separate
reasoning_content field.
"""
def _make_mock_state(self):
plan = MagicMock()
plan.title = "Test Plan"
plan.thought = "Test Thought"
return {
"current_plan": plan,
"observations": [],
"citations": [],
"locale": "en-US",
}
def _run_reporter_node(self, response_content):
state = self._make_mock_state()
mock_response = MagicMock()
mock_response.content = response_content
mock_configurable = MagicMock()
with (
patch(
"src.graph.nodes.Configuration.from_runnable_config",
return_value=mock_configurable,
),
patch(
"src.graph.nodes.apply_prompt_template",
return_value=[{"role": "user", "content": "test"}],
),
patch("src.graph.nodes.get_llm_by_type") as mock_get_llm,
patch("src.graph.nodes.get_llm_token_limit_by_type", return_value=4096),
patch("src.graph.nodes.AGENT_LLM_MAP", {"reporter": "basic"}),
patch(
"src.graph.nodes.ContextManager"
) as mock_ctx_mgr,
):
mock_ctx_mgr.return_value.compress_messages.return_value = {"messages": []}
mock_llm = MagicMock()
mock_llm.invoke.return_value = mock_response
mock_get_llm.return_value = mock_llm
result = reporter_node(state, MagicMock())
return result
def test_strips_think_tag_at_beginning(self):
result = self._run_reporter_node(
"<think>\nLet me analyze...\n</think>\n\n# Report\n\nContent here."
)
assert "<think>" not in result["final_report"]
assert "# Report" in result["final_report"]
assert "Content here." in result["final_report"]
def test_strips_multiple_think_blocks(self):
result = self._run_reporter_node(
"<think>First thought</think>\nParagraph 1.\n<think>Second thought</think>\nParagraph 2."
)
assert "<think>" not in result["final_report"]
assert "Paragraph 1." in result["final_report"]
assert "Paragraph 2." in result["final_report"]
def test_preserves_content_without_think_tags(self):
result = self._run_reporter_node("Normal content without think tags.")
assert result["final_report"] == "Normal content without think tags."
def test_empty_content_after_stripping(self):
result = self._run_reporter_node(
"<think>Only thinking, no real content</think>"
)
assert "<think>" not in result["final_report"]
def test_non_string_content_passes_through(self):
"""Verify non-string content is not broken by the stripping logic."""
state = self._make_mock_state()
mock_response = MagicMock()
# Simulate non-string content (e.g. list from multimodal model)
mock_response.content = ["some", "list"]
mock_configurable = MagicMock()
with (
patch(
"src.graph.nodes.Configuration.from_runnable_config",
return_value=mock_configurable,
),
patch(
"src.graph.nodes.apply_prompt_template",
return_value=[{"role": "user", "content": "test"}],
),
patch("src.graph.nodes.get_llm_by_type") as mock_get_llm,
patch("src.graph.nodes.get_llm_token_limit_by_type", return_value=4096),
patch("src.graph.nodes.AGENT_LLM_MAP", {"reporter": "basic"}),
patch(
"src.graph.nodes.ContextManager"
) as mock_ctx_mgr,
):
mock_ctx_mgr.return_value.compress_messages.return_value = {"messages": []}
mock_llm = MagicMock()
mock_llm.invoke.return_value = mock_response
mock_get_llm.return_value = mock_llm
result = reporter_node(state, MagicMock())
# Non-string content should pass through unchanged
assert result["final_report"] == ["some", "list"]

View File

@@ -16,6 +16,7 @@ from langgraph.types import Command
from src.config.report_style import ReportStyle from src.config.report_style import ReportStyle
from src.server.app import ( from src.server.app import (
_astream_workflow_generator, _astream_workflow_generator,
_create_event_stream_message,
_create_interrupt_event, _create_interrupt_event,
_make_event, _make_event,
_stream_graph_events, _stream_graph_events,
@@ -1680,3 +1681,53 @@ class TestGlobalConnectionPoolUsage:
"""Helper to create an empty async generator.""" """Helper to create an empty async generator."""
if False: if False:
yield yield
class TestCreateEventStreamMessageThinkTagStripping:
"""Tests for stripping <think> tags from streamed content (#781).
Some models (e.g. DeepSeek-R1, QwQ via ollama) embed reasoning in
content using <think>...</think> tags instead of the separate
reasoning_content field.
"""
def _make_mock_chunk(self, content):
chunk = AIMessageChunk(content=content)
chunk.id = "msg_test"
chunk.response_metadata = {}
return chunk
def test_strips_think_tag_at_beginning(self):
chunk = self._make_mock_chunk(
"<think>\nLet me analyze...\n</think>\n\n# Report\n\nContent here."
)
result = _create_event_stream_message(chunk, {}, "thread-1", "reporter")
assert "<think>" not in result["content"]
assert "# Report" in result["content"]
assert "Content here." in result["content"]
def test_strips_multiple_think_blocks(self):
chunk = self._make_mock_chunk(
"<think>First thought</think>\nParagraph 1.\n<think>Second thought</think>\nParagraph 2."
)
result = _create_event_stream_message(chunk, {}, "thread-1", "coordinator")
assert "<think>" not in result["content"]
assert "Paragraph 1." in result["content"]
assert "Paragraph 2." in result["content"]
def test_preserves_content_without_think_tags(self):
chunk = self._make_mock_chunk("Normal content without think tags.")
result = _create_event_stream_message(chunk, {}, "thread-1", "planner")
assert result["content"] == "Normal content without think tags."
def test_empty_content_after_stripping(self):
chunk = self._make_mock_chunk("<think>Only thinking, no real content</think>")
result = _create_event_stream_message(chunk, {}, "thread-1", "reporter")
assert "<think>" not in result["content"]
def test_preserves_reasoning_content_field(self):
chunk = self._make_mock_chunk("Actual content")
chunk.additional_kwargs["reasoning_content"] = "This is reasoning"
result = _create_event_stream_message(chunk, {}, "thread-1", "planner")
assert result["content"] == "Actual content"
assert result["reasoning_content"] == "This is reasoning"