From 423f5c829c464b135cf418e525000feedb80c35b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E7=8C=AB=E5=AD=90?= <1811866786@qq.com> Date: Mon, 16 Feb 2026 09:38:17 +0800 Subject: [PATCH] fix: strip tags from reporter output to prevent thinking text leakage (#781) (#862) * fix: strip tags from LLM output to prevent thinking text leakage (#781) Some models (e.g. DeepSeek-R1, QwQ via ollama) embed reasoning in content using ... tags instead of the separate reasoning_content field. This causes thinking text to leak into both streamed messages and the final report. Fix at two layers: - server/app.py: strip tags in _create_event_stream_message so ALL streamed content is filtered (coordinator, planner, etc.) - graph/nodes.py: strip tags in reporter_node before storing final_report (which is not streamed through the event layer) The regex uses a fast-path check ("" in content) to avoid unnecessary regex calls on normal content. * refactor: add defensive check for think tag stripping and add reporter_node tests (#781) - Add isinstance and fast-path check in reporter_node before regex, consistent with app.py - Add TestReporterNodeThinkTagStripping with 5 test cases covering various scenarios * chore: re-trigger review --- src/graph/nodes.py | 7 ++ src/server/app.py | 6 ++ tests/integration/test_nodes.py | 112 ++++++++++++++++++++++++++++++++ tests/unit/server/test_app.py | 51 +++++++++++++++ 4 files changed, 176 insertions(+) diff --git a/src/graph/nodes.py b/src/graph/nodes.py index 51a83c5..9f207c0 100644 --- a/src/graph/nodes.py +++ b/src/graph/nodes.py @@ -4,6 +4,7 @@ import json import logging import os +import re from functools import partial from typing import Annotated, Any, Literal @@ -900,6 +901,12 @@ def reporter_node(state: State, config: RunnableConfig): logger.debug(f"Current invoke messages: {invoke_messages}") response = get_llm_by_type(AGENT_LLM_MAP["reporter"]).invoke(invoke_messages) response_content = response.content + # Strip ... tags that some models (e.g. QwQ, DeepSeek) embed + # directly in content instead of using the reasoning_content field (#781) + if isinstance(response_content, str) and "" in response_content: + response_content = re.sub( + r"[\s\S]*?", "", response_content + ).strip() logger.info(f"reporter response: {response_content}") return { diff --git a/src/server/app.py b/src/server/app.py index 97f2413..94ff3c0 100644 --- a/src/server/app.py +++ b/src/server/app.py @@ -6,6 +6,7 @@ import base64 import json import logging import os +import re from typing import Annotated, Any, List, Optional, cast from uuid import uuid4 @@ -423,6 +424,11 @@ def _create_event_stream_message( if not isinstance(content, str): content = json.dumps(content, ensure_ascii=False) + # Strip ... tags that some models (e.g. DeepSeek-R1, QwQ via ollama) + # embed directly in content instead of using the reasoning_content field (#781) + if isinstance(content, str) and "" in content: + content = re.sub(r"[\s\S]*?", "", content).strip() + event_stream_message = { "thread_id": thread_id, "agent": agent_name, diff --git a/tests/integration/test_nodes.py b/tests/integration/test_nodes.py index 8445bfa..ce7b9c6 100644 --- a/tests/integration/test_nodes.py +++ b/tests/integration/test_nodes.py @@ -2823,3 +2823,115 @@ async def test_execute_agent_step_no_tool_calls_still_works(): # Verify step execution result is set assert state["current_plan"].steps[0].execution_res == "Based on my knowledge, here is the answer without needing to search." + + +class TestReporterNodeThinkTagStripping: + """Tests for stripping tags from reporter_node output (#781). + + Some models (e.g. DeepSeek-R1, QwQ via ollama) embed reasoning in + content using ... tags instead of the separate + reasoning_content field. + """ + + def _make_mock_state(self): + plan = MagicMock() + plan.title = "Test Plan" + plan.thought = "Test Thought" + return { + "current_plan": plan, + "observations": [], + "citations": [], + "locale": "en-US", + } + + def _run_reporter_node(self, response_content): + state = self._make_mock_state() + mock_response = MagicMock() + mock_response.content = response_content + + mock_configurable = MagicMock() + + with ( + patch( + "src.graph.nodes.Configuration.from_runnable_config", + return_value=mock_configurable, + ), + patch( + "src.graph.nodes.apply_prompt_template", + return_value=[{"role": "user", "content": "test"}], + ), + patch("src.graph.nodes.get_llm_by_type") as mock_get_llm, + patch("src.graph.nodes.get_llm_token_limit_by_type", return_value=4096), + patch("src.graph.nodes.AGENT_LLM_MAP", {"reporter": "basic"}), + patch( + "src.graph.nodes.ContextManager" + ) as mock_ctx_mgr, + ): + mock_ctx_mgr.return_value.compress_messages.return_value = {"messages": []} + mock_llm = MagicMock() + mock_llm.invoke.return_value = mock_response + mock_get_llm.return_value = mock_llm + + result = reporter_node(state, MagicMock()) + return result + + def test_strips_think_tag_at_beginning(self): + result = self._run_reporter_node( + "\nLet me analyze...\n\n\n# Report\n\nContent here." + ) + assert "" not in result["final_report"] + assert "# Report" in result["final_report"] + assert "Content here." in result["final_report"] + + def test_strips_multiple_think_blocks(self): + result = self._run_reporter_node( + "First thought\nParagraph 1.\nSecond thought\nParagraph 2." + ) + assert "" not in result["final_report"] + assert "Paragraph 1." in result["final_report"] + assert "Paragraph 2." in result["final_report"] + + def test_preserves_content_without_think_tags(self): + result = self._run_reporter_node("Normal content without think tags.") + assert result["final_report"] == "Normal content without think tags." + + def test_empty_content_after_stripping(self): + result = self._run_reporter_node( + "Only thinking, no real content" + ) + assert "" not in result["final_report"] + + def test_non_string_content_passes_through(self): + """Verify non-string content is not broken by the stripping logic.""" + state = self._make_mock_state() + mock_response = MagicMock() + # Simulate non-string content (e.g. list from multimodal model) + mock_response.content = ["some", "list"] + + mock_configurable = MagicMock() + + with ( + patch( + "src.graph.nodes.Configuration.from_runnable_config", + return_value=mock_configurable, + ), + patch( + "src.graph.nodes.apply_prompt_template", + return_value=[{"role": "user", "content": "test"}], + ), + patch("src.graph.nodes.get_llm_by_type") as mock_get_llm, + patch("src.graph.nodes.get_llm_token_limit_by_type", return_value=4096), + patch("src.graph.nodes.AGENT_LLM_MAP", {"reporter": "basic"}), + patch( + "src.graph.nodes.ContextManager" + ) as mock_ctx_mgr, + ): + mock_ctx_mgr.return_value.compress_messages.return_value = {"messages": []} + mock_llm = MagicMock() + mock_llm.invoke.return_value = mock_response + mock_get_llm.return_value = mock_llm + + result = reporter_node(state, MagicMock()) + + # Non-string content should pass through unchanged + assert result["final_report"] == ["some", "list"] diff --git a/tests/unit/server/test_app.py b/tests/unit/server/test_app.py index 5ed71f7..d57d018 100644 --- a/tests/unit/server/test_app.py +++ b/tests/unit/server/test_app.py @@ -16,6 +16,7 @@ from langgraph.types import Command from src.config.report_style import ReportStyle from src.server.app import ( _astream_workflow_generator, + _create_event_stream_message, _create_interrupt_event, _make_event, _stream_graph_events, @@ -1680,3 +1681,53 @@ class TestGlobalConnectionPoolUsage: """Helper to create an empty async generator.""" if False: yield + + +class TestCreateEventStreamMessageThinkTagStripping: + """Tests for stripping tags from streamed content (#781). + + Some models (e.g. DeepSeek-R1, QwQ via ollama) embed reasoning in + content using ... tags instead of the separate + reasoning_content field. + """ + + def _make_mock_chunk(self, content): + chunk = AIMessageChunk(content=content) + chunk.id = "msg_test" + chunk.response_metadata = {} + return chunk + + def test_strips_think_tag_at_beginning(self): + chunk = self._make_mock_chunk( + "\nLet me analyze...\n\n\n# Report\n\nContent here." + ) + result = _create_event_stream_message(chunk, {}, "thread-1", "reporter") + assert "" not in result["content"] + assert "# Report" in result["content"] + assert "Content here." in result["content"] + + def test_strips_multiple_think_blocks(self): + chunk = self._make_mock_chunk( + "First thought\nParagraph 1.\nSecond thought\nParagraph 2." + ) + result = _create_event_stream_message(chunk, {}, "thread-1", "coordinator") + assert "" not in result["content"] + assert "Paragraph 1." in result["content"] + assert "Paragraph 2." in result["content"] + + def test_preserves_content_without_think_tags(self): + chunk = self._make_mock_chunk("Normal content without think tags.") + result = _create_event_stream_message(chunk, {}, "thread-1", "planner") + assert result["content"] == "Normal content without think tags." + + def test_empty_content_after_stripping(self): + chunk = self._make_mock_chunk("Only thinking, no real content") + result = _create_event_stream_message(chunk, {}, "thread-1", "reporter") + assert "" not in result["content"] + + def test_preserves_reasoning_content_field(self): + chunk = self._make_mock_chunk("Actual content") + chunk.additional_kwargs["reasoning_content"] = "This is reasoning" + result = _create_event_stream_message(chunk, {}, "thread-1", "planner") + assert result["content"] == "Actual content" + assert result["reasoning_content"] == "This is reasoning"