From e3e7a83f40ac852e6b5befc93466d2d4b0cf3821 Mon Sep 17 00:00:00 2001
From: Willem Jiang <willem.jiang@gmail.com>
Date: Mon, 2 Feb 2026 20:31:58 +0800
Subject: [PATCH] fix(node):deal with the plan_data content with multipmodal
 message (#846)

* fix(node):deal with the plan_data content with multipmodal message

* Update the code with review comments
---
 src/graph/nodes.py              |  22 ++++-
 tests/integration/test_nodes.py | 156 ++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/src/graph/nodes.py b/src/graph/nodes.py
index 54f41d8..f0f8ae4 100644
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -421,6 +421,26 @@ def extract_plan_content(plan_data: str | dict | Any) -> str:
             if isinstance(plan_data["content"], dict):
                 logger.debug("Converting content field dict to JSON string")
                 return json.dumps(plan_data["content"], ensure_ascii=False)
+            if isinstance(plan_data["content"], list):
+                # Handle multimodal message format where content is a list
+                # Extract text content from the list structure
+                logger.debug(f"Extracting plan content from multimodal list format with {len(plan_data['content'])} elements")
+                for item in plan_data["content"]:
+                    if isinstance(item, str) and item.strip():
+                        # Return the first valid text content found
+                        # We only take the first one because plan content should be a single JSON object
+                        # Joining multiple text parts with newlines would produce invalid JSON
+                        return item
+                    elif isinstance(item, dict):
+                        # Handle content block format like {"type": "text", "text": "..."}
+                        if item.get("type") == "text" and "text" in item:
+                            return item["text"]
+                        elif "content" in item and isinstance(item["content"], str):
+                            return item["content"]
+                # No valid text content found - raise ValueError to trigger error handling
+                # Do NOT use json.dumps() here as it would produce a JSON array that causes
+                # Plan.model_validate() to fail with ValidationError (issue #845)
+                raise ValueError(f"No valid text content found in multimodal list: {plan_data['content']}")
             else:
                 logger.warning(f"Unexpected type for 'content' field in plan_data dict: {type(plan_data['content']).__name__}, converting to string")
                 return str(plan_data["content"])
@@ -494,7 +514,7 @@ def human_feedback_node(
         # Validate and fix plan to ensure web search requirements are met
         configurable = Configuration.from_runnable_config(config)
         new_plan = validate_and_fix_plan(new_plan, configurable.enforce_web_search, configurable.enable_web_search)
-    except (json.JSONDecodeError, AttributeError) as e:
+    except (json.JSONDecodeError, AttributeError, ValueError) as e:
         logger.warning(f"Failed to parse plan: {str(e)}. Plan data type: {type(current_plan).__name__}")
         if isinstance(current_plan, dict) and "content" in original_plan:
             logger.warning(f"Plan appears to be an AIMessage object with content field")
diff --git a/tests/integration/test_nodes.py b/tests/integration/test_nodes.py
index 5ff23f5..8445bfa 100644
--- a/tests/integration/test_nodes.py
+++ b/tests/integration/test_nodes.py
@@ -14,6 +14,7 @@ from src.graph.nodes import (
     researcher_node,
     extract_plan_content,
 )
+from src.prompts.planner_model import Plan
 
 
 class TestExtractPlanContent:
@@ -188,6 +189,161 @@ class TestExtractPlanContent:
         assert len(parsed_result["steps"]) == 1
         assert parsed_result["steps"][0]["title"] == "收集埃菲尔铁塔和世界最高建筑的高度数据"
 
+    def test_extract_plan_content_with_multimodal_list_issue_845(self):
+        """Test that extract_plan_content handles multimodal message format (list type) from issue #845."""
+        # This is the structure that causes ValidationError in issue #845
+        # When content is a list like ['', ['XXXXXXXX']] from multimodal LLM models
+        plan_json = '{"locale": "en-US", "has_enough_context": false, "title": "Test Plan", "steps": []}'
+        content_dict_simple_list = {"content": [plan_json]}
+        
+        result = extract_plan_content(content_dict_simple_list)
+        # Should extract the text content from the list
+        assert result == plan_json
+        # Verify it can be parsed as JSON
+        parsed_result = json.loads(result)
+        assert parsed_result["locale"] == "en-US"
+
+    def test_extract_plan_content_with_multimodal_list_mixed_content(self):
+        """Test multimodal list with mixed content (text and references)."""
+        plan_json = '{"locale": "zh-CN", "title": "测试计划", "steps": []}'
+        # Simulate multimodal format: ['text_content', ['reference1', 'reference2']]
+        content_dict_mixed = {"content": [plan_json, ["ref1", "ref2"]]}
+        
+        result = extract_plan_content(content_dict_mixed)
+        # Should extract only the text content, ignoring nested lists
+        assert result == plan_json
+        parsed_result = json.loads(result)
+        assert parsed_result["title"] == "测试计划"
+
+    def test_extract_plan_content_with_multimodal_content_blocks(self):
+        """Test multimodal list with content block format."""
+        plan_json = '{"locale": "en-US", "title": "Block Test", "steps": []}'
+        # Simulate content block format: [{"type": "text", "text": "..."}]
+        content_dict_blocks = {"content": [{"type": "text", "text": plan_json}]}
+        
+        result = extract_plan_content(content_dict_blocks)
+        assert result == plan_json
+        parsed_result = json.loads(result)
+        assert parsed_result["title"] == "Block Test"
+
+    def test_extract_plan_content_with_generic_content_dict_format(self):
+        """Test multimodal list with generic {"content": "..."} dict format.
+        
+        Some LLM providers may use a simpler content block format where the dict
+        has a "content" field directly instead of {"type": "text", "text": "..."}.
+        This test ensures that format is also handled correctly.
+        """
+        plan_json = '{"locale": "en-US", "title": "Generic Content Test", "has_enough_context": true, "steps": []}'
+        # Simulate generic content dict format: [{"content": "..."}]
+        content_dict = {"content": [{"content": plan_json}]}
+        
+        result = extract_plan_content(content_dict)
+        assert result == plan_json
+        parsed_result = json.loads(result)
+        assert parsed_result["title"] == "Generic Content Test"
+
+    def test_extract_plan_content_with_empty_multimodal_list(self):
+        """Test multimodal list with empty or whitespace-only content raises ValueError."""
+        # Simulate the case from issue #845: ['', ['XXXXXXXX']]
+        content_dict_empty = {"content": ["", ["XXXXXXXX"]]}
+        
+        # Should raise ValueError since no valid text content found
+        # This prevents the original bug where json.dumps would create a JSON array
+        # that causes Plan.model_validate() to fail
+        with pytest.raises(ValueError) as exc_info:
+            extract_plan_content(content_dict_empty)
+        assert "No valid text content found in multimodal list" in str(exc_info.value)
+
+    def test_extract_plan_content_multimodal_uses_first_text_only(self):
+        """Test that only the first valid text element is used from multimodal list.
+        
+        When multiple text parts are present, joining them with newlines would produce
+        invalid JSON. Therefore, we only use the first valid text element.
+        """
+        first_json = '{"locale": "en-US", "title": "First Plan", "has_enough_context": true, "steps": []}'
+        second_json = '{"locale": "zh-CN", "title": "Second Plan", "has_enough_context": false, "steps": []}'
+        
+        # Multiple JSON strings in the list - only the first should be used
+        content_dict = {"content": [first_json, second_json]}
+        result = extract_plan_content(content_dict)
+        
+        # Should return only the first JSON, not joined with newlines
+        assert result == first_json
+        assert "\n" not in result  # Ensure no newline joining occurred
+        
+        # Verify the result is valid JSON
+        parsed_result = json.loads(result)
+        assert parsed_result["title"] == "First Plan"
+        assert parsed_result["locale"] == "en-US"
+
+    def test_extract_plan_content_multimodal_full_flow_issue_845(self):
+        """Test complete flow: multimodal content -> extract -> parse -> Plan.model_validate().
+        
+        This is a comprehensive end-to-end test for issue #845 that validates:
+        1. The extracted result can be successfully parsed as JSON
+        2. The parsed result is a dict (not a list)
+        3. The parsed dict can be validated by Plan.model_validate() without raising ValidationError
+        
+        Note: This test uses the real Plan.model_validate to verify the fix, bypassing the
+        autouse fixture that patches Plan.model_validate globally for other tests.
+        """
+        # Import Plan directly and get the real model_validate method
+        from src.prompts.planner_model import Plan as PlanModel
+        # Get the real model_validate method (bypass any patches)
+        real_model_validate = PlanModel.__pydantic_validator__.validate_python
+        
+        # Create a valid plan JSON that matches the Plan model schema
+        valid_plan = {
+            "locale": "en-US",
+            "has_enough_context": True,
+            "thought": "Test thought",
+            "title": "Test Plan Title",
+            "steps": [
+                {
+                    "need_search": True,
+                    "title": "Step 1",
+                    "description": "Step 1 description",
+                    "step_type": "research"
+                }
+            ]
+        }
+        plan_json = json.dumps(valid_plan, ensure_ascii=False)
+        
+        # Test case 1: Multimodal list with valid text content
+        content_dict = {"content": [plan_json]}
+        result = extract_plan_content(content_dict)
+        
+        # Verify result can be parsed as JSON
+        parsed_result = json.loads(result)
+        
+        # Verify parsed result is a dict, not a list - this is the KEY assertion for issue #845
+        # The original bug caused parsed_result to be a list, which fails Plan.model_validate()
+        assert isinstance(parsed_result, dict), f"Expected dict but got {type(parsed_result).__name__}"
+        
+        # Verify it can be validated by the real Plan.model_validate() without raising ValidationError
+        # This is the key assertion - if parsed_result was a list (the original bug),
+        # this would raise: ValidationError: 1 validation error for PlanInput should be a valid dictionary
+        validated_plan = real_model_validate(parsed_result)
+        assert validated_plan.title == "Test Plan Title"
+        assert validated_plan.locale == "en-US"
+        assert len(validated_plan.steps) == 1
+        
+        # Test case 2: Multimodal list with content block format
+        content_dict_blocks = {"content": [{"type": "text", "text": plan_json}]}
+        result_blocks = extract_plan_content(content_dict_blocks)
+        parsed_blocks = json.loads(result_blocks)
+        assert isinstance(parsed_blocks, dict), f"Expected dict but got {type(parsed_blocks).__name__}"
+        validated_blocks = real_model_validate(parsed_blocks)
+        assert validated_blocks.title == "Test Plan Title"
+        
+        # Test case 3: Mixed content - should extract only valid text
+        content_dict_mixed = {"content": [plan_json, ["reference1", "reference2"]]}
+        result_mixed = extract_plan_content(content_dict_mixed)
+        parsed_mixed = json.loads(result_mixed)
+        assert isinstance(parsed_mixed, dict), f"Expected dict but got {type(parsed_mixed).__name__}"
+        validated_mixed = real_model_validate(parsed_mixed)
+        assert validated_mixed.title == "Test Plan Title"
+
 
 # 在这里 mock 掉 get_llm_by_type，避免 ValueError
 with patch("src.llms.llm.get_llm_by_type", return_value=MagicMock()):