fix: ensure web search is performed for research plans to fix #535 (#640)

* fix: ensure web search is performed for research plans to fix #535

          When using certain models (DeepSeek-V3, Qwen3, or local deployments), the
          agent framework failed to trigger web search tools, resulting in hallucinated
          data. This fix implements multiple safeguards:

          1. Add enforce_web_search configuration flag:
             - New config option to mandate web search in research plans
             - Defaults to False for backward compatibility

          2. Add plan validation function validate_and_fix_plan():
             - Validates that plans include at least one research step with web search
             - Enforces web search requirement when enabled
             - Adds default research step if plan has no steps

          3. Enhance coordinator_node fallback logic:
             - When model fails to call tools, fallback to planner instead of __end__
             - Ensures workflow continues even when tool calling fails
             - Logs detailed diagnostic info for debugging

          4. Update prompts for stricter requirements:
             - planner.md: Add MANDATORY web search requirement and clear warnings
             - coordinator.md: Add CRITICAL tool calling requirement
             - Emphasize consequences of missing web search (hallucinated data)

          5. Update tests to reflect new behavior:
             - test_coordinator_node_no_tool_calls: Expect planner instead of __end__
             - test_coordinator_empty_llm_response_corner_case: Same expectation

          Fixes #535 by ensuring:
          - Web search is always performed for research tasks
          - Workflow doesn't terminate on tool calling failures
          - Models with poor tool calling support can still proceed
          - No hallucinated data without real information gathering

* Update src/graph/nodes.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update src/graph/nodes.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* accept the review suggestion of getting configuration

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Willem Jiang
2025-10-22 08:27:06 +08:00
committed by GitHub
parent 2ff7d9adf8
commit add0a701f4
5 changed files with 95 additions and 8 deletions

View File

@@ -51,6 +51,7 @@ class Configuration:
mcp_settings: dict = None # MCP settings, including dynamic loaded tools mcp_settings: dict = None # MCP settings, including dynamic loaded tools
report_style: str = ReportStyle.ACADEMIC.value # Report style report_style: str = ReportStyle.ACADEMIC.value # Report style
enable_deep_thinking: bool = False # Whether to enable deep thinking enable_deep_thinking: bool = False # Whether to enable deep thinking
enforce_web_search: bool = False # Enforce at least one web search step in every plan
@classmethod @classmethod
def from_runnable_config( def from_runnable_config(

View File

@@ -75,6 +75,54 @@ def needs_clarification(state: dict) -> bool:
) )
def validate_and_fix_plan(plan: dict, enforce_web_search: bool = False) -> dict:
"""
Validate and fix a plan to ensure it meets requirements.
Args:
plan: The plan dict to validate
enforce_web_search: If True, ensure at least one step has need_search=true
Returns:
The validated/fixed plan dict
"""
if not isinstance(plan, dict):
return plan
steps = plan.get("steps", [])
if enforce_web_search:
# Check if any step has need_search=true
has_search_step = any(step.get("need_search", False) for step in steps)
if not has_search_step and steps:
# Ensure first research step has web search enabled
for idx, step in enumerate(steps):
if step.get("step_type") == "research":
step["need_search"] = True
logger.info(f"Enforced web search on research step at index {idx}")
break
else:
# Fallback: If no research step exists, convert the first step to a research step with web search enabled.
# This ensures that at least one step will perform a web search as required.
steps[0]["step_type"] = "research"
steps[0]["need_search"] = True
logger.info("Converted first step to research with web search enforcement")
elif not has_search_step and not steps:
# Add a default research step if no steps exist
logger.warning("Plan has no steps. Adding default research step.")
plan["steps"] = [
{
"need_search": True,
"title": "Initial Research",
"description": "Gather information about the topic",
"step_type": "research",
}
]
return plan
def background_investigation_node(state: State, config: RunnableConfig): def background_investigation_node(state: State, config: RunnableConfig):
logger.info("background investigation node is running.") logger.info("background investigation node is running.")
configurable = Configuration.from_runnable_config(config) configurable = Configuration.from_runnable_config(config)
@@ -182,6 +230,11 @@ def planner_node(
return Command(goto="reporter") return Command(goto="reporter")
else: else:
return Command(goto="__end__") return Command(goto="__end__")
# Validate and fix plan to ensure web search requirements are met
if isinstance(curr_plan, dict):
curr_plan = validate_and_fix_plan(curr_plan, configurable.enforce_web_search)
if isinstance(curr_plan, dict) and curr_plan.get("has_enough_context"): if isinstance(curr_plan, dict) and curr_plan.get("has_enough_context"):
logger.info("Planner response has enough context.") logger.info("Planner response has enough context.")
new_plan = Plan.model_validate(curr_plan) new_plan = Plan.model_validate(curr_plan)
@@ -234,6 +287,9 @@ def human_feedback_node(
plan_iterations += 1 plan_iterations += 1
# parse the plan # parse the plan
new_plan = json.loads(current_plan) new_plan = json.loads(current_plan)
# Validate and fix plan to ensure web search requirements are met
configurable = Configuration.from_runnable_config(config)
new_plan = validate_and_fix_plan(new_plan, configurable.enforce_web_search)
except json.JSONDecodeError: except json.JSONDecodeError:
logger.warning("Planner response is not a valid JSON") logger.warning("Planner response is not a valid JSON")
if plan_iterations > 1: # the plan_iterations is increased before this check if plan_iterations > 1: # the plan_iterations is increased before this check
@@ -508,9 +564,16 @@ def coordinator_node(
logger.error(f"Error processing tool calls: {e}") logger.error(f"Error processing tool calls: {e}")
goto = "planner" goto = "planner"
else: else:
# No tool calls - both modes should goto __end__ # No tool calls detected - fallback to planner instead of ending
logger.warning("LLM didn't call any tools. Staying at __end__.") logger.warning(
goto = "__end__" "LLM didn't call any tools. This may indicate tool calling issues with the model. "
"Falling back to planner to ensure research proceeds."
)
# Log full response for debugging
logger.debug(f"Coordinator response content: {response.content}")
logger.debug(f"Coordinator response object: {response}")
# Fallback to planner to ensure workflow continues
goto = "planner"
# Apply background_investigation routing if enabled (unified logic) # Apply background_investigation routing if enabled (unified logic)
if goto == "planner" and state.get("enable_background_investigation"): if goto == "planner" and state.get("enable_background_investigation"):

View File

@@ -51,6 +51,15 @@ Your primary responsibilities are:
- For all other inputs (category 3 - which includes most questions): - For all other inputs (category 3 - which includes most questions):
- call `handoff_to_planner()` tool to handoff to planner for research without ANY thoughts. - call `handoff_to_planner()` tool to handoff to planner for research without ANY thoughts.
# Tool Calling Requirements
**CRITICAL**: You MUST call one of the available tools for research requests. This is mandatory:
- Do NOT respond to research questions without calling a tool
- For research questions, ALWAYS use either `handoff_to_planner()` or `handoff_after_clarification()`
- Tool calling is required to ensure the workflow proceeds correctly
- Never skip tool calling even if you think you can answer the question directly
- Responding with text alone for research requests will cause the workflow to fail
# Clarification Process (When Enabled) # Clarification Process (When Enabled)
Goal: Get 2+ dimensions before handing off to planner. Goal: Get 2+ dimensions before handing off to planner.

View File

@@ -64,6 +64,8 @@ Different types of steps have different web search requirements:
- Collecting competitor analysis - Collecting competitor analysis
- Researching current events or news - Researching current events or news
- Finding statistical data or reports - Finding statistical data or reports
- **CRITICAL**: Research plans MUST include at least one step with `need_search: true` to gather real information
- Without web search, the report will contain hallucinated/fabricated data
2. **Data Processing Steps** (`need_search: false`): 2. **Data Processing Steps** (`need_search: false`):
- API calls and data extraction - API calls and data extraction
@@ -71,6 +73,15 @@ Different types of steps have different web search requirements:
- Raw data collection from existing sources - Raw data collection from existing sources
- Mathematical calculations and analysis - Mathematical calculations and analysis
- Statistical computations and data processing - Statistical computations and data processing
- **NOTE**: Processing steps alone are insufficient - you must include research steps with web search
## Web Search Requirement
**MANDATORY**: Every research plan MUST include at least one step with `need_search: true`. This is critical because:
- Without web search, models generate hallucinated data
- Research steps must gather real information from external sources
- Pure processing steps cannot generate credible information for the final report
- At least one research step must search the web for factual data
## Exclusions ## Exclusions
@@ -143,6 +154,7 @@ When planning information gathering, consider these key aspects and ensure COMPR
- Create NO MORE THAN {{ max_step_num }} focused and comprehensive steps that cover the most essential aspects - Create NO MORE THAN {{ max_step_num }} focused and comprehensive steps that cover the most essential aspects
- Ensure each step is substantial and covers related information categories - Ensure each step is substantial and covers related information categories
- Prioritize breadth and depth within the {{ max_step_num }}-step constraint - Prioritize breadth and depth within the {{ max_step_num }}-step constraint
- **MANDATORY**: Include at least ONE research step with `need_search: true` to avoid hallucinated data
- For each step, carefully assess if web search is needed: - For each step, carefully assess if web search is needed:
- Research and external data gathering: Set `need_search: true` - Research and external data gathering: Set `need_search: true`
- Internal data processing: Set `need_search: false` - Internal data processing: Set `need_search: false`
@@ -150,6 +162,7 @@ When planning information gathering, consider these key aspects and ensure COMPR
- Prioritize depth and volume of relevant information - limited information is not acceptable. - Prioritize depth and volume of relevant information - limited information is not acceptable.
- Use the same language as the user to generate the plan. - Use the same language as the user to generate the plan.
- Do not include steps for summarizing or consolidating the gathered information. - Do not include steps for summarizing or consolidating the gathered information.
- **CRITICAL**: Verify that your plan includes at least one step with `need_search: true` before finalizing
# Output Format # Output Format

View File

@@ -568,7 +568,7 @@ def test_coordinator_node_no_tool_calls(
patch_handoff_to_planner, patch_handoff_to_planner,
patch_logger, patch_logger,
): ):
# No tool calls, should goto __end__ # No tool calls, should fallback to planner (fix for issue #535)
with ( with (
patch("src.graph.nodes.AGENT_LLM_MAP", {"coordinator": "basic"}), patch("src.graph.nodes.AGENT_LLM_MAP", {"coordinator": "basic"}),
patch("src.graph.nodes.get_llm_by_type") as mock_get_llm, patch("src.graph.nodes.get_llm_by_type") as mock_get_llm,
@@ -579,7 +579,8 @@ def test_coordinator_node_no_tool_calls(
mock_get_llm.return_value = mock_llm mock_get_llm.return_value = mock_llm
result = coordinator_node(mock_state_coordinator, MagicMock()) result = coordinator_node(mock_state_coordinator, MagicMock())
assert result.goto == "__end__" # Should fallback to planner instead of __end__ to ensure workflow continues
assert result.goto == "planner"
assert result.update["locale"] == "en-US" assert result.update["locale"] == "en-US"
assert result.update["resources"] == ["resource1", "resource2"] assert result.update["resources"] == ["resource1", "resource2"]
@@ -1535,7 +1536,7 @@ def test_coordinator_empty_llm_response_corner_case(mock_get_llm):
This tests error handling when LLM fails to return any content or tool calls This tests error handling when LLM fails to return any content or tool calls
in the initial state (clarification_rounds=0). The system should gracefully in the initial state (clarification_rounds=0). The system should gracefully
handle this by going to __end__ instead of crashing. handle this by going to planner instead of crashing (fix for issue #535).
Note: This is NOT a typical clarification workflow test, but rather tests Note: This is NOT a typical clarification workflow test, but rather tests
fault tolerance when LLM misbehaves. fault tolerance when LLM misbehaves.
@@ -1563,6 +1564,6 @@ def test_coordinator_empty_llm_response_corner_case(mock_get_llm):
# Call coordinator_node - should not crash # Call coordinator_node - should not crash
result = coordinator_node(state, config) result = coordinator_node(state, config)
# Should gracefully handle empty response by going to __end__ # Should gracefully handle empty response by going to planner to ensure workflow continues
assert result.goto == "__end__" assert result.goto == "planner"
assert result.update["locale"] == "en-US" assert result.update["locale"] == "en-US"