From 314ea4178132e65519eb0036e8367ea7256a719f Mon Sep 17 00:00:00 2001
From: hetao <hetao7@pku.edu.cn>
Date: Thu, 29 Jan 2026 14:57:26 +0800
Subject: [PATCH] feat: optimize vision tools and image handling

- Add model-aware vision tool loading based on supports_vision flag
- Move view_image_tool from config to builtin tools for dynamic inclusion
- Add timeout to image search to prevent hanging requests
- Optimize image search results format using thumbnails
- Add image validation for reference images in generation
- Improve error handling with detailed messages

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 backend/src/agents/lead_agent/agent.py        |  2 +-
 backend/src/community/image_search/tools.py   |  8 +---
 backend/src/tools/tools.py                    | 19 +++++++--
 config.example.yaml                           |  7 +---
 .../image-generation/scripts/generate.py      | 42 +++++++++++++++++--
 5 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/backend/src/agents/lead_agent/agent.py b/backend/src/agents/lead_agent/agent.py
index 836cba1..19cb441 100644
--- a/backend/src/agents/lead_agent/agent.py
+++ b/backend/src/agents/lead_agent/agent.py
@@ -230,7 +230,7 @@ def make_lead_agent(config: RunnableConfig):
     print(f"thinking_enabled: {thinking_enabled}, model_name: {model_name}, is_plan_mode: {is_plan_mode}")
     return create_agent(
         model=create_chat_model(name=model_name, thinking_enabled=thinking_enabled),
-        tools=get_available_tools(),
+        tools=get_available_tools(model_name=model_name),
         middleware=_build_middlewares(config),
         system_prompt=apply_prompt_template(),
         state_schema=ThreadState,
diff --git a/backend/src/community/image_search/tools.py b/backend/src/community/image_search/tools.py
index f493bcc..89ccf34 100644
--- a/backend/src/community/image_search/tools.py
+++ b/backend/src/community/image_search/tools.py
@@ -46,7 +46,7 @@ def _search_images(
         logger.error("ddgs library not installed. Run: pip install ddgs")
         return []
 
-    ddgs = DDGS()
+    ddgs = DDGS(timeout=30)
 
     try:
         kwargs = {
@@ -119,12 +119,8 @@ def image_search_tool(
     normalized_results = [
         {
             "title": r.get("title", ""),
-            "image_url": r.get("image", ""),
+            "image_url": r.get("thumbnail", ""),
             "thumbnail_url": r.get("thumbnail", ""),
-            "source_url": r.get("url", ""),
-            "source": r.get("source", ""),
-            "width": r.get("width"),
-            "height": r.get("height"),
         }
         for r in results
     ]
diff --git a/backend/src/tools/tools.py b/backend/src/tools/tools.py
index e4915a5..38cbf32 100644
--- a/backend/src/tools/tools.py
+++ b/backend/src/tools/tools.py
@@ -4,7 +4,7 @@ from langchain.tools import BaseTool
 
 from src.config import get_app_config
 from src.reflection import resolve_variable
-from src.tools.builtins import ask_clarification_tool, present_file_tool
+from src.tools.builtins import ask_clarification_tool, present_file_tool, view_image_tool
 
 logger = logging.getLogger(__name__)
 
@@ -14,7 +14,7 @@ BUILTIN_TOOLS = [
 ]
 
 
-def get_available_tools(groups: list[str] | None = None, include_mcp: bool = True) -> list[BaseTool]:
+def get_available_tools(groups: list[str] | None = None, include_mcp: bool = True, model_name: str | None = None) -> list[BaseTool]:
     """Get all available tools from config.
 
     Note: MCP tools should be initialized at application startup using
@@ -23,6 +23,7 @@ def get_available_tools(groups: list[str] | None = None, include_mcp: bool = Tru
     Args:
         groups: Optional list of tool groups to filter by.
         include_mcp: Whether to include tools from MCP servers (default: True).
+        model_name: Optional model name to determine if vision tools should be included.
 
     Returns:
         List of available tools.
@@ -51,4 +52,16 @@ def get_available_tools(groups: list[str] | None = None, include_mcp: bool = Tru
         except Exception as e:
             logger.error(f"Failed to get cached MCP tools: {e}")
 
-    return loaded_tools + BUILTIN_TOOLS + mcp_tools
+    # Conditionally add view_image_tool only if the model supports vision
+    builtin_tools = BUILTIN_TOOLS.copy()
+
+    # If no model_name specified, use the first model (default)
+    if model_name is None and config.models:
+        model_name = config.models[0].name
+
+    model_config = config.get_model_config(model_name) if model_name else None
+    if model_config is not None and model_config.supports_vision:
+        builtin_tools.append(view_image_tool)
+        logger.info(f"Including view_image_tool for model '{model_name}' (supports_vision=True)")
+
+    return loaded_tools + builtin_tools + mcp_tools
diff --git a/config.example.yaml b/config.example.yaml
index 132d74e..af5cf67 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -69,7 +69,7 @@ models:
   #   api_key: $MOONSHOT_API_KEY
   #   max_tokens: 32768
   #   supports_thinking: true
-  #   supports_vision: false  # Check your specific model's capabilities
+  #   supports_vision: true  # Check your specific model's capabilities
   #   when_thinking_enabled:
   #     extra_body:
   #       thinking:
@@ -112,11 +112,6 @@ tools:
     use: src.community.image_search.tools:image_search_tool
     max_results: 5
 
-  # View image tool (display local images to user)
-  - name: view_image
-    group: file:read
-    use: src.tools.builtins:view_image_tool
-
   # File operations tools
   - name: ls
     group: file:read
diff --git a/skills/public/image-generation/scripts/generate.py b/skills/public/image-generation/scripts/generate.py
index 9bc7399..9665faf 100644
--- a/skills/public/image-generation/scripts/generate.py
+++ b/skills/public/image-generation/scripts/generate.py
@@ -2,6 +2,29 @@ import base64
 import os
 
 import requests
+from PIL import Image
+
+
+def validate_image(image_path: str) -> bool:
+    """
+    Validate if an image file can be opened and is not corrupted.
+    
+    Args:
+        image_path: Path to the image file
+        
+    Returns:
+        True if the image is valid and can be opened, False otherwise
+    """
+    try:
+        with Image.open(image_path) as img:
+            img.verify()  # Verify that it's a valid image
+        # Re-open to check if it can be fully loaded (verify() may not catch all issues)
+        with Image.open(image_path) as img:
+            img.load()  # Force load the image data
+        return True
+    except Exception as e:
+        print(f"Warning: Image '{image_path}' is invalid or corrupted: {e}")
+        return False
 
 
 def generate_image(
@@ -14,7 +37,19 @@ def generate_image(
         prompt = f.read()
     parts = []
     i = 0
-    for reference_image in reference_images:
+    
+    # Filter out invalid reference images
+    valid_reference_images = []
+    for ref_img in reference_images:
+        if validate_image(ref_img):
+            valid_reference_images.append(ref_img)
+        else:
+            print(f"Skipping invalid reference image: {ref_img}")
+    
+    if len(valid_reference_images) < len(reference_images):
+        print(f"Note: {len(reference_images) - len(valid_reference_images)} reference image(s) were skipped due to validation failure.")
+    
+    for reference_image in valid_reference_images:
         i += 1
         with open(reference_image, "rb") as f:
             image_b64 = base64.b64encode(f.read()).decode("utf-8")
@@ -41,6 +76,7 @@ def generate_image(
             "contents": [{"parts": [*parts, {"text": prompt}]}],
         },
     )
+    response.raise_for_status()
     json = response.json()
     parts: list[dict] = json["candidates"][0]["content"]["parts"]
     image_parts = [part for part in parts if part.get("inlineData", False)]
@@ -92,5 +128,5 @@ if __name__ == "__main__":
                 args.aspect_ratio,
             )
         )
-    except Exception:
-        print("Error while generating image.")
+    except Exception as e:
+        print(f"Error while generating image: {e}")