support infoquest (#960)

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
2026-04-03 06:12:14 +08:00 · 2026-03-06 15:32:13 +08:00
parent 3e4a24f48b
commit 28e1257e1e
6 changed files with 590 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -4,6 +4,8 @@ TAVILY_API_KEY=your-tavily-api-key
 # Jina API Key
 JINA_API_KEY=your-jina-api-key
 # InfoQuest API Key
 INFOQUEST_API_KEY=your-infoquest-api-key
 # CORS Origins (comma-separated) - e.g., http://localhost:3000,http://localhost:3001
 # CORS_ORIGINS=http://localhost:3000
--- a/README.md
+++ b/README.md
@@ -16,6 +16,16 @@ Learn more and see **real demos** on our official website.
 **[deerflow.tech](https://deerflow.tech/)**
 ## InfoQuest
 DeerFlow has newly integrated the intelligent search and crawling toolset independently developed by BytePlus--[InfoQuest (supports free online experience)](https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest)
 <a href="https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest" target="_blank">
  <img 
    src="https://sf16-sg.tiktokcdn.com/obj/eden-sg/hubseh7bsbps/20251208-160108.png"   alt="InfoQuest_banner" 
  />
 </a>
 ---
 ## Table of Contents
@@ -94,6 +104,7 @@ Learn more and see **real demos** on our official website.
   TAVILY_API_KEY=your-tavily-api-key
   OPENAI_API_KEY=your-openai-api-key
   # Add other provider keys as needed
   INFOQUEST_API_KEY=your-infoquest-api-key
   ```
 - Option B: Export environment variables in your shell
--- a/backend/src/community/infoquest/infoquest_client.py
+++ b/backend/src/community/infoquest/infoquest_client.py
@@ -0,0 +1,312 @@
 """Util that calls InfoQuest Search And Fetch API.
 In order to set this up, follow instructions at:
 https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
 """
 import json
 import logging
 import os
 from typing import Any
 import requests
 logger = logging.getLogger(__name__)
 class InfoQuestClient:
    """Client for interacting with the InfoQuest web search and fetch API."""
    def __init__(self, fetch_time: int = -1, fetch_timeout: int = -1, fetch_navigation_timeout: int = -1, search_time_range: int = -1):
        logger.info("\n============================================\n🚀 BytePlus InfoQuest Client Initialization 🚀\n============================================")
        self.fetch_time = fetch_time
        self.fetch_timeout = fetch_timeout
        self.fetch_navigation_timeout = fetch_navigation_timeout
        self.search_time_range = search_time_range
        self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY"))
        if logger.isEnabledFor(logging.DEBUG):
            config_details = (
                f"\n📋 Configuration Details:\n"
                f"├── Fetch time: {fetch_time} {'(Default: No fetch time)' if fetch_time == -1 else '(Custom)'}\n"
                f"├── Fetch Timeout: {fetch_timeout} {'(Default: No fetch timeout)' if fetch_timeout == -1 else '(Custom)'}\n"
                f"├── Navigation Timeout: {fetch_navigation_timeout} {'(Default: No Navigation Timeout)' if fetch_navigation_timeout == -1 else '(Custom)'}\n"
                f"├── Search Time Range: {search_time_range} {'(Default: No Search Time Range)' if search_time_range == -1 else '(Custom)'}\n"
                f"└── API Key: {'✅ Configured' if self.api_key_set else '❌ Not set'}"
            )
            logger.debug(config_details)
            logger.debug("\n" + "*" * 70 + "\n")
    def fetch(self, url: str, return_format: str = "html") -> str:
        if logger.isEnabledFor(logging.DEBUG):
            url_truncated = url[:50] + "..." if len(url) > 50 else url
            logger.debug(
                f"InfoQuest - Fetch API request initiated | "
                f"operation=crawl url | "
                f"url_truncated={url_truncated} | "
                f"has_timeout_filter={self.fetch_timeout > 0} | timeout_filter={self.fetch_timeout} | "
                f"has_fetch_time_filter={self.fetch_time > 0} | fetch_time_filter={self.fetch_time} | "
                f"has_navigation_timeout_filter={self.fetch_navigation_timeout > 0} | navi_timeout_filter={self.fetch_navigation_timeout} | "
                f"request_type=sync"
            )
        # Prepare headers
        headers = self._prepare_headers()
        # Prepare request data
        data = self._prepare_crawl_request_data(url, return_format)
        logger.debug("Sending crawl request to InfoQuest API")
        try:
            response = requests.post("https://reader.infoquest.bytepluses.com", headers=headers, json=data)
            # Check if status code is not 200
            if response.status_code != 200:
                error_message = f"fetch API returned status {response.status_code}: {response.text}"
                logger.debug("InfoQuest Crawler fetch API return status %d: %s for URL: %s", response.status_code, response.text, url)
                return f"Error: {error_message}"
            # Check for empty response
            if not response.text or not response.text.strip():
                error_message = "no result found"
                logger.debug("InfoQuest Crawler returned empty response for URL: %s", url)
                return f"Error: {error_message}"
            # Try to parse response as JSON and extract reader_result
            try:
                response_data = json.loads(response.text)
                # Extract reader_result if it exists
                if "reader_result" in response_data:
                    logger.debug("Successfully extracted reader_result from JSON response")
                    return response_data["reader_result"]
                elif "content" in response_data:
                    # Fallback to content field if reader_result is not available
                    logger.debug("reader_result missing in JSON response, falling back to content field: %s",
                                 response_data["content"])
                    return response_data["content"]
                else:
                    # If neither field exists, return the original response
                    logger.warning("Neither reader_result nor content field found in JSON response")
            except json.JSONDecodeError:
                # If response is not JSON, return the original text
                logger.debug("Response is not in JSON format, returning as-is")
                return response.text
            # Print partial response for debugging
            if logger.isEnabledFor(logging.DEBUG):
                response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "")
                logger.debug("Successfully received response, content length: %d bytes, first 200 chars: %s", len(response.text), response_sample)
            return response.text
        except Exception as e:
            error_message = f"fetch API failed: {str(e)}"
            logger.error(error_message)
            return f"Error: {error_message}"
    @staticmethod
    def _prepare_headers() -> dict[str, str]:
        """Prepare request headers."""
        headers = {
            "Content-Type": "application/json",
        }
        # Add API key if available
        if os.getenv("INFOQUEST_API_KEY"):
            headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}"
            logger.debug("API key added to request headers")
        else:
            logger.warning("InfoQuest API key is not set. Provide your own key for authentication.")
        return headers
    def _prepare_crawl_request_data(self, url: str, return_format: str) -> dict[str, Any]:
        """Prepare request data with formatted parameters."""
        # Normalize return_format
        if return_format and return_format.lower() == "html":
            normalized_format = "HTML"
        else:
            normalized_format = return_format
        data = {"url": url, "format": normalized_format}
        # Add timeout parameters if set to positive values
        timeout_params = {}
        if self.fetch_time > 0:
            timeout_params["fetch_time"] = self.fetch_time
        if self.fetch_timeout > 0:
            timeout_params["timeout"] = self.fetch_timeout
        if self.fetch_navigation_timeout > 0:
            timeout_params["navi_timeout"] = self.fetch_navigation_timeout
        # Log applied timeout parameters
        if timeout_params:
            logger.debug("Applying timeout parameters: %s", timeout_params)
            data.update(timeout_params)
        return data
    def web_search_raw_results(
        self,
        query: str,
        site: str,
        output_format: str = "JSON",
    ) -> dict:
        """Get results from the InfoQuest Web-Search API synchronously."""
        headers = self._prepare_headers()
        params = {"format": output_format, "query": query}
        if self.search_time_range > 0:
            params["time_range"] = self.search_time_range
        if site != "":
            params["site"] = site
        response = requests.post("https://search.infoquest.bytepluses.com", headers=headers, json=params)
        response.raise_for_status()
        # Print partial response for debugging
        response_json = response.json()
        if logger.isEnabledFor(logging.DEBUG):
            response_sample = json.dumps(response_json)[:200] + ("..." if len(json.dumps(response_json)) > 200 else "")
            logger.debug(f"Search API request completed successfully | service=InfoQuest | status=success | response_sample={response_sample}")
        return response_json
    @staticmethod
    def clean_results(raw_results: list[dict[str, dict[str, dict[str, Any]]]]) -> list[dict]:
        """Clean results from InfoQuest Web-Search API."""
        logger.debug("Processing web-search results")
        seen_urls = set()
        clean_results = []
        counts = {"pages": 0, "news": 0}
        for content_list in raw_results:
            content = content_list["content"]
            results = content["results"]
            if results.get("organic"):
                organic_results = results["organic"]
                for result in organic_results:
                    clean_result = {
                        "type": "page",
                    }
                    if "title" in result:
                        clean_result["title"] = result["title"]
                    if "desc" in result:
                        clean_result["desc"] = result["desc"]
                        clean_result["snippet"] = result["desc"]
                    if "url" in result:
                        clean_result["url"] = result["url"]
                        url = clean_result["url"]
                        if isinstance(url, str) and url and url not in seen_urls:
                            seen_urls.add(url)
                            clean_results.append(clean_result)
                            counts["pages"] += 1
            if results.get("top_stories"):
                news = results["top_stories"]
                for obj in news["items"]:
                    clean_result = {
                        "type": "news",
                    }
                    if "time_frame" in obj:
                        clean_result["time_frame"] = obj["time_frame"]
                    if "source" in obj:
                        clean_result["source"] = obj["source"]
                    title = obj.get("title")
                    url = obj.get("url")
                    if title:
                        clean_result["title"] = title
                    if url:
                        clean_result["url"] = url
                    if title and isinstance(url, str) and url and url not in seen_urls:
                        seen_urls.add(url)
                        clean_results.append(clean_result)
                        counts["news"] += 1
        logger.debug(f"Results processing completed | total_results={len(clean_results)} | pages={counts['pages']} | news_items={counts['news']} | unique_urls={len(seen_urls)}")
        return clean_results
    def web_search(
        self,
        query: str,
        site: str = "",
        output_format: str = "JSON",
    ) -> str:
        if logger.isEnabledFor(logging.DEBUG):
            query_truncated = query[:50] + "..." if len(query) > 50 else query
            logger.debug(
                f"InfoQuest - Search API request initiated | "
                f"operation=search webs | "
                f"query_truncated={query_truncated} | "
                f"has_time_filter={self.search_time_range > 0} | time_filter={self.search_time_range} | "
                f"has_site_filter={bool(site)} | site={site} | "
                f"request_type=sync"
            )
        try:
            logger.debug("InfoQuest Web-Search - Executing search with parameters")
            raw_results = self.web_search_raw_results(
                query,
                site,
                output_format,
            )
            if "search_result" in raw_results:
                logger.debug("InfoQuest Web-Search - Successfully extracted search_result from JSON response")
                results = raw_results["search_result"]
                logger.debug("InfoQuest Web-Search - Processing raw search results")
                cleaned_results = self.clean_results(results["results"])
                result_json = json.dumps(cleaned_results, indent=2, ensure_ascii=False)
                logger.debug(f"InfoQuest Web-Search - Search tool execution completed | mode=synchronous | results_count={len(cleaned_results)}")
                return result_json
            elif "content" in raw_results:
                # Fallback to content field if search_result is not available
                error_message = "web search API return wrong format"
                logger.error("web search API return wrong format, no search_result nor content field found in JSON response, content: %s", raw_results["content"])
                return f"Error: {error_message}"
            else:
                # If neither field exists, return the original response
                logger.warning("InfoQuest Web-Search - Neither search_result nor content field found in JSON response")
                return json.dumps(raw_results, indent=2, ensure_ascii=False)
        except Exception as e:
            error_message = f"InfoQuest Web-Search - Search tool execution failed | mode=synchronous | error={str(e)}"
            logger.error(error_message)
            return f"Error: {error_message}"
    @staticmethod
    def clean_results_with_image_search(raw_results: list[dict[str, dict[str, dict[str, Any]]]]) -> list[dict]:
        """Clean results from InfoQuest Web-Search API."""
        logger.debug("Processing web-search results")
        seen_urls = set()
        clean_results = []
        counts = {"images": 0}
        for content_list in raw_results:
            content = content_list["content"]
            results = content["results"]
            if results.get("images_results"):
                images_results = results["images_results"]
                for result in images_results:
                    clean_result = {}
                    if "image_url" in result:
                        clean_result["image_url"] = result["image_url"]
                        url = clean_result["image_url"]
                        if isinstance(url, str) and url and url not in seen_urls:
                            seen_urls.add(url)
                            clean_results.append(clean_result)
                            counts["images"] += 1
                    if "thumbnail_url" in result:
                        clean_result["thumbnail_url"] = result["thumbnail_url"]
                    if "url" in result:
                        clean_result["url"] = result["url"]
        logger.debug(f"Results processing completed | total_results={len(clean_results)} | images={counts['images']} | unique_urls={len(seen_urls)}")
        return clean_results
--- a/backend/src/community/infoquest/tools.py
+++ b/backend/src/community/infoquest/tools.py
@@ -0,0 +1,63 @@
 from langchain.tools import tool
 from src.config import get_app_config
 from src.utils.readability import ReadabilityExtractor
 from .infoquest_client import InfoQuestClient
 readability_extractor = ReadabilityExtractor()
 def _get_infoquest_client() -> InfoQuestClient:
    search_config = get_app_config().get_tool_config("web_search")
    search_time_range = -1
    if search_config is not None and "search_time_range" in search_config.model_extra:
        search_time_range = search_config.model_extra.get("search_time_range")
    fetch_config = get_app_config().get_tool_config("web_fetch")
    fetch_time = -1
    if fetch_config is not None and "fetch_time" in fetch_config.model_extra:
        fetch_time = fetch_config.model_extra.get("fetch_time")
    fetch_timeout = -1
    if fetch_config is not None and "timeout" in fetch_config.model_extra:
        fetch_timeout = fetch_config.model_extra.get("timeout")
    navigation_timeout = -1
    if fetch_config is not None and "navigation_timeout" in fetch_config.model_extra:
        navigation_timeout = fetch_config.model_extra.get("navigation_timeout")
    return InfoQuestClient(
        search_time_range=search_time_range,
        fetch_timeout=fetch_timeout,
        fetch_navigation_timeout=navigation_timeout,
        fetch_time=fetch_time,
    )
@tool("web_search", parse_docstring=True)
 def web_search_tool(query: str) -> str:
    """Search the web.
    Args:
        query: The query to search for.
    """
    client = _get_infoquest_client()
    return client.web_search(query)
@tool("web_fetch", parse_docstring=True)
 def web_fetch_tool(url: str) -> str:
    """Fetch the contents of a web page at a given URL.
    Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
    This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
    Do NOT add www. to URLs that do NOT have them.
    URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
    Args:
        url: The URL to fetch the contents of.
    """
    client = _get_infoquest_client()
    result = client.fetch(url)
    if result.startswith("Error: "):
        return result
    article = readability_extractor.extract_article(result)
    return article.to_markdown()[:4096]
--- a/backend/tests/test_infoquest_client.py
+++ b/backend/tests/test_infoquest_client.py
@@ -0,0 +1,184 @@
 """Tests for InfoQuest client and tools."""
 import json
 from unittest.mock import MagicMock, patch
 from src.community.infoquest import tools
 from src.community.infoquest.infoquest_client import InfoQuestClient
 class TestInfoQuestClient:
    def test_infoquest_client_initialization(self):
        """Test InfoQuestClient initialization with different parameters."""
        # Test with default parameters
        client = InfoQuestClient()
        assert client.fetch_time == -1
        assert client.fetch_timeout == -1
        assert client.fetch_navigation_timeout == -1
        assert client.search_time_range == -1
        # Test with custom parameters
        client = InfoQuestClient(fetch_time=10, fetch_timeout=30, fetch_navigation_timeout=60, search_time_range=24)
        assert client.fetch_time == 10
        assert client.fetch_timeout == 30
        assert client.fetch_navigation_timeout == 60
        assert client.search_time_range == 24
    @patch("src.community.infoquest.infoquest_client.requests.post")
    def test_fetch_success(self, mock_post):
        """Test successful fetch operation."""
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.text = json.dumps({"reader_result": "<html><body>Test content</body></html>"})
        mock_post.return_value = mock_response
        client = InfoQuestClient()
        result = client.fetch("https://example.com")
        assert result == "<html><body>Test content</body></html>"
        mock_post.assert_called_once()
        args, kwargs = mock_post.call_args
        assert args[0] == "https://reader.infoquest.bytepluses.com"
        assert kwargs["json"]["url"] == "https://example.com"
        assert kwargs["json"]["format"] == "HTML"
    @patch("src.community.infoquest.infoquest_client.requests.post")
    def test_fetch_non_200_status(self, mock_post):
        """Test fetch operation with non-200 status code."""
        mock_response = MagicMock()
        mock_response.status_code = 404
        mock_response.text = "Not Found"
        mock_post.return_value = mock_response
        client = InfoQuestClient()
        result = client.fetch("https://example.com")
        assert result == "Error: fetch API returned status 404: Not Found"
    @patch("src.community.infoquest.infoquest_client.requests.post")
    def test_fetch_empty_response(self, mock_post):
        """Test fetch operation with empty response."""
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.text = ""
        mock_post.return_value = mock_response
        client = InfoQuestClient()
        result = client.fetch("https://example.com")
        assert result == "Error: no result found"
    @patch("src.community.infoquest.infoquest_client.requests.post")
    def test_web_search_raw_results_success(self, mock_post):
        """Test successful web_search_raw_results operation."""
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.json.return_value = {"search_result": {"results": [{"content": {"results": {"organic": [{"title": "Test Result", "desc": "Test description", "url": "https://example.com"}]}}}], "images_results": []}}
        mock_post.return_value = mock_response
        client = InfoQuestClient()
        result = client.web_search_raw_results("test query", "")
        assert "search_result" in result
        mock_post.assert_called_once()
        args, kwargs = mock_post.call_args
        assert args[0] == "https://search.infoquest.bytepluses.com"
        assert kwargs["json"]["query"] == "test query"
    @patch("src.community.infoquest.infoquest_client.requests.post")
    def test_web_search_success(self, mock_post):
        """Test successful web_search operation."""
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.json.return_value = {"search_result": {"results": [{"content": {"results": {"organic": [{"title": "Test Result", "desc": "Test description", "url": "https://example.com"}]}}}], "images_results": []}}
        mock_post.return_value = mock_response
        client = InfoQuestClient()
        result = client.web_search("test query")
        # Check if result is a valid JSON string with expected content
        result_data = json.loads(result)
        assert len(result_data) == 1
        assert result_data[0]["title"] == "Test Result"
        assert result_data[0]["url"] == "https://example.com"
    def test_clean_results(self):
        """Test clean_results method with sample raw results."""
        raw_results = [
            {
                "content": {
                    "results": {
                        "organic": [{"title": "Test Page", "desc": "Page description", "url": "https://example.com/page1"}],
                        "top_stories": {"items": [{"title": "Test News", "source": "Test Source", "time_frame": "2 hours ago", "url": "https://example.com/news1"}]},
                    }
                }
            }
        ]
        cleaned = InfoQuestClient.clean_results(raw_results)
        assert len(cleaned) == 2
        assert cleaned[0]["type"] == "page"
        assert cleaned[0]["title"] == "Test Page"
        assert cleaned[1]["type"] == "news"
        assert cleaned[1]["title"] == "Test News"
    def test_clean_results_with_image_search(self):
        """Test clean_results_with_image_search method with sample raw results."""
        raw_results = [{"content": {"results": {"images_results": [{"image_url": "https://example.com/image1.jpg", "thumbnail_url": "https://example.com/thumb1.jpg","url": "https://example.com/page1"}]}}}]
        cleaned = InfoQuestClient.clean_results_with_image_search(raw_results)
        assert len(cleaned) == 1
        assert cleaned[0]["image_url"] == "https://example.com/image1.jpg"
        assert cleaned[0]["thumbnail_url"] == "https://example.com/thumb1.jpg"
        assert cleaned[0]["url"] == "https://example.com/page1"
    @patch("src.community.infoquest.tools._get_infoquest_client")
    def test_web_search_tool(self, mock_get_client):
        """Test web_search_tool function."""
        mock_client = MagicMock()
        mock_client.web_search.return_value = json.dumps([])
        mock_get_client.return_value = mock_client
        result = tools.web_search_tool.run("test query")
        assert result == json.dumps([])
        mock_get_client.assert_called_once()
        mock_client.web_search.assert_called_once_with("test query")
    @patch("src.community.infoquest.tools._get_infoquest_client")
    def test_web_fetch_tool(self, mock_get_client):
        """Test web_fetch_tool function."""
        mock_client = MagicMock()
        mock_client.fetch.return_value = "<html><body>Test content</body></html>"
        mock_get_client.return_value = mock_client
        result = tools.web_fetch_tool.run("https://example.com")
        assert result == "# Untitled\n\nTest content"
        mock_get_client.assert_called_once()
        mock_client.fetch.assert_called_once_with("https://example.com")
    @patch("src.community.infoquest.tools.get_app_config")
    def test_get_infoquest_client(self, mock_get_app_config):
        """Test _get_infoquest_client function with config."""
        mock_config = MagicMock()
        mock_config.get_tool_config.side_effect = [MagicMock(model_extra={"search_time_range": 24}), MagicMock(model_extra={"fetch_time": 10, "timeout": 30, "navigation_timeout": 60})]
        mock_get_app_config.return_value = mock_config
        client = tools._get_infoquest_client()
        assert client.search_time_range == 24
        assert client.fetch_time == 10
        assert client.fetch_timeout == 30
        assert client.fetch_navigation_timeout == 60
    @patch("src.community.infoquest.infoquest_client.requests.post")
    def test_web_search_api_error(self, mock_post):
        """Test web_search operation with API error."""
        mock_post.side_effect = Exception("Connection error")
        client = InfoQuestClient()
        result = client.web_search("test query")
        assert "Error" in result
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -127,12 +127,30 @@ tools:
    max_results: 5
    # api_key: $TAVILY_API_KEY  # Set if needed
  # Web search tool (requires InfoQuest API key)
  # - name: web_search
  #   group: web
  #   use: src.community.infoquest.tools:web_search_tool
  #   # Used to limit the scope of search results, only returns content within the specified time range. Set to -1 to disable time filtering
  #   search_time_range: 10
  # Web fetch tool (uses Jina AI reader)
  - name: web_fetch
    group: web
    use: src.community.jina_ai.tools:web_fetch_tool
    timeout: 10
  # Web fetch tool (uses InfoQuest AI reader)
  # - name: web_fetch
  #   group: web
  #   use: src.community.infoquest.tools:web_fetch_tool
  #   # Overall timeout for the entire crawling process (in seconds). Set to positive value to enable, -1 to disable
  #   timeout: 10
  #   # Waiting time after page loading (in seconds). Set to positive value to enable, -1 to disable
  #   fetch_time: 10
  #   # Timeout for navigating to the page (in seconds). Set to positive value to enable, -1 to disable
  #   navigation_timeout: 30
  # Image search tool (uses DuckDuckGo)
  # Use this to find reference images before image generation
  - name: image_search