From 28e1257e1eb8b8d1c5b438666d84578f5da26860 Mon Sep 17 00:00:00 2001 From: infoquest-byteplus Date: Fri, 6 Mar 2026 15:32:13 +0800 Subject: [PATCH] support infoquest (#960) Co-authored-by: Willem Jiang --- .env.example | 2 + README.md | 11 + .../community/infoquest/infoquest_client.py | 312 ++++++++++++++++++ backend/src/community/infoquest/tools.py | 63 ++++ backend/tests/test_infoquest_client.py | 184 +++++++++++ config.example.yaml | 18 + 6 files changed, 590 insertions(+) create mode 100644 backend/src/community/infoquest/infoquest_client.py create mode 100644 backend/src/community/infoquest/tools.py create mode 100644 backend/tests/test_infoquest_client.py diff --git a/.env.example b/.env.example index 8107d30..0ae96d3 100644 --- a/.env.example +++ b/.env.example @@ -4,6 +4,8 @@ TAVILY_API_KEY=your-tavily-api-key # Jina API Key JINA_API_KEY=your-jina-api-key +# InfoQuest API Key +INFOQUEST_API_KEY=your-infoquest-api-key # CORS Origins (comma-separated) - e.g., http://localhost:3000,http://localhost:3001 # CORS_ORIGINS=http://localhost:3000 diff --git a/README.md b/README.md index e785e27..2cf42d5 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,16 @@ Learn more and see **real demos** on our official website. **[deerflow.tech](https://deerflow.tech/)** +## InfoQuest + +DeerFlow has newly integrated the intelligent search and crawling toolset independently developed by BytePlus--[InfoQuest (supports free online experience)](https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest) + + + InfoQuest_banner + + --- ## Table of Contents @@ -94,6 +104,7 @@ Learn more and see **real demos** on our official website. TAVILY_API_KEY=your-tavily-api-key OPENAI_API_KEY=your-openai-api-key # Add other provider keys as needed + INFOQUEST_API_KEY=your-infoquest-api-key ``` - Option B: Export environment variables in your shell diff --git a/backend/src/community/infoquest/infoquest_client.py b/backend/src/community/infoquest/infoquest_client.py new file mode 100644 index 0000000..00832de --- /dev/null +++ b/backend/src/community/infoquest/infoquest_client.py @@ -0,0 +1,312 @@ +"""Util that calls InfoQuest Search And Fetch API. + +In order to set this up, follow instructions at: +https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest +""" + +import json +import logging +import os +from typing import Any + +import requests + +logger = logging.getLogger(__name__) + + +class InfoQuestClient: + """Client for interacting with the InfoQuest web search and fetch API.""" + + def __init__(self, fetch_time: int = -1, fetch_timeout: int = -1, fetch_navigation_timeout: int = -1, search_time_range: int = -1): + logger.info("\n============================================\nšŸš€ BytePlus InfoQuest Client Initialization šŸš€\n============================================") + + self.fetch_time = fetch_time + self.fetch_timeout = fetch_timeout + self.fetch_navigation_timeout = fetch_navigation_timeout + self.search_time_range = search_time_range + self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY")) + if logger.isEnabledFor(logging.DEBUG): + config_details = ( + f"\nšŸ“‹ Configuration Details:\n" + f"ā”œā”€ā”€ Fetch time: {fetch_time} {'(Default: No fetch time)' if fetch_time == -1 else '(Custom)'}\n" + f"ā”œā”€ā”€ Fetch Timeout: {fetch_timeout} {'(Default: No fetch timeout)' if fetch_timeout == -1 else '(Custom)'}\n" + f"ā”œā”€ā”€ Navigation Timeout: {fetch_navigation_timeout} {'(Default: No Navigation Timeout)' if fetch_navigation_timeout == -1 else '(Custom)'}\n" + f"ā”œā”€ā”€ Search Time Range: {search_time_range} {'(Default: No Search Time Range)' if search_time_range == -1 else '(Custom)'}\n" + f"└── API Key: {'āœ… Configured' if self.api_key_set else 'āŒ Not set'}" + ) + + logger.debug(config_details) + logger.debug("\n" + "*" * 70 + "\n") + + def fetch(self, url: str, return_format: str = "html") -> str: + if logger.isEnabledFor(logging.DEBUG): + url_truncated = url[:50] + "..." if len(url) > 50 else url + logger.debug( + f"InfoQuest - Fetch API request initiated | " + f"operation=crawl url | " + f"url_truncated={url_truncated} | " + f"has_timeout_filter={self.fetch_timeout > 0} | timeout_filter={self.fetch_timeout} | " + f"has_fetch_time_filter={self.fetch_time > 0} | fetch_time_filter={self.fetch_time} | " + f"has_navigation_timeout_filter={self.fetch_navigation_timeout > 0} | navi_timeout_filter={self.fetch_navigation_timeout} | " + f"request_type=sync" + ) + + # Prepare headers + headers = self._prepare_headers() + + # Prepare request data + data = self._prepare_crawl_request_data(url, return_format) + + logger.debug("Sending crawl request to InfoQuest API") + try: + response = requests.post("https://reader.infoquest.bytepluses.com", headers=headers, json=data) + + # Check if status code is not 200 + if response.status_code != 200: + error_message = f"fetch API returned status {response.status_code}: {response.text}" + logger.debug("InfoQuest Crawler fetch API return status %d: %s for URL: %s", response.status_code, response.text, url) + return f"Error: {error_message}" + + # Check for empty response + if not response.text or not response.text.strip(): + error_message = "no result found" + logger.debug("InfoQuest Crawler returned empty response for URL: %s", url) + return f"Error: {error_message}" + + # Try to parse response as JSON and extract reader_result + try: + response_data = json.loads(response.text) + # Extract reader_result if it exists + if "reader_result" in response_data: + logger.debug("Successfully extracted reader_result from JSON response") + return response_data["reader_result"] + elif "content" in response_data: + # Fallback to content field if reader_result is not available + logger.debug("reader_result missing in JSON response, falling back to content field: %s", + response_data["content"]) + return response_data["content"] + else: + # If neither field exists, return the original response + logger.warning("Neither reader_result nor content field found in JSON response") + except json.JSONDecodeError: + # If response is not JSON, return the original text + logger.debug("Response is not in JSON format, returning as-is") + return response.text + + # Print partial response for debugging + if logger.isEnabledFor(logging.DEBUG): + response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "") + logger.debug("Successfully received response, content length: %d bytes, first 200 chars: %s", len(response.text), response_sample) + return response.text + except Exception as e: + error_message = f"fetch API failed: {str(e)}" + logger.error(error_message) + return f"Error: {error_message}" + + @staticmethod + def _prepare_headers() -> dict[str, str]: + """Prepare request headers.""" + headers = { + "Content-Type": "application/json", + } + + # Add API key if available + if os.getenv("INFOQUEST_API_KEY"): + headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}" + logger.debug("API key added to request headers") + else: + logger.warning("InfoQuest API key is not set. Provide your own key for authentication.") + + return headers + + def _prepare_crawl_request_data(self, url: str, return_format: str) -> dict[str, Any]: + """Prepare request data with formatted parameters.""" + # Normalize return_format + if return_format and return_format.lower() == "html": + normalized_format = "HTML" + else: + normalized_format = return_format + + data = {"url": url, "format": normalized_format} + + # Add timeout parameters if set to positive values + timeout_params = {} + if self.fetch_time > 0: + timeout_params["fetch_time"] = self.fetch_time + if self.fetch_timeout > 0: + timeout_params["timeout"] = self.fetch_timeout + if self.fetch_navigation_timeout > 0: + timeout_params["navi_timeout"] = self.fetch_navigation_timeout + + # Log applied timeout parameters + if timeout_params: + logger.debug("Applying timeout parameters: %s", timeout_params) + data.update(timeout_params) + + return data + + def web_search_raw_results( + self, + query: str, + site: str, + output_format: str = "JSON", + ) -> dict: + """Get results from the InfoQuest Web-Search API synchronously.""" + headers = self._prepare_headers() + + params = {"format": output_format, "query": query} + if self.search_time_range > 0: + params["time_range"] = self.search_time_range + + if site != "": + params["site"] = site + + response = requests.post("https://search.infoquest.bytepluses.com", headers=headers, json=params) + response.raise_for_status() + + # Print partial response for debugging + response_json = response.json() + if logger.isEnabledFor(logging.DEBUG): + response_sample = json.dumps(response_json)[:200] + ("..." if len(json.dumps(response_json)) > 200 else "") + logger.debug(f"Search API request completed successfully | service=InfoQuest | status=success | response_sample={response_sample}") + + return response_json + + @staticmethod + def clean_results(raw_results: list[dict[str, dict[str, dict[str, Any]]]]) -> list[dict]: + """Clean results from InfoQuest Web-Search API.""" + logger.debug("Processing web-search results") + + seen_urls = set() + clean_results = [] + counts = {"pages": 0, "news": 0} + + for content_list in raw_results: + content = content_list["content"] + results = content["results"] + + if results.get("organic"): + organic_results = results["organic"] + for result in organic_results: + clean_result = { + "type": "page", + } + if "title" in result: + clean_result["title"] = result["title"] + if "desc" in result: + clean_result["desc"] = result["desc"] + clean_result["snippet"] = result["desc"] + if "url" in result: + clean_result["url"] = result["url"] + url = clean_result["url"] + if isinstance(url, str) and url and url not in seen_urls: + seen_urls.add(url) + clean_results.append(clean_result) + counts["pages"] += 1 + + if results.get("top_stories"): + news = results["top_stories"] + for obj in news["items"]: + clean_result = { + "type": "news", + } + if "time_frame" in obj: + clean_result["time_frame"] = obj["time_frame"] + if "source" in obj: + clean_result["source"] = obj["source"] + title = obj.get("title") + url = obj.get("url") + if title: + clean_result["title"] = title + if url: + clean_result["url"] = url + if title and isinstance(url, str) and url and url not in seen_urls: + seen_urls.add(url) + clean_results.append(clean_result) + counts["news"] += 1 + logger.debug(f"Results processing completed | total_results={len(clean_results)} | pages={counts['pages']} | news_items={counts['news']} | unique_urls={len(seen_urls)}") + + return clean_results + + def web_search( + self, + query: str, + site: str = "", + output_format: str = "JSON", + ) -> str: + if logger.isEnabledFor(logging.DEBUG): + query_truncated = query[:50] + "..." if len(query) > 50 else query + logger.debug( + f"InfoQuest - Search API request initiated | " + f"operation=search webs | " + f"query_truncated={query_truncated} | " + f"has_time_filter={self.search_time_range > 0} | time_filter={self.search_time_range} | " + f"has_site_filter={bool(site)} | site={site} | " + f"request_type=sync" + ) + + try: + logger.debug("InfoQuest Web-Search - Executing search with parameters") + raw_results = self.web_search_raw_results( + query, + site, + output_format, + ) + if "search_result" in raw_results: + logger.debug("InfoQuest Web-Search - Successfully extracted search_result from JSON response") + results = raw_results["search_result"] + + logger.debug("InfoQuest Web-Search - Processing raw search results") + cleaned_results = self.clean_results(results["results"]) + + result_json = json.dumps(cleaned_results, indent=2, ensure_ascii=False) + + logger.debug(f"InfoQuest Web-Search - Search tool execution completed | mode=synchronous | results_count={len(cleaned_results)}") + return result_json + + elif "content" in raw_results: + # Fallback to content field if search_result is not available + error_message = "web search API return wrong format" + logger.error("web search API return wrong format, no search_result nor content field found in JSON response, content: %s", raw_results["content"]) + return f"Error: {error_message}" + else: + # If neither field exists, return the original response + logger.warning("InfoQuest Web-Search - Neither search_result nor content field found in JSON response") + return json.dumps(raw_results, indent=2, ensure_ascii=False) + + except Exception as e: + error_message = f"InfoQuest Web-Search - Search tool execution failed | mode=synchronous | error={str(e)}" + logger.error(error_message) + return f"Error: {error_message}" + + @staticmethod + def clean_results_with_image_search(raw_results: list[dict[str, dict[str, dict[str, Any]]]]) -> list[dict]: + """Clean results from InfoQuest Web-Search API.""" + logger.debug("Processing web-search results") + + seen_urls = set() + clean_results = [] + counts = {"images": 0} + + for content_list in raw_results: + content = content_list["content"] + results = content["results"] + + if results.get("images_results"): + images_results = results["images_results"] + for result in images_results: + clean_result = {} + if "image_url" in result: + clean_result["image_url"] = result["image_url"] + url = clean_result["image_url"] + if isinstance(url, str) and url and url not in seen_urls: + seen_urls.add(url) + clean_results.append(clean_result) + counts["images"] += 1 + if "thumbnail_url" in result: + clean_result["thumbnail_url"] = result["thumbnail_url"] + if "url" in result: + clean_result["url"] = result["url"] + logger.debug(f"Results processing completed | total_results={len(clean_results)} | images={counts['images']} | unique_urls={len(seen_urls)}") + + return clean_results diff --git a/backend/src/community/infoquest/tools.py b/backend/src/community/infoquest/tools.py new file mode 100644 index 0000000..555d1c7 --- /dev/null +++ b/backend/src/community/infoquest/tools.py @@ -0,0 +1,63 @@ +from langchain.tools import tool + +from src.config import get_app_config +from src.utils.readability import ReadabilityExtractor + +from .infoquest_client import InfoQuestClient + +readability_extractor = ReadabilityExtractor() + + +def _get_infoquest_client() -> InfoQuestClient: + search_config = get_app_config().get_tool_config("web_search") + search_time_range = -1 + if search_config is not None and "search_time_range" in search_config.model_extra: + search_time_range = search_config.model_extra.get("search_time_range") + fetch_config = get_app_config().get_tool_config("web_fetch") + fetch_time = -1 + if fetch_config is not None and "fetch_time" in fetch_config.model_extra: + fetch_time = fetch_config.model_extra.get("fetch_time") + fetch_timeout = -1 + if fetch_config is not None and "timeout" in fetch_config.model_extra: + fetch_timeout = fetch_config.model_extra.get("timeout") + navigation_timeout = -1 + if fetch_config is not None and "navigation_timeout" in fetch_config.model_extra: + navigation_timeout = fetch_config.model_extra.get("navigation_timeout") + + return InfoQuestClient( + search_time_range=search_time_range, + fetch_timeout=fetch_timeout, + fetch_navigation_timeout=navigation_timeout, + fetch_time=fetch_time, + ) + + +@tool("web_search", parse_docstring=True) +def web_search_tool(query: str) -> str: + """Search the web. + + Args: + query: The query to search for. + """ + + client = _get_infoquest_client() + return client.web_search(query) + + +@tool("web_fetch", parse_docstring=True) +def web_fetch_tool(url: str) -> str: + """Fetch the contents of a web page at a given URL. + Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools. + This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls. + Do NOT add www. to URLs that do NOT have them. + URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL. + + Args: + url: The URL to fetch the contents of. + """ + client = _get_infoquest_client() + result = client.fetch(url) + if result.startswith("Error: "): + return result + article = readability_extractor.extract_article(result) + return article.to_markdown()[:4096] diff --git a/backend/tests/test_infoquest_client.py b/backend/tests/test_infoquest_client.py new file mode 100644 index 0000000..2641229 --- /dev/null +++ b/backend/tests/test_infoquest_client.py @@ -0,0 +1,184 @@ +"""Tests for InfoQuest client and tools.""" + +import json +from unittest.mock import MagicMock, patch + +from src.community.infoquest import tools +from src.community.infoquest.infoquest_client import InfoQuestClient + + +class TestInfoQuestClient: + def test_infoquest_client_initialization(self): + """Test InfoQuestClient initialization with different parameters.""" + # Test with default parameters + client = InfoQuestClient() + assert client.fetch_time == -1 + assert client.fetch_timeout == -1 + assert client.fetch_navigation_timeout == -1 + assert client.search_time_range == -1 + + # Test with custom parameters + client = InfoQuestClient(fetch_time=10, fetch_timeout=30, fetch_navigation_timeout=60, search_time_range=24) + assert client.fetch_time == 10 + assert client.fetch_timeout == 30 + assert client.fetch_navigation_timeout == 60 + assert client.search_time_range == 24 + + @patch("src.community.infoquest.infoquest_client.requests.post") + def test_fetch_success(self, mock_post): + """Test successful fetch operation.""" + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = json.dumps({"reader_result": "Test content"}) + mock_post.return_value = mock_response + + client = InfoQuestClient() + result = client.fetch("https://example.com") + + assert result == "Test content" + mock_post.assert_called_once() + args, kwargs = mock_post.call_args + assert args[0] == "https://reader.infoquest.bytepluses.com" + assert kwargs["json"]["url"] == "https://example.com" + assert kwargs["json"]["format"] == "HTML" + + @patch("src.community.infoquest.infoquest_client.requests.post") + def test_fetch_non_200_status(self, mock_post): + """Test fetch operation with non-200 status code.""" + mock_response = MagicMock() + mock_response.status_code = 404 + mock_response.text = "Not Found" + mock_post.return_value = mock_response + + client = InfoQuestClient() + result = client.fetch("https://example.com") + + assert result == "Error: fetch API returned status 404: Not Found" + + @patch("src.community.infoquest.infoquest_client.requests.post") + def test_fetch_empty_response(self, mock_post): + """Test fetch operation with empty response.""" + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = "" + mock_post.return_value = mock_response + + client = InfoQuestClient() + result = client.fetch("https://example.com") + + assert result == "Error: no result found" + + @patch("src.community.infoquest.infoquest_client.requests.post") + def test_web_search_raw_results_success(self, mock_post): + """Test successful web_search_raw_results operation.""" + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"search_result": {"results": [{"content": {"results": {"organic": [{"title": "Test Result", "desc": "Test description", "url": "https://example.com"}]}}}], "images_results": []}} + mock_post.return_value = mock_response + + client = InfoQuestClient() + result = client.web_search_raw_results("test query", "") + + assert "search_result" in result + mock_post.assert_called_once() + args, kwargs = mock_post.call_args + assert args[0] == "https://search.infoquest.bytepluses.com" + assert kwargs["json"]["query"] == "test query" + + @patch("src.community.infoquest.infoquest_client.requests.post") + def test_web_search_success(self, mock_post): + """Test successful web_search operation.""" + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"search_result": {"results": [{"content": {"results": {"organic": [{"title": "Test Result", "desc": "Test description", "url": "https://example.com"}]}}}], "images_results": []}} + mock_post.return_value = mock_response + + client = InfoQuestClient() + result = client.web_search("test query") + + # Check if result is a valid JSON string with expected content + result_data = json.loads(result) + assert len(result_data) == 1 + assert result_data[0]["title"] == "Test Result" + assert result_data[0]["url"] == "https://example.com" + + def test_clean_results(self): + """Test clean_results method with sample raw results.""" + raw_results = [ + { + "content": { + "results": { + "organic": [{"title": "Test Page", "desc": "Page description", "url": "https://example.com/page1"}], + "top_stories": {"items": [{"title": "Test News", "source": "Test Source", "time_frame": "2 hours ago", "url": "https://example.com/news1"}]}, + } + } + } + ] + + cleaned = InfoQuestClient.clean_results(raw_results) + + assert len(cleaned) == 2 + assert cleaned[0]["type"] == "page" + assert cleaned[0]["title"] == "Test Page" + assert cleaned[1]["type"] == "news" + assert cleaned[1]["title"] == "Test News" + + def test_clean_results_with_image_search(self): + """Test clean_results_with_image_search method with sample raw results.""" + raw_results = [{"content": {"results": {"images_results": [{"image_url": "https://example.com/image1.jpg", "thumbnail_url": "https://example.com/thumb1.jpg","url": "https://example.com/page1"}]}}}] + cleaned = InfoQuestClient.clean_results_with_image_search(raw_results) + + assert len(cleaned) == 1 + assert cleaned[0]["image_url"] == "https://example.com/image1.jpg" + assert cleaned[0]["thumbnail_url"] == "https://example.com/thumb1.jpg" + assert cleaned[0]["url"] == "https://example.com/page1" + + @patch("src.community.infoquest.tools._get_infoquest_client") + def test_web_search_tool(self, mock_get_client): + """Test web_search_tool function.""" + mock_client = MagicMock() + mock_client.web_search.return_value = json.dumps([]) + mock_get_client.return_value = mock_client + + result = tools.web_search_tool.run("test query") + + assert result == json.dumps([]) + mock_get_client.assert_called_once() + mock_client.web_search.assert_called_once_with("test query") + + @patch("src.community.infoquest.tools._get_infoquest_client") + def test_web_fetch_tool(self, mock_get_client): + """Test web_fetch_tool function.""" + mock_client = MagicMock() + mock_client.fetch.return_value = "Test content" + mock_get_client.return_value = mock_client + + result = tools.web_fetch_tool.run("https://example.com") + + assert result == "# Untitled\n\nTest content" + mock_get_client.assert_called_once() + mock_client.fetch.assert_called_once_with("https://example.com") + + @patch("src.community.infoquest.tools.get_app_config") + def test_get_infoquest_client(self, mock_get_app_config): + """Test _get_infoquest_client function with config.""" + mock_config = MagicMock() + mock_config.get_tool_config.side_effect = [MagicMock(model_extra={"search_time_range": 24}), MagicMock(model_extra={"fetch_time": 10, "timeout": 30, "navigation_timeout": 60})] + mock_get_app_config.return_value = mock_config + + client = tools._get_infoquest_client() + + assert client.search_time_range == 24 + assert client.fetch_time == 10 + assert client.fetch_timeout == 30 + assert client.fetch_navigation_timeout == 60 + + @patch("src.community.infoquest.infoquest_client.requests.post") + def test_web_search_api_error(self, mock_post): + """Test web_search operation with API error.""" + mock_post.side_effect = Exception("Connection error") + + client = InfoQuestClient() + result = client.web_search("test query") + + assert "Error" in result \ No newline at end of file diff --git a/config.example.yaml b/config.example.yaml index 0738f5f..fd4cfa5 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -127,12 +127,30 @@ tools: max_results: 5 # api_key: $TAVILY_API_KEY # Set if needed + # Web search tool (requires InfoQuest API key) + # - name: web_search + # group: web + # use: src.community.infoquest.tools:web_search_tool + # # Used to limit the scope of search results, only returns content within the specified time range. Set to -1 to disable time filtering + # search_time_range: 10 + # Web fetch tool (uses Jina AI reader) - name: web_fetch group: web use: src.community.jina_ai.tools:web_fetch_tool timeout: 10 + # Web fetch tool (uses InfoQuest AI reader) + # - name: web_fetch + # group: web + # use: src.community.infoquest.tools:web_fetch_tool + # # Overall timeout for the entire crawling process (in seconds). Set to positive value to enable, -1 to disable + # timeout: 10 + # # Waiting time after page loading (in seconds). Set to positive value to enable, -1 to disable + # fetch_time: 10 + # # Timeout for navigating to the page (in seconds). Set to positive value to enable, -1 to disable + # navigation_timeout: 30 + # Image search tool (uses DuckDuckGo) # Use this to find reference images before image generation - name: image_search