support infoquest (#960)

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
infoquest-byteplus
2026-03-06 15:32:13 +08:00
committed by GitHub
parent 3e4a24f48b
commit 28e1257e1e
6 changed files with 590 additions and 0 deletions

View File

@@ -4,6 +4,8 @@ TAVILY_API_KEY=your-tavily-api-key
# Jina API Key # Jina API Key
JINA_API_KEY=your-jina-api-key JINA_API_KEY=your-jina-api-key
# InfoQuest API Key
INFOQUEST_API_KEY=your-infoquest-api-key
# CORS Origins (comma-separated) - e.g., http://localhost:3000,http://localhost:3001 # CORS Origins (comma-separated) - e.g., http://localhost:3000,http://localhost:3001
# CORS_ORIGINS=http://localhost:3000 # CORS_ORIGINS=http://localhost:3000

View File

@@ -16,6 +16,16 @@ Learn more and see **real demos** on our official website.
**[deerflow.tech](https://deerflow.tech/)** **[deerflow.tech](https://deerflow.tech/)**
## InfoQuest
DeerFlow has newly integrated the intelligent search and crawling toolset independently developed by BytePlus--[InfoQuest (supports free online experience)](https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest)
<a href="https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest" target="_blank">
<img
src="https://sf16-sg.tiktokcdn.com/obj/eden-sg/hubseh7bsbps/20251208-160108.png" alt="InfoQuest_banner"
/>
</a>
--- ---
## Table of Contents ## Table of Contents
@@ -94,6 +104,7 @@ Learn more and see **real demos** on our official website.
TAVILY_API_KEY=your-tavily-api-key TAVILY_API_KEY=your-tavily-api-key
OPENAI_API_KEY=your-openai-api-key OPENAI_API_KEY=your-openai-api-key
# Add other provider keys as needed # Add other provider keys as needed
INFOQUEST_API_KEY=your-infoquest-api-key
``` ```
- Option B: Export environment variables in your shell - Option B: Export environment variables in your shell

View File

@@ -0,0 +1,312 @@
"""Util that calls InfoQuest Search And Fetch API.
In order to set this up, follow instructions at:
https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
"""
import json
import logging
import os
from typing import Any
import requests
logger = logging.getLogger(__name__)
class InfoQuestClient:
"""Client for interacting with the InfoQuest web search and fetch API."""
def __init__(self, fetch_time: int = -1, fetch_timeout: int = -1, fetch_navigation_timeout: int = -1, search_time_range: int = -1):
logger.info("\n============================================\n🚀 BytePlus InfoQuest Client Initialization 🚀\n============================================")
self.fetch_time = fetch_time
self.fetch_timeout = fetch_timeout
self.fetch_navigation_timeout = fetch_navigation_timeout
self.search_time_range = search_time_range
self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY"))
if logger.isEnabledFor(logging.DEBUG):
config_details = (
f"\n📋 Configuration Details:\n"
f"├── Fetch time: {fetch_time} {'(Default: No fetch time)' if fetch_time == -1 else '(Custom)'}\n"
f"├── Fetch Timeout: {fetch_timeout} {'(Default: No fetch timeout)' if fetch_timeout == -1 else '(Custom)'}\n"
f"├── Navigation Timeout: {fetch_navigation_timeout} {'(Default: No Navigation Timeout)' if fetch_navigation_timeout == -1 else '(Custom)'}\n"
f"├── Search Time Range: {search_time_range} {'(Default: No Search Time Range)' if search_time_range == -1 else '(Custom)'}\n"
f"└── API Key: {'✅ Configured' if self.api_key_set else '❌ Not set'}"
)
logger.debug(config_details)
logger.debug("\n" + "*" * 70 + "\n")
def fetch(self, url: str, return_format: str = "html") -> str:
if logger.isEnabledFor(logging.DEBUG):
url_truncated = url[:50] + "..." if len(url) > 50 else url
logger.debug(
f"InfoQuest - Fetch API request initiated | "
f"operation=crawl url | "
f"url_truncated={url_truncated} | "
f"has_timeout_filter={self.fetch_timeout > 0} | timeout_filter={self.fetch_timeout} | "
f"has_fetch_time_filter={self.fetch_time > 0} | fetch_time_filter={self.fetch_time} | "
f"has_navigation_timeout_filter={self.fetch_navigation_timeout > 0} | navi_timeout_filter={self.fetch_navigation_timeout} | "
f"request_type=sync"
)
# Prepare headers
headers = self._prepare_headers()
# Prepare request data
data = self._prepare_crawl_request_data(url, return_format)
logger.debug("Sending crawl request to InfoQuest API")
try:
response = requests.post("https://reader.infoquest.bytepluses.com", headers=headers, json=data)
# Check if status code is not 200
if response.status_code != 200:
error_message = f"fetch API returned status {response.status_code}: {response.text}"
logger.debug("InfoQuest Crawler fetch API return status %d: %s for URL: %s", response.status_code, response.text, url)
return f"Error: {error_message}"
# Check for empty response
if not response.text or not response.text.strip():
error_message = "no result found"
logger.debug("InfoQuest Crawler returned empty response for URL: %s", url)
return f"Error: {error_message}"
# Try to parse response as JSON and extract reader_result
try:
response_data = json.loads(response.text)
# Extract reader_result if it exists
if "reader_result" in response_data:
logger.debug("Successfully extracted reader_result from JSON response")
return response_data["reader_result"]
elif "content" in response_data:
# Fallback to content field if reader_result is not available
logger.debug("reader_result missing in JSON response, falling back to content field: %s",
response_data["content"])
return response_data["content"]
else:
# If neither field exists, return the original response
logger.warning("Neither reader_result nor content field found in JSON response")
except json.JSONDecodeError:
# If response is not JSON, return the original text
logger.debug("Response is not in JSON format, returning as-is")
return response.text
# Print partial response for debugging
if logger.isEnabledFor(logging.DEBUG):
response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "")
logger.debug("Successfully received response, content length: %d bytes, first 200 chars: %s", len(response.text), response_sample)
return response.text
except Exception as e:
error_message = f"fetch API failed: {str(e)}"
logger.error(error_message)
return f"Error: {error_message}"
@staticmethod
def _prepare_headers() -> dict[str, str]:
"""Prepare request headers."""
headers = {
"Content-Type": "application/json",
}
# Add API key if available
if os.getenv("INFOQUEST_API_KEY"):
headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}"
logger.debug("API key added to request headers")
else:
logger.warning("InfoQuest API key is not set. Provide your own key for authentication.")
return headers
def _prepare_crawl_request_data(self, url: str, return_format: str) -> dict[str, Any]:
"""Prepare request data with formatted parameters."""
# Normalize return_format
if return_format and return_format.lower() == "html":
normalized_format = "HTML"
else:
normalized_format = return_format
data = {"url": url, "format": normalized_format}
# Add timeout parameters if set to positive values
timeout_params = {}
if self.fetch_time > 0:
timeout_params["fetch_time"] = self.fetch_time
if self.fetch_timeout > 0:
timeout_params["timeout"] = self.fetch_timeout
if self.fetch_navigation_timeout > 0:
timeout_params["navi_timeout"] = self.fetch_navigation_timeout
# Log applied timeout parameters
if timeout_params:
logger.debug("Applying timeout parameters: %s", timeout_params)
data.update(timeout_params)
return data
def web_search_raw_results(
self,
query: str,
site: str,
output_format: str = "JSON",
) -> dict:
"""Get results from the InfoQuest Web-Search API synchronously."""
headers = self._prepare_headers()
params = {"format": output_format, "query": query}
if self.search_time_range > 0:
params["time_range"] = self.search_time_range
if site != "":
params["site"] = site
response = requests.post("https://search.infoquest.bytepluses.com", headers=headers, json=params)
response.raise_for_status()
# Print partial response for debugging
response_json = response.json()
if logger.isEnabledFor(logging.DEBUG):
response_sample = json.dumps(response_json)[:200] + ("..." if len(json.dumps(response_json)) > 200 else "")
logger.debug(f"Search API request completed successfully | service=InfoQuest | status=success | response_sample={response_sample}")
return response_json
@staticmethod
def clean_results(raw_results: list[dict[str, dict[str, dict[str, Any]]]]) -> list[dict]:
"""Clean results from InfoQuest Web-Search API."""
logger.debug("Processing web-search results")
seen_urls = set()
clean_results = []
counts = {"pages": 0, "news": 0}
for content_list in raw_results:
content = content_list["content"]
results = content["results"]
if results.get("organic"):
organic_results = results["organic"]
for result in organic_results:
clean_result = {
"type": "page",
}
if "title" in result:
clean_result["title"] = result["title"]
if "desc" in result:
clean_result["desc"] = result["desc"]
clean_result["snippet"] = result["desc"]
if "url" in result:
clean_result["url"] = result["url"]
url = clean_result["url"]
if isinstance(url, str) and url and url not in seen_urls:
seen_urls.add(url)
clean_results.append(clean_result)
counts["pages"] += 1
if results.get("top_stories"):
news = results["top_stories"]
for obj in news["items"]:
clean_result = {
"type": "news",
}
if "time_frame" in obj:
clean_result["time_frame"] = obj["time_frame"]
if "source" in obj:
clean_result["source"] = obj["source"]
title = obj.get("title")
url = obj.get("url")
if title:
clean_result["title"] = title
if url:
clean_result["url"] = url
if title and isinstance(url, str) and url and url not in seen_urls:
seen_urls.add(url)
clean_results.append(clean_result)
counts["news"] += 1
logger.debug(f"Results processing completed | total_results={len(clean_results)} | pages={counts['pages']} | news_items={counts['news']} | unique_urls={len(seen_urls)}")
return clean_results
def web_search(
self,
query: str,
site: str = "",
output_format: str = "JSON",
) -> str:
if logger.isEnabledFor(logging.DEBUG):
query_truncated = query[:50] + "..." if len(query) > 50 else query
logger.debug(
f"InfoQuest - Search API request initiated | "
f"operation=search webs | "
f"query_truncated={query_truncated} | "
f"has_time_filter={self.search_time_range > 0} | time_filter={self.search_time_range} | "
f"has_site_filter={bool(site)} | site={site} | "
f"request_type=sync"
)
try:
logger.debug("InfoQuest Web-Search - Executing search with parameters")
raw_results = self.web_search_raw_results(
query,
site,
output_format,
)
if "search_result" in raw_results:
logger.debug("InfoQuest Web-Search - Successfully extracted search_result from JSON response")
results = raw_results["search_result"]
logger.debug("InfoQuest Web-Search - Processing raw search results")
cleaned_results = self.clean_results(results["results"])
result_json = json.dumps(cleaned_results, indent=2, ensure_ascii=False)
logger.debug(f"InfoQuest Web-Search - Search tool execution completed | mode=synchronous | results_count={len(cleaned_results)}")
return result_json
elif "content" in raw_results:
# Fallback to content field if search_result is not available
error_message = "web search API return wrong format"
logger.error("web search API return wrong format, no search_result nor content field found in JSON response, content: %s", raw_results["content"])
return f"Error: {error_message}"
else:
# If neither field exists, return the original response
logger.warning("InfoQuest Web-Search - Neither search_result nor content field found in JSON response")
return json.dumps(raw_results, indent=2, ensure_ascii=False)
except Exception as e:
error_message = f"InfoQuest Web-Search - Search tool execution failed | mode=synchronous | error={str(e)}"
logger.error(error_message)
return f"Error: {error_message}"
@staticmethod
def clean_results_with_image_search(raw_results: list[dict[str, dict[str, dict[str, Any]]]]) -> list[dict]:
"""Clean results from InfoQuest Web-Search API."""
logger.debug("Processing web-search results")
seen_urls = set()
clean_results = []
counts = {"images": 0}
for content_list in raw_results:
content = content_list["content"]
results = content["results"]
if results.get("images_results"):
images_results = results["images_results"]
for result in images_results:
clean_result = {}
if "image_url" in result:
clean_result["image_url"] = result["image_url"]
url = clean_result["image_url"]
if isinstance(url, str) and url and url not in seen_urls:
seen_urls.add(url)
clean_results.append(clean_result)
counts["images"] += 1
if "thumbnail_url" in result:
clean_result["thumbnail_url"] = result["thumbnail_url"]
if "url" in result:
clean_result["url"] = result["url"]
logger.debug(f"Results processing completed | total_results={len(clean_results)} | images={counts['images']} | unique_urls={len(seen_urls)}")
return clean_results

View File

@@ -0,0 +1,63 @@
from langchain.tools import tool
from src.config import get_app_config
from src.utils.readability import ReadabilityExtractor
from .infoquest_client import InfoQuestClient
readability_extractor = ReadabilityExtractor()
def _get_infoquest_client() -> InfoQuestClient:
search_config = get_app_config().get_tool_config("web_search")
search_time_range = -1
if search_config is not None and "search_time_range" in search_config.model_extra:
search_time_range = search_config.model_extra.get("search_time_range")
fetch_config = get_app_config().get_tool_config("web_fetch")
fetch_time = -1
if fetch_config is not None and "fetch_time" in fetch_config.model_extra:
fetch_time = fetch_config.model_extra.get("fetch_time")
fetch_timeout = -1
if fetch_config is not None and "timeout" in fetch_config.model_extra:
fetch_timeout = fetch_config.model_extra.get("timeout")
navigation_timeout = -1
if fetch_config is not None and "navigation_timeout" in fetch_config.model_extra:
navigation_timeout = fetch_config.model_extra.get("navigation_timeout")
return InfoQuestClient(
search_time_range=search_time_range,
fetch_timeout=fetch_timeout,
fetch_navigation_timeout=navigation_timeout,
fetch_time=fetch_time,
)
@tool("web_search", parse_docstring=True)
def web_search_tool(query: str) -> str:
"""Search the web.
Args:
query: The query to search for.
"""
client = _get_infoquest_client()
return client.web_search(query)
@tool("web_fetch", parse_docstring=True)
def web_fetch_tool(url: str) -> str:
"""Fetch the contents of a web page at a given URL.
Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
Do NOT add www. to URLs that do NOT have them.
URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
Args:
url: The URL to fetch the contents of.
"""
client = _get_infoquest_client()
result = client.fetch(url)
if result.startswith("Error: "):
return result
article = readability_extractor.extract_article(result)
return article.to_markdown()[:4096]

View File

@@ -0,0 +1,184 @@
"""Tests for InfoQuest client and tools."""
import json
from unittest.mock import MagicMock, patch
from src.community.infoquest import tools
from src.community.infoquest.infoquest_client import InfoQuestClient
class TestInfoQuestClient:
def test_infoquest_client_initialization(self):
"""Test InfoQuestClient initialization with different parameters."""
# Test with default parameters
client = InfoQuestClient()
assert client.fetch_time == -1
assert client.fetch_timeout == -1
assert client.fetch_navigation_timeout == -1
assert client.search_time_range == -1
# Test with custom parameters
client = InfoQuestClient(fetch_time=10, fetch_timeout=30, fetch_navigation_timeout=60, search_time_range=24)
assert client.fetch_time == 10
assert client.fetch_timeout == 30
assert client.fetch_navigation_timeout == 60
assert client.search_time_range == 24
@patch("src.community.infoquest.infoquest_client.requests.post")
def test_fetch_success(self, mock_post):
"""Test successful fetch operation."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = json.dumps({"reader_result": "<html><body>Test content</body></html>"})
mock_post.return_value = mock_response
client = InfoQuestClient()
result = client.fetch("https://example.com")
assert result == "<html><body>Test content</body></html>"
mock_post.assert_called_once()
args, kwargs = mock_post.call_args
assert args[0] == "https://reader.infoquest.bytepluses.com"
assert kwargs["json"]["url"] == "https://example.com"
assert kwargs["json"]["format"] == "HTML"
@patch("src.community.infoquest.infoquest_client.requests.post")
def test_fetch_non_200_status(self, mock_post):
"""Test fetch operation with non-200 status code."""
mock_response = MagicMock()
mock_response.status_code = 404
mock_response.text = "Not Found"
mock_post.return_value = mock_response
client = InfoQuestClient()
result = client.fetch("https://example.com")
assert result == "Error: fetch API returned status 404: Not Found"
@patch("src.community.infoquest.infoquest_client.requests.post")
def test_fetch_empty_response(self, mock_post):
"""Test fetch operation with empty response."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = ""
mock_post.return_value = mock_response
client = InfoQuestClient()
result = client.fetch("https://example.com")
assert result == "Error: no result found"
@patch("src.community.infoquest.infoquest_client.requests.post")
def test_web_search_raw_results_success(self, mock_post):
"""Test successful web_search_raw_results operation."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {"search_result": {"results": [{"content": {"results": {"organic": [{"title": "Test Result", "desc": "Test description", "url": "https://example.com"}]}}}], "images_results": []}}
mock_post.return_value = mock_response
client = InfoQuestClient()
result = client.web_search_raw_results("test query", "")
assert "search_result" in result
mock_post.assert_called_once()
args, kwargs = mock_post.call_args
assert args[0] == "https://search.infoquest.bytepluses.com"
assert kwargs["json"]["query"] == "test query"
@patch("src.community.infoquest.infoquest_client.requests.post")
def test_web_search_success(self, mock_post):
"""Test successful web_search operation."""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {"search_result": {"results": [{"content": {"results": {"organic": [{"title": "Test Result", "desc": "Test description", "url": "https://example.com"}]}}}], "images_results": []}}
mock_post.return_value = mock_response
client = InfoQuestClient()
result = client.web_search("test query")
# Check if result is a valid JSON string with expected content
result_data = json.loads(result)
assert len(result_data) == 1
assert result_data[0]["title"] == "Test Result"
assert result_data[0]["url"] == "https://example.com"
def test_clean_results(self):
"""Test clean_results method with sample raw results."""
raw_results = [
{
"content": {
"results": {
"organic": [{"title": "Test Page", "desc": "Page description", "url": "https://example.com/page1"}],
"top_stories": {"items": [{"title": "Test News", "source": "Test Source", "time_frame": "2 hours ago", "url": "https://example.com/news1"}]},
}
}
}
]
cleaned = InfoQuestClient.clean_results(raw_results)
assert len(cleaned) == 2
assert cleaned[0]["type"] == "page"
assert cleaned[0]["title"] == "Test Page"
assert cleaned[1]["type"] == "news"
assert cleaned[1]["title"] == "Test News"
def test_clean_results_with_image_search(self):
"""Test clean_results_with_image_search method with sample raw results."""
raw_results = [{"content": {"results": {"images_results": [{"image_url": "https://example.com/image1.jpg", "thumbnail_url": "https://example.com/thumb1.jpg","url": "https://example.com/page1"}]}}}]
cleaned = InfoQuestClient.clean_results_with_image_search(raw_results)
assert len(cleaned) == 1
assert cleaned[0]["image_url"] == "https://example.com/image1.jpg"
assert cleaned[0]["thumbnail_url"] == "https://example.com/thumb1.jpg"
assert cleaned[0]["url"] == "https://example.com/page1"
@patch("src.community.infoquest.tools._get_infoquest_client")
def test_web_search_tool(self, mock_get_client):
"""Test web_search_tool function."""
mock_client = MagicMock()
mock_client.web_search.return_value = json.dumps([])
mock_get_client.return_value = mock_client
result = tools.web_search_tool.run("test query")
assert result == json.dumps([])
mock_get_client.assert_called_once()
mock_client.web_search.assert_called_once_with("test query")
@patch("src.community.infoquest.tools._get_infoquest_client")
def test_web_fetch_tool(self, mock_get_client):
"""Test web_fetch_tool function."""
mock_client = MagicMock()
mock_client.fetch.return_value = "<html><body>Test content</body></html>"
mock_get_client.return_value = mock_client
result = tools.web_fetch_tool.run("https://example.com")
assert result == "# Untitled\n\nTest content"
mock_get_client.assert_called_once()
mock_client.fetch.assert_called_once_with("https://example.com")
@patch("src.community.infoquest.tools.get_app_config")
def test_get_infoquest_client(self, mock_get_app_config):
"""Test _get_infoquest_client function with config."""
mock_config = MagicMock()
mock_config.get_tool_config.side_effect = [MagicMock(model_extra={"search_time_range": 24}), MagicMock(model_extra={"fetch_time": 10, "timeout": 30, "navigation_timeout": 60})]
mock_get_app_config.return_value = mock_config
client = tools._get_infoquest_client()
assert client.search_time_range == 24
assert client.fetch_time == 10
assert client.fetch_timeout == 30
assert client.fetch_navigation_timeout == 60
@patch("src.community.infoquest.infoquest_client.requests.post")
def test_web_search_api_error(self, mock_post):
"""Test web_search operation with API error."""
mock_post.side_effect = Exception("Connection error")
client = InfoQuestClient()
result = client.web_search("test query")
assert "Error" in result

View File

@@ -127,12 +127,30 @@ tools:
max_results: 5 max_results: 5
# api_key: $TAVILY_API_KEY # Set if needed # api_key: $TAVILY_API_KEY # Set if needed
# Web search tool (requires InfoQuest API key)
# - name: web_search
# group: web
# use: src.community.infoquest.tools:web_search_tool
# # Used to limit the scope of search results, only returns content within the specified time range. Set to -1 to disable time filtering
# search_time_range: 10
# Web fetch tool (uses Jina AI reader) # Web fetch tool (uses Jina AI reader)
- name: web_fetch - name: web_fetch
group: web group: web
use: src.community.jina_ai.tools:web_fetch_tool use: src.community.jina_ai.tools:web_fetch_tool
timeout: 10 timeout: 10
# Web fetch tool (uses InfoQuest AI reader)
# - name: web_fetch
# group: web
# use: src.community.infoquest.tools:web_fetch_tool
# # Overall timeout for the entire crawling process (in seconds). Set to positive value to enable, -1 to disable
# timeout: 10
# # Waiting time after page loading (in seconds). Set to positive value to enable, -1 to disable
# fetch_time: 10
# # Timeout for navigating to the page (in seconds). Set to positive value to enable, -1 to disable
# navigation_timeout: 30
# Image search tool (uses DuckDuckGo) # Image search tool (uses DuckDuckGo)
# Use this to find reference images before image generation # Use this to find reference images before image generation
- name: image_search - name: image_search