mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-15 03:04:44 +08:00
feat: support infoquest (#708)
* support infoquest * support html checker * support html checker * change line break format * change line break format * change line break format * change line break format * change line break format * change line break format * change line break format * change line break format * Fix several critical issues in the codebase - Resolve crawler panic by improving error handling - Fix plan validation to prevent invalid configurations - Correct InfoQuest crawler JSON conversion logic * add test for infoquest * add test for infoquest * Add InfoQuest introduction to the README * add test for infoquest * fix readme for infoquest * fix readme for infoquest * resolve the conflict * resolve the conflict * resolve the conflict * Fix formatting of INFOQUEST in SearchEngine enum * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com> Co-authored-by: Willem Jiang <willem.jiang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e179fb1632
commit
7ec9e45702
@@ -11,6 +11,7 @@ load_dotenv()
|
||||
|
||||
class SearchEngine(enum.Enum):
|
||||
TAVILY = "tavily"
|
||||
INFOQUEST = "infoquest"
|
||||
DUCKDUCKGO = "duckduckgo"
|
||||
BRAVE_SEARCH = "brave_search"
|
||||
ARXIV = "arxiv"
|
||||
@@ -18,10 +19,14 @@ class SearchEngine(enum.Enum):
|
||||
WIKIPEDIA = "wikipedia"
|
||||
|
||||
|
||||
class CrawlerEngine(enum.Enum):
|
||||
JINA = "jina"
|
||||
INFOQUEST = "infoquest"
|
||||
|
||||
|
||||
# Tool configuration
|
||||
SELECTED_SEARCH_ENGINE = os.getenv("SEARCH_API", SearchEngine.TAVILY.value)
|
||||
|
||||
|
||||
class RAGProvider(enum.Enum):
|
||||
DIFY = "dify"
|
||||
RAGFLOW = "ragflow"
|
||||
|
||||
@@ -4,9 +4,12 @@
|
||||
import re
|
||||
import logging
|
||||
|
||||
from .article import Article
|
||||
from .jina_client import JinaClient
|
||||
from .readability_extractor import ReadabilityExtractor
|
||||
from src.config.tools import CrawlerEngine
|
||||
from src.config import load_yaml_config
|
||||
from src.crawler.article import Article
|
||||
from src.crawler.infoquest_client import InfoQuestClient
|
||||
from src.crawler.jina_client import JinaClient
|
||||
from src.crawler.readability_extractor import ReadabilityExtractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -14,11 +17,11 @@ logger = logging.getLogger(__name__)
|
||||
def safe_truncate(text: str, max_length: int = 500) -> str:
|
||||
"""
|
||||
Safely truncate text to a maximum length without breaking multi-byte characters.
|
||||
|
||||
|
||||
Args:
|
||||
text: The text to truncate
|
||||
max_length: Maximum number of characters to keep
|
||||
|
||||
|
||||
Returns:
|
||||
Truncated text that is safe to use without encoding issues
|
||||
"""
|
||||
@@ -49,7 +52,7 @@ def safe_truncate(text: str, max_length: int = 500) -> str:
|
||||
def is_html_content(content: str) -> bool:
|
||||
"""
|
||||
Check if the provided content is HTML.
|
||||
|
||||
|
||||
Uses a more robust detection method that checks for common HTML patterns
|
||||
including DOCTYPE declarations, HTML tags, and other HTML markers.
|
||||
"""
|
||||
@@ -138,17 +141,21 @@ class Crawler:
|
||||
# them into text and image blocks for one single and unified
|
||||
# LLM message.
|
||||
#
|
||||
# Jina is not the best crawler on readability, however it's
|
||||
# much easier and free to use.
|
||||
# The system supports multiple crawler engines:
|
||||
# - Jina: An accessible solution, though with some limitations in readability extraction
|
||||
# - InfoQuest: A BytePlus product offering advanced capabilities with configurable parameters
|
||||
# like fetch_time, timeout, and navi_timeout.
|
||||
#
|
||||
# Instead of using Jina's own markdown converter, we'll use
|
||||
# our own solution to get better readability results.
|
||||
try:
|
||||
jina_client = JinaClient()
|
||||
html = jina_client.crawl(url, return_format="html")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}")
|
||||
raise
|
||||
|
||||
# Get crawler configuration
|
||||
config = load_yaml_config("conf.yaml")
|
||||
crawler_config = config.get("CRAWLER_ENGINE", {})
|
||||
|
||||
# Get the selected crawler tool based on configuration
|
||||
crawler_client = self._select_crawler_tool(crawler_config)
|
||||
html = self._crawl_with_tool(crawler_client, url)
|
||||
|
||||
# Check if we got valid HTML content
|
||||
if not html or not html.strip():
|
||||
@@ -186,3 +193,44 @@ class Crawler:
|
||||
|
||||
article.url = url
|
||||
return article
|
||||
|
||||
def _select_crawler_tool(self, crawler_config: dict):
|
||||
# Only check engine from configuration file
|
||||
engine = crawler_config.get("engine", CrawlerEngine.JINA.value)
|
||||
|
||||
if engine == CrawlerEngine.JINA.value:
|
||||
logger.info(f"Selecting Jina crawler engine")
|
||||
return JinaClient()
|
||||
elif engine == CrawlerEngine.INFOQUEST.value:
|
||||
logger.info(f"Selecting InfoQuest crawler engine")
|
||||
# Read timeout parameters directly from crawler_config root level
|
||||
# These parameters are only effective when engine is set to "infoquest"
|
||||
fetch_time = crawler_config.get("fetch_time", -1)
|
||||
timeout = crawler_config.get("timeout", -1)
|
||||
navi_timeout = crawler_config.get("navi_timeout", -1)
|
||||
|
||||
# Log the configuration being used
|
||||
if fetch_time > 0 or timeout > 0 or navi_timeout > 0:
|
||||
logger.debug(
|
||||
f"Initializing InfoQuestCrawler with parameters: "
|
||||
f"fetch_time={fetch_time}, "
|
||||
f"timeout={timeout}, "
|
||||
f"navi_timeout={navi_timeout}"
|
||||
)
|
||||
|
||||
# Initialize InfoQuestClient with the parameters from configuration
|
||||
return InfoQuestClient(
|
||||
fetch_time=fetch_time,
|
||||
timeout=timeout,
|
||||
navi_timeout=navi_timeout
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported crawler engine: {engine}")
|
||||
|
||||
def _crawl_with_tool(self, crawler_client, url: str) -> str:
|
||||
logger.info(f"Crawling URL: {url} using {crawler_client.__class__.__name__}")
|
||||
try:
|
||||
return crawler_client.crawl(url, return_format="html")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch URL {url} using {crawler_client.__class__.__name__}: {repr(e)}")
|
||||
raise
|
||||
153
src/crawler/infoquest_client.py
Normal file
153
src/crawler/infoquest_client.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Util that calls InfoQuest Crawler API.
|
||||
|
||||
In order to set this up, follow instructions at:
|
||||
https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class InfoQuestClient:
|
||||
"""Client for interacting with the InfoQuest web crawling API."""
|
||||
|
||||
def __init__(self, fetch_time: int = -1, timeout: int = -1, navi_timeout: int = -1):
|
||||
logger.info(
|
||||
"\n============================================\n"
|
||||
"🚀 BytePlus InfoQuest Crawler Initialization 🚀\n"
|
||||
"============================================"
|
||||
)
|
||||
|
||||
self.fetch_time = fetch_time
|
||||
self.timeout = timeout
|
||||
self.navi_timeout = navi_timeout
|
||||
self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY"))
|
||||
|
||||
config_details = (
|
||||
f"\n📋 Configuration Details:\n"
|
||||
f"├── Fetch Timeout: {fetch_time} {'(Default: No timeout)' if fetch_time == -1 else '(Custom)'}\n"
|
||||
f"├── Timeout: {timeout} {'(Default: No timeout)' if timeout == -1 else '(Custom)'}\n"
|
||||
f"├── Navigation Timeout: {navi_timeout} {'(Default: No timeout)' if navi_timeout == -1 else '(Custom)'}\n"
|
||||
f"└── API Key: {'✅ Configured' if self.api_key_set else '❌ Not set'}"
|
||||
)
|
||||
|
||||
logger.info(config_details)
|
||||
logger.info("\n" + "*" * 70 + "\n")
|
||||
|
||||
def crawl(self, url: str, return_format: str = "html") -> str:
|
||||
logger.debug("Preparing request for URL: %s", url)
|
||||
|
||||
# Prepare headers
|
||||
headers = self._prepare_headers()
|
||||
|
||||
# Prepare request data
|
||||
data = self._prepare_request_data(url, return_format)
|
||||
|
||||
# Log request details
|
||||
logger.debug(
|
||||
"InfoQuest Crawler request prepared: endpoint=https://reader.infoquest.bytepluses.com, "
|
||||
"format=%s, has_api_key=%s",
|
||||
data.get("format"), self.api_key_set
|
||||
)
|
||||
|
||||
logger.debug("Sending crawl request to InfoQuest API")
|
||||
try:
|
||||
response = requests.post(
|
||||
"https://reader.infoquest.bytepluses.com",
|
||||
headers=headers,
|
||||
json=data
|
||||
)
|
||||
|
||||
# Check if status code is not 200
|
||||
if response.status_code != 200:
|
||||
error_message = f"InfoQuest API returned status {response.status_code}: {response.text}"
|
||||
logger.error(error_message)
|
||||
return f"Error: {error_message}"
|
||||
|
||||
# Check for empty response
|
||||
if not response.text or not response.text.strip():
|
||||
error_message = "InfoQuest Crawler API returned empty response"
|
||||
logger.error("BytePlus InfoQuest Crawler returned empty response for URL: %s", url)
|
||||
return f"Error: {error_message}"
|
||||
|
||||
# Try to parse response as JSON and extract reader_result
|
||||
try:
|
||||
response_data = json.loads(response.text)
|
||||
# Extract reader_result if it exists
|
||||
if "reader_result" in response_data:
|
||||
logger.debug("Successfully extracted reader_result from JSON response")
|
||||
return response_data["reader_result"]
|
||||
elif "content" in response_data:
|
||||
# Fallback to content field if reader_result is not available
|
||||
logger.debug("Using content field as fallback")
|
||||
return response_data["content"]
|
||||
else:
|
||||
# If neither field exists, return the original response
|
||||
logger.warning("Neither reader_result nor content field found in JSON response")
|
||||
except json.JSONDecodeError:
|
||||
# If response is not JSON, return the original text
|
||||
logger.debug("Response is not in JSON format, returning as-is")
|
||||
|
||||
# Print partial response for debugging
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "")
|
||||
logger.debug(
|
||||
"Successfully received response, content length: %d bytes, first 200 chars: %s",
|
||||
len(response.text), response_sample
|
||||
)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
error_message = f"Request to InfoQuest API failed: {str(e)}"
|
||||
logger.error(error_message)
|
||||
return f"Error: {error_message}"
|
||||
|
||||
def _prepare_headers(self) -> Dict[str, str]:
|
||||
"""Prepare request headers."""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# Add API key if available
|
||||
if os.getenv("INFOQUEST_API_KEY"):
|
||||
headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}"
|
||||
logger.debug("API key added to request headers")
|
||||
else:
|
||||
logger.warning(
|
||||
"InfoQuest API key is not set. Provide your own key for authentication."
|
||||
)
|
||||
|
||||
return headers
|
||||
|
||||
def _prepare_request_data(self, url: str, return_format: str) -> Dict[str, Any]:
|
||||
"""Prepare request data with formatted parameters."""
|
||||
# Normalize return_format
|
||||
if return_format and return_format.lower() == "html":
|
||||
normalized_format = "HTML"
|
||||
else:
|
||||
normalized_format = return_format
|
||||
|
||||
data = {"url": url, "format": normalized_format}
|
||||
|
||||
# Add timeout parameters if set to positive values
|
||||
timeout_params = {}
|
||||
if self.fetch_time > 0:
|
||||
timeout_params["fetch_time"] = self.fetch_time
|
||||
if self.timeout > 0:
|
||||
timeout_params["timeout"] = self.timeout
|
||||
if self.navi_timeout > 0:
|
||||
timeout_params["navi_timeout"] = self.navi_timeout
|
||||
|
||||
# Log applied timeout parameters
|
||||
if timeout_params:
|
||||
logger.debug("Applying timeout parameters: %s", timeout_params)
|
||||
data.update(timeout_params)
|
||||
|
||||
return data
|
||||
@@ -22,12 +22,21 @@ class JinaClient:
|
||||
"Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
|
||||
)
|
||||
data = {"url": url}
|
||||
response = requests.post("https://r.jina.ai/", headers=headers, json=data)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"Jina API returned status {response.status_code}: {response.text}")
|
||||
|
||||
if not response.text or not response.text.strip():
|
||||
raise ValueError("Jina API returned empty response")
|
||||
|
||||
return response.text
|
||||
try:
|
||||
response = requests.post("https://r.jina.ai/", headers=headers, json=data)
|
||||
|
||||
if response.status_code != 200:
|
||||
error_message = f"Jina API returned status {response.status_code}: {response.text}"
|
||||
logger.error(error_message)
|
||||
return f"Error: {error_message}"
|
||||
|
||||
if not response.text or not response.text.strip():
|
||||
error_message = "Jina API returned empty response"
|
||||
logger.error(error_message)
|
||||
return f"Error: {error_message}"
|
||||
|
||||
return response.text
|
||||
except Exception as e:
|
||||
error_message = f"Request to Jina API failed: {str(e)}"
|
||||
logger.error(error_message)
|
||||
return f"Error: {error_message}"
|
||||
4
src/tools/infoquest_search/__init__.py
Normal file
4
src/tools/infoquest_search/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .infoquest_search_api import InfoQuestAPIWrapper
|
||||
from .infoquest_search_results import InfoQuestSearchResults
|
||||
|
||||
__all__ = ["InfoQuestAPIWrapper", "InfoQuestSearchResults"]
|
||||
232
src/tools/infoquest_search/infoquest_search_api.py
Normal file
232
src/tools/infoquest_search/infoquest_search_api.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Util that calls InfoQuest Search API.
|
||||
|
||||
In order to set this up, follow instructions at:
|
||||
https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
from langchain_core.utils import get_from_dict_or_env
|
||||
from pydantic import BaseModel, ConfigDict, SecretStr, model_validator
|
||||
from src.config import load_yaml_config
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
INFOQUEST_API_URL = "https://search.infoquest.bytepluses.com"
|
||||
|
||||
def get_search_config():
|
||||
config = load_yaml_config("conf.yaml")
|
||||
search_config = config.get("SEARCH_ENGINE", {})
|
||||
return search_config
|
||||
|
||||
class InfoQuestAPIWrapper(BaseModel):
|
||||
"""Wrapper for InfoQuest Search API."""
|
||||
|
||||
infoquest_api_key: SecretStr
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_environment(cls, values: Dict) -> Any:
|
||||
"""Validate that api key and endpoint exists in environment."""
|
||||
logger.info("Initializing BytePlus InfoQuest Product - Search API client")
|
||||
|
||||
infoquest_api_key = get_from_dict_or_env(
|
||||
values, "infoquest_api_key", "INFOQUEST_API_KEY"
|
||||
)
|
||||
values["infoquest_api_key"] = infoquest_api_key
|
||||
|
||||
logger.info("BytePlus InfoQuest Product - Environment validation successful")
|
||||
return values
|
||||
|
||||
def raw_results(
|
||||
self,
|
||||
query: str,
|
||||
time_range: int,
|
||||
site: str,
|
||||
output_format: str = "JSON",
|
||||
) -> Dict:
|
||||
"""Get results from the InfoQuest Search API synchronously."""
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
query_truncated = query[:50] + "..." if len(query) > 50 else query
|
||||
logger.debug(
|
||||
f"InfoQuest - Search API request initiated | "
|
||||
f"operation=search | "
|
||||
f"query_truncated={query_truncated} | "
|
||||
f"has_time_filter={time_range > 0} | "
|
||||
f"has_site_filter={bool(site)} | "
|
||||
f"request_type=sync"
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.infoquest_api_key.get_secret_value()}",
|
||||
}
|
||||
|
||||
params = {
|
||||
"format": output_format,
|
||||
"query": query
|
||||
}
|
||||
if time_range > 0:
|
||||
params["time_range"] = time_range
|
||||
logger.debug(f"InfoQuest - Applying time range filter: time_range_days={time_range}")
|
||||
|
||||
if site != "":
|
||||
params["site"] = site
|
||||
logger.debug(f"InfoQuest - Applying site filter: site={site}")
|
||||
|
||||
response = requests.post(
|
||||
f"{INFOQUEST_API_URL}",
|
||||
headers=headers,
|
||||
json=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Print partial response for debugging
|
||||
response_json = response.json()
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
response_sample = json.dumps(response_json)[:200] + ("..." if len(json.dumps(response_json)) > 200 else "")
|
||||
logger.debug(
|
||||
f"Search API request completed successfully | "
|
||||
f"service=InfoQuest | "
|
||||
f"status=success | "
|
||||
f"response_sample={response_sample}"
|
||||
)
|
||||
|
||||
return response_json["search_result"]
|
||||
|
||||
async def raw_results_async(
|
||||
self,
|
||||
query: str,
|
||||
time_range: int,
|
||||
site: str,
|
||||
output_format: str = "JSON",
|
||||
) -> Dict:
|
||||
"""Get results from the InfoQuest Search API asynchronously."""
|
||||
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
query_truncated = query[:50] + "..." if len(query) > 50 else query
|
||||
logger.debug(
|
||||
f"BytePlus InfoQuest - Search API async request initiated | "
|
||||
f"operation=search | "
|
||||
f"query_truncated={query_truncated} | "
|
||||
f"has_time_filter={time_range > 0} | "
|
||||
f"has_site_filter={bool(site)} | "
|
||||
f"request_type=async"
|
||||
)
|
||||
# Function to perform the API call
|
||||
async def fetch() -> str:
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {self.infoquest_api_key.get_secret_value()}",
|
||||
}
|
||||
params = {
|
||||
"format": output_format,
|
||||
"query": query,
|
||||
}
|
||||
if time_range > 0:
|
||||
params["time_range"] = time_range
|
||||
logger.debug(f"Applying time range filter in async request: {time_range} days")
|
||||
if site != "":
|
||||
params["site"] = site
|
||||
logger.debug(f"Applying site filter in async request: {site}")
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True) as session:
|
||||
async with session.post(f"{INFOQUEST_API_URL}", headers=headers, json=params) as res:
|
||||
if res.status == 200:
|
||||
data = await res.text()
|
||||
return data
|
||||
else:
|
||||
raise Exception(f"Error {res.status}: {res.reason}")
|
||||
results_json_str = await fetch()
|
||||
|
||||
# Print partial response for debugging
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
response_sample = results_json_str[:200] + ("..." if len(results_json_str) > 200 else "")
|
||||
logger.debug(
|
||||
f"Async search API request completed successfully | "
|
||||
f"service=InfoQuest | "
|
||||
f"status=success | "
|
||||
f"response_sample={response_sample}"
|
||||
)
|
||||
return json.loads(results_json_str)["search_result"]
|
||||
|
||||
def clean_results_with_images(
|
||||
self, raw_results: List[Dict[str, Dict[str, Dict[str, Any]]]]
|
||||
) -> List[Dict]:
|
||||
"""Clean results from InfoQuest Search API."""
|
||||
logger.debug("Processing search results")
|
||||
|
||||
seen_urls = set()
|
||||
clean_results = []
|
||||
counts = {"pages": 0, "news": 0, "images": 0}
|
||||
|
||||
for content_list in raw_results:
|
||||
content = content_list["content"]
|
||||
results = content["results"]
|
||||
|
||||
|
||||
if results.get("organic"):
|
||||
organic_results = results["organic"]
|
||||
for result in organic_results:
|
||||
clean_result = {
|
||||
"type": "page",
|
||||
"title": result["title"],
|
||||
"url": result["url"],
|
||||
"desc": result["desc"],
|
||||
}
|
||||
url = clean_result["url"]
|
||||
if isinstance(url, str) and url and url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
clean_results.append(clean_result)
|
||||
counts["pages"] += 1
|
||||
|
||||
if results.get("top_stories"):
|
||||
news = results["top_stories"]
|
||||
for obj in news["items"]:
|
||||
clean_result = {
|
||||
"type": "news",
|
||||
"time_frame": obj["time_frame"],
|
||||
"title": obj["title"],
|
||||
"url": obj["url"],
|
||||
"source": obj["source"],
|
||||
}
|
||||
url = clean_result["url"]
|
||||
if isinstance(url, str) and url and url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
clean_results.append(clean_result)
|
||||
counts["news"] += 1
|
||||
|
||||
if results.get("images"):
|
||||
images = results["images"]
|
||||
for image in images["items"]:
|
||||
clean_result = {
|
||||
"type": "image_url",
|
||||
"image_url": image["url"],
|
||||
"image_description": image["alt"],
|
||||
}
|
||||
url = clean_result["image_url"]
|
||||
if isinstance(url, str) and url and url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
clean_results.append(clean_result)
|
||||
counts["images"] += 1
|
||||
|
||||
logger.debug(
|
||||
f"Results processing completed | "
|
||||
f"total_results={len(clean_results)} | "
|
||||
f"pages={counts['pages']} | "
|
||||
f"news_items={counts['news']} | "
|
||||
f"images={counts['images']} | "
|
||||
f"unique_urls={len(seen_urls)}"
|
||||
)
|
||||
|
||||
return clean_results
|
||||
236
src/tools/infoquest_search/infoquest_search_results.py
Normal file
236
src/tools/infoquest_search/infoquest_search_results.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Tool for the InfoQuest search API."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
||||
from langchain_core.callbacks import (
|
||||
AsyncCallbackManagerForToolRun,
|
||||
CallbackManagerForToolRun,
|
||||
)
|
||||
from langchain_core.tools import BaseTool
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.tools.infoquest_search.infoquest_search_api import InfoQuestAPIWrapper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class InfoQuestInput(BaseModel):
|
||||
"""Input for the InfoQuest tool."""
|
||||
|
||||
query: str = Field(description="search query to look up")
|
||||
|
||||
class InfoQuestSearchResults(BaseTool):
|
||||
"""Tool that queries the InfoQuest Search API and returns processed results with images.
|
||||
|
||||
Setup:
|
||||
Install required packages and set environment variable ``INFOQUEST_API_KEY``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community aiohttp
|
||||
export INFOQUEST_API_KEY="your-api-key"
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from your_module import InfoQuestSearch
|
||||
|
||||
tool = InfoQuestSearchResults(
|
||||
output_format="json",
|
||||
time_range=10,
|
||||
site="nytimes.com"
|
||||
)
|
||||
|
||||
Invoke directly with args:
|
||||
.. code-block:: python
|
||||
|
||||
tool.invoke({
|
||||
'query': 'who won the last french open'
|
||||
})
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
[
|
||||
{
|
||||
"type": "page",
|
||||
"title": "Djokovic Claims French Open Title...",
|
||||
"url": "https://www.nytimes.com/...",
|
||||
"desc": "Novak Djokovic won the 2024 French Open by defeating Casper Ruud..."
|
||||
},
|
||||
{
|
||||
"type": "news",
|
||||
"time_frame": "2 days ago",
|
||||
"title": "French Open Finals Recap",
|
||||
"url": "https://www.nytimes.com/...",
|
||||
"source": "New York Times"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "https://www.nytimes.com/.../djokovic.jpg"},
|
||||
"image_description": "Novak Djokovic celebrating his French Open victory"
|
||||
}
|
||||
]
|
||||
|
||||
Invoke with tool call:
|
||||
.. code-block:: python
|
||||
|
||||
tool.invoke({
|
||||
"args": {
|
||||
'query': 'who won the last french open',
|
||||
},
|
||||
"type": "tool_call",
|
||||
"id": "foo",
|
||||
"name": "infoquest"
|
||||
})
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ToolMessage(
|
||||
content='[
|
||||
{"type": "page", "title": "Djokovic Claims...", "url": "https://www.nytimes.com/...", "desc": "Novak Djokovic won..."},
|
||||
{"type": "news", "time_frame": "2 days ago", "title": "French Open Finals...", "url": "https://www.nytimes.com/...", "source": "New York Times"},
|
||||
{"type": "image_url", "image_url": {"url": "https://www.nytimes.com/.../djokovic.jpg"}, "image_description": "Novak Djokovic celebrating..."}
|
||||
]',
|
||||
tool_call_id='1',
|
||||
name='infoquest_search_results_json',
|
||||
)
|
||||
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
name: str = "infoquest_search_results_json"
|
||||
description: str = (
|
||||
"A search engine optimized for comprehensive, accurate, and trusted results. "
|
||||
"Useful for when you need to answer questions about current events. "
|
||||
"Input should be a search query."
|
||||
)
|
||||
args_schema: Type[BaseModel] = InfoQuestInput
|
||||
"""The tool response format."""
|
||||
|
||||
time_range: int = -1
|
||||
"""Time range for filtering search results, in days.
|
||||
|
||||
If set to a positive integer (e.g., 30), only results from the last N days will be included.
|
||||
Default is -1, which means no time range filter is applied.
|
||||
"""
|
||||
|
||||
site: str = ""
|
||||
"""Specific domain to restrict search results to (e.g., "nytimes.com").
|
||||
|
||||
If provided, only results from the specified domain will be returned.
|
||||
Default is an empty string, which means no domain restriction is applied.
|
||||
"""
|
||||
|
||||
api_wrapper: InfoQuestAPIWrapper = Field(default_factory=InfoQuestAPIWrapper) # type: ignore[arg-type]
|
||||
response_format: Literal["content_and_artifact"] = "content_and_artifact"
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
# Create api_wrapper with infoquest_api_key if provided
|
||||
if "infoquest_api_key" in kwargs:
|
||||
kwargs["api_wrapper"] = InfoQuestAPIWrapper(
|
||||
infoquest_api_key=kwargs["infoquest_api_key"]
|
||||
)
|
||||
logger.debug("API wrapper initialized with provided key")
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
logger.info(
|
||||
"\n============================================\n"
|
||||
"🚀 BytePlus InfoQuest Search Initialization 🚀\n"
|
||||
"============================================"
|
||||
)
|
||||
|
||||
# Prepare initialization details
|
||||
time_range_status = f"{self.time_range} days" if hasattr(self, 'time_range') and self.time_range > 0 else "Disabled"
|
||||
site_filter = f"'{self.site}'" if hasattr(self, 'site') and self.site else "Disabled"
|
||||
|
||||
initialization_details = (
|
||||
f"\n🔧 Tool Information:\n"
|
||||
f"├── Tool Name: {self.name}\n"
|
||||
f"├── Time Range Filter: {time_range_status}\n"
|
||||
f"└── Site Filter: {site_filter}\n"
|
||||
f"📊 Configuration Summary:\n"
|
||||
f"├── Response Format: {self.response_format}\n"
|
||||
)
|
||||
|
||||
logger.info(initialization_details)
|
||||
logger.info("\n" + "*" * 70 + "\n")
|
||||
|
||||
def _run(
|
||||
self,
|
||||
query: str,
|
||||
run_manager: Optional[CallbackManagerForToolRun] = None,
|
||||
) -> Tuple[Union[List[Dict[str, str]], str], Dict]:
|
||||
"""Use the tool."""
|
||||
try:
|
||||
logger.debug(f"Executing search with parameters: time_range={self.time_range}, site={self.site}")
|
||||
raw_results = self.api_wrapper.raw_results(
|
||||
query,
|
||||
self.time_range,
|
||||
self.site
|
||||
)
|
||||
logger.debug("Processing raw search results")
|
||||
cleaned_results = self.api_wrapper.clean_results_with_images(raw_results["results"])
|
||||
|
||||
result_json = json.dumps(cleaned_results, ensure_ascii=False)
|
||||
|
||||
logger.info(
|
||||
f"Search tool execution completed | "
|
||||
f"mode=synchronous | "
|
||||
f"results_count={len(cleaned_results)}"
|
||||
)
|
||||
return result_json, raw_results
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Search tool execution failed | "
|
||||
f"mode=synchronous | "
|
||||
f"error={str(e)}"
|
||||
)
|
||||
error_result = json.dumps({"error": repr(e)}, ensure_ascii=False)
|
||||
return error_result, {}
|
||||
|
||||
async def _arun(
|
||||
self,
|
||||
query: str,
|
||||
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
|
||||
) -> Tuple[Union[List[Dict[str, str]], str], Dict]:
|
||||
"""Use the tool asynchronously."""
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
query_truncated = query[:50] + "..." if len(query) > 50 else query
|
||||
logger.debug(
|
||||
f"Search tool execution started | "
|
||||
f"mode=asynchronous | "
|
||||
f"query={query_truncated}"
|
||||
)
|
||||
try:
|
||||
logger.debug(f"Executing async search with parameters: time_range={self.time_range}, site={self.site}")
|
||||
|
||||
raw_results = await self.api_wrapper.raw_results_async(
|
||||
query,
|
||||
self.time_range,
|
||||
self.site
|
||||
)
|
||||
|
||||
logger.debug("Processing raw async search results")
|
||||
cleaned_results = self.api_wrapper.clean_results_with_images(raw_results["results"])
|
||||
|
||||
result_json = json.dumps(cleaned_results, ensure_ascii=False)
|
||||
|
||||
logger.debug(
|
||||
f"Search tool execution completed | "
|
||||
f"mode=asynchronous | "
|
||||
f"results_count={len(cleaned_results)}"
|
||||
)
|
||||
|
||||
return result_json, raw_results
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Search tool execution failed | "
|
||||
f"mode=asynchronous | "
|
||||
f"error={str(e)}"
|
||||
)
|
||||
error_result = json.dumps({"error": repr(e)}, ensure_ascii=False)
|
||||
return error_result, {}
|
||||
@@ -21,6 +21,7 @@ from langchain_community.utilities import (
|
||||
|
||||
from src.config import SELECTED_SEARCH_ENGINE, SearchEngine, load_yaml_config
|
||||
from src.tools.decorators import create_logged_tool
|
||||
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
|
||||
from src.tools.tavily_search.tavily_search_results_with_images import (
|
||||
TavilySearchWithImages,
|
||||
)
|
||||
@@ -29,6 +30,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Create logged versions of the search tools
|
||||
LoggedTavilySearch = create_logged_tool(TavilySearchWithImages)
|
||||
LoggedInfoQuestSearch = create_logged_tool(InfoQuestSearchResults)
|
||||
LoggedDuckDuckGoSearch = create_logged_tool(DuckDuckGoSearchResults)
|
||||
LoggedBraveSearch = create_logged_tool(BraveSearch)
|
||||
LoggedArxivSearch = create_logged_tool(ArxivQueryRun)
|
||||
@@ -76,6 +78,17 @@ def get_web_search_tool(max_search_results: int):
|
||||
include_domains=include_domains,
|
||||
exclude_domains=exclude_domains,
|
||||
)
|
||||
elif SELECTED_SEARCH_ENGINE == SearchEngine.INFOQUEST.value:
|
||||
time_range = search_config.get("time_range", -1)
|
||||
site = search_config.get("site", "")
|
||||
logger.info(
|
||||
f"InfoQuest search configuration loaded: time_range={time_range}, site={site}"
|
||||
)
|
||||
return LoggedInfoQuestSearch(
|
||||
name="web_search",
|
||||
time_range=time_range,
|
||||
site=site,
|
||||
)
|
||||
elif SELECTED_SEARCH_ENGINE == SearchEngine.DUCKDUCKGO.value:
|
||||
return LoggedDuckDuckGoSearch(
|
||||
name="web_search",
|
||||
|
||||
Reference in New Issue
Block a user