feat: support infoquest (#708)

* support infoquest

* support html checker

* support html checker

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* Fix several critical issues in the codebase
- Resolve crawler panic by improving error handling
- Fix plan validation to prevent invalid configurations
- Correct InfoQuest crawler JSON conversion logic

* add test for infoquest

* add test for infoquest

* Add InfoQuest introduction to the README

* add test for infoquest

* fix readme for infoquest

* fix readme for infoquest

* resolve the conflict

* resolve the conflict

* resolve the conflict

* Fix formatting of INFOQUEST in SearchEngine enum

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
infoquest-byteplus
2025-12-02 08:16:35 +08:00
committed by GitHub
parent e179fb1632
commit 7ec9e45702
22 changed files with 2103 additions and 94 deletions

View File

@@ -11,6 +11,7 @@ load_dotenv()
class SearchEngine(enum.Enum):
TAVILY = "tavily"
INFOQUEST = "infoquest"
DUCKDUCKGO = "duckduckgo"
BRAVE_SEARCH = "brave_search"
ARXIV = "arxiv"
@@ -18,10 +19,14 @@ class SearchEngine(enum.Enum):
WIKIPEDIA = "wikipedia"
class CrawlerEngine(enum.Enum):
JINA = "jina"
INFOQUEST = "infoquest"
# Tool configuration
SELECTED_SEARCH_ENGINE = os.getenv("SEARCH_API", SearchEngine.TAVILY.value)
class RAGProvider(enum.Enum):
DIFY = "dify"
RAGFLOW = "ragflow"

View File

@@ -4,9 +4,12 @@
import re
import logging
from .article import Article
from .jina_client import JinaClient
from .readability_extractor import ReadabilityExtractor
from src.config.tools import CrawlerEngine
from src.config import load_yaml_config
from src.crawler.article import Article
from src.crawler.infoquest_client import InfoQuestClient
from src.crawler.jina_client import JinaClient
from src.crawler.readability_extractor import ReadabilityExtractor
logger = logging.getLogger(__name__)
@@ -14,11 +17,11 @@ logger = logging.getLogger(__name__)
def safe_truncate(text: str, max_length: int = 500) -> str:
"""
Safely truncate text to a maximum length without breaking multi-byte characters.
Args:
text: The text to truncate
max_length: Maximum number of characters to keep
Returns:
Truncated text that is safe to use without encoding issues
"""
@@ -49,7 +52,7 @@ def safe_truncate(text: str, max_length: int = 500) -> str:
def is_html_content(content: str) -> bool:
"""
Check if the provided content is HTML.
Uses a more robust detection method that checks for common HTML patterns
including DOCTYPE declarations, HTML tags, and other HTML markers.
"""
@@ -138,17 +141,21 @@ class Crawler:
# them into text and image blocks for one single and unified
# LLM message.
#
# Jina is not the best crawler on readability, however it's
# much easier and free to use.
# The system supports multiple crawler engines:
# - Jina: An accessible solution, though with some limitations in readability extraction
# - InfoQuest: A BytePlus product offering advanced capabilities with configurable parameters
# like fetch_time, timeout, and navi_timeout.
#
# Instead of using Jina's own markdown converter, we'll use
# our own solution to get better readability results.
try:
jina_client = JinaClient()
html = jina_client.crawl(url, return_format="html")
except Exception as e:
logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}")
raise
# Get crawler configuration
config = load_yaml_config("conf.yaml")
crawler_config = config.get("CRAWLER_ENGINE", {})
# Get the selected crawler tool based on configuration
crawler_client = self._select_crawler_tool(crawler_config)
html = self._crawl_with_tool(crawler_client, url)
# Check if we got valid HTML content
if not html or not html.strip():
@@ -186,3 +193,44 @@ class Crawler:
article.url = url
return article
def _select_crawler_tool(self, crawler_config: dict):
# Only check engine from configuration file
engine = crawler_config.get("engine", CrawlerEngine.JINA.value)
if engine == CrawlerEngine.JINA.value:
logger.info(f"Selecting Jina crawler engine")
return JinaClient()
elif engine == CrawlerEngine.INFOQUEST.value:
logger.info(f"Selecting InfoQuest crawler engine")
# Read timeout parameters directly from crawler_config root level
# These parameters are only effective when engine is set to "infoquest"
fetch_time = crawler_config.get("fetch_time", -1)
timeout = crawler_config.get("timeout", -1)
navi_timeout = crawler_config.get("navi_timeout", -1)
# Log the configuration being used
if fetch_time > 0 or timeout > 0 or navi_timeout > 0:
logger.debug(
f"Initializing InfoQuestCrawler with parameters: "
f"fetch_time={fetch_time}, "
f"timeout={timeout}, "
f"navi_timeout={navi_timeout}"
)
# Initialize InfoQuestClient with the parameters from configuration
return InfoQuestClient(
fetch_time=fetch_time,
timeout=timeout,
navi_timeout=navi_timeout
)
else:
raise ValueError(f"Unsupported crawler engine: {engine}")
def _crawl_with_tool(self, crawler_client, url: str) -> str:
logger.info(f"Crawling URL: {url} using {crawler_client.__class__.__name__}")
try:
return crawler_client.crawl(url, return_format="html")
except Exception as e:
logger.error(f"Failed to fetch URL {url} using {crawler_client.__class__.__name__}: {repr(e)}")
raise

View File

@@ -0,0 +1,153 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Util that calls InfoQuest Crawler API.
In order to set this up, follow instructions at:
https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
"""
import json
import logging
import os
from typing import Dict, Any
import requests
logger = logging.getLogger(__name__)
class InfoQuestClient:
"""Client for interacting with the InfoQuest web crawling API."""
def __init__(self, fetch_time: int = -1, timeout: int = -1, navi_timeout: int = -1):
logger.info(
"\n============================================\n"
"🚀 BytePlus InfoQuest Crawler Initialization 🚀\n"
"============================================"
)
self.fetch_time = fetch_time
self.timeout = timeout
self.navi_timeout = navi_timeout
self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY"))
config_details = (
f"\n📋 Configuration Details:\n"
f"├── Fetch Timeout: {fetch_time} {'(Default: No timeout)' if fetch_time == -1 else '(Custom)'}\n"
f"├── Timeout: {timeout} {'(Default: No timeout)' if timeout == -1 else '(Custom)'}\n"
f"├── Navigation Timeout: {navi_timeout} {'(Default: No timeout)' if navi_timeout == -1 else '(Custom)'}\n"
f"└── API Key: {'✅ Configured' if self.api_key_set else '❌ Not set'}"
)
logger.info(config_details)
logger.info("\n" + "*" * 70 + "\n")
def crawl(self, url: str, return_format: str = "html") -> str:
logger.debug("Preparing request for URL: %s", url)
# Prepare headers
headers = self._prepare_headers()
# Prepare request data
data = self._prepare_request_data(url, return_format)
# Log request details
logger.debug(
"InfoQuest Crawler request prepared: endpoint=https://reader.infoquest.bytepluses.com, "
"format=%s, has_api_key=%s",
data.get("format"), self.api_key_set
)
logger.debug("Sending crawl request to InfoQuest API")
try:
response = requests.post(
"https://reader.infoquest.bytepluses.com",
headers=headers,
json=data
)
# Check if status code is not 200
if response.status_code != 200:
error_message = f"InfoQuest API returned status {response.status_code}: {response.text}"
logger.error(error_message)
return f"Error: {error_message}"
# Check for empty response
if not response.text or not response.text.strip():
error_message = "InfoQuest Crawler API returned empty response"
logger.error("BytePlus InfoQuest Crawler returned empty response for URL: %s", url)
return f"Error: {error_message}"
# Try to parse response as JSON and extract reader_result
try:
response_data = json.loads(response.text)
# Extract reader_result if it exists
if "reader_result" in response_data:
logger.debug("Successfully extracted reader_result from JSON response")
return response_data["reader_result"]
elif "content" in response_data:
# Fallback to content field if reader_result is not available
logger.debug("Using content field as fallback")
return response_data["content"]
else:
# If neither field exists, return the original response
logger.warning("Neither reader_result nor content field found in JSON response")
except json.JSONDecodeError:
# If response is not JSON, return the original text
logger.debug("Response is not in JSON format, returning as-is")
# Print partial response for debugging
if logger.isEnabledFor(logging.DEBUG):
response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "")
logger.debug(
"Successfully received response, content length: %d bytes, first 200 chars: %s",
len(response.text), response_sample
)
return response.text
except Exception as e:
error_message = f"Request to InfoQuest API failed: {str(e)}"
logger.error(error_message)
return f"Error: {error_message}"
def _prepare_headers(self) -> Dict[str, str]:
"""Prepare request headers."""
headers = {
"Content-Type": "application/json",
}
# Add API key if available
if os.getenv("INFOQUEST_API_KEY"):
headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}"
logger.debug("API key added to request headers")
else:
logger.warning(
"InfoQuest API key is not set. Provide your own key for authentication."
)
return headers
def _prepare_request_data(self, url: str, return_format: str) -> Dict[str, Any]:
"""Prepare request data with formatted parameters."""
# Normalize return_format
if return_format and return_format.lower() == "html":
normalized_format = "HTML"
else:
normalized_format = return_format
data = {"url": url, "format": normalized_format}
# Add timeout parameters if set to positive values
timeout_params = {}
if self.fetch_time > 0:
timeout_params["fetch_time"] = self.fetch_time
if self.timeout > 0:
timeout_params["timeout"] = self.timeout
if self.navi_timeout > 0:
timeout_params["navi_timeout"] = self.navi_timeout
# Log applied timeout parameters
if timeout_params:
logger.debug("Applying timeout parameters: %s", timeout_params)
data.update(timeout_params)
return data

View File

@@ -22,12 +22,21 @@ class JinaClient:
"Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
)
data = {"url": url}
response = requests.post("https://r.jina.ai/", headers=headers, json=data)
if response.status_code != 200:
raise ValueError(f"Jina API returned status {response.status_code}: {response.text}")
if not response.text or not response.text.strip():
raise ValueError("Jina API returned empty response")
return response.text
try:
response = requests.post("https://r.jina.ai/", headers=headers, json=data)
if response.status_code != 200:
error_message = f"Jina API returned status {response.status_code}: {response.text}"
logger.error(error_message)
return f"Error: {error_message}"
if not response.text or not response.text.strip():
error_message = "Jina API returned empty response"
logger.error(error_message)
return f"Error: {error_message}"
return response.text
except Exception as e:
error_message = f"Request to Jina API failed: {str(e)}"
logger.error(error_message)
return f"Error: {error_message}"

View File

@@ -0,0 +1,4 @@
from .infoquest_search_api import InfoQuestAPIWrapper
from .infoquest_search_results import InfoQuestSearchResults
__all__ = ["InfoQuestAPIWrapper", "InfoQuestSearchResults"]

View File

@@ -0,0 +1,232 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Util that calls InfoQuest Search API.
In order to set this up, follow instructions at:
https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
"""
import json
from typing import Any, Dict, List
import aiohttp
import requests
from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, ConfigDict, SecretStr, model_validator
from src.config import load_yaml_config
import logging
logger = logging.getLogger(__name__)
INFOQUEST_API_URL = "https://search.infoquest.bytepluses.com"
def get_search_config():
config = load_yaml_config("conf.yaml")
search_config = config.get("SEARCH_ENGINE", {})
return search_config
class InfoQuestAPIWrapper(BaseModel):
"""Wrapper for InfoQuest Search API."""
infoquest_api_key: SecretStr
model_config = ConfigDict(
extra="forbid",
)
@model_validator(mode="before")
@classmethod
def validate_environment(cls, values: Dict) -> Any:
"""Validate that api key and endpoint exists in environment."""
logger.info("Initializing BytePlus InfoQuest Product - Search API client")
infoquest_api_key = get_from_dict_or_env(
values, "infoquest_api_key", "INFOQUEST_API_KEY"
)
values["infoquest_api_key"] = infoquest_api_key
logger.info("BytePlus InfoQuest Product - Environment validation successful")
return values
def raw_results(
self,
query: str,
time_range: int,
site: str,
output_format: str = "JSON",
) -> Dict:
"""Get results from the InfoQuest Search API synchronously."""
if logger.isEnabledFor(logging.DEBUG):
query_truncated = query[:50] + "..." if len(query) > 50 else query
logger.debug(
f"InfoQuest - Search API request initiated | "
f"operation=search | "
f"query_truncated={query_truncated} | "
f"has_time_filter={time_range > 0} | "
f"has_site_filter={bool(site)} | "
f"request_type=sync"
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.infoquest_api_key.get_secret_value()}",
}
params = {
"format": output_format,
"query": query
}
if time_range > 0:
params["time_range"] = time_range
logger.debug(f"InfoQuest - Applying time range filter: time_range_days={time_range}")
if site != "":
params["site"] = site
logger.debug(f"InfoQuest - Applying site filter: site={site}")
response = requests.post(
f"{INFOQUEST_API_URL}",
headers=headers,
json=params
)
response.raise_for_status()
# Print partial response for debugging
response_json = response.json()
if logger.isEnabledFor(logging.DEBUG):
response_sample = json.dumps(response_json)[:200] + ("..." if len(json.dumps(response_json)) > 200 else "")
logger.debug(
f"Search API request completed successfully | "
f"service=InfoQuest | "
f"status=success | "
f"response_sample={response_sample}"
)
return response_json["search_result"]
async def raw_results_async(
self,
query: str,
time_range: int,
site: str,
output_format: str = "JSON",
) -> Dict:
"""Get results from the InfoQuest Search API asynchronously."""
if logger.isEnabledFor(logging.DEBUG):
query_truncated = query[:50] + "..." if len(query) > 50 else query
logger.debug(
f"BytePlus InfoQuest - Search API async request initiated | "
f"operation=search | "
f"query_truncated={query_truncated} | "
f"has_time_filter={time_range > 0} | "
f"has_site_filter={bool(site)} | "
f"request_type=async"
)
# Function to perform the API call
async def fetch() -> str:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.infoquest_api_key.get_secret_value()}",
}
params = {
"format": output_format,
"query": query,
}
if time_range > 0:
params["time_range"] = time_range
logger.debug(f"Applying time range filter in async request: {time_range} days")
if site != "":
params["site"] = site
logger.debug(f"Applying site filter in async request: {site}")
async with aiohttp.ClientSession(trust_env=True) as session:
async with session.post(f"{INFOQUEST_API_URL}", headers=headers, json=params) as res:
if res.status == 200:
data = await res.text()
return data
else:
raise Exception(f"Error {res.status}: {res.reason}")
results_json_str = await fetch()
# Print partial response for debugging
if logger.isEnabledFor(logging.DEBUG):
response_sample = results_json_str[:200] + ("..." if len(results_json_str) > 200 else "")
logger.debug(
f"Async search API request completed successfully | "
f"service=InfoQuest | "
f"status=success | "
f"response_sample={response_sample}"
)
return json.loads(results_json_str)["search_result"]
def clean_results_with_images(
self, raw_results: List[Dict[str, Dict[str, Dict[str, Any]]]]
) -> List[Dict]:
"""Clean results from InfoQuest Search API."""
logger.debug("Processing search results")
seen_urls = set()
clean_results = []
counts = {"pages": 0, "news": 0, "images": 0}
for content_list in raw_results:
content = content_list["content"]
results = content["results"]
if results.get("organic"):
organic_results = results["organic"]
for result in organic_results:
clean_result = {
"type": "page",
"title": result["title"],
"url": result["url"],
"desc": result["desc"],
}
url = clean_result["url"]
if isinstance(url, str) and url and url not in seen_urls:
seen_urls.add(url)
clean_results.append(clean_result)
counts["pages"] += 1
if results.get("top_stories"):
news = results["top_stories"]
for obj in news["items"]:
clean_result = {
"type": "news",
"time_frame": obj["time_frame"],
"title": obj["title"],
"url": obj["url"],
"source": obj["source"],
}
url = clean_result["url"]
if isinstance(url, str) and url and url not in seen_urls:
seen_urls.add(url)
clean_results.append(clean_result)
counts["news"] += 1
if results.get("images"):
images = results["images"]
for image in images["items"]:
clean_result = {
"type": "image_url",
"image_url": image["url"],
"image_description": image["alt"],
}
url = clean_result["image_url"]
if isinstance(url, str) and url and url not in seen_urls:
seen_urls.add(url)
clean_results.append(clean_result)
counts["images"] += 1
logger.debug(
f"Results processing completed | "
f"total_results={len(clean_results)} | "
f"pages={counts['pages']} | "
f"news_items={counts['news']} | "
f"images={counts['images']} | "
f"unique_urls={len(seen_urls)}"
)
return clean_results

View File

@@ -0,0 +1,236 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Tool for the InfoQuest search API."""
import json
import logging
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
from langchain_core.callbacks import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
from langchain_core.tools import BaseTool
from pydantic import BaseModel, Field
from src.tools.infoquest_search.infoquest_search_api import InfoQuestAPIWrapper
logger = logging.getLogger(__name__)
class InfoQuestInput(BaseModel):
"""Input for the InfoQuest tool."""
query: str = Field(description="search query to look up")
class InfoQuestSearchResults(BaseTool):
"""Tool that queries the InfoQuest Search API and returns processed results with images.
Setup:
Install required packages and set environment variable ``INFOQUEST_API_KEY``.
.. code-block:: bash
pip install -U langchain-community aiohttp
export INFOQUEST_API_KEY="your-api-key"
Instantiate:
.. code-block:: python
from your_module import InfoQuestSearch
tool = InfoQuestSearchResults(
output_format="json",
time_range=10,
site="nytimes.com"
)
Invoke directly with args:
.. code-block:: python
tool.invoke({
'query': 'who won the last french open'
})
.. code-block:: json
[
{
"type": "page",
"title": "Djokovic Claims French Open Title...",
"url": "https://www.nytimes.com/...",
"desc": "Novak Djokovic won the 2024 French Open by defeating Casper Ruud..."
},
{
"type": "news",
"time_frame": "2 days ago",
"title": "French Open Finals Recap",
"url": "https://www.nytimes.com/...",
"source": "New York Times"
},
{
"type": "image_url",
"image_url": {"url": "https://www.nytimes.com/.../djokovic.jpg"},
"image_description": "Novak Djokovic celebrating his French Open victory"
}
]
Invoke with tool call:
.. code-block:: python
tool.invoke({
"args": {
'query': 'who won the last french open',
},
"type": "tool_call",
"id": "foo",
"name": "infoquest"
})
.. code-block:: python
ToolMessage(
content='[
{"type": "page", "title": "Djokovic Claims...", "url": "https://www.nytimes.com/...", "desc": "Novak Djokovic won..."},
{"type": "news", "time_frame": "2 days ago", "title": "French Open Finals...", "url": "https://www.nytimes.com/...", "source": "New York Times"},
{"type": "image_url", "image_url": {"url": "https://www.nytimes.com/.../djokovic.jpg"}, "image_description": "Novak Djokovic celebrating..."}
]',
tool_call_id='1',
name='infoquest_search_results_json',
)
""" # noqa: E501
name: str = "infoquest_search_results_json"
description: str = (
"A search engine optimized for comprehensive, accurate, and trusted results. "
"Useful for when you need to answer questions about current events. "
"Input should be a search query."
)
args_schema: Type[BaseModel] = InfoQuestInput
"""The tool response format."""
time_range: int = -1
"""Time range for filtering search results, in days.
If set to a positive integer (e.g., 30), only results from the last N days will be included.
Default is -1, which means no time range filter is applied.
"""
site: str = ""
"""Specific domain to restrict search results to (e.g., "nytimes.com").
If provided, only results from the specified domain will be returned.
Default is an empty string, which means no domain restriction is applied.
"""
api_wrapper: InfoQuestAPIWrapper = Field(default_factory=InfoQuestAPIWrapper) # type: ignore[arg-type]
response_format: Literal["content_and_artifact"] = "content_and_artifact"
def __init__(self, **kwargs: Any) -> None:
# Create api_wrapper with infoquest_api_key if provided
if "infoquest_api_key" in kwargs:
kwargs["api_wrapper"] = InfoQuestAPIWrapper(
infoquest_api_key=kwargs["infoquest_api_key"]
)
logger.debug("API wrapper initialized with provided key")
super().__init__(**kwargs)
logger.info(
"\n============================================\n"
"🚀 BytePlus InfoQuest Search Initialization 🚀\n"
"============================================"
)
# Prepare initialization details
time_range_status = f"{self.time_range} days" if hasattr(self, 'time_range') and self.time_range > 0 else "Disabled"
site_filter = f"'{self.site}'" if hasattr(self, 'site') and self.site else "Disabled"
initialization_details = (
f"\n🔧 Tool Information:\n"
f"├── Tool Name: {self.name}\n"
f"├── Time Range Filter: {time_range_status}\n"
f"└── Site Filter: {site_filter}\n"
f"📊 Configuration Summary:\n"
f"├── Response Format: {self.response_format}\n"
)
logger.info(initialization_details)
logger.info("\n" + "*" * 70 + "\n")
def _run(
self,
query: str,
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> Tuple[Union[List[Dict[str, str]], str], Dict]:
"""Use the tool."""
try:
logger.debug(f"Executing search with parameters: time_range={self.time_range}, site={self.site}")
raw_results = self.api_wrapper.raw_results(
query,
self.time_range,
self.site
)
logger.debug("Processing raw search results")
cleaned_results = self.api_wrapper.clean_results_with_images(raw_results["results"])
result_json = json.dumps(cleaned_results, ensure_ascii=False)
logger.info(
f"Search tool execution completed | "
f"mode=synchronous | "
f"results_count={len(cleaned_results)}"
)
return result_json, raw_results
except Exception as e:
logger.error(
f"Search tool execution failed | "
f"mode=synchronous | "
f"error={str(e)}"
)
error_result = json.dumps({"error": repr(e)}, ensure_ascii=False)
return error_result, {}
async def _arun(
self,
query: str,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> Tuple[Union[List[Dict[str, str]], str], Dict]:
"""Use the tool asynchronously."""
if logger.isEnabledFor(logging.DEBUG):
query_truncated = query[:50] + "..." if len(query) > 50 else query
logger.debug(
f"Search tool execution started | "
f"mode=asynchronous | "
f"query={query_truncated}"
)
try:
logger.debug(f"Executing async search with parameters: time_range={self.time_range}, site={self.site}")
raw_results = await self.api_wrapper.raw_results_async(
query,
self.time_range,
self.site
)
logger.debug("Processing raw async search results")
cleaned_results = self.api_wrapper.clean_results_with_images(raw_results["results"])
result_json = json.dumps(cleaned_results, ensure_ascii=False)
logger.debug(
f"Search tool execution completed | "
f"mode=asynchronous | "
f"results_count={len(cleaned_results)}"
)
return result_json, raw_results
except Exception as e:
logger.error(
f"Search tool execution failed | "
f"mode=asynchronous | "
f"error={str(e)}"
)
error_result = json.dumps({"error": repr(e)}, ensure_ascii=False)
return error_result, {}

View File

@@ -21,6 +21,7 @@ from langchain_community.utilities import (
from src.config import SELECTED_SEARCH_ENGINE, SearchEngine, load_yaml_config
from src.tools.decorators import create_logged_tool
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
from src.tools.tavily_search.tavily_search_results_with_images import (
TavilySearchWithImages,
)
@@ -29,6 +30,7 @@ logger = logging.getLogger(__name__)
# Create logged versions of the search tools
LoggedTavilySearch = create_logged_tool(TavilySearchWithImages)
LoggedInfoQuestSearch = create_logged_tool(InfoQuestSearchResults)
LoggedDuckDuckGoSearch = create_logged_tool(DuckDuckGoSearchResults)
LoggedBraveSearch = create_logged_tool(BraveSearch)
LoggedArxivSearch = create_logged_tool(ArxivQueryRun)
@@ -76,6 +78,17 @@ def get_web_search_tool(max_search_results: int):
include_domains=include_domains,
exclude_domains=exclude_domains,
)
elif SELECTED_SEARCH_ENGINE == SearchEngine.INFOQUEST.value:
time_range = search_config.get("time_range", -1)
site = search_config.get("site", "")
logger.info(
f"InfoQuest search configuration loaded: time_range={time_range}, site={site}"
)
return LoggedInfoQuestSearch(
name="web_search",
time_range=time_range,
site=site,
)
elif SELECTED_SEARCH_ENGINE == SearchEngine.DUCKDUCKGO.value:
return LoggedDuckDuckGoSearch(
name="web_search",