mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-12 01:54:45 +08:00
Add web_search_tool and web_fetch_tool implementations using the official firecrawl-py SDK as an alternative to Tavily/Jina AI integrations. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
68 lines
2.2 KiB
Python
68 lines
2.2 KiB
Python
import json
|
|
|
|
from firecrawl import FirecrawlApp
|
|
from langchain.tools import tool
|
|
|
|
from src.config import get_app_config
|
|
|
|
|
|
def _get_firecrawl_client() -> FirecrawlApp:
|
|
config = get_app_config().get_tool_config("web_search")
|
|
api_key = None
|
|
if config is not None:
|
|
api_key = config.model_extra.get("api_key")
|
|
return FirecrawlApp(api_key=api_key) # type: ignore[arg-type]
|
|
|
|
|
|
@tool("web_search", parse_docstring=True)
|
|
def web_search_tool(query: str) -> str:
|
|
"""Search the web.
|
|
|
|
Args:
|
|
query: The query to search for.
|
|
"""
|
|
config = get_app_config().get_tool_config("web_search")
|
|
max_results = 5
|
|
if config is not None:
|
|
max_results = config.model_extra.get("max_results", max_results)
|
|
|
|
client = _get_firecrawl_client()
|
|
result = client.search(query, limit=max_results)
|
|
|
|
# result.web contains list of SearchResultWeb objects
|
|
web_results = result.web or []
|
|
normalized_results = [
|
|
{
|
|
"title": getattr(item, "title", "") or "",
|
|
"url": getattr(item, "url", "") or "",
|
|
"snippet": getattr(item, "description", "") or "",
|
|
}
|
|
for item in web_results
|
|
]
|
|
json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False)
|
|
return json_results
|
|
|
|
|
|
@tool("web_fetch", parse_docstring=True)
|
|
def web_fetch_tool(url: str) -> str:
|
|
"""Fetch the contents of a web page at a given URL.
|
|
Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
|
|
This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
|
|
Do NOT add www. to URLs that do NOT have them.
|
|
URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
|
|
|
|
Args:
|
|
url: The URL to fetch the contents of.
|
|
"""
|
|
client = _get_firecrawl_client()
|
|
result = client.scrape(url, formats=["markdown"])
|
|
|
|
markdown_content = result.markdown or ""
|
|
metadata = result.metadata
|
|
title = metadata.title if metadata and metadata.title else "Untitled"
|
|
|
|
if not markdown_content:
|
|
return "Error: No content found"
|
|
|
|
return f"# {title}\n\n{markdown_content}"
|