mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-14 10:44:46 +08:00
feat: add firecrawl community package with web_search and web_fetch tools
Add web_search_tool and web_fetch_tool implementations using the official firecrawl-py SDK as an alternative to Tavily/Jina AI integrations. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
67
backend/src/community/firecrawl/tools.py
Normal file
67
backend/src/community/firecrawl/tools.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import json
|
||||
|
||||
from firecrawl import FirecrawlApp
|
||||
from langchain.tools import tool
|
||||
|
||||
from src.config import get_app_config
|
||||
|
||||
|
||||
def _get_firecrawl_client() -> FirecrawlApp:
|
||||
config = get_app_config().get_tool_config("web_search")
|
||||
api_key = None
|
||||
if config is not None:
|
||||
api_key = config.model_extra.get("api_key")
|
||||
return FirecrawlApp(api_key=api_key) # type: ignore[arg-type]
|
||||
|
||||
|
||||
@tool("web_search", parse_docstring=True)
|
||||
def web_search_tool(query: str) -> str:
|
||||
"""Search the web.
|
||||
|
||||
Args:
|
||||
query: The query to search for.
|
||||
"""
|
||||
config = get_app_config().get_tool_config("web_search")
|
||||
max_results = 5
|
||||
if config is not None:
|
||||
max_results = config.model_extra.get("max_results", max_results)
|
||||
|
||||
client = _get_firecrawl_client()
|
||||
result = client.search(query, limit=max_results)
|
||||
|
||||
# result.web contains list of SearchResultWeb objects
|
||||
web_results = result.web or []
|
||||
normalized_results = [
|
||||
{
|
||||
"title": getattr(item, "title", "") or "",
|
||||
"url": getattr(item, "url", "") or "",
|
||||
"snippet": getattr(item, "description", "") or "",
|
||||
}
|
||||
for item in web_results
|
||||
]
|
||||
json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False)
|
||||
return json_results
|
||||
|
||||
|
||||
@tool("web_fetch", parse_docstring=True)
|
||||
def web_fetch_tool(url: str) -> str:
|
||||
"""Fetch the contents of a web page at a given URL.
|
||||
Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
|
||||
This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
|
||||
Do NOT add www. to URLs that do NOT have them.
|
||||
URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch the contents of.
|
||||
"""
|
||||
client = _get_firecrawl_client()
|
||||
result = client.scrape(url, formats=["markdown"])
|
||||
|
||||
markdown_content = result.markdown or ""
|
||||
metadata = result.metadata
|
||||
title = metadata.title if metadata and metadata.title else "Untitled"
|
||||
|
||||
if not markdown_content:
|
||||
return "Error: No content found"
|
||||
|
||||
return f"# {title}\n\n{markdown_content}"
|
||||
Reference in New Issue
Block a user