feat: add firecrawl community package with web_search and web_fetch tools

Add web_search_tool and web_fetch_tool implementations using the official
firecrawl-py SDK as an alternative to Tavily/Jina AI integrations.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
hetao
2026-01-26 19:58:08 +08:00
parent 0cc7cc08e9
commit ce7f7258ba
4 changed files with 571 additions and 0 deletions

View File

@@ -0,0 +1,67 @@
import json
from firecrawl import FirecrawlApp
from langchain.tools import tool
from src.config import get_app_config
def _get_firecrawl_client() -> FirecrawlApp:
config = get_app_config().get_tool_config("web_search")
api_key = None
if config is not None:
api_key = config.model_extra.get("api_key")
return FirecrawlApp(api_key=api_key) # type: ignore[arg-type]
@tool("web_search", parse_docstring=True)
def web_search_tool(query: str) -> str:
"""Search the web.
Args:
query: The query to search for.
"""
config = get_app_config().get_tool_config("web_search")
max_results = 5
if config is not None:
max_results = config.model_extra.get("max_results", max_results)
client = _get_firecrawl_client()
result = client.search(query, limit=max_results)
# result.web contains list of SearchResultWeb objects
web_results = result.web or []
normalized_results = [
{
"title": getattr(item, "title", "") or "",
"url": getattr(item, "url", "") or "",
"snippet": getattr(item, "description", "") or "",
}
for item in web_results
]
json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False)
return json_results
@tool("web_fetch", parse_docstring=True)
def web_fetch_tool(url: str) -> str:
"""Fetch the contents of a web page at a given URL.
Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
Do NOT add www. to URLs that do NOT have them.
URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
Args:
url: The URL to fetch the contents of.
"""
client = _get_firecrawl_client()
result = client.scrape(url, formats=["markdown"])
markdown_content = result.markdown or ""
metadata = result.metadata
title = metadata.title if metadata and metadata.title else "Untitled"
if not markdown_content:
return "Error: No content found"
return f"# {title}\n\n{markdown_content}"