feat: integrated with Tavily and Jina AI

2026-04-18 12:04:45 +08:00 · 2026-01-14 07:17:22 +08:00
parent 83bd7e4309
commit 4b5f529903
4 changed files with 190 additions and 0 deletions
--- a/backend/src/community/jina_ai/jina_client.py
+++ b/backend/src/community/jina_ai/jina_client.py
@@ -0,0 +1,43 @@
+import logging
+import os
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+class JinaClient:
+
+    def crawl(self, url: str, return_format: str = "html", timeout: int = 10) -> str:
+        headers = {
+            "Content-Type": "application/json",
+            "X-Return-Format": return_format,
+            "X-Timeout": str(timeout),
+        }
+        if os.getenv("JINA_API_KEY"):
+            headers["Authorization"] = f"Bearer {os.getenv('JINA_API_KEY')}"
+        else:
+            logger.warning(
+                "Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
+            )
+        data = {"url": url}
+        try:
+            response = requests.post("https://r.jina.ai/", headers=headers, json=data)
+
+            if response.status_code != 200:
+                error_message = (
+                    f"Jina API returned status {response.status_code}: {response.text}"
+                )
+                logger.error(error_message)
+                return f"Error: {error_message}"
+
+            if not response.text or not response.text.strip():
+                error_message = "Jina API returned empty response"
+                logger.error(error_message)
+                return f"Error: {error_message}"
+
+            return response.text
+        except Exception as e:
+            error_message = f"Request to Jina API failed: {str(e)}"
+            logger.error(error_message)
+            return f"Error: {error_message}"
--- a/backend/src/community/jina_ai/tools.py
+++ b/backend/src/community/jina_ai/tools.py
@@ -0,0 +1,28 @@
+from langchain.tools import tool
+
+from src.community.jina_ai.jina_client import JinaClient
+from src.config import get_app_config
+from src.utils.readability import ReadabilityExtractor
+
+readability_extractor = ReadabilityExtractor()
+
+
+@tool("web_fetch", parse_docstring=True)
+def web_fetch_tool(url: str) -> str:
+    """Fetch the contents of a web page at a given URL.
+    Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
+    This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
+    Do NOT add www. to URLs that do NOT have them.
+    URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
+
+    Args:
+        url: The URL to fetch the contents of.
+    """
+    jina_client = JinaClient()
+    timeout = 10
+    config = get_app_config().get_tool_config("web_fetch")
+    if config is not None and "timeout" in config.model_extra:
+        timeout = config.model_extra.get("timeout")
+    html_content = jina_client.crawl(url, return_format="html", timeout=timeout)
+    article = readability_extractor.extract_article(html_content)
+    return article.to_markdown()
--- a/backend/src/community/tavily/tools.py
+++ b/backend/src/community/tavily/tools.py
@@ -0,0 +1,53 @@
+import json
+
+from langchain.tools import tool
+from tavily import TavilyClient
+
+from src.config import get_app_config
+
+tavily_client = TavilyClient()
+
+
+@tool("web_search", parse_docstring=True)
+def web_search_tool(query: str) -> str:
+    """Search the web.
+
+    Args:
+        query: The query to search for.
+    """
+    config = get_app_config().get_tool_config("web_search")
+    max_results = 5
+    if config is not None and "max_results" in config.model_extra:
+        max_results = config.model_extra.get("max_results")
+    res = tavily_client.search(query, max_results=max_results)
+    normalized_results = [
+        {
+            "title": result["title"],
+            "url": result["url"],
+            "snippet": result["content"],
+        }
+        for result in res["results"]
+    ]
+    json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False)
+    return json_results
+
+
+@tool("web_fetch", parse_docstring=True)
+def web_fetch_tool(url: str) -> str:
+    """Fetch the contents of a web page at a given URL.
+    Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
+    This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
+    Do NOT add www. to URLs that do NOT have them.
+    URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
+
+    Args:
+        url: The URL to fetch the contents of.
+    """
+    res = tavily_client.extract([url])
+    if "failed_results" in res and len(res["failed_results"]) > 0:
+        return f"Error: {res["failed_results"][0]["error"]}"
+    elif "results" in res and len(res["results"]) > 0:
+        result = res["results"][0]
+        return f"# {result['title']}\n\n{result['raw_content']}"
+    else:
+        return "Error: No results found"