feat: integrated with Tavily and Jina AI

2026-04-03 06:12:14 +08:00 · 2026-01-14 07:17:22 +08:00
parent 83bd7e4309
commit 4b5f529903
4 changed files with 190 additions and 0 deletions
--- a/backend/src/community/jina_ai/jina_client.py
+++ b/backend/src/community/jina_ai/jina_client.py
@@ -0,0 +1,43 @@
+import logging
+import os
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+class JinaClient:
+
+    def crawl(self, url: str, return_format: str = "html", timeout: int = 10) -> str:
+        headers = {
+            "Content-Type": "application/json",
+            "X-Return-Format": return_format,
+            "X-Timeout": str(timeout),
+        }
+        if os.getenv("JINA_API_KEY"):
+            headers["Authorization"] = f"Bearer {os.getenv('JINA_API_KEY')}"
+        else:
+            logger.warning(
+                "Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
+            )
+        data = {"url": url}
+        try:
+            response = requests.post("https://r.jina.ai/", headers=headers, json=data)
+
+            if response.status_code != 200:
+                error_message = (
+                    f"Jina API returned status {response.status_code}: {response.text}"
+                )
+                logger.error(error_message)
+                return f"Error: {error_message}"
+
+            if not response.text or not response.text.strip():
+                error_message = "Jina API returned empty response"
+                logger.error(error_message)
+                return f"Error: {error_message}"
+
+            return response.text
+        except Exception as e:
+            error_message = f"Request to Jina API failed: {str(e)}"
+            logger.error(error_message)
+            return f"Error: {error_message}"
--- a/backend/src/community/jina_ai/tools.py
+++ b/backend/src/community/jina_ai/tools.py
@@ -0,0 +1,28 @@
+from langchain.tools import tool
+
+from src.community.jina_ai.jina_client import JinaClient
+from src.config import get_app_config
+from src.utils.readability import ReadabilityExtractor
+
+readability_extractor = ReadabilityExtractor()
+
+
+@tool("web_fetch", parse_docstring=True)
+def web_fetch_tool(url: str) -> str:
+    """Fetch the contents of a web page at a given URL.
+    Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
+    This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
+    Do NOT add www. to URLs that do NOT have them.
+    URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
+
+    Args:
+        url: The URL to fetch the contents of.
+    """
+    jina_client = JinaClient()
+    timeout = 10
+    config = get_app_config().get_tool_config("web_fetch")
+    if config is not None and "timeout" in config.model_extra:
+        timeout = config.model_extra.get("timeout")
+    html_content = jina_client.crawl(url, return_format="html", timeout=timeout)
+    article = readability_extractor.extract_article(html_content)
+    return article.to_markdown()
--- a/backend/src/community/tavily/tools.py
+++ b/backend/src/community/tavily/tools.py
@@ -0,0 +1,53 @@
+import json
+
+from langchain.tools import tool
+from tavily import TavilyClient
+
+from src.config import get_app_config
+
+tavily_client = TavilyClient()
+
+
+@tool("web_search", parse_docstring=True)
+def web_search_tool(query: str) -> str:
+    """Search the web.
+
+    Args:
+        query: The query to search for.
+    """
+    config = get_app_config().get_tool_config("web_search")
+    max_results = 5
+    if config is not None and "max_results" in config.model_extra:
+        max_results = config.model_extra.get("max_results")
+    res = tavily_client.search(query, max_results=max_results)
+    normalized_results = [
+        {
+            "title": result["title"],
+            "url": result["url"],
+            "snippet": result["content"],
+        }
+        for result in res["results"]
+    ]
+    json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False)
+    return json_results
+
+
+@tool("web_fetch", parse_docstring=True)
+def web_fetch_tool(url: str) -> str:
+    """Fetch the contents of a web page at a given URL.
+    Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
+    This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
+    Do NOT add www. to URLs that do NOT have them.
+    URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
+
+    Args:
+        url: The URL to fetch the contents of.
+    """
+    res = tavily_client.extract([url])
+    if "failed_results" in res and len(res["failed_results"]) > 0:
+        return f"Error: {res["failed_results"][0]["error"]}"
+    elif "results" in res and len(res["results"]) > 0:
+        result = res["results"][0]
+        return f"# {result['title']}\n\n{result['raw_content']}"
+    else:
+        return "Error: No results found"
--- a/backend/src/utils/readability.py
+++ b/backend/src/utils/readability.py
@@ -0,0 +1,66 @@
+import re
+from urllib.parse import urljoin
+
+from markdownify import markdownify as md
+from readabilipy import simple_json_from_html_string
+
+
+class Article:
+    url: str
+
+    def __init__(self, title: str, html_content: str):
+        self.title = title
+        self.html_content = html_content
+
+    def to_markdown(self, including_title: bool = True) -> str:
+        markdown = ""
+        if including_title:
+            markdown += f"# {self.title}\n\n"
+
+        if self.html_content is None or not str(self.html_content).strip():
+            markdown += "*No content available*\n"
+        else:
+            markdown += md(self.html_content)
+
+        return markdown
+
+    def to_message(self) -> list[dict]:
+        image_pattern = r"!\[.*?\]\((.*?)\)"
+
+        content: list[dict[str, str]] = []
+        markdown = self.to_markdown()
+
+        if not markdown or not markdown.strip():
+            return [{"type": "text", "text": "No content available"}]
+
+        parts = re.split(image_pattern, markdown)
+
+        for i, part in enumerate(parts):
+            if i % 2 == 1:
+                image_url = urljoin(self.url, part.strip())
+                content.append({"type": "image_url", "image_url": {"url": image_url}})
+            else:
+                text_part = part.strip()
+                if text_part:
+                    content.append({"type": "text", "text": text_part})
+
+        # If after processing all parts, content is still empty, provide a fallback message.
+        if not content:
+            content = [{"type": "text", "text": "No content available"}]
+
+        return content
+
+
+class ReadabilityExtractor:
+    def extract_article(self, html: str) -> Article:
+        article = simple_json_from_html_string(html, use_readability=True)
+
+        html_content = article.get("content")
+        if not html_content or not str(html_content).strip():
+            html_content = "No content could be extracted from this page"
+
+        title = article.get("title")
+        if not title or not str(title).strip():
+            title = "Untitled"
+
+        return Article(title=title, html_content=html_content)