feat: integrated with Tavily and Jina AI

2026-05-02 18:20:46 +08:00 · 2026-01-14 07:17:22 +08:00
parent 83bd7e4309
commit 4b5f529903
4 changed files with 190 additions and 0 deletions
--- a/backend/src/community/jina_ai/jina_client.py
+++ b/backend/src/community/jina_ai/jina_client.py
@@ -0,0 +1,43 @@
 import logging
 import os
 import requests
 logger = logging.getLogger(__name__)
 class JinaClient:
    def crawl(self, url: str, return_format: str = "html", timeout: int = 10) -> str:
        headers = {
            "Content-Type": "application/json",
            "X-Return-Format": return_format,
            "X-Timeout": str(timeout),
        }
        if os.getenv("JINA_API_KEY"):
            headers["Authorization"] = f"Bearer {os.getenv('JINA_API_KEY')}"
        else:
            logger.warning(
                "Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
            )
        data = {"url": url}
        try:
            response = requests.post("https://r.jina.ai/", headers=headers, json=data)
            if response.status_code != 200:
                error_message = (
                    f"Jina API returned status {response.status_code}: {response.text}"
                )
                logger.error(error_message)
                return f"Error: {error_message}"
            if not response.text or not response.text.strip():
                error_message = "Jina API returned empty response"
                logger.error(error_message)
                return f"Error: {error_message}"
            return response.text
        except Exception as e:
            error_message = f"Request to Jina API failed: {str(e)}"
            logger.error(error_message)
            return f"Error: {error_message}"
--- a/backend/src/community/jina_ai/tools.py
+++ b/backend/src/community/jina_ai/tools.py
@@ -0,0 +1,28 @@
 from langchain.tools import tool
 from src.community.jina_ai.jina_client import JinaClient
 from src.config import get_app_config
 from src.utils.readability import ReadabilityExtractor
 readability_extractor = ReadabilityExtractor()
@tool("web_fetch", parse_docstring=True)
 def web_fetch_tool(url: str) -> str:
    """Fetch the contents of a web page at a given URL.
    Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
    This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
    Do NOT add www. to URLs that do NOT have them.
    URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
    Args:
        url: The URL to fetch the contents of.
    """
    jina_client = JinaClient()
    timeout = 10
    config = get_app_config().get_tool_config("web_fetch")
    if config is not None and "timeout" in config.model_extra:
        timeout = config.model_extra.get("timeout")
    html_content = jina_client.crawl(url, return_format="html", timeout=timeout)
    article = readability_extractor.extract_article(html_content)
    return article.to_markdown()
--- a/backend/src/community/tavily/tools.py
+++ b/backend/src/community/tavily/tools.py
@@ -0,0 +1,53 @@
 import json
 from langchain.tools import tool
 from tavily import TavilyClient
 from src.config import get_app_config
 tavily_client = TavilyClient()
@tool("web_search", parse_docstring=True)
 def web_search_tool(query: str) -> str:
    """Search the web.
    Args:
        query: The query to search for.
    """
    config = get_app_config().get_tool_config("web_search")
    max_results = 5
    if config is not None and "max_results" in config.model_extra:
        max_results = config.model_extra.get("max_results")
    res = tavily_client.search(query, max_results=max_results)
    normalized_results = [
        {
            "title": result["title"],
            "url": result["url"],
            "snippet": result["content"],
        }
        for result in res["results"]
    ]
    json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False)
    return json_results
@tool("web_fetch", parse_docstring=True)
 def web_fetch_tool(url: str) -> str:
    """Fetch the contents of a web page at a given URL.
    Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools.
    This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls.
    Do NOT add www. to URLs that do NOT have them.
    URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL.
    Args:
        url: The URL to fetch the contents of.
    """
    res = tavily_client.extract([url])
    if "failed_results" in res and len(res["failed_results"]) > 0:
        return f"Error: {res["failed_results"][0]["error"]}"
    elif "results" in res and len(res["results"]) > 0:
        result = res["results"][0]
        return f"# {result['title']}\n\n{result['raw_content']}"
    else:
        return "Error: No results found"
--- a/backend/src/utils/readability.py
+++ b/backend/src/utils/readability.py
@@ -0,0 +1,66 @@
 import re
 from urllib.parse import urljoin
 from markdownify import markdownify as md
 from readabilipy import simple_json_from_html_string
 class Article:
    url: str
    def __init__(self, title: str, html_content: str):
        self.title = title
        self.html_content = html_content
    def to_markdown(self, including_title: bool = True) -> str:
        markdown = ""
        if including_title:
            markdown += f"# {self.title}\n\n"
        if self.html_content is None or not str(self.html_content).strip():
            markdown += "*No content available*\n"
        else:
            markdown += md(self.html_content)
        return markdown
    def to_message(self) -> list[dict]:
        image_pattern = r"!\[.*?\]\((.*?)\)"
        content: list[dict[str, str]] = []
        markdown = self.to_markdown()
        if not markdown or not markdown.strip():
            return [{"type": "text", "text": "No content available"}]
        parts = re.split(image_pattern, markdown)
        for i, part in enumerate(parts):
            if i % 2 == 1:
                image_url = urljoin(self.url, part.strip())
                content.append({"type": "image_url", "image_url": {"url": image_url}})
            else:
                text_part = part.strip()
                if text_part:
                    content.append({"type": "text", "text": text_part})
        # If after processing all parts, content is still empty, provide a fallback message.
        if not content:
            content = [{"type": "text", "text": "No content available"}]
        return content
 class ReadabilityExtractor:
    def extract_article(self, html: str) -> Article:
        article = simple_json_from_html_string(html, use_readability=True)
        html_content = article.get("content")
        if not html_content or not str(html_content).strip():
            html_content = "No content could be extracted from this page"
        title = article.get("title")
        if not title or not str(title).strip():
            title = "Untitled"
        return Article(title=title, html_content=html_content)