From 4b5f5299037ffee397fcad67fc4bd27671cebe1d Mon Sep 17 00:00:00 2001 From: Henry Li Date: Wed, 14 Jan 2026 07:17:22 +0800 Subject: [PATCH] feat: integrated with Tavily and Jina AI --- backend/src/community/jina_ai/jina_client.py | 43 +++++++++++++ backend/src/community/jina_ai/tools.py | 28 +++++++++ backend/src/community/tavily/tools.py | 53 ++++++++++++++++ backend/src/utils/readability.py | 66 ++++++++++++++++++++ 4 files changed, 190 insertions(+) create mode 100644 backend/src/community/jina_ai/jina_client.py create mode 100644 backend/src/community/jina_ai/tools.py create mode 100644 backend/src/community/tavily/tools.py create mode 100644 backend/src/utils/readability.py diff --git a/backend/src/community/jina_ai/jina_client.py b/backend/src/community/jina_ai/jina_client.py new file mode 100644 index 0000000..2f0d07b --- /dev/null +++ b/backend/src/community/jina_ai/jina_client.py @@ -0,0 +1,43 @@ +import logging +import os + +import requests + +logger = logging.getLogger(__name__) + + +class JinaClient: + + def crawl(self, url: str, return_format: str = "html", timeout: int = 10) -> str: + headers = { + "Content-Type": "application/json", + "X-Return-Format": return_format, + "X-Timeout": str(timeout), + } + if os.getenv("JINA_API_KEY"): + headers["Authorization"] = f"Bearer {os.getenv('JINA_API_KEY')}" + else: + logger.warning( + "Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information." + ) + data = {"url": url} + try: + response = requests.post("https://r.jina.ai/", headers=headers, json=data) + + if response.status_code != 200: + error_message = ( + f"Jina API returned status {response.status_code}: {response.text}" + ) + logger.error(error_message) + return f"Error: {error_message}" + + if not response.text or not response.text.strip(): + error_message = "Jina API returned empty response" + logger.error(error_message) + return f"Error: {error_message}" + + return response.text + except Exception as e: + error_message = f"Request to Jina API failed: {str(e)}" + logger.error(error_message) + return f"Error: {error_message}" diff --git a/backend/src/community/jina_ai/tools.py b/backend/src/community/jina_ai/tools.py new file mode 100644 index 0000000..c87b011 --- /dev/null +++ b/backend/src/community/jina_ai/tools.py @@ -0,0 +1,28 @@ +from langchain.tools import tool + +from src.community.jina_ai.jina_client import JinaClient +from src.config import get_app_config +from src.utils.readability import ReadabilityExtractor + +readability_extractor = ReadabilityExtractor() + + +@tool("web_fetch", parse_docstring=True) +def web_fetch_tool(url: str) -> str: + """Fetch the contents of a web page at a given URL. + Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools. + This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls. + Do NOT add www. to URLs that do NOT have them. + URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL. + + Args: + url: The URL to fetch the contents of. + """ + jina_client = JinaClient() + timeout = 10 + config = get_app_config().get_tool_config("web_fetch") + if config is not None and "timeout" in config.model_extra: + timeout = config.model_extra.get("timeout") + html_content = jina_client.crawl(url, return_format="html", timeout=timeout) + article = readability_extractor.extract_article(html_content) + return article.to_markdown() diff --git a/backend/src/community/tavily/tools.py b/backend/src/community/tavily/tools.py new file mode 100644 index 0000000..0654cd0 --- /dev/null +++ b/backend/src/community/tavily/tools.py @@ -0,0 +1,53 @@ +import json + +from langchain.tools import tool +from tavily import TavilyClient + +from src.config import get_app_config + +tavily_client = TavilyClient() + + +@tool("web_search", parse_docstring=True) +def web_search_tool(query: str) -> str: + """Search the web. + + Args: + query: The query to search for. + """ + config = get_app_config().get_tool_config("web_search") + max_results = 5 + if config is not None and "max_results" in config.model_extra: + max_results = config.model_extra.get("max_results") + res = tavily_client.search(query, max_results=max_results) + normalized_results = [ + { + "title": result["title"], + "url": result["url"], + "snippet": result["content"], + } + for result in res["results"] + ] + json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False) + return json_results + + +@tool("web_fetch", parse_docstring=True) +def web_fetch_tool(url: str) -> str: + """Fetch the contents of a web page at a given URL. + Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools. + This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls. + Do NOT add www. to URLs that do NOT have them. + URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL. + + Args: + url: The URL to fetch the contents of. + """ + res = tavily_client.extract([url]) + if "failed_results" in res and len(res["failed_results"]) > 0: + return f"Error: {res["failed_results"][0]["error"]}" + elif "results" in res and len(res["results"]) > 0: + result = res["results"][0] + return f"# {result['title']}\n\n{result['raw_content']}" + else: + return "Error: No results found" diff --git a/backend/src/utils/readability.py b/backend/src/utils/readability.py new file mode 100644 index 0000000..8915098 --- /dev/null +++ b/backend/src/utils/readability.py @@ -0,0 +1,66 @@ +import re +from urllib.parse import urljoin + +from markdownify import markdownify as md +from readabilipy import simple_json_from_html_string + + +class Article: + url: str + + def __init__(self, title: str, html_content: str): + self.title = title + self.html_content = html_content + + def to_markdown(self, including_title: bool = True) -> str: + markdown = "" + if including_title: + markdown += f"# {self.title}\n\n" + + if self.html_content is None or not str(self.html_content).strip(): + markdown += "*No content available*\n" + else: + markdown += md(self.html_content) + + return markdown + + def to_message(self) -> list[dict]: + image_pattern = r"!\[.*?\]\((.*?)\)" + + content: list[dict[str, str]] = [] + markdown = self.to_markdown() + + if not markdown or not markdown.strip(): + return [{"type": "text", "text": "No content available"}] + + parts = re.split(image_pattern, markdown) + + for i, part in enumerate(parts): + if i % 2 == 1: + image_url = urljoin(self.url, part.strip()) + content.append({"type": "image_url", "image_url": {"url": image_url}}) + else: + text_part = part.strip() + if text_part: + content.append({"type": "text", "text": text_part}) + + # If after processing all parts, content is still empty, provide a fallback message. + if not content: + content = [{"type": "text", "text": "No content available"}] + + return content + + +class ReadabilityExtractor: + def extract_article(self, html: str) -> Article: + article = simple_json_from_html_string(html, use_readability=True) + + html_content = article.get("content") + if not html_content or not str(html_content).strip(): + html_content = "No content could be extracted from this page" + + title = article.get("title") + if not title or not str(title).strip(): + title = "Untitled" + + return Article(title=title, html_content=html_content)