feat: lite deep researcher implementation

2026-04-18 12:04:45 +08:00 · 2025-04-07 16:25:55 +08:00
commit 03798ded08
58 changed files with 4242 additions and 0 deletions
--- a/src/crawler/init.py
+++ b/src/crawler/init.py
@@ -0,0 +1,7 @@
+from .article import Article
+from .crawler import Crawler
+
+__all__ = [
+    "Article",
+    "Crawler",
+]
--- a/src/crawler/article.py
+++ b/src/crawler/article.py
@@ -0,0 +1,34 @@
+import re
+from urllib.parse import urljoin
+
+from markdownify import markdownify as md
+
+
+class Article:
+    url: str
+
+    def __init__(self, title: str, html_content: str):
+        self.title = title
+        self.html_content = html_content
+
+    def to_markdown(self, including_title: bool = True) -> str:
+        markdown = ""
+        if including_title:
+            markdown += f"# {self.title}\n\n"
+        markdown += md(self.html_content)
+        return markdown
+
+    def to_message(self) -> list[dict]:
+        image_pattern = r"!\[.*?\]\((.*?)\)"
+
+        content: list[dict[str, str]] = []
+        parts = re.split(image_pattern, self.to_markdown())
+
+        for i, part in enumerate(parts):
+            if i % 2 == 1:
+                image_url = urljoin(self.url, part.strip())
+                content.append({"type": "image_url", "image_url": {"url": image_url}})
+            else:
+                content.append({"type": "text", "text": part.strip()})
+
+        return content
--- a/src/crawler/crawler.py
+++ b/src/crawler/crawler.py
@@ -0,0 +1,35 @@
+import sys
+
+from .article import Article
+from .jina_client import JinaClient
+from .readability_extractor import ReadabilityExtractor
+
+
+class Crawler:
+    def crawl(self, url: str) -> Article:
+        # To help LLMs better understand content, we extract clean
+        # articles from HTML, convert them to markdown, and split
+        # them into text and image blocks for one single and unified
+        # LLM message.
+        #
+        # Jina is not the best crawler on readability, however it's
+        # much easier and free to use.
+        #
+        # Instead of using Jina's own markdown converter, we'll use
+        # our own solution to get better readability results.
+        jina_client = JinaClient()
+        html = jina_client.crawl(url, return_format="html")
+        extractor = ReadabilityExtractor()
+        article = extractor.extract_article(html)
+        article.url = url
+        return article
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        url = sys.argv[1]
+    else:
+        url = "https://fintel.io/zh-hant/s/br/nvdc34"
+    crawler = Crawler()
+    article = crawler.crawl(url)
+    print(article.to_markdown())
--- a/src/crawler/jina_client.py
+++ b/src/crawler/jina_client.py
@@ -0,0 +1,23 @@
+import logging
+import os
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+class JinaClient:
+    def crawl(self, url: str, return_format: str = "html") -> str:
+        headers = {
+            "Content-Type": "application/json",
+            "X-Return-Format": return_format,
+        }
+        if os.getenv("JINA_API_KEY"):
+            headers["Authorization"] = f"Bearer {os.getenv('JINA_API_KEY')}"
+        else:
+            logger.warning(
+                "Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
+            )
+        data = {"url": url}
+        response = requests.post("https://r.jina.ai/", headers=headers, json=data)
+        return response.text
--- a/src/crawler/readability_extractor.py
+++ b/src/crawler/readability_extractor.py
@@ -0,0 +1,12 @@
+from readabilipy import simple_json_from_html_string
+
+from .article import Article
+
+
+class ReadabilityExtractor:
+    def extract_article(self, html: str) -> Article:
+        article = simple_json_from_html_string(html, use_readability=True)
+        return Article(
+            title=article.get("title"),
+            html_content=article.get("content"),
+        )