feat: lite deep researcher implementation

2026-04-26 23:34:47 +08:00 · 2025-04-07 16:25:55 +08:00
commit 03798ded08
58 changed files with 4242 additions and 0 deletions
--- a/src/crawler/crawler.py
+++ b/src/crawler/crawler.py
@@ -0,0 +1,35 @@
+import sys
+
+from .article import Article
+from .jina_client import JinaClient
+from .readability_extractor import ReadabilityExtractor
+
+
+class Crawler:
+    def crawl(self, url: str) -> Article:
+        # To help LLMs better understand content, we extract clean
+        # articles from HTML, convert them to markdown, and split
+        # them into text and image blocks for one single and unified
+        # LLM message.
+        #
+        # Jina is not the best crawler on readability, however it's
+        # much easier and free to use.
+        #
+        # Instead of using Jina's own markdown converter, we'll use
+        # our own solution to get better readability results.
+        jina_client = JinaClient()
+        html = jina_client.crawl(url, return_format="html")
+        extractor = ReadabilityExtractor()
+        article = extractor.extract_article(html)
+        article.url = url
+        return article
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        url = sys.argv[1]
+    else:
+        url = "https://fintel.io/zh-hant/s/br/nvdc34"
+    crawler = Crawler()
+    article = crawler.crawl(url)
+    print(article.to_markdown())