feat: lite deep researcher implementation

This commit is contained in:
He Tao
2025-04-07 16:25:55 +08:00
commit 03798ded08
58 changed files with 4242 additions and 0 deletions

7
src/crawler/__init__.py Normal file
View File

@@ -0,0 +1,7 @@
from .article import Article
from .crawler import Crawler
__all__ = [
"Article",
"Crawler",
]

34
src/crawler/article.py Normal file
View File

@@ -0,0 +1,34 @@
import re
from urllib.parse import urljoin
from markdownify import markdownify as md
class Article:
url: str
def __init__(self, title: str, html_content: str):
self.title = title
self.html_content = html_content
def to_markdown(self, including_title: bool = True) -> str:
markdown = ""
if including_title:
markdown += f"# {self.title}\n\n"
markdown += md(self.html_content)
return markdown
def to_message(self) -> list[dict]:
image_pattern = r"!\[.*?\]\((.*?)\)"
content: list[dict[str, str]] = []
parts = re.split(image_pattern, self.to_markdown())
for i, part in enumerate(parts):
if i % 2 == 1:
image_url = urljoin(self.url, part.strip())
content.append({"type": "image_url", "image_url": {"url": image_url}})
else:
content.append({"type": "text", "text": part.strip()})
return content

35
src/crawler/crawler.py Normal file
View File

@@ -0,0 +1,35 @@
import sys
from .article import Article
from .jina_client import JinaClient
from .readability_extractor import ReadabilityExtractor
class Crawler:
def crawl(self, url: str) -> Article:
# To help LLMs better understand content, we extract clean
# articles from HTML, convert them to markdown, and split
# them into text and image blocks for one single and unified
# LLM message.
#
# Jina is not the best crawler on readability, however it's
# much easier and free to use.
#
# Instead of using Jina's own markdown converter, we'll use
# our own solution to get better readability results.
jina_client = JinaClient()
html = jina_client.crawl(url, return_format="html")
extractor = ReadabilityExtractor()
article = extractor.extract_article(html)
article.url = url
return article
if __name__ == "__main__":
if len(sys.argv) == 2:
url = sys.argv[1]
else:
url = "https://fintel.io/zh-hant/s/br/nvdc34"
crawler = Crawler()
article = crawler.crawl(url)
print(article.to_markdown())

View File

@@ -0,0 +1,23 @@
import logging
import os
import requests
logger = logging.getLogger(__name__)
class JinaClient:
def crawl(self, url: str, return_format: str = "html") -> str:
headers = {
"Content-Type": "application/json",
"X-Return-Format": return_format,
}
if os.getenv("JINA_API_KEY"):
headers["Authorization"] = f"Bearer {os.getenv('JINA_API_KEY')}"
else:
logger.warning(
"Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
)
data = {"url": url}
response = requests.post("https://r.jina.ai/", headers=headers, json=data)
return response.text

View File

@@ -0,0 +1,12 @@
from readabilipy import simple_json_from_html_string
from .article import Article
class ReadabilityExtractor:
def extract_article(self, html: str) -> Article:
article = simple_json_from_html_string(html, use_readability=True)
return Article(
title=article.get("title"),
html_content=article.get("content"),
)