mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-26 23:34:47 +08:00
feat: lite deep researcher implementation
This commit is contained in:
35
src/crawler/crawler.py
Normal file
35
src/crawler/crawler.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import sys
|
||||
|
||||
from .article import Article
|
||||
from .jina_client import JinaClient
|
||||
from .readability_extractor import ReadabilityExtractor
|
||||
|
||||
|
||||
class Crawler:
|
||||
def crawl(self, url: str) -> Article:
|
||||
# To help LLMs better understand content, we extract clean
|
||||
# articles from HTML, convert them to markdown, and split
|
||||
# them into text and image blocks for one single and unified
|
||||
# LLM message.
|
||||
#
|
||||
# Jina is not the best crawler on readability, however it's
|
||||
# much easier and free to use.
|
||||
#
|
||||
# Instead of using Jina's own markdown converter, we'll use
|
||||
# our own solution to get better readability results.
|
||||
jina_client = JinaClient()
|
||||
html = jina_client.crawl(url, return_format="html")
|
||||
extractor = ReadabilityExtractor()
|
||||
article = extractor.extract_article(html)
|
||||
article.url = url
|
||||
return article
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) == 2:
|
||||
url = sys.argv[1]
|
||||
else:
|
||||
url = "https://fintel.io/zh-hant/s/br/nvdc34"
|
||||
crawler = Crawler()
|
||||
article = crawler.crawl(url)
|
||||
print(article.to_markdown())
|
||||
Reference in New Issue
Block a user