mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-10 17:24:45 +08:00
39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import sys
|
|
|
|
from .article import Article
|
|
from .jina_client import JinaClient
|
|
from .readability_extractor import ReadabilityExtractor
|
|
|
|
|
|
class Crawler:
|
|
def crawl(self, url: str) -> Article:
|
|
# To help LLMs better understand content, we extract clean
|
|
# articles from HTML, convert them to markdown, and split
|
|
# them into text and image blocks for one single and unified
|
|
# LLM message.
|
|
#
|
|
# Jina is not the best crawler on readability, however it's
|
|
# much easier and free to use.
|
|
#
|
|
# Instead of using Jina's own markdown converter, we'll use
|
|
# our own solution to get better readability results.
|
|
jina_client = JinaClient()
|
|
html = jina_client.crawl(url, return_format="html")
|
|
extractor = ReadabilityExtractor()
|
|
article = extractor.extract_article(html)
|
|
article.url = url
|
|
return article
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) == 2:
|
|
url = sys.argv[1]
|
|
else:
|
|
url = "https://fintel.io/zh-hant/s/br/nvdc34"
|
|
crawler = Crawler()
|
|
article = crawler.crawl(url)
|
|
print(article.to_markdown())
|