Files
deer-flow/src/crawler/crawler.py
2025-04-17 11:34:42 +08:00

39 lines
1.2 KiB
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import sys
from .article import Article
from .jina_client import JinaClient
from .readability_extractor import ReadabilityExtractor
class Crawler:
def crawl(self, url: str) -> Article:
# To help LLMs better understand content, we extract clean
# articles from HTML, convert them to markdown, and split
# them into text and image blocks for one single and unified
# LLM message.
#
# Jina is not the best crawler on readability, however it's
# much easier and free to use.
#
# Instead of using Jina's own markdown converter, we'll use
# our own solution to get better readability results.
jina_client = JinaClient()
html = jina_client.crawl(url, return_format="html")
extractor = ReadabilityExtractor()
article = extractor.extract_article(html)
article.url = url
return article
if __name__ == "__main__":
if len(sys.argv) == 2:
url = sys.argv[1]
else:
url = "https://fintel.io/zh-hant/s/br/nvdc34"
crawler = Crawler()
article = crawler.crawl(url)
print(article.to_markdown())