# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT import sys from .article import Article from .jina_client import JinaClient from .readability_extractor import ReadabilityExtractor class Crawler: def crawl(self, url: str) -> Article: # To help LLMs better understand content, we extract clean # articles from HTML, convert them to markdown, and split # them into text and image blocks for one single and unified # LLM message. # # Jina is not the best crawler on readability, however it's # much easier and free to use. # # Instead of using Jina's own markdown converter, we'll use # our own solution to get better readability results. jina_client = JinaClient() html = jina_client.crawl(url, return_format="html") extractor = ReadabilityExtractor() article = extractor.extract_article(html) article.url = url return article if __name__ == "__main__": if len(sys.argv) == 2: url = sys.argv[1] else: url = "https://fintel.io/zh-hant/s/br/nvdc34" crawler = Crawler() article = crawler.crawl(url) print(article.to_markdown())