# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT import pytest import src.crawler as crawler_module from src.crawler import Crawler def test_crawler_sets_article_url(monkeypatch): """Test that the crawler sets the article.url field correctly.""" class DummyArticle: def __init__(self): self.url = None def to_markdown(self): return "# Dummy" class DummyJinaClient: def crawl(self, url, return_format=None): return "dummy" class DummyReadabilityExtractor: def extract_article(self, html): return DummyArticle() monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) monkeypatch.setattr( "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor ) crawler = crawler_module.Crawler() url = "http://example.com" article = crawler.crawl(url) assert article.url == url assert article.to_markdown() == "# Dummy" def test_crawler_calls_dependencies(monkeypatch): """Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article.""" calls = {} class DummyJinaClient: def crawl(self, url, return_format=None): calls["jina"] = (url, return_format) return "dummy" class DummyReadabilityExtractor: def extract_article(self, html): calls["extractor"] = html class DummyArticle: url = None def to_markdown(self): return "# Dummy" return DummyArticle() monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) monkeypatch.setattr( "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor ) crawler = crawler_module.Crawler() url = "http://example.com" crawler.crawl(url) assert "jina" in calls assert calls["jina"][0] == url assert calls["jina"][1] == "html" assert "extractor" in calls assert calls["extractor"] == "dummy"