test: add unit tests of crawler (#292)

* test: add unit tests of crawler * test: polish the code of crawler unit tests
2026-04-27 15:54:48 +08:00 · 2025-06-07 21:51:05 +08:00
parent 0e22c373af
commit c6ed423021
4 changed files with 149 additions and 14 deletions
--- a/src/crawler/init.py
+++ b/src/crawler/init.py
@@ -3,8 +3,7 @@
 from .article import Article
 from .crawler import Crawler
 from .jina_client import JinaClient
 from .readability_extractor import ReadabilityExtractor
-__all__ = [
+__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"]
    "Article",
    "Crawler",
 ]
--- a/src/crawler/crawler.py
+++ b/src/crawler/crawler.py
@@ -26,13 +26,3 @@ class Crawler:
        article = extractor.extract_article(html)
        article.url = url
        return article
 if __name__ == "__main__":
    if len(sys.argv) == 2:
        url = sys.argv[1]
    else:
        url = "https://fintel.io/zh-hant/s/br/nvdc34"
    crawler = Crawler()
    article = crawler.crawl(url)
    print(article.to_markdown())
--- a/tests/unit/crawler/test_article.py
+++ b/tests/unit/crawler/test_article.py
@@ -0,0 +1,74 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 import pytest
 from src.crawler.article import Article
 class DummyMarkdownify:
    """A dummy markdownify replacement for patching if needed."""
    @staticmethod
    def markdownify(html):
        return html
 def test_to_markdown_includes_title(monkeypatch):
    article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
    result = article.to_markdown(including_title=True)
    assert result.startswith("# Test Title")
    assert "Hello" in result
 def test_to_markdown_excludes_title():
    article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
    result = article.to_markdown(including_title=False)
    assert not result.startswith("# Test Title")
    assert "Hello" in result
 def test_to_message_with_text_only():
    article = Article("Test Title", "<p>Hello world!</p>")
    article.url = "https://example.com/"
    result = article.to_message()
    assert isinstance(result, list)
    assert any(item["type"] == "text" for item in result)
    assert all("type" in item for item in result)
 def test_to_message_with_image(monkeypatch):
    html = '<p>Intro</p><img src="img/pic.png"/>'
    article = Article("Title", html)
    article.url = "https://host.com/path/"
    # The markdownify library will convert <img> to markdown image syntax
    result = article.to_message()
    # Should have both text and image_url types
    types = [item["type"] for item in result]
    assert "image_url" in types
    assert "text" in types
    # Check that the image_url is correctly joined
    image_items = [item for item in result if item["type"] == "image_url"]
    assert image_items
    assert image_items[0]["image_url"]["url"] == "https://host.com/path/img/pic.png"
 def test_to_message_multiple_images():
    html = '<p>Start</p><img src="a.png"/><p>Mid</p><img src="b.jpg"/>End'
    article = Article("Title", html)
    article.url = "http://x/"
    result = article.to_message()
    image_urls = [
        item["image_url"]["url"] for item in result if item["type"] == "image_url"
    ]
    assert "http://x/a.png" in image_urls
    assert "http://x/b.jpg" in image_urls
    text_items = [item for item in result if item["type"] == "text"]
    assert any("Start" in item["text"] for item in text_items)
    assert any("Mid" in item["text"] for item in text_items)
 def test_to_message_handles_empty_html():
    article = Article("Empty", "")
    article.url = "http://test/"
    result = article.to_message()
    assert isinstance(result, list)
    assert result[0]["type"] == "text"
--- a/tests/unit/crawler/test_crawler_class.py
+++ b/tests/unit/crawler/test_crawler_class.py
@@ -0,0 +1,72 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 import pytest
 import src.crawler as crawler_module
 from src.crawler import Crawler
 def test_crawler_sets_article_url(monkeypatch):
    """Test that the crawler sets the article.url field correctly."""
    class DummyArticle:
        def __init__(self):
            self.url = None
        def to_markdown(self):
            return "# Dummy"
    class DummyJinaClient:
        def crawl(self, url, return_format=None):
            return "<html>dummy</html>"
    class DummyReadabilityExtractor:
        def extract_article(self, html):
            return DummyArticle()
    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
    monkeypatch.setattr(
        "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
    )
    crawler = crawler_module.Crawler()
    url = "http://example.com"
    article = crawler.crawl(url)
    assert article.url == url
    assert article.to_markdown() == "# Dummy"
 def test_crawler_calls_dependencies(monkeypatch):
    """Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
    calls = {}
    class DummyJinaClient:
        def crawl(self, url, return_format=None):
            calls["jina"] = (url, return_format)
            return "<html>dummy</html>"
    class DummyReadabilityExtractor:
        def extract_article(self, html):
            calls["extractor"] = html
            class DummyArticle:
                url = None
                def to_markdown(self):
                    return "# Dummy"
            return DummyArticle()
    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
    monkeypatch.setattr(
        "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
    )
    crawler = crawler_module.Crawler()
    url = "http://example.com"
    crawler.crawl(url)
    assert "jina" in calls
    assert calls["jina"][0] == url
    assert calls["jina"][1] == "html"
    assert "extractor" in calls
    assert calls["extractor"] == "<html>dummy</html>"