test: add unit tests of crawler (#292)

* test: add unit tests of crawler * test: polish the code of crawler unit tests
2026-04-09 16:54:46 +08:00 · 2025-06-07 21:51:05 +08:00
parent 0e22c373af
commit c6ed423021
4 changed files with 149 additions and 14 deletions
--- a/src/crawler/init.py
+++ b/src/crawler/init.py
@@ -3,8 +3,7 @@

 from .article import Article
 from .crawler import Crawler
+from .jina_client import JinaClient
+from .readability_extractor import ReadabilityExtractor

-__all__ = [
-    "Article",
-    "Crawler",
-]
+__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"]
--- a/src/crawler/crawler.py
+++ b/src/crawler/crawler.py
@@ -26,13 +26,3 @@ class Crawler:
        article = extractor.extract_article(html)
        article.url = url
        return article
-
-
-if __name__ == "__main__":
-    if len(sys.argv) == 2:
-        url = sys.argv[1]
-    else:
-        url = "https://fintel.io/zh-hant/s/br/nvdc34"
-    crawler = Crawler()
-    article = crawler.crawl(url)
-    print(article.to_markdown())
--- a/tests/unit/crawler/test_article.py
+++ b/tests/unit/crawler/test_article.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+import pytest
+from src.crawler.article import Article
+
+
+class DummyMarkdownify:
+    """A dummy markdownify replacement for patching if needed."""
+
+    @staticmethod
+    def markdownify(html):
+        return html
+
+
+def test_to_markdown_includes_title(monkeypatch):
+    article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
+    result = article.to_markdown(including_title=True)
+    assert result.startswith("# Test Title")
+    assert "Hello" in result
+
+
+def test_to_markdown_excludes_title():
+    article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
+    result = article.to_markdown(including_title=False)
+    assert not result.startswith("# Test Title")
+    assert "Hello" in result
+
+
+def test_to_message_with_text_only():
+    article = Article("Test Title", "<p>Hello world!</p>")
+    article.url = "https://example.com/"
+    result = article.to_message()
+    assert isinstance(result, list)
+    assert any(item["type"] == "text" for item in result)
+    assert all("type" in item for item in result)
+
+
+def test_to_message_with_image(monkeypatch):
+    html = '<p>Intro</p><img src="img/pic.png"/>'
+    article = Article("Title", html)
+    article.url = "https://host.com/path/"
+    # The markdownify library will convert <img> to markdown image syntax
+    result = article.to_message()
+    # Should have both text and image_url types
+    types = [item["type"] for item in result]
+    assert "image_url" in types
+    assert "text" in types
+    # Check that the image_url is correctly joined
+    image_items = [item for item in result if item["type"] == "image_url"]
+    assert image_items
+    assert image_items[0]["image_url"]["url"] == "https://host.com/path/img/pic.png"
+
+
+def test_to_message_multiple_images():
+    html = '<p>Start</p><img src="a.png"/><p>Mid</p><img src="b.jpg"/>End'
+    article = Article("Title", html)
+    article.url = "http://x/"
+    result = article.to_message()
+    image_urls = [
+        item["image_url"]["url"] for item in result if item["type"] == "image_url"
+    ]
+    assert "http://x/a.png" in image_urls
+    assert "http://x/b.jpg" in image_urls
+    text_items = [item for item in result if item["type"] == "text"]
+    assert any("Start" in item["text"] for item in text_items)
+    assert any("Mid" in item["text"] for item in text_items)
+
+
+def test_to_message_handles_empty_html():
+    article = Article("Empty", "")
+    article.url = "http://test/"
+    result = article.to_message()
+    assert isinstance(result, list)
+    assert result[0]["type"] == "text"
--- a/tests/unit/crawler/test_crawler_class.py
+++ b/tests/unit/crawler/test_crawler_class.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+import pytest
+import src.crawler as crawler_module
+from src.crawler import Crawler
+
+
+def test_crawler_sets_article_url(monkeypatch):
+    """Test that the crawler sets the article.url field correctly."""
+
+    class DummyArticle:
+        def __init__(self):
+            self.url = None
+
+        def to_markdown(self):
+            return "# Dummy"
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            return "<html>dummy</html>"
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            return DummyArticle()
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr(
+        "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
+    )
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com"
+    article = crawler.crawl(url)
+    assert article.url == url
+    assert article.to_markdown() == "# Dummy"
+
+
+def test_crawler_calls_dependencies(monkeypatch):
+    """Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
+    calls = {}
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            calls["jina"] = (url, return_format)
+            return "<html>dummy</html>"
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            calls["extractor"] = html
+
+            class DummyArticle:
+                url = None
+
+                def to_markdown(self):
+                    return "# Dummy"
+
+            return DummyArticle()
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr(
+        "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
+    )
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com"
+    crawler.crawl(url)
+    assert "jina" in calls
+    assert calls["jina"][0] == url
+    assert calls["jina"][1] == "html"
+    assert "extractor" in calls
+    assert calls["extractor"] == "<html>dummy</html>"