From c6ed423021cf3e27b1c5025f65bdd9becb7d9df6 Mon Sep 17 00:00:00 2001
From: Willem Jiang <willem.jiang@gmail.com>
Date: Sat, 7 Jun 2025 21:51:05 +0800
Subject: [PATCH] test: add unit tests of crawler (#292)

* test: add unit tests of crawler

* test: polish the code of crawler unit tests
---
 src/crawler/__init__.py                  |  7 +--
 src/crawler/crawler.py                   | 10 ----
 tests/unit/crawler/test_article.py       | 74 ++++++++++++++++++++++++
 tests/unit/crawler/test_crawler_class.py | 72 +++++++++++++++++++++++
 4 files changed, 149 insertions(+), 14 deletions(-)
 create mode 100644 tests/unit/crawler/test_article.py
 create mode 100644 tests/unit/crawler/test_crawler_class.py
diff --git a/src/crawler/__init__.py b/src/crawler/__init__.py
index 4f6a6e7..0747da2 100644
--- a/src/crawler/__init__.py
+++ b/src/crawler/__init__.py
@@ -3,8 +3,7 @@
 
 from .article import Article
 from .crawler import Crawler
+from .jina_client import JinaClient
+from .readability_extractor import ReadabilityExtractor
 
-__all__ = [
-    "Article",
-    "Crawler",
-]
+__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"]
diff --git a/src/crawler/crawler.py b/src/crawler/crawler.py
index fe7ddfc..9f3632d 100644
--- a/src/crawler/crawler.py
+++ b/src/crawler/crawler.py
@@ -26,13 +26,3 @@ class Crawler:
         article = extractor.extract_article(html)
         article.url = url
         return article
-
-
-if __name__ == "__main__":
-    if len(sys.argv) == 2:
-        url = sys.argv[1]
-    else:
-        url = "https://fintel.io/zh-hant/s/br/nvdc34"
-    crawler = Crawler()
-    article = crawler.crawl(url)
-    print(article.to_markdown())
diff --git a/tests/unit/crawler/test_article.py b/tests/unit/crawler/test_article.py
new file mode 100644
index 0000000..b2aa918
--- /dev/null
+++ b/tests/unit/crawler/test_article.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+import pytest
+from src.crawler.article import Article
+
+
+class DummyMarkdownify:
+    """A dummy markdownify replacement for patching if needed."""
+
+    @staticmethod
+    def markdownify(html):
+        return html
+
+
+def test_to_markdown_includes_title(monkeypatch):
+    article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
+    result = article.to_markdown(including_title=True)
+    assert result.startswith("# Test Title")
+    assert "Hello" in result
+
+
+def test_to_markdown_excludes_title():
+    article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
+    result = article.to_markdown(including_title=False)
+    assert not result.startswith("# Test Title")
+    assert "Hello" in result
+
+
+def test_to_message_with_text_only():
+    article = Article("Test Title", "<p>Hello world!</p>")
+    article.url = "https://example.com/"
+    result = article.to_message()
+    assert isinstance(result, list)
+    assert any(item["type"] == "text" for item in result)
+    assert all("type" in item for item in result)
+
+
+def test_to_message_with_image(monkeypatch):
+    html = '<p>Intro</p><img src="img/pic.png"/>'
+    article = Article("Title", html)
+    article.url = "https://host.com/path/"
+    # The markdownify library will convert <img> to markdown image syntax
+    result = article.to_message()
+    # Should have both text and image_url types
+    types = [item["type"] for item in result]
+    assert "image_url" in types
+    assert "text" in types
+    # Check that the image_url is correctly joined
+    image_items = [item for item in result if item["type"] == "image_url"]
+    assert image_items
+    assert image_items[0]["image_url"]["url"] == "https://host.com/path/img/pic.png"
+
+
+def test_to_message_multiple_images():
+    html = '<p>Start</p><img src="a.png"/><p>Mid</p><img src="b.jpg"/>End'
+    article = Article("Title", html)
+    article.url = "http://x/"
+    result = article.to_message()
+    image_urls = [
+        item["image_url"]["url"] for item in result if item["type"] == "image_url"
+    ]
+    assert "http://x/a.png" in image_urls
+    assert "http://x/b.jpg" in image_urls
+    text_items = [item for item in result if item["type"] == "text"]
+    assert any("Start" in item["text"] for item in text_items)
+    assert any("Mid" in item["text"] for item in text_items)
+
+
+def test_to_message_handles_empty_html():
+    article = Article("Empty", "")
+    article.url = "http://test/"
+    result = article.to_message()
+    assert isinstance(result, list)
+    assert result[0]["type"] == "text"
diff --git a/tests/unit/crawler/test_crawler_class.py b/tests/unit/crawler/test_crawler_class.py
new file mode 100644
index 0000000..eba2148
--- /dev/null
+++ b/tests/unit/crawler/test_crawler_class.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+import pytest
+import src.crawler as crawler_module
+from src.crawler import Crawler
+
+
+def test_crawler_sets_article_url(monkeypatch):
+    """Test that the crawler sets the article.url field correctly."""
+
+    class DummyArticle:
+        def __init__(self):
+            self.url = None
+
+        def to_markdown(self):
+            return "# Dummy"
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            return "<html>dummy</html>"
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            return DummyArticle()
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr(
+        "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
+    )
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com"
+    article = crawler.crawl(url)
+    assert article.url == url
+    assert article.to_markdown() == "# Dummy"
+
+
+def test_crawler_calls_dependencies(monkeypatch):
+    """Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
+    calls = {}
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            calls["jina"] = (url, return_format)
+            return "<html>dummy</html>"
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            calls["extractor"] = html
+
+            class DummyArticle:
+                url = None
+
+                def to_markdown(self):
+                    return "# Dummy"
+
+            return DummyArticle()
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr(
+        "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
+    )
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com"
+    crawler.crawl(url)
+    assert "jina" in calls
+    assert calls["jina"][0] == url
+    assert calls["jina"][1] == "html"
+    assert "extractor" in calls
+    assert calls["extractor"] == "<html>dummy</html>"