From c6ed423021cf3e27b1c5025f65bdd9becb7d9df6 Mon Sep 17 00:00:00 2001 From: Willem Jiang Date: Sat, 7 Jun 2025 21:51:05 +0800 Subject: [PATCH] test: add unit tests of crawler (#292) * test: add unit tests of crawler * test: polish the code of crawler unit tests --- src/crawler/__init__.py | 7 +-- src/crawler/crawler.py | 10 ---- tests/unit/crawler/test_article.py | 74 ++++++++++++++++++++++++ tests/unit/crawler/test_crawler_class.py | 72 +++++++++++++++++++++++ 4 files changed, 149 insertions(+), 14 deletions(-) create mode 100644 tests/unit/crawler/test_article.py create mode 100644 tests/unit/crawler/test_crawler_class.py diff --git a/src/crawler/__init__.py b/src/crawler/__init__.py index 4f6a6e7..0747da2 100644 --- a/src/crawler/__init__.py +++ b/src/crawler/__init__.py @@ -3,8 +3,7 @@ from .article import Article from .crawler import Crawler +from .jina_client import JinaClient +from .readability_extractor import ReadabilityExtractor -__all__ = [ - "Article", - "Crawler", -] +__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"] diff --git a/src/crawler/crawler.py b/src/crawler/crawler.py index fe7ddfc..9f3632d 100644 --- a/src/crawler/crawler.py +++ b/src/crawler/crawler.py @@ -26,13 +26,3 @@ class Crawler: article = extractor.extract_article(html) article.url = url return article - - -if __name__ == "__main__": - if len(sys.argv) == 2: - url = sys.argv[1] - else: - url = "https://fintel.io/zh-hant/s/br/nvdc34" - crawler = Crawler() - article = crawler.crawl(url) - print(article.to_markdown()) diff --git a/tests/unit/crawler/test_article.py b/tests/unit/crawler/test_article.py new file mode 100644 index 0000000..b2aa918 --- /dev/null +++ b/tests/unit/crawler/test_article.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT +import pytest +from src.crawler.article import Article + + +class DummyMarkdownify: + """A dummy markdownify replacement for patching if needed.""" + + @staticmethod + def markdownify(html): + return html + + +def test_to_markdown_includes_title(monkeypatch): + article = Article("Test Title", "

Hello world!

") + result = article.to_markdown(including_title=True) + assert result.startswith("# Test Title") + assert "Hello" in result + + +def test_to_markdown_excludes_title(): + article = Article("Test Title", "

Hello world!

") + result = article.to_markdown(including_title=False) + assert not result.startswith("# Test Title") + assert "Hello" in result + + +def test_to_message_with_text_only(): + article = Article("Test Title", "

Hello world!

") + article.url = "https://example.com/" + result = article.to_message() + assert isinstance(result, list) + assert any(item["type"] == "text" for item in result) + assert all("type" in item for item in result) + + +def test_to_message_with_image(monkeypatch): + html = '

Intro

' + article = Article("Title", html) + article.url = "https://host.com/path/" + # The markdownify library will convert to markdown image syntax + result = article.to_message() + # Should have both text and image_url types + types = [item["type"] for item in result] + assert "image_url" in types + assert "text" in types + # Check that the image_url is correctly joined + image_items = [item for item in result if item["type"] == "image_url"] + assert image_items + assert image_items[0]["image_url"]["url"] == "https://host.com/path/img/pic.png" + + +def test_to_message_multiple_images(): + html = '

Start

Mid

End' + article = Article("Title", html) + article.url = "http://x/" + result = article.to_message() + image_urls = [ + item["image_url"]["url"] for item in result if item["type"] == "image_url" + ] + assert "http://x/a.png" in image_urls + assert "http://x/b.jpg" in image_urls + text_items = [item for item in result if item["type"] == "text"] + assert any("Start" in item["text"] for item in text_items) + assert any("Mid" in item["text"] for item in text_items) + + +def test_to_message_handles_empty_html(): + article = Article("Empty", "") + article.url = "http://test/" + result = article.to_message() + assert isinstance(result, list) + assert result[0]["type"] == "text" diff --git a/tests/unit/crawler/test_crawler_class.py b/tests/unit/crawler/test_crawler_class.py new file mode 100644 index 0000000..eba2148 --- /dev/null +++ b/tests/unit/crawler/test_crawler_class.py @@ -0,0 +1,72 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +import pytest +import src.crawler as crawler_module +from src.crawler import Crawler + + +def test_crawler_sets_article_url(monkeypatch): + """Test that the crawler sets the article.url field correctly.""" + + class DummyArticle: + def __init__(self): + self.url = None + + def to_markdown(self): + return "# Dummy" + + class DummyJinaClient: + def crawl(self, url, return_format=None): + return "dummy" + + class DummyReadabilityExtractor: + def extract_article(self, html): + return DummyArticle() + + monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) + monkeypatch.setattr( + "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor + ) + + crawler = crawler_module.Crawler() + url = "http://example.com" + article = crawler.crawl(url) + assert article.url == url + assert article.to_markdown() == "# Dummy" + + +def test_crawler_calls_dependencies(monkeypatch): + """Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article.""" + calls = {} + + class DummyJinaClient: + def crawl(self, url, return_format=None): + calls["jina"] = (url, return_format) + return "dummy" + + class DummyReadabilityExtractor: + def extract_article(self, html): + calls["extractor"] = html + + class DummyArticle: + url = None + + def to_markdown(self): + return "# Dummy" + + return DummyArticle() + + monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) + monkeypatch.setattr( + "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor + ) + + crawler = crawler_module.Crawler() + url = "http://example.com" + crawler.crawl(url) + assert "jina" in calls + assert calls["jina"][0] == url + assert calls["jina"][1] == "html" + assert "extractor" in calls + assert calls["extractor"] == "dummy"