test: add unit tests of crawler (#292)

* test: add unit tests of crawler

* test: polish the code of crawler unit tests
This commit is contained in:
Willem Jiang
2025-06-07 21:51:05 +08:00
committed by GitHub
parent 0e22c373af
commit c6ed423021
4 changed files with 149 additions and 14 deletions

View File

@@ -3,8 +3,7 @@
from .article import Article
from .crawler import Crawler
from .jina_client import JinaClient
from .readability_extractor import ReadabilityExtractor
__all__ = [
"Article",
"Crawler",
]
__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"]

View File

@@ -26,13 +26,3 @@ class Crawler:
article = extractor.extract_article(html)
article.url = url
return article
if __name__ == "__main__":
if len(sys.argv) == 2:
url = sys.argv[1]
else:
url = "https://fintel.io/zh-hant/s/br/nvdc34"
crawler = Crawler()
article = crawler.crawl(url)
print(article.to_markdown())

View File

@@ -0,0 +1,74 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import pytest
from src.crawler.article import Article
class DummyMarkdownify:
"""A dummy markdownify replacement for patching if needed."""
@staticmethod
def markdownify(html):
return html
def test_to_markdown_includes_title(monkeypatch):
article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
result = article.to_markdown(including_title=True)
assert result.startswith("# Test Title")
assert "Hello" in result
def test_to_markdown_excludes_title():
article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
result = article.to_markdown(including_title=False)
assert not result.startswith("# Test Title")
assert "Hello" in result
def test_to_message_with_text_only():
article = Article("Test Title", "<p>Hello world!</p>")
article.url = "https://example.com/"
result = article.to_message()
assert isinstance(result, list)
assert any(item["type"] == "text" for item in result)
assert all("type" in item for item in result)
def test_to_message_with_image(monkeypatch):
html = '<p>Intro</p><img src="img/pic.png"/>'
article = Article("Title", html)
article.url = "https://host.com/path/"
# The markdownify library will convert <img> to markdown image syntax
result = article.to_message()
# Should have both text and image_url types
types = [item["type"] for item in result]
assert "image_url" in types
assert "text" in types
# Check that the image_url is correctly joined
image_items = [item for item in result if item["type"] == "image_url"]
assert image_items
assert image_items[0]["image_url"]["url"] == "https://host.com/path/img/pic.png"
def test_to_message_multiple_images():
html = '<p>Start</p><img src="a.png"/><p>Mid</p><img src="b.jpg"/>End'
article = Article("Title", html)
article.url = "http://x/"
result = article.to_message()
image_urls = [
item["image_url"]["url"] for item in result if item["type"] == "image_url"
]
assert "http://x/a.png" in image_urls
assert "http://x/b.jpg" in image_urls
text_items = [item for item in result if item["type"] == "text"]
assert any("Start" in item["text"] for item in text_items)
assert any("Mid" in item["text"] for item in text_items)
def test_to_message_handles_empty_html():
article = Article("Empty", "")
article.url = "http://test/"
result = article.to_message()
assert isinstance(result, list)
assert result[0]["type"] == "text"

View File

@@ -0,0 +1,72 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import pytest
import src.crawler as crawler_module
from src.crawler import Crawler
def test_crawler_sets_article_url(monkeypatch):
"""Test that the crawler sets the article.url field correctly."""
class DummyArticle:
def __init__(self):
self.url = None
def to_markdown(self):
return "# Dummy"
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
return DummyArticle()
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
crawler = crawler_module.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.to_markdown() == "# Dummy"
def test_crawler_calls_dependencies(monkeypatch):
"""Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
calls = {}
class DummyJinaClient:
def crawl(self, url, return_format=None):
calls["jina"] = (url, return_format)
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
calls["extractor"] = html
class DummyArticle:
url = None
def to_markdown(self):
return "# Dummy"
return DummyArticle()
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
crawler = crawler_module.Crawler()
url = "http://example.com"
crawler.crawl(url)
assert "jina" in calls
assert calls["jina"][0] == url
assert calls["jina"][1] == "html"
assert "extractor" in calls
assert calls["extractor"] == "<html>dummy</html>"