mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-09 16:54:46 +08:00
test: add unit tests of crawler (#292)
* test: add unit tests of crawler * test: polish the code of crawler unit tests
This commit is contained in:
@@ -3,8 +3,7 @@
|
||||
|
||||
from .article import Article
|
||||
from .crawler import Crawler
|
||||
from .jina_client import JinaClient
|
||||
from .readability_extractor import ReadabilityExtractor
|
||||
|
||||
__all__ = [
|
||||
"Article",
|
||||
"Crawler",
|
||||
]
|
||||
__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"]
|
||||
|
||||
@@ -26,13 +26,3 @@ class Crawler:
|
||||
article = extractor.extract_article(html)
|
||||
article.url = url
|
||||
return article
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) == 2:
|
||||
url = sys.argv[1]
|
||||
else:
|
||||
url = "https://fintel.io/zh-hant/s/br/nvdc34"
|
||||
crawler = Crawler()
|
||||
article = crawler.crawl(url)
|
||||
print(article.to_markdown())
|
||||
|
||||
74
tests/unit/crawler/test_article.py
Normal file
74
tests/unit/crawler/test_article.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
import pytest
|
||||
from src.crawler.article import Article
|
||||
|
||||
|
||||
class DummyMarkdownify:
|
||||
"""A dummy markdownify replacement for patching if needed."""
|
||||
|
||||
@staticmethod
|
||||
def markdownify(html):
|
||||
return html
|
||||
|
||||
|
||||
def test_to_markdown_includes_title(monkeypatch):
|
||||
article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
|
||||
result = article.to_markdown(including_title=True)
|
||||
assert result.startswith("# Test Title")
|
||||
assert "Hello" in result
|
||||
|
||||
|
||||
def test_to_markdown_excludes_title():
|
||||
article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
|
||||
result = article.to_markdown(including_title=False)
|
||||
assert not result.startswith("# Test Title")
|
||||
assert "Hello" in result
|
||||
|
||||
|
||||
def test_to_message_with_text_only():
|
||||
article = Article("Test Title", "<p>Hello world!</p>")
|
||||
article.url = "https://example.com/"
|
||||
result = article.to_message()
|
||||
assert isinstance(result, list)
|
||||
assert any(item["type"] == "text" for item in result)
|
||||
assert all("type" in item for item in result)
|
||||
|
||||
|
||||
def test_to_message_with_image(monkeypatch):
|
||||
html = '<p>Intro</p><img src="img/pic.png"/>'
|
||||
article = Article("Title", html)
|
||||
article.url = "https://host.com/path/"
|
||||
# The markdownify library will convert <img> to markdown image syntax
|
||||
result = article.to_message()
|
||||
# Should have both text and image_url types
|
||||
types = [item["type"] for item in result]
|
||||
assert "image_url" in types
|
||||
assert "text" in types
|
||||
# Check that the image_url is correctly joined
|
||||
image_items = [item for item in result if item["type"] == "image_url"]
|
||||
assert image_items
|
||||
assert image_items[0]["image_url"]["url"] == "https://host.com/path/img/pic.png"
|
||||
|
||||
|
||||
def test_to_message_multiple_images():
|
||||
html = '<p>Start</p><img src="a.png"/><p>Mid</p><img src="b.jpg"/>End'
|
||||
article = Article("Title", html)
|
||||
article.url = "http://x/"
|
||||
result = article.to_message()
|
||||
image_urls = [
|
||||
item["image_url"]["url"] for item in result if item["type"] == "image_url"
|
||||
]
|
||||
assert "http://x/a.png" in image_urls
|
||||
assert "http://x/b.jpg" in image_urls
|
||||
text_items = [item for item in result if item["type"] == "text"]
|
||||
assert any("Start" in item["text"] for item in text_items)
|
||||
assert any("Mid" in item["text"] for item in text_items)
|
||||
|
||||
|
||||
def test_to_message_handles_empty_html():
|
||||
article = Article("Empty", "")
|
||||
article.url = "http://test/"
|
||||
result = article.to_message()
|
||||
assert isinstance(result, list)
|
||||
assert result[0]["type"] == "text"
|
||||
72
tests/unit/crawler/test_crawler_class.py
Normal file
72
tests/unit/crawler/test_crawler_class.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import pytest
|
||||
import src.crawler as crawler_module
|
||||
from src.crawler import Crawler
|
||||
|
||||
|
||||
def test_crawler_sets_article_url(monkeypatch):
|
||||
"""Test that the crawler sets the article.url field correctly."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self):
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return "# Dummy"
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
return DummyArticle()
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
assert article.url == url
|
||||
assert article.to_markdown() == "# Dummy"
|
||||
|
||||
|
||||
def test_crawler_calls_dependencies(monkeypatch):
|
||||
"""Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
|
||||
calls = {}
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
calls["jina"] = (url, return_format)
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
calls["extractor"] = html
|
||||
|
||||
class DummyArticle:
|
||||
url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return "# Dummy"
|
||||
|
||||
return DummyArticle()
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
crawler.crawl(url)
|
||||
assert "jina" in calls
|
||||
assert calls["jina"][0] == url
|
||||
assert calls["jina"][1] == "html"
|
||||
assert "extractor" in calls
|
||||
assert calls["extractor"] == "<html>dummy</html>"
|
||||
Reference in New Issue
Block a user