mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-27 15:54:48 +08:00
test: add unit tests of crawler (#292)
* test: add unit tests of crawler * test: polish the code of crawler unit tests
This commit is contained in:
@@ -3,8 +3,7 @@
|
|||||||
|
|
||||||
from .article import Article
|
from .article import Article
|
||||||
from .crawler import Crawler
|
from .crawler import Crawler
|
||||||
|
from .jina_client import JinaClient
|
||||||
|
from .readability_extractor import ReadabilityExtractor
|
||||||
|
|
||||||
__all__ = [
|
__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"]
|
||||||
"Article",
|
|
||||||
"Crawler",
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -26,13 +26,3 @@ class Crawler:
|
|||||||
article = extractor.extract_article(html)
|
article = extractor.extract_article(html)
|
||||||
article.url = url
|
article.url = url
|
||||||
return article
|
return article
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
if len(sys.argv) == 2:
|
|
||||||
url = sys.argv[1]
|
|
||||||
else:
|
|
||||||
url = "https://fintel.io/zh-hant/s/br/nvdc34"
|
|
||||||
crawler = Crawler()
|
|
||||||
article = crawler.crawl(url)
|
|
||||||
print(article.to_markdown())
|
|
||||||
|
|||||||
74
tests/unit/crawler/test_article.py
Normal file
74
tests/unit/crawler/test_article.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
import pytest
|
||||||
|
from src.crawler.article import Article
|
||||||
|
|
||||||
|
|
||||||
|
class DummyMarkdownify:
|
||||||
|
"""A dummy markdownify replacement for patching if needed."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def markdownify(html):
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_markdown_includes_title(monkeypatch):
|
||||||
|
article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
|
||||||
|
result = article.to_markdown(including_title=True)
|
||||||
|
assert result.startswith("# Test Title")
|
||||||
|
assert "Hello" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_markdown_excludes_title():
|
||||||
|
article = Article("Test Title", "<p>Hello <b>world</b>!</p>")
|
||||||
|
result = article.to_markdown(including_title=False)
|
||||||
|
assert not result.startswith("# Test Title")
|
||||||
|
assert "Hello" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_message_with_text_only():
|
||||||
|
article = Article("Test Title", "<p>Hello world!</p>")
|
||||||
|
article.url = "https://example.com/"
|
||||||
|
result = article.to_message()
|
||||||
|
assert isinstance(result, list)
|
||||||
|
assert any(item["type"] == "text" for item in result)
|
||||||
|
assert all("type" in item for item in result)
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_message_with_image(monkeypatch):
|
||||||
|
html = '<p>Intro</p><img src="img/pic.png"/>'
|
||||||
|
article = Article("Title", html)
|
||||||
|
article.url = "https://host.com/path/"
|
||||||
|
# The markdownify library will convert <img> to markdown image syntax
|
||||||
|
result = article.to_message()
|
||||||
|
# Should have both text and image_url types
|
||||||
|
types = [item["type"] for item in result]
|
||||||
|
assert "image_url" in types
|
||||||
|
assert "text" in types
|
||||||
|
# Check that the image_url is correctly joined
|
||||||
|
image_items = [item for item in result if item["type"] == "image_url"]
|
||||||
|
assert image_items
|
||||||
|
assert image_items[0]["image_url"]["url"] == "https://host.com/path/img/pic.png"
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_message_multiple_images():
|
||||||
|
html = '<p>Start</p><img src="a.png"/><p>Mid</p><img src="b.jpg"/>End'
|
||||||
|
article = Article("Title", html)
|
||||||
|
article.url = "http://x/"
|
||||||
|
result = article.to_message()
|
||||||
|
image_urls = [
|
||||||
|
item["image_url"]["url"] for item in result if item["type"] == "image_url"
|
||||||
|
]
|
||||||
|
assert "http://x/a.png" in image_urls
|
||||||
|
assert "http://x/b.jpg" in image_urls
|
||||||
|
text_items = [item for item in result if item["type"] == "text"]
|
||||||
|
assert any("Start" in item["text"] for item in text_items)
|
||||||
|
assert any("Mid" in item["text"] for item in text_items)
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_message_handles_empty_html():
|
||||||
|
article = Article("Empty", "")
|
||||||
|
article.url = "http://test/"
|
||||||
|
result = article.to_message()
|
||||||
|
assert isinstance(result, list)
|
||||||
|
assert result[0]["type"] == "text"
|
||||||
72
tests/unit/crawler/test_crawler_class.py
Normal file
72
tests/unit/crawler/test_crawler_class.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import src.crawler as crawler_module
|
||||||
|
from src.crawler import Crawler
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_sets_article_url(monkeypatch):
|
||||||
|
"""Test that the crawler sets the article.url field correctly."""
|
||||||
|
|
||||||
|
class DummyArticle:
|
||||||
|
def __init__(self):
|
||||||
|
self.url = None
|
||||||
|
|
||||||
|
def to_markdown(self):
|
||||||
|
return "# Dummy"
|
||||||
|
|
||||||
|
class DummyJinaClient:
|
||||||
|
def crawl(self, url, return_format=None):
|
||||||
|
return "<html>dummy</html>"
|
||||||
|
|
||||||
|
class DummyReadabilityExtractor:
|
||||||
|
def extract_article(self, html):
|
||||||
|
return DummyArticle()
|
||||||
|
|
||||||
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||||
|
)
|
||||||
|
|
||||||
|
crawler = crawler_module.Crawler()
|
||||||
|
url = "http://example.com"
|
||||||
|
article = crawler.crawl(url)
|
||||||
|
assert article.url == url
|
||||||
|
assert article.to_markdown() == "# Dummy"
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawler_calls_dependencies(monkeypatch):
|
||||||
|
"""Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
|
||||||
|
calls = {}
|
||||||
|
|
||||||
|
class DummyJinaClient:
|
||||||
|
def crawl(self, url, return_format=None):
|
||||||
|
calls["jina"] = (url, return_format)
|
||||||
|
return "<html>dummy</html>"
|
||||||
|
|
||||||
|
class DummyReadabilityExtractor:
|
||||||
|
def extract_article(self, html):
|
||||||
|
calls["extractor"] = html
|
||||||
|
|
||||||
|
class DummyArticle:
|
||||||
|
url = None
|
||||||
|
|
||||||
|
def to_markdown(self):
|
||||||
|
return "# Dummy"
|
||||||
|
|
||||||
|
return DummyArticle()
|
||||||
|
|
||||||
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||||
|
)
|
||||||
|
|
||||||
|
crawler = crawler_module.Crawler()
|
||||||
|
url = "http://example.com"
|
||||||
|
crawler.crawl(url)
|
||||||
|
assert "jina" in calls
|
||||||
|
assert calls["jina"][0] == url
|
||||||
|
assert calls["jina"][1] == "html"
|
||||||
|
assert "extractor" in calls
|
||||||
|
assert calls["extractor"] == "<html>dummy</html>"
|
||||||
Reference in New Issue
Block a user