mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-21 13:24:44 +08:00
test: add unit tests of crawler (#292)
* test: add unit tests of crawler * test: polish the code of crawler unit tests
This commit is contained in:
72
tests/unit/crawler/test_crawler_class.py
Normal file
72
tests/unit/crawler/test_crawler_class.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import pytest
|
||||
import src.crawler as crawler_module
|
||||
from src.crawler import Crawler
|
||||
|
||||
|
||||
def test_crawler_sets_article_url(monkeypatch):
|
||||
"""Test that the crawler sets the article.url field correctly."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self):
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return "# Dummy"
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
return DummyArticle()
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
assert article.url == url
|
||||
assert article.to_markdown() == "# Dummy"
|
||||
|
||||
|
||||
def test_crawler_calls_dependencies(monkeypatch):
|
||||
"""Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
|
||||
calls = {}
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
calls["jina"] = (url, return_format)
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
calls["extractor"] = html
|
||||
|
||||
class DummyArticle:
|
||||
url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return "# Dummy"
|
||||
|
||||
return DummyArticle()
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
crawler.crawl(url)
|
||||
assert "jina" in calls
|
||||
assert calls["jina"][0] == url
|
||||
assert calls["jina"][1] == "html"
|
||||
assert "extractor" in calls
|
||||
assert calls["extractor"] == "<html>dummy</html>"
|
||||
Reference in New Issue
Block a user