mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 06:12:14 +08:00
* fix: the crawling error when encountering PDF URLs * Added the unit test for the new feature of crawl tool * fix: address the code review problems * fix: address the code review problems
301 lines
10 KiB
Python
301 lines
10 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import src.crawler as crawler_module
|
|
from src.crawler.crawler import safe_truncate
|
|
|
|
|
|
def test_crawler_sets_article_url(monkeypatch):
|
|
"""Test that the crawler sets the article.url field correctly."""
|
|
|
|
class DummyArticle:
|
|
def __init__(self):
|
|
self.url = None
|
|
|
|
def to_markdown(self):
|
|
return "# Dummy"
|
|
|
|
class DummyJinaClient:
|
|
def crawl(self, url, return_format=None):
|
|
return "<html>dummy</html>"
|
|
|
|
class DummyReadabilityExtractor:
|
|
def extract_article(self, html):
|
|
return DummyArticle()
|
|
|
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
|
monkeypatch.setattr(
|
|
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
|
)
|
|
|
|
crawler = crawler_module.Crawler()
|
|
url = "http://example.com"
|
|
article = crawler.crawl(url)
|
|
assert article.url == url
|
|
assert article.to_markdown() == "# Dummy"
|
|
|
|
|
|
def test_crawler_calls_dependencies(monkeypatch):
|
|
"""Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
|
|
calls = {}
|
|
|
|
class DummyJinaClient:
|
|
def crawl(self, url, return_format=None):
|
|
calls["jina"] = (url, return_format)
|
|
return "<html>dummy</html>"
|
|
|
|
class DummyReadabilityExtractor:
|
|
def extract_article(self, html):
|
|
calls["extractor"] = html
|
|
|
|
class DummyArticle:
|
|
url = None
|
|
|
|
def to_markdown(self):
|
|
return "# Dummy"
|
|
|
|
return DummyArticle()
|
|
|
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
|
monkeypatch.setattr(
|
|
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
|
)
|
|
|
|
crawler = crawler_module.Crawler()
|
|
url = "http://example.com"
|
|
crawler.crawl(url)
|
|
assert "jina" in calls
|
|
assert calls["jina"][0] == url
|
|
assert calls["jina"][1] == "html"
|
|
assert "extractor" in calls
|
|
assert calls["extractor"] == "<html>dummy</html>"
|
|
|
|
|
|
def test_crawler_handles_empty_content(monkeypatch):
|
|
"""Test that the crawler handles empty content gracefully."""
|
|
|
|
class DummyArticle:
|
|
def __init__(self, title, html_content):
|
|
self.title = title
|
|
self.html_content = html_content
|
|
self.url = None
|
|
|
|
def to_markdown(self):
|
|
return f"# {self.title}"
|
|
|
|
class DummyJinaClient:
|
|
def crawl(self, url, return_format=None):
|
|
return "" # Empty content
|
|
|
|
class DummyReadabilityExtractor:
|
|
def extract_article(self, html):
|
|
# This should not be called for empty content
|
|
assert False, "ReadabilityExtractor should not be called for empty content"
|
|
|
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
|
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
|
|
|
crawler = crawler_module.Crawler()
|
|
url = "http://example.com"
|
|
article = crawler.crawl(url)
|
|
|
|
assert article.url == url
|
|
assert article.title == "Empty Content"
|
|
assert "No content could be extracted" in article.html_content
|
|
|
|
|
|
def test_crawler_handles_non_html_content(monkeypatch):
|
|
"""Test that the crawler handles non-HTML content gracefully."""
|
|
|
|
class DummyArticle:
|
|
def __init__(self, title, html_content):
|
|
self.title = title
|
|
self.html_content = html_content
|
|
self.url = None
|
|
|
|
def to_markdown(self):
|
|
return f"# {self.title}"
|
|
|
|
class DummyJinaClient:
|
|
def crawl(self, url, return_format=None):
|
|
return "This is plain text content, not HTML"
|
|
|
|
class DummyReadabilityExtractor:
|
|
def extract_article(self, html):
|
|
# This should not be called for non-HTML content
|
|
assert False, "ReadabilityExtractor should not be called for non-HTML content"
|
|
|
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
|
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
|
|
|
crawler = crawler_module.Crawler()
|
|
url = "http://example.com"
|
|
article = crawler.crawl(url)
|
|
|
|
assert article.url == url
|
|
assert article.title == "Non-HTML Content"
|
|
assert "cannot be parsed as HTML" in article.html_content
|
|
assert "plain text content" in article.html_content # Should include a snippet of the original content
|
|
|
|
|
|
def test_crawler_handles_extraction_failure(monkeypatch):
|
|
"""Test that the crawler handles readability extraction failure gracefully."""
|
|
|
|
class DummyArticle:
|
|
def __init__(self, title, html_content):
|
|
self.title = title
|
|
self.html_content = html_content
|
|
self.url = None
|
|
|
|
def to_markdown(self):
|
|
return f"# {self.title}"
|
|
|
|
class DummyJinaClient:
|
|
def crawl(self, url, return_format=None):
|
|
return "<html><body>Valid HTML but extraction will fail</body></html>"
|
|
|
|
class DummyReadabilityExtractor:
|
|
def extract_article(self, html):
|
|
raise Exception("Extraction failed")
|
|
|
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
|
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
|
|
|
crawler = crawler_module.Crawler()
|
|
url = "http://example.com"
|
|
article = crawler.crawl(url)
|
|
|
|
assert article.url == url
|
|
assert article.title == "Content Extraction Failed"
|
|
assert "Content extraction failed" in article.html_content
|
|
assert "Valid HTML but extraction will fail" in article.html_content # Should include a snippet of the HTML
|
|
|
|
|
|
def test_crawler_with_json_like_content(monkeypatch):
|
|
"""Test that the crawler handles JSON-like content gracefully."""
|
|
|
|
class DummyArticle:
|
|
def __init__(self, title, html_content):
|
|
self.title = title
|
|
self.html_content = html_content
|
|
self.url = None
|
|
|
|
def to_markdown(self):
|
|
return f"# {self.title}"
|
|
|
|
class DummyJinaClient:
|
|
def crawl(self, url, return_format=None):
|
|
return '{"title": "Some JSON", "content": "This is JSON content"}'
|
|
|
|
class DummyReadabilityExtractor:
|
|
def extract_article(self, html):
|
|
# This should not be called for JSON content
|
|
assert False, "ReadabilityExtractor should not be called for JSON content"
|
|
|
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
|
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
|
|
|
crawler = crawler_module.Crawler()
|
|
url = "http://example.com/api/data"
|
|
article = crawler.crawl(url)
|
|
|
|
assert article.url == url
|
|
assert article.title == "Non-HTML Content"
|
|
assert "cannot be parsed as HTML" in article.html_content
|
|
assert '{"title": "Some JSON"' in article.html_content # Should include a snippet of the JSON
|
|
|
|
|
|
def test_crawler_with_various_html_formats(monkeypatch):
|
|
"""Test that the crawler correctly identifies various HTML formats."""
|
|
|
|
class DummyArticle:
|
|
def __init__(self, title, html_content):
|
|
self.title = title
|
|
self.html_content = html_content
|
|
self.url = None
|
|
|
|
def to_markdown(self):
|
|
return f"# {self.title}"
|
|
|
|
# Test case 1: HTML with DOCTYPE
|
|
class DummyJinaClient1:
|
|
def crawl(self, url, return_format=None):
|
|
return "<!DOCTYPE html><html><body><p>Test content</p></body></html>"
|
|
|
|
# Test case 2: HTML with leading whitespace
|
|
class DummyJinaClient2:
|
|
def crawl(self, url, return_format=None):
|
|
return "\n\n <html><body><p>Test content</p></body></html>"
|
|
|
|
# Test case 3: HTML with comments
|
|
class DummyJinaClient3:
|
|
def crawl(self, url, return_format=None):
|
|
return "<!-- HTML comment --><html><body><p>Test content</p></body></html>"
|
|
|
|
# Test case 4: HTML with self-closing tags
|
|
class DummyJinaClient4:
|
|
def crawl(self, url, return_format=None):
|
|
return '<img src="test.jpg" alt="test" /><p>Test content</p>'
|
|
|
|
class DummyReadabilityExtractor:
|
|
def extract_article(self, html):
|
|
return DummyArticle("Extracted Article", "<p>Extracted content</p>")
|
|
|
|
# Test each HTML format
|
|
test_cases = [
|
|
(DummyJinaClient1, "HTML with DOCTYPE"),
|
|
(DummyJinaClient2, "HTML with leading whitespace"),
|
|
(DummyJinaClient3, "HTML with comments"),
|
|
(DummyJinaClient4, "HTML with self-closing tags"),
|
|
]
|
|
|
|
for JinaClientClass, description in test_cases:
|
|
monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass)
|
|
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
|
|
|
crawler = crawler_module.Crawler()
|
|
url = "http://example.com"
|
|
article = crawler.crawl(url)
|
|
|
|
assert article.url == url
|
|
assert article.title == "Extracted Article"
|
|
assert "Extracted content" in article.html_content
|
|
|
|
|
|
def test_safe_truncate_function():
|
|
"""Test the safe_truncate function handles various character sets correctly."""
|
|
|
|
# Test None input
|
|
assert safe_truncate(None) is None
|
|
|
|
# Test empty string
|
|
assert safe_truncate("") == ""
|
|
|
|
# Test string shorter than limit
|
|
assert safe_truncate("Short text") == "Short text"
|
|
|
|
# Test ASCII truncation
|
|
result = safe_truncate("This is a longer text that needs truncation", 20)
|
|
assert len(result) <= 20
|
|
assert "..." in result
|
|
|
|
# Test Unicode/emoji characters
|
|
text_with_emoji = "Hello! 🌍 Welcome to the world 🚀"
|
|
result = safe_truncate(text_with_emoji, 20)
|
|
assert len(result) <= 20
|
|
assert "..." in result
|
|
# Verify it's valid UTF-8
|
|
assert result.encode('utf-8').decode('utf-8') == result
|
|
|
|
# Test very small limit
|
|
assert safe_truncate("Long text", 1) == "."
|
|
assert safe_truncate("Long text", 2) == ".."
|
|
assert safe_truncate("Long text", 3) == "..."
|
|
|
|
# Test with Chinese characters
|
|
chinese_text = "这是一个中文测试文本"
|
|
result = safe_truncate(chinese_text, 10)
|
|
assert len(result) <= 10
|
|
# Verify it's valid UTF-8
|
|
assert result.encode('utf-8').decode('utf-8') == result
|