Files
deer-flow/tests/unit/crawler/test_readability_extractor.py

105 lines
3.5 KiB
Python
Raw Normal View History

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
from unittest.mock import patch
security: add log injection attack prevention with input sanitization (#667) * security: add log injection attack prevention with input sanitization - Created src/utils/log_sanitizer.py to sanitize user-controlled input before logging - Prevents log injection attacks using newlines, tabs, carriage returns, etc. - Escapes dangerous characters: \n, \r, \t, \0, \x1b - Provides specialized functions for different input types: - sanitize_log_input: general purpose sanitization - sanitize_thread_id: for user-provided thread IDs - sanitize_user_content: for user messages (more aggressive truncation) - sanitize_agent_name: for agent identifiers - sanitize_tool_name: for tool names - sanitize_feedback: for user interrupt feedback - create_safe_log_message: template-based safe message creation - Updated src/server/app.py to sanitize all user input in logging: - Thread IDs from request parameter - Message content from user - Agent names and node information - Tool names and feedback - Updated src/agents/tool_interceptor.py to sanitize: - Tool names during execution - User feedback during interrupt handling - Tool input data - Added 29 comprehensive unit tests covering: - Classic newline injection attacks - Carriage return injection - Tab and null character injection - HTML/ANSI escape sequence injection - Combined multi-character attacks - Truncation and length limits Fixes potential log forgery vulnerability where malicious users could inject fake log entries via unsanitized input containing control characters.
2025-10-27 20:57:23 +08:00
from src.crawler.readability_extractor import ReadabilityExtractor
class TestReadabilityExtractor:
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_valid_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": "<p>Article content</p>",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>Article content</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_none_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": None,
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>No content could be extracted from this page</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_empty_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": "",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>No content could be extracted from this page</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_whitespace_only_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": " \n \t ",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>No content could be extracted from this page</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_none_title(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": None,
"content": "<p>Article content</p>",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Untitled"
assert article.html_content == "<p>Article content</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_empty_title(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "",
"content": "<p>Article content</p>",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Untitled"
assert article.html_content == "<p>Article content</p>"