mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-04 06:32:13 +08:00
* security: add log injection attack prevention with input sanitization - Created src/utils/log_sanitizer.py to sanitize user-controlled input before logging - Prevents log injection attacks using newlines, tabs, carriage returns, etc. - Escapes dangerous characters: \n, \r, \t, \0, \x1b - Provides specialized functions for different input types: - sanitize_log_input: general purpose sanitization - sanitize_thread_id: for user-provided thread IDs - sanitize_user_content: for user messages (more aggressive truncation) - sanitize_agent_name: for agent identifiers - sanitize_tool_name: for tool names - sanitize_feedback: for user interrupt feedback - create_safe_log_message: template-based safe message creation - Updated src/server/app.py to sanitize all user input in logging: - Thread IDs from request parameter - Message content from user - Agent names and node information - Tool names and feedback - Updated src/agents/tool_interceptor.py to sanitize: - Tool names during execution - User feedback during interrupt handling - Tool input data - Added 29 comprehensive unit tests covering: - Classic newline injection attacks - Carriage return injection - Tab and null character injection - HTML/ANSI escape sequence injection - Combined multi-character attacks - Truncation and length limits Fixes potential log forgery vulnerability where malicious users could inject fake log entries via unsanitized input containing control characters.
105 lines
3.5 KiB
Python
105 lines
3.5 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
from unittest.mock import patch
|
|
|
|
from src.crawler.readability_extractor import ReadabilityExtractor
|
|
|
|
|
|
class TestReadabilityExtractor:
|
|
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
|
|
def test_extract_article_with_valid_content(self, mock_simple_json):
|
|
# Arrange
|
|
mock_simple_json.return_value = {
|
|
"title": "Test Article",
|
|
"content": "<p>Article content</p>",
|
|
}
|
|
extractor = ReadabilityExtractor()
|
|
|
|
# Act
|
|
article = extractor.extract_article("<html>test</html>")
|
|
|
|
# Assert
|
|
assert article.title == "Test Article"
|
|
assert article.html_content == "<p>Article content</p>"
|
|
|
|
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
|
|
def test_extract_article_with_none_content(self, mock_simple_json):
|
|
# Arrange
|
|
mock_simple_json.return_value = {
|
|
"title": "Test Article",
|
|
"content": None,
|
|
}
|
|
extractor = ReadabilityExtractor()
|
|
|
|
# Act
|
|
article = extractor.extract_article("<html>test</html>")
|
|
|
|
# Assert
|
|
assert article.title == "Test Article"
|
|
assert article.html_content == "<p>No content could be extracted from this page</p>"
|
|
|
|
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
|
|
def test_extract_article_with_empty_content(self, mock_simple_json):
|
|
# Arrange
|
|
mock_simple_json.return_value = {
|
|
"title": "Test Article",
|
|
"content": "",
|
|
}
|
|
extractor = ReadabilityExtractor()
|
|
|
|
# Act
|
|
article = extractor.extract_article("<html>test</html>")
|
|
|
|
# Assert
|
|
assert article.title == "Test Article"
|
|
assert article.html_content == "<p>No content could be extracted from this page</p>"
|
|
|
|
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
|
|
def test_extract_article_with_whitespace_only_content(self, mock_simple_json):
|
|
# Arrange
|
|
mock_simple_json.return_value = {
|
|
"title": "Test Article",
|
|
"content": " \n \t ",
|
|
}
|
|
extractor = ReadabilityExtractor()
|
|
|
|
# Act
|
|
article = extractor.extract_article("<html>test</html>")
|
|
|
|
# Assert
|
|
assert article.title == "Test Article"
|
|
assert article.html_content == "<p>No content could be extracted from this page</p>"
|
|
|
|
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
|
|
def test_extract_article_with_none_title(self, mock_simple_json):
|
|
# Arrange
|
|
mock_simple_json.return_value = {
|
|
"title": None,
|
|
"content": "<p>Article content</p>",
|
|
}
|
|
extractor = ReadabilityExtractor()
|
|
|
|
# Act
|
|
article = extractor.extract_article("<html>test</html>")
|
|
|
|
# Assert
|
|
assert article.title == "Untitled"
|
|
assert article.html_content == "<p>Article content</p>"
|
|
|
|
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
|
|
def test_extract_article_with_empty_title(self, mock_simple_json):
|
|
# Arrange
|
|
mock_simple_json.return_value = {
|
|
"title": "",
|
|
"content": "<p>Article content</p>",
|
|
}
|
|
extractor = ReadabilityExtractor()
|
|
|
|
# Act
|
|
article = extractor.extract_article("<html>test</html>")
|
|
|
|
# Assert
|
|
assert article.title == "Untitled"
|
|
assert article.html_content == "<p>Article content</p>"
|