Files
deer-flow/tests/unit/crawler/test_readability_extractor.py
Willem Jiang b4c09aa4b1 security: add log injection attack prevention with input sanitization (#667)
* security: add log injection attack prevention with input sanitization

- Created src/utils/log_sanitizer.py to sanitize user-controlled input before logging
- Prevents log injection attacks using newlines, tabs, carriage returns, etc.
- Escapes dangerous characters: \n, \r, \t, \0, \x1b
- Provides specialized functions for different input types:
  - sanitize_log_input: general purpose sanitization
  - sanitize_thread_id: for user-provided thread IDs
  - sanitize_user_content: for user messages (more aggressive truncation)
  - sanitize_agent_name: for agent identifiers
  - sanitize_tool_name: for tool names
  - sanitize_feedback: for user interrupt feedback
  - create_safe_log_message: template-based safe message creation

- Updated src/server/app.py to sanitize all user input in logging:
  - Thread IDs from request parameter
  - Message content from user
  - Agent names and node information
  - Tool names and feedback

- Updated src/agents/tool_interceptor.py to sanitize:
  - Tool names during execution
  - User feedback during interrupt handling
  - Tool input data

- Added 29 comprehensive unit tests covering:
  - Classic newline injection attacks
  - Carriage return injection
  - Tab and null character injection
  - HTML/ANSI escape sequence injection
  - Combined multi-character attacks
  - Truncation and length limits

Fixes potential log forgery vulnerability where malicious users could inject
fake log entries via unsanitized input containing control characters.
2025-10-27 20:57:23 +08:00

105 lines
3.5 KiB
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
from unittest.mock import patch
from src.crawler.readability_extractor import ReadabilityExtractor
class TestReadabilityExtractor:
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_valid_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": "<p>Article content</p>",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>Article content</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_none_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": None,
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>No content could be extracted from this page</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_empty_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": "",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>No content could be extracted from this page</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_whitespace_only_content(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "Test Article",
"content": " \n \t ",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Test Article"
assert article.html_content == "<p>No content could be extracted from this page</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_none_title(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": None,
"content": "<p>Article content</p>",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Untitled"
assert article.html_content == "<p>Article content</p>"
@patch("src.crawler.readability_extractor.simple_json_from_html_string")
def test_extract_article_with_empty_title(self, mock_simple_json):
# Arrange
mock_simple_json.return_value = {
"title": "",
"content": "<p>Article content</p>",
}
extractor = ReadabilityExtractor()
# Act
article = extractor.extract_article("<html>test</html>")
# Assert
assert article.title == "Untitled"
assert article.html_content == "<p>Article content</p>"