Files
deer-flow/tests/unit/crawler/test_jina_client.py

126 lines
3.5 KiB
Python
Raw Normal View History

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
security: add log injection attack prevention with input sanitization (#667) * security: add log injection attack prevention with input sanitization - Created src/utils/log_sanitizer.py to sanitize user-controlled input before logging - Prevents log injection attacks using newlines, tabs, carriage returns, etc. - Escapes dangerous characters: \n, \r, \t, \0, \x1b - Provides specialized functions for different input types: - sanitize_log_input: general purpose sanitization - sanitize_thread_id: for user-provided thread IDs - sanitize_user_content: for user messages (more aggressive truncation) - sanitize_agent_name: for agent identifiers - sanitize_tool_name: for tool names - sanitize_feedback: for user interrupt feedback - create_safe_log_message: template-based safe message creation - Updated src/server/app.py to sanitize all user input in logging: - Thread IDs from request parameter - Message content from user - Agent names and node information - Tool names and feedback - Updated src/agents/tool_interceptor.py to sanitize: - Tool names during execution - User feedback during interrupt handling - Tool input data - Added 29 comprehensive unit tests covering: - Classic newline injection attacks - Carriage return injection - Tab and null character injection - HTML/ANSI escape sequence injection - Combined multi-character attacks - Truncation and length limits Fixes potential log forgery vulnerability where malicious users could inject fake log entries via unsanitized input containing control characters.
2025-10-27 20:57:23 +08:00
from unittest.mock import Mock, patch
import pytest
security: add log injection attack prevention with input sanitization (#667) * security: add log injection attack prevention with input sanitization - Created src/utils/log_sanitizer.py to sanitize user-controlled input before logging - Prevents log injection attacks using newlines, tabs, carriage returns, etc. - Escapes dangerous characters: \n, \r, \t, \0, \x1b - Provides specialized functions for different input types: - sanitize_log_input: general purpose sanitization - sanitize_thread_id: for user-provided thread IDs - sanitize_user_content: for user messages (more aggressive truncation) - sanitize_agent_name: for agent identifiers - sanitize_tool_name: for tool names - sanitize_feedback: for user interrupt feedback - create_safe_log_message: template-based safe message creation - Updated src/server/app.py to sanitize all user input in logging: - Thread IDs from request parameter - Message content from user - Agent names and node information - Tool names and feedback - Updated src/agents/tool_interceptor.py to sanitize: - Tool names during execution - User feedback during interrupt handling - Tool input data - Added 29 comprehensive unit tests covering: - Classic newline injection attacks - Carriage return injection - Tab and null character injection - HTML/ANSI escape sequence injection - Combined multi-character attacks - Truncation and length limits Fixes potential log forgery vulnerability where malicious users could inject fake log entries via unsanitized input containing control characters.
2025-10-27 20:57:23 +08:00
from src.crawler.jina_client import JinaClient
class TestJinaClient:
@patch("src.crawler.jina_client.requests.post")
def test_crawl_success(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html><body>Test</body></html>"
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<html><body>Test</body></html>"
mock_post.assert_called_once()
@patch("src.crawler.jina_client.requests.post")
def test_crawl_http_error(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 500
mock_response.text = "Internal Server Error"
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "status 500" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_empty_response(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = ""
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_whitespace_only_response(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = " \n \t "
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_not_found(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 404
mock_response.text = "Not Found"
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "status 404" in result
@patch.dict("os.environ", {}, clear=True)
@patch("src.crawler.jina_client.requests.post")
def test_crawl_without_api_key_logs_warning(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html>Test</html>"
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<html>Test</html>"
@patch("src.crawler.jina_client.requests.post")
def test_crawl_exception_handling(self, mock_post):
# Arrange
mock_post.side_effect = Exception("Network error")
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "Network error" in result