Files
deer-flow/tests/unit/crawler/test_jina_client.py
Willem Jiang b4c09aa4b1 security: add log injection attack prevention with input sanitization (#667)
* security: add log injection attack prevention with input sanitization

- Created src/utils/log_sanitizer.py to sanitize user-controlled input before logging
- Prevents log injection attacks using newlines, tabs, carriage returns, etc.
- Escapes dangerous characters: \n, \r, \t, \0, \x1b
- Provides specialized functions for different input types:
  - sanitize_log_input: general purpose sanitization
  - sanitize_thread_id: for user-provided thread IDs
  - sanitize_user_content: for user messages (more aggressive truncation)
  - sanitize_agent_name: for agent identifiers
  - sanitize_tool_name: for tool names
  - sanitize_feedback: for user interrupt feedback
  - create_safe_log_message: template-based safe message creation

- Updated src/server/app.py to sanitize all user input in logging:
  - Thread IDs from request parameter
  - Message content from user
  - Agent names and node information
  - Tool names and feedback

- Updated src/agents/tool_interceptor.py to sanitize:
  - Tool names during execution
  - User feedback during interrupt handling
  - Tool input data

- Added 29 comprehensive unit tests covering:
  - Classic newline injection attacks
  - Carriage return injection
  - Tab and null character injection
  - HTML/ANSI escape sequence injection
  - Combined multi-character attacks
  - Truncation and length limits

Fixes potential log forgery vulnerability where malicious users could inject
fake log entries via unsanitized input containing control characters.
2025-10-27 20:57:23 +08:00

109 lines
3.1 KiB
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
from unittest.mock import Mock, patch
import pytest
from src.crawler.jina_client import JinaClient
class TestJinaClient:
@patch("src.crawler.jina_client.requests.post")
def test_crawl_success(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html><body>Test</body></html>"
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<html><body>Test</body></html>"
mock_post.assert_called_once()
@patch("src.crawler.jina_client.requests.post")
def test_crawl_http_error(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 500
mock_response.text = "Internal Server Error"
mock_post.return_value = mock_response
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
assert "status 500" in str(exc_info.value)
@patch("src.crawler.jina_client.requests.post")
def test_crawl_empty_response(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = ""
mock_post.return_value = mock_response
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
assert "empty response" in str(exc_info.value)
@patch("src.crawler.jina_client.requests.post")
def test_crawl_whitespace_only_response(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = " \n \t "
mock_post.return_value = mock_response
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
assert "empty response" in str(exc_info.value)
@patch("src.crawler.jina_client.requests.post")
def test_crawl_not_found(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 404
mock_response.text = "Not Found"
mock_post.return_value = mock_response
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
assert "status 404" in str(exc_info.value)
@patch.dict("os.environ", {}, clear=True)
@patch("src.crawler.jina_client.requests.post")
def test_crawl_without_api_key_logs_warning(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html>Test</html>"
mock_post.return_value = mock_response
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<html>Test</html>"