mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-02 22:02:13 +08:00
fix(subagents): cleanup background tasks after completion to prevent memory leak (#1030)
* fix(subagents): cleanup background tasks after completion to prevent memory leak Added cleanup_background_task() function to remove completed subagent results from the global _background_tasks dict. Found a small issue: completed tasks were never removed, causing memory to grow indefinitely with each subagent execution. Alternative approaches considered: - Future + SubagentHandle pattern: Not chosen due to requiring refactoring Chose the simple cleanup approach for minimal code changes while effectively resolving the memory leak. Changes: - Add cleanup_background_task() in executor.py - Call cleanup in all task_tool return paths (completed, failed, timed out) * fix(subagents): prevent race condition in background task cleanup Address Copilot review feedback on memory leak fix: - Add terminal state check in cleanup_background_task() to only remove tasks that are COMPLETED/FAILED/TIMED_OUT or have completed_at set - Remove cleanup call from polling safety-timeout branch in task_tool since the task may still be running - Add comprehensive tests for cleanup behavior including: - Verification that cleanup is called on terminal states - Verification that cleanup is NOT called on polling timeout - Tests for terminal state check logic in executor This prevents KeyError when the background executor tries to update a task that was prematurely removed from _background_tasks. --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
@@ -452,3 +452,40 @@ def list_background_tasks() -> list[SubagentResult]:
|
||||
"""
|
||||
with _background_tasks_lock:
|
||||
return list(_background_tasks.values())
|
||||
|
||||
|
||||
def cleanup_background_task(task_id: str) -> None:
|
||||
"""Remove a completed task from background tasks.
|
||||
|
||||
Should be called by task_tool after it finishes polling and returns the result.
|
||||
This prevents memory leaks from accumulated completed tasks.
|
||||
|
||||
Only removes tasks that are in a terminal state (COMPLETED/FAILED/TIMED_OUT)
|
||||
to avoid race conditions with the background executor still updating the task entry.
|
||||
|
||||
Args:
|
||||
task_id: The task ID to remove.
|
||||
"""
|
||||
with _background_tasks_lock:
|
||||
result = _background_tasks.get(task_id)
|
||||
if result is None:
|
||||
# Nothing to clean up; may have been removed already.
|
||||
logger.debug("Requested cleanup for unknown background task %s", task_id)
|
||||
return
|
||||
|
||||
# Only clean up tasks that are in a terminal state to avoid races with
|
||||
# the background executor still updating the task entry.
|
||||
is_terminal_status = result.status in {
|
||||
SubagentStatus.COMPLETED,
|
||||
SubagentStatus.FAILED,
|
||||
SubagentStatus.TIMED_OUT,
|
||||
}
|
||||
if is_terminal_status or result.completed_at is not None:
|
||||
del _background_tasks[task_id]
|
||||
logger.debug("Cleaned up background task: %s", task_id)
|
||||
else:
|
||||
logger.debug(
|
||||
"Skipping cleanup for non-terminal background task %s (status=%s)",
|
||||
task_id,
|
||||
result.status.value if hasattr(result.status, "value") else result.status,
|
||||
)
|
||||
|
||||
@@ -13,7 +13,7 @@ from langgraph.typing import ContextT
|
||||
from src.agents.lead_agent.prompt import get_skills_prompt_section
|
||||
from src.agents.thread_state import ThreadState
|
||||
from src.subagents import SubagentExecutor, get_subagent_config
|
||||
from src.subagents.executor import SubagentStatus, get_background_task_result
|
||||
from src.subagents.executor import SubagentStatus, cleanup_background_task, get_background_task_result
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -135,6 +135,7 @@ def task_tool(
|
||||
if result is None:
|
||||
logger.error(f"[trace={trace_id}] Task {task_id} not found in background tasks")
|
||||
writer({"type": "task_failed", "task_id": task_id, "error": "Task disappeared from background tasks"})
|
||||
cleanup_background_task(task_id)
|
||||
return f"Error: Task {task_id} disappeared from background tasks"
|
||||
|
||||
# Log status changes for debugging
|
||||
@@ -164,14 +165,17 @@ def task_tool(
|
||||
if result.status == SubagentStatus.COMPLETED:
|
||||
writer({"type": "task_completed", "task_id": task_id, "result": result.result})
|
||||
logger.info(f"[trace={trace_id}] Task {task_id} completed after {poll_count} polls")
|
||||
cleanup_background_task(task_id)
|
||||
return f"Task Succeeded. Result: {result.result}"
|
||||
elif result.status == SubagentStatus.FAILED:
|
||||
writer({"type": "task_failed", "task_id": task_id, "error": result.error})
|
||||
logger.error(f"[trace={trace_id}] Task {task_id} failed: {result.error}")
|
||||
cleanup_background_task(task_id)
|
||||
return f"Task failed. Error: {result.error}"
|
||||
elif result.status == SubagentStatus.TIMED_OUT:
|
||||
writer({"type": "task_timed_out", "task_id": task_id, "error": result.error})
|
||||
logger.warning(f"[trace={trace_id}] Task {task_id} timed out: {result.error}")
|
||||
cleanup_background_task(task_id)
|
||||
return f"Task timed out. Error: {result.error}"
|
||||
|
||||
# Still running, wait before next poll
|
||||
@@ -181,6 +185,9 @@ def task_tool(
|
||||
# Polling timeout as a safety net (in case thread pool timeout doesn't work)
|
||||
# Set to execution timeout + 60s buffer, in 5s poll intervals
|
||||
# This catches edge cases where the background task gets stuck
|
||||
# Note: We don't call cleanup_background_task here because the task may
|
||||
# still be running in the background. The cleanup will happen when the
|
||||
# executor completes and sets a terminal status.
|
||||
if poll_count > max_poll_count:
|
||||
timeout_minutes = config.timeout_seconds // 60
|
||||
logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)")
|
||||
|
||||
@@ -625,3 +625,151 @@ class TestThreadSafety:
|
||||
for result in results:
|
||||
assert result.status == SubagentStatus.COMPLETED
|
||||
assert "Result" in result.result
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Cleanup Background Task Tests
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCleanupBackgroundTask:
|
||||
"""Test cleanup_background_task function for race condition prevention."""
|
||||
|
||||
@pytest.fixture
|
||||
def executor_module(self, _setup_executor_classes):
|
||||
"""Import the executor module with real classes."""
|
||||
# Re-import to get the real module with cleanup_background_task
|
||||
import importlib
|
||||
|
||||
from src.subagents import executor
|
||||
|
||||
return importlib.reload(executor)
|
||||
|
||||
def test_cleanup_removes_terminal_completed_task(self, executor_module, classes):
|
||||
"""Test that cleanup removes a COMPLETED task."""
|
||||
SubagentResult = classes["SubagentResult"]
|
||||
SubagentStatus = classes["SubagentStatus"]
|
||||
|
||||
# Add a completed task
|
||||
task_id = "test-completed-task"
|
||||
result = SubagentResult(
|
||||
task_id=task_id,
|
||||
trace_id="test-trace",
|
||||
status=SubagentStatus.COMPLETED,
|
||||
result="done",
|
||||
completed_at=datetime.now(),
|
||||
)
|
||||
executor_module._background_tasks[task_id] = result
|
||||
|
||||
# Cleanup should remove it
|
||||
executor_module.cleanup_background_task(task_id)
|
||||
|
||||
assert task_id not in executor_module._background_tasks
|
||||
|
||||
def test_cleanup_removes_terminal_failed_task(self, executor_module, classes):
|
||||
"""Test that cleanup removes a FAILED task."""
|
||||
SubagentResult = classes["SubagentResult"]
|
||||
SubagentStatus = classes["SubagentStatus"]
|
||||
|
||||
task_id = "test-failed-task"
|
||||
result = SubagentResult(
|
||||
task_id=task_id,
|
||||
trace_id="test-trace",
|
||||
status=SubagentStatus.FAILED,
|
||||
error="error",
|
||||
completed_at=datetime.now(),
|
||||
)
|
||||
executor_module._background_tasks[task_id] = result
|
||||
|
||||
executor_module.cleanup_background_task(task_id)
|
||||
|
||||
assert task_id not in executor_module._background_tasks
|
||||
|
||||
def test_cleanup_removes_terminal_timed_out_task(self, executor_module, classes):
|
||||
"""Test that cleanup removes a TIMED_OUT task."""
|
||||
SubagentResult = classes["SubagentResult"]
|
||||
SubagentStatus = classes["SubagentStatus"]
|
||||
|
||||
task_id = "test-timedout-task"
|
||||
result = SubagentResult(
|
||||
task_id=task_id,
|
||||
trace_id="test-trace",
|
||||
status=SubagentStatus.TIMED_OUT,
|
||||
error="timeout",
|
||||
completed_at=datetime.now(),
|
||||
)
|
||||
executor_module._background_tasks[task_id] = result
|
||||
|
||||
executor_module.cleanup_background_task(task_id)
|
||||
|
||||
assert task_id not in executor_module._background_tasks
|
||||
|
||||
def test_cleanup_skips_running_task(self, executor_module, classes):
|
||||
"""Test that cleanup does NOT remove a RUNNING task.
|
||||
|
||||
This prevents race conditions where task_tool calls cleanup
|
||||
while the background executor is still updating the task.
|
||||
"""
|
||||
SubagentResult = classes["SubagentResult"]
|
||||
SubagentStatus = classes["SubagentStatus"]
|
||||
|
||||
task_id = "test-running-task"
|
||||
result = SubagentResult(
|
||||
task_id=task_id,
|
||||
trace_id="test-trace",
|
||||
status=SubagentStatus.RUNNING,
|
||||
started_at=datetime.now(),
|
||||
)
|
||||
executor_module._background_tasks[task_id] = result
|
||||
|
||||
executor_module.cleanup_background_task(task_id)
|
||||
|
||||
# Should still be present because it's RUNNING
|
||||
assert task_id in executor_module._background_tasks
|
||||
|
||||
def test_cleanup_skips_pending_task(self, executor_module, classes):
|
||||
"""Test that cleanup does NOT remove a PENDING task."""
|
||||
SubagentResult = classes["SubagentResult"]
|
||||
SubagentStatus = classes["SubagentStatus"]
|
||||
|
||||
task_id = "test-pending-task"
|
||||
result = SubagentResult(
|
||||
task_id=task_id,
|
||||
trace_id="test-trace",
|
||||
status=SubagentStatus.PENDING,
|
||||
)
|
||||
executor_module._background_tasks[task_id] = result
|
||||
|
||||
executor_module.cleanup_background_task(task_id)
|
||||
|
||||
assert task_id in executor_module._background_tasks
|
||||
|
||||
def test_cleanup_handles_unknown_task_gracefully(self, executor_module):
|
||||
"""Test that cleanup doesn't raise for unknown task IDs."""
|
||||
# Should not raise
|
||||
executor_module.cleanup_background_task("nonexistent-task")
|
||||
|
||||
def test_cleanup_removes_task_with_completed_at_even_if_running(
|
||||
self, executor_module, classes
|
||||
):
|
||||
"""Test that cleanup removes task if completed_at is set, even if status is RUNNING.
|
||||
|
||||
This is a safety net: if completed_at is set, the task is considered done
|
||||
regardless of status.
|
||||
"""
|
||||
SubagentResult = classes["SubagentResult"]
|
||||
SubagentStatus = classes["SubagentStatus"]
|
||||
|
||||
task_id = "test-completed-at-task"
|
||||
result = SubagentResult(
|
||||
task_id=task_id,
|
||||
trace_id="test-trace",
|
||||
status=SubagentStatus.RUNNING, # Status not terminal
|
||||
completed_at=datetime.now(), # But completed_at is set
|
||||
)
|
||||
executor_module._background_tasks[task_id] = result
|
||||
|
||||
executor_module.cleanup_background_task(task_id)
|
||||
|
||||
# Should be removed because completed_at is set
|
||||
assert task_id not in executor_module._background_tasks
|
||||
|
||||
@@ -239,3 +239,171 @@ def test_task_tool_polling_safety_timeout(monkeypatch):
|
||||
assert output.startswith("Task polling timed out after 0 minutes")
|
||||
assert events[0]["type"] == "task_started"
|
||||
assert events[-1]["type"] == "task_timed_out"
|
||||
|
||||
|
||||
def test_cleanup_called_on_completed(monkeypatch):
|
||||
"""Verify cleanup_background_task is called when task completes."""
|
||||
config = _make_subagent_config()
|
||||
events = []
|
||||
cleanup_calls = []
|
||||
|
||||
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"SubagentExecutor",
|
||||
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
|
||||
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"get_background_task_result",
|
||||
lambda _: _make_result(FakeSubagentStatus.COMPLETED, result="done"),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
|
||||
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
|
||||
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"cleanup_background_task",
|
||||
lambda task_id: cleanup_calls.append(task_id),
|
||||
)
|
||||
|
||||
output = task_tool_module.task_tool.func(
|
||||
runtime=_make_runtime(),
|
||||
description="执行任务",
|
||||
prompt="complete task",
|
||||
subagent_type="general-purpose",
|
||||
tool_call_id="tc-cleanup-completed",
|
||||
)
|
||||
|
||||
assert output == "Task Succeeded. Result: done"
|
||||
assert cleanup_calls == ["tc-cleanup-completed"]
|
||||
|
||||
|
||||
def test_cleanup_called_on_failed(monkeypatch):
|
||||
"""Verify cleanup_background_task is called when task fails."""
|
||||
config = _make_subagent_config()
|
||||
events = []
|
||||
cleanup_calls = []
|
||||
|
||||
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"SubagentExecutor",
|
||||
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
|
||||
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"get_background_task_result",
|
||||
lambda _: _make_result(FakeSubagentStatus.FAILED, error="error"),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
|
||||
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
|
||||
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"cleanup_background_task",
|
||||
lambda task_id: cleanup_calls.append(task_id),
|
||||
)
|
||||
|
||||
output = task_tool_module.task_tool.func(
|
||||
runtime=_make_runtime(),
|
||||
description="执行任务",
|
||||
prompt="fail task",
|
||||
subagent_type="general-purpose",
|
||||
tool_call_id="tc-cleanup-failed",
|
||||
)
|
||||
|
||||
assert output == "Task failed. Error: error"
|
||||
assert cleanup_calls == ["tc-cleanup-failed"]
|
||||
|
||||
|
||||
def test_cleanup_called_on_timed_out(monkeypatch):
|
||||
"""Verify cleanup_background_task is called when task times out."""
|
||||
config = _make_subagent_config()
|
||||
events = []
|
||||
cleanup_calls = []
|
||||
|
||||
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"SubagentExecutor",
|
||||
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
|
||||
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"get_background_task_result",
|
||||
lambda _: _make_result(FakeSubagentStatus.TIMED_OUT, error="timeout"),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
|
||||
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
|
||||
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"cleanup_background_task",
|
||||
lambda task_id: cleanup_calls.append(task_id),
|
||||
)
|
||||
|
||||
output = task_tool_module.task_tool.func(
|
||||
runtime=_make_runtime(),
|
||||
description="执行任务",
|
||||
prompt="timeout task",
|
||||
subagent_type="general-purpose",
|
||||
tool_call_id="tc-cleanup-timedout",
|
||||
)
|
||||
|
||||
assert output == "Task timed out. Error: timeout"
|
||||
assert cleanup_calls == ["tc-cleanup-timedout"]
|
||||
|
||||
|
||||
def test_cleanup_not_called_on_polling_safety_timeout(monkeypatch):
|
||||
"""Verify cleanup_background_task is NOT called on polling safety timeout.
|
||||
|
||||
This prevents race conditions where the background task is still running
|
||||
but the polling loop gives up. The cleanup should happen later when the
|
||||
executor completes and sets a terminal status.
|
||||
"""
|
||||
config = _make_subagent_config()
|
||||
# Keep max_poll_count small for test speed: (1 + 60) // 5 = 12
|
||||
config.timeout_seconds = 1
|
||||
events = []
|
||||
cleanup_calls = []
|
||||
|
||||
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"SubagentExecutor",
|
||||
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
|
||||
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"get_background_task_result",
|
||||
lambda _: _make_result(FakeSubagentStatus.RUNNING, ai_messages=[]),
|
||||
)
|
||||
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
|
||||
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
|
||||
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
|
||||
monkeypatch.setattr(
|
||||
task_tool_module,
|
||||
"cleanup_background_task",
|
||||
lambda task_id: cleanup_calls.append(task_id),
|
||||
)
|
||||
|
||||
output = task_tool_module.task_tool.func(
|
||||
runtime=_make_runtime(),
|
||||
description="执行任务",
|
||||
prompt="never finish",
|
||||
subagent_type="general-purpose",
|
||||
tool_call_id="tc-no-cleanup-safety-timeout",
|
||||
)
|
||||
|
||||
assert output.startswith("Task polling timed out after 0 minutes")
|
||||
# cleanup should NOT be called because the task is still RUNNING
|
||||
assert cleanup_calls == []
|
||||
|
||||
Reference in New Issue
Block a user