fix(subagents): cleanup background tasks after completion to prevent memory leak (#1030)

* fix(subagents): cleanup background tasks after completion to prevent memory leak

Added cleanup_background_task() function to remove completed subagent results
from the global _background_tasks dict. Found a small issue: completed tasks
were never removed, causing memory to grow indefinitely with each subagent
execution.

Alternative approaches considered:
- Future + SubagentHandle pattern: Not chosen due to requiring refactoring

Chose the simple cleanup approach for minimal code changes while effectively
resolving the memory leak.

Changes:
- Add cleanup_background_task() in executor.py
- Call cleanup in all task_tool return paths (completed, failed, timed out)

* fix(subagents): prevent race condition in background task cleanup

Address Copilot review feedback on memory leak fix:

- Add terminal state check in cleanup_background_task() to only remove
  tasks that are COMPLETED/FAILED/TIMED_OUT or have completed_at set
- Remove cleanup call from polling safety-timeout branch in task_tool
  since the task may still be running
- Add comprehensive tests for cleanup behavior including:
  - Verification that cleanup is called on terminal states
  - Verification that cleanup is NOT called on polling timeout
  - Tests for terminal state check logic in executor

This prevents KeyError when the background executor tries to update
a task that was prematurely removed from _background_tasks.

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
momorebi
2026-03-10 07:41:48 +08:00
committed by GitHub
parent f6508e0677
commit 0409f8cefd
4 changed files with 361 additions and 1 deletions

View File

@@ -625,3 +625,151 @@ class TestThreadSafety:
for result in results:
assert result.status == SubagentStatus.COMPLETED
assert "Result" in result.result
# -----------------------------------------------------------------------------
# Cleanup Background Task Tests
# -----------------------------------------------------------------------------
class TestCleanupBackgroundTask:
"""Test cleanup_background_task function for race condition prevention."""
@pytest.fixture
def executor_module(self, _setup_executor_classes):
"""Import the executor module with real classes."""
# Re-import to get the real module with cleanup_background_task
import importlib
from src.subagents import executor
return importlib.reload(executor)
def test_cleanup_removes_terminal_completed_task(self, executor_module, classes):
"""Test that cleanup removes a COMPLETED task."""
SubagentResult = classes["SubagentResult"]
SubagentStatus = classes["SubagentStatus"]
# Add a completed task
task_id = "test-completed-task"
result = SubagentResult(
task_id=task_id,
trace_id="test-trace",
status=SubagentStatus.COMPLETED,
result="done",
completed_at=datetime.now(),
)
executor_module._background_tasks[task_id] = result
# Cleanup should remove it
executor_module.cleanup_background_task(task_id)
assert task_id not in executor_module._background_tasks
def test_cleanup_removes_terminal_failed_task(self, executor_module, classes):
"""Test that cleanup removes a FAILED task."""
SubagentResult = classes["SubagentResult"]
SubagentStatus = classes["SubagentStatus"]
task_id = "test-failed-task"
result = SubagentResult(
task_id=task_id,
trace_id="test-trace",
status=SubagentStatus.FAILED,
error="error",
completed_at=datetime.now(),
)
executor_module._background_tasks[task_id] = result
executor_module.cleanup_background_task(task_id)
assert task_id not in executor_module._background_tasks
def test_cleanup_removes_terminal_timed_out_task(self, executor_module, classes):
"""Test that cleanup removes a TIMED_OUT task."""
SubagentResult = classes["SubagentResult"]
SubagentStatus = classes["SubagentStatus"]
task_id = "test-timedout-task"
result = SubagentResult(
task_id=task_id,
trace_id="test-trace",
status=SubagentStatus.TIMED_OUT,
error="timeout",
completed_at=datetime.now(),
)
executor_module._background_tasks[task_id] = result
executor_module.cleanup_background_task(task_id)
assert task_id not in executor_module._background_tasks
def test_cleanup_skips_running_task(self, executor_module, classes):
"""Test that cleanup does NOT remove a RUNNING task.
This prevents race conditions where task_tool calls cleanup
while the background executor is still updating the task.
"""
SubagentResult = classes["SubagentResult"]
SubagentStatus = classes["SubagentStatus"]
task_id = "test-running-task"
result = SubagentResult(
task_id=task_id,
trace_id="test-trace",
status=SubagentStatus.RUNNING,
started_at=datetime.now(),
)
executor_module._background_tasks[task_id] = result
executor_module.cleanup_background_task(task_id)
# Should still be present because it's RUNNING
assert task_id in executor_module._background_tasks
def test_cleanup_skips_pending_task(self, executor_module, classes):
"""Test that cleanup does NOT remove a PENDING task."""
SubagentResult = classes["SubagentResult"]
SubagentStatus = classes["SubagentStatus"]
task_id = "test-pending-task"
result = SubagentResult(
task_id=task_id,
trace_id="test-trace",
status=SubagentStatus.PENDING,
)
executor_module._background_tasks[task_id] = result
executor_module.cleanup_background_task(task_id)
assert task_id in executor_module._background_tasks
def test_cleanup_handles_unknown_task_gracefully(self, executor_module):
"""Test that cleanup doesn't raise for unknown task IDs."""
# Should not raise
executor_module.cleanup_background_task("nonexistent-task")
def test_cleanup_removes_task_with_completed_at_even_if_running(
self, executor_module, classes
):
"""Test that cleanup removes task if completed_at is set, even if status is RUNNING.
This is a safety net: if completed_at is set, the task is considered done
regardless of status.
"""
SubagentResult = classes["SubagentResult"]
SubagentStatus = classes["SubagentStatus"]
task_id = "test-completed-at-task"
result = SubagentResult(
task_id=task_id,
trace_id="test-trace",
status=SubagentStatus.RUNNING, # Status not terminal
completed_at=datetime.now(), # But completed_at is set
)
executor_module._background_tasks[task_id] = result
executor_module.cleanup_background_task(task_id)
# Should be removed because completed_at is set
assert task_id not in executor_module._background_tasks

View File

@@ -239,3 +239,171 @@ def test_task_tool_polling_safety_timeout(monkeypatch):
assert output.startswith("Task polling timed out after 0 minutes")
assert events[0]["type"] == "task_started"
assert events[-1]["type"] == "task_timed_out"
def test_cleanup_called_on_completed(monkeypatch):
"""Verify cleanup_background_task is called when task completes."""
config = _make_subagent_config()
events = []
cleanup_calls = []
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
monkeypatch.setattr(
task_tool_module,
"SubagentExecutor",
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
)
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
monkeypatch.setattr(
task_tool_module,
"get_background_task_result",
lambda _: _make_result(FakeSubagentStatus.COMPLETED, result="done"),
)
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
monkeypatch.setattr(
task_tool_module,
"cleanup_background_task",
lambda task_id: cleanup_calls.append(task_id),
)
output = task_tool_module.task_tool.func(
runtime=_make_runtime(),
description="执行任务",
prompt="complete task",
subagent_type="general-purpose",
tool_call_id="tc-cleanup-completed",
)
assert output == "Task Succeeded. Result: done"
assert cleanup_calls == ["tc-cleanup-completed"]
def test_cleanup_called_on_failed(monkeypatch):
"""Verify cleanup_background_task is called when task fails."""
config = _make_subagent_config()
events = []
cleanup_calls = []
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
monkeypatch.setattr(
task_tool_module,
"SubagentExecutor",
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
)
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
monkeypatch.setattr(
task_tool_module,
"get_background_task_result",
lambda _: _make_result(FakeSubagentStatus.FAILED, error="error"),
)
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
monkeypatch.setattr(
task_tool_module,
"cleanup_background_task",
lambda task_id: cleanup_calls.append(task_id),
)
output = task_tool_module.task_tool.func(
runtime=_make_runtime(),
description="执行任务",
prompt="fail task",
subagent_type="general-purpose",
tool_call_id="tc-cleanup-failed",
)
assert output == "Task failed. Error: error"
assert cleanup_calls == ["tc-cleanup-failed"]
def test_cleanup_called_on_timed_out(monkeypatch):
"""Verify cleanup_background_task is called when task times out."""
config = _make_subagent_config()
events = []
cleanup_calls = []
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
monkeypatch.setattr(
task_tool_module,
"SubagentExecutor",
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
)
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
monkeypatch.setattr(
task_tool_module,
"get_background_task_result",
lambda _: _make_result(FakeSubagentStatus.TIMED_OUT, error="timeout"),
)
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
monkeypatch.setattr(
task_tool_module,
"cleanup_background_task",
lambda task_id: cleanup_calls.append(task_id),
)
output = task_tool_module.task_tool.func(
runtime=_make_runtime(),
description="执行任务",
prompt="timeout task",
subagent_type="general-purpose",
tool_call_id="tc-cleanup-timedout",
)
assert output == "Task timed out. Error: timeout"
assert cleanup_calls == ["tc-cleanup-timedout"]
def test_cleanup_not_called_on_polling_safety_timeout(monkeypatch):
"""Verify cleanup_background_task is NOT called on polling safety timeout.
This prevents race conditions where the background task is still running
but the polling loop gives up. The cleanup should happen later when the
executor completes and sets a terminal status.
"""
config = _make_subagent_config()
# Keep max_poll_count small for test speed: (1 + 60) // 5 = 12
config.timeout_seconds = 1
events = []
cleanup_calls = []
monkeypatch.setattr(task_tool_module, "SubagentStatus", FakeSubagentStatus)
monkeypatch.setattr(
task_tool_module,
"SubagentExecutor",
type("DummyExecutor", (), {"__init__": lambda self, **kwargs: None, "execute_async": lambda self, prompt, task_id=None: task_id}),
)
monkeypatch.setattr(task_tool_module, "get_subagent_config", lambda _: config)
monkeypatch.setattr(task_tool_module, "get_skills_prompt_section", lambda: "")
monkeypatch.setattr(
task_tool_module,
"get_background_task_result",
lambda _: _make_result(FakeSubagentStatus.RUNNING, ai_messages=[]),
)
monkeypatch.setattr(task_tool_module, "get_stream_writer", lambda: events.append)
monkeypatch.setattr(task_tool_module.time, "sleep", lambda _: None)
monkeypatch.setattr("src.tools.get_available_tools", lambda **kwargs: [])
monkeypatch.setattr(
task_tool_module,
"cleanup_background_task",
lambda task_id: cleanup_calls.append(task_id),
)
output = task_tool_module.task_tool.func(
runtime=_make_runtime(),
description="执行任务",
prompt="never finish",
subagent_type="general-purpose",
tool_call_id="tc-no-cleanup-safety-timeout",
)
assert output.startswith("Task polling timed out after 0 minutes")
# cleanup should NOT be called because the task is still RUNNING
assert cleanup_calls == []