fix(subagents): cleanup background tasks after completion to prevent memory leak (#1030)

* fix(subagents): cleanup background tasks after completion to prevent memory leak

Added cleanup_background_task() function to remove completed subagent results
from the global _background_tasks dict. Found a small issue: completed tasks
were never removed, causing memory to grow indefinitely with each subagent
execution.

Alternative approaches considered:
- Future + SubagentHandle pattern: Not chosen due to requiring refactoring

Chose the simple cleanup approach for minimal code changes while effectively
resolving the memory leak.

Changes:
- Add cleanup_background_task() in executor.py
- Call cleanup in all task_tool return paths (completed, failed, timed out)

* fix(subagents): prevent race condition in background task cleanup

Address Copilot review feedback on memory leak fix:

- Add terminal state check in cleanup_background_task() to only remove
  tasks that are COMPLETED/FAILED/TIMED_OUT or have completed_at set
- Remove cleanup call from polling safety-timeout branch in task_tool
  since the task may still be running
- Add comprehensive tests for cleanup behavior including:
  - Verification that cleanup is called on terminal states
  - Verification that cleanup is NOT called on polling timeout
  - Tests for terminal state check logic in executor

This prevents KeyError when the background executor tries to update
a task that was prematurely removed from _background_tasks.

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
momorebi
2026-03-10 07:41:48 +08:00
committed by GitHub
parent f6508e0677
commit 0409f8cefd
4 changed files with 361 additions and 1 deletions

View File

@@ -452,3 +452,40 @@ def list_background_tasks() -> list[SubagentResult]:
"""
with _background_tasks_lock:
return list(_background_tasks.values())
def cleanup_background_task(task_id: str) -> None:
"""Remove a completed task from background tasks.
Should be called by task_tool after it finishes polling and returns the result.
This prevents memory leaks from accumulated completed tasks.
Only removes tasks that are in a terminal state (COMPLETED/FAILED/TIMED_OUT)
to avoid race conditions with the background executor still updating the task entry.
Args:
task_id: The task ID to remove.
"""
with _background_tasks_lock:
result = _background_tasks.get(task_id)
if result is None:
# Nothing to clean up; may have been removed already.
logger.debug("Requested cleanup for unknown background task %s", task_id)
return
# Only clean up tasks that are in a terminal state to avoid races with
# the background executor still updating the task entry.
is_terminal_status = result.status in {
SubagentStatus.COMPLETED,
SubagentStatus.FAILED,
SubagentStatus.TIMED_OUT,
}
if is_terminal_status or result.completed_at is not None:
del _background_tasks[task_id]
logger.debug("Cleaned up background task: %s", task_id)
else:
logger.debug(
"Skipping cleanup for non-terminal background task %s (status=%s)",
task_id,
result.status.value if hasattr(result.status, "value") else result.status,
)

View File

@@ -13,7 +13,7 @@ from langgraph.typing import ContextT
from src.agents.lead_agent.prompt import get_skills_prompt_section
from src.agents.thread_state import ThreadState
from src.subagents import SubagentExecutor, get_subagent_config
from src.subagents.executor import SubagentStatus, get_background_task_result
from src.subagents.executor import SubagentStatus, cleanup_background_task, get_background_task_result
logger = logging.getLogger(__name__)
@@ -135,6 +135,7 @@ def task_tool(
if result is None:
logger.error(f"[trace={trace_id}] Task {task_id} not found in background tasks")
writer({"type": "task_failed", "task_id": task_id, "error": "Task disappeared from background tasks"})
cleanup_background_task(task_id)
return f"Error: Task {task_id} disappeared from background tasks"
# Log status changes for debugging
@@ -164,14 +165,17 @@ def task_tool(
if result.status == SubagentStatus.COMPLETED:
writer({"type": "task_completed", "task_id": task_id, "result": result.result})
logger.info(f"[trace={trace_id}] Task {task_id} completed after {poll_count} polls")
cleanup_background_task(task_id)
return f"Task Succeeded. Result: {result.result}"
elif result.status == SubagentStatus.FAILED:
writer({"type": "task_failed", "task_id": task_id, "error": result.error})
logger.error(f"[trace={trace_id}] Task {task_id} failed: {result.error}")
cleanup_background_task(task_id)
return f"Task failed. Error: {result.error}"
elif result.status == SubagentStatus.TIMED_OUT:
writer({"type": "task_timed_out", "task_id": task_id, "error": result.error})
logger.warning(f"[trace={trace_id}] Task {task_id} timed out: {result.error}")
cleanup_background_task(task_id)
return f"Task timed out. Error: {result.error}"
# Still running, wait before next poll
@@ -181,6 +185,9 @@ def task_tool(
# Polling timeout as a safety net (in case thread pool timeout doesn't work)
# Set to execution timeout + 60s buffer, in 5s poll intervals
# This catches edge cases where the background task gets stuck
# Note: We don't call cleanup_background_task here because the task may
# still be running in the background. The cleanup will happen when the
# executor completes and sets a terminal status.
if poll_count > max_poll_count:
timeout_minutes = config.timeout_seconds // 60
logger.error(f"[trace={trace_id}] Task {task_id} polling timed out after {poll_count} polls (should have been caught by thread pool timeout)")