mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 06:12:14 +08:00
chore(docker): Refactor sandbox state management and improve Docker integration (#1068)
* Refactor sandbox state management and improve Docker integration - Removed FileSandboxStateStore and SandboxStateStore classes for a cleaner architecture. - Enhanced LocalContainerBackend to handle port allocation retries and introduced environment variable support for sandbox host configuration. - Updated Paths class to include host_base_dir for Docker volume mounts and ensured proper permissions for sandbox directories. - Modified ExtensionsConfig to improve error handling when loading configuration files and adjusted environment variable resolution. - Updated sandbox configuration to include a replicas option for managing concurrent sandbox containers. - Improved logging and context management in SandboxMiddleware for better sandbox lifecycle handling. - Enhanced network port allocation logic to bind to 0.0.0.0 for compatibility with Docker. - Updated Docker Compose files to ensure proper volume management and environment variable configuration. - Created scripts to ensure necessary configuration files are present before starting services. - Cleaned up unused MCP server configurations in extensions_config.example.json. * Address Copilot review suggestions from PR #1068 (#9) --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -7,9 +7,11 @@ RUN apt-get update && apt-get install -y \
|
|||||||
build-essential \
|
build-essential \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install uv
|
# Install Docker CLI (for DooD: allows starting sandbox containers via host Docker socket)
|
||||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
COPY --from=docker:cli /usr/local/bin/docker /usr/local/bin/docker
|
||||||
ENV PATH="/root/.local/bin:$PATH"
|
|
||||||
|
# Install uv from a pinned versioned image (avoids curl|sh from untrusted remote)
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:0.7.20 /uv /uvx /usr/local/bin/
|
||||||
|
|
||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|||||||
@@ -1,19 +1,15 @@
|
|||||||
from .aio_sandbox import AioSandbox
|
from .aio_sandbox import AioSandbox
|
||||||
from .aio_sandbox_provider import AioSandboxProvider
|
from .aio_sandbox_provider import AioSandboxProvider
|
||||||
from .backend import SandboxBackend
|
from .backend import SandboxBackend
|
||||||
from .file_state_store import FileSandboxStateStore
|
|
||||||
from .local_backend import LocalContainerBackend
|
from .local_backend import LocalContainerBackend
|
||||||
from .remote_backend import RemoteSandboxBackend
|
from .remote_backend import RemoteSandboxBackend
|
||||||
from .sandbox_info import SandboxInfo
|
from .sandbox_info import SandboxInfo
|
||||||
from .state_store import SandboxStateStore
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AioSandbox",
|
"AioSandbox",
|
||||||
"AioSandboxProvider",
|
"AioSandboxProvider",
|
||||||
"FileSandboxStateStore",
|
|
||||||
"LocalContainerBackend",
|
"LocalContainerBackend",
|
||||||
"RemoteSandboxBackend",
|
"RemoteSandboxBackend",
|
||||||
"SandboxBackend",
|
"SandboxBackend",
|
||||||
"SandboxInfo",
|
"SandboxInfo",
|
||||||
"SandboxStateStore",
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,18 +1,17 @@
|
|||||||
"""AIO Sandbox Provider — orchestrates sandbox lifecycle with pluggable backends.
|
"""AIO Sandbox Provider — orchestrates sandbox lifecycle with pluggable backends.
|
||||||
|
|
||||||
This provider composes two abstractions:
|
This provider composes:
|
||||||
- SandboxBackend: how sandboxes are provisioned (local container vs remote/K8s)
|
- SandboxBackend: how sandboxes are provisioned (local container vs remote/K8s)
|
||||||
- SandboxStateStore: how thread→sandbox mappings are persisted (file vs Redis)
|
|
||||||
|
|
||||||
The provider itself handles:
|
The provider itself handles:
|
||||||
- In-process caching for fast repeated access
|
- In-process caching for fast repeated access
|
||||||
- Thread-safe locking (in-process + cross-process via state store)
|
|
||||||
- Idle timeout management
|
- Idle timeout management
|
||||||
- Graceful shutdown with signal handling
|
- Graceful shutdown with signal handling
|
||||||
- Mount computation (thread-specific, skills)
|
- Mount computation (thread-specific, skills)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import atexit
|
import atexit
|
||||||
|
import fcntl
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -22,17 +21,15 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from src.config import get_app_config
|
from src.config import get_app_config
|
||||||
from src.config.paths import VIRTUAL_PATH_PREFIX, get_paths
|
from src.config.paths import VIRTUAL_PATH_PREFIX, Paths, get_paths
|
||||||
from src.sandbox.sandbox import Sandbox
|
from src.sandbox.sandbox import Sandbox
|
||||||
from src.sandbox.sandbox_provider import SandboxProvider
|
from src.sandbox.sandbox_provider import SandboxProvider
|
||||||
|
|
||||||
from .aio_sandbox import AioSandbox
|
from .aio_sandbox import AioSandbox
|
||||||
from .backend import SandboxBackend, wait_for_sandbox_ready
|
from .backend import SandboxBackend, wait_for_sandbox_ready
|
||||||
from .file_state_store import FileSandboxStateStore
|
|
||||||
from .local_backend import LocalContainerBackend
|
from .local_backend import LocalContainerBackend
|
||||||
from .remote_backend import RemoteSandboxBackend
|
from .remote_backend import RemoteSandboxBackend
|
||||||
from .sandbox_info import SandboxInfo
|
from .sandbox_info import SandboxInfo
|
||||||
from .state_store import SandboxStateStore
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -41,6 +38,7 @@ DEFAULT_IMAGE = "enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in
|
|||||||
DEFAULT_PORT = 8080
|
DEFAULT_PORT = 8080
|
||||||
DEFAULT_CONTAINER_PREFIX = "deer-flow-sandbox"
|
DEFAULT_CONTAINER_PREFIX = "deer-flow-sandbox"
|
||||||
DEFAULT_IDLE_TIMEOUT = 600 # 10 minutes in seconds
|
DEFAULT_IDLE_TIMEOUT = 600 # 10 minutes in seconds
|
||||||
|
DEFAULT_REPLICAS = 3 # Maximum concurrent sandbox containers
|
||||||
IDLE_CHECK_INTERVAL = 60 # Check every 60 seconds
|
IDLE_CHECK_INTERVAL = 60 # Check every 60 seconds
|
||||||
|
|
||||||
|
|
||||||
@@ -48,20 +46,17 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
"""Sandbox provider that manages containers running the AIO sandbox.
|
"""Sandbox provider that manages containers running the AIO sandbox.
|
||||||
|
|
||||||
Architecture:
|
Architecture:
|
||||||
This provider composes a SandboxBackend (how to provision) and a
|
This provider composes a SandboxBackend (how to provision), enabling:
|
||||||
SandboxStateStore (how to persist state), enabling:
|
|
||||||
- Local Docker/Apple Container mode (auto-start containers)
|
- Local Docker/Apple Container mode (auto-start containers)
|
||||||
- Remote/K8s mode (connect to pre-existing sandbox URL)
|
- Remote/K8s mode (connect to pre-existing sandbox URL)
|
||||||
- Cross-process consistency via file-based or Redis state stores
|
|
||||||
|
|
||||||
Configuration options in config.yaml under sandbox:
|
Configuration options in config.yaml under sandbox:
|
||||||
use: src.community.aio_sandbox:AioSandboxProvider
|
use: src.community.aio_sandbox:AioSandboxProvider
|
||||||
image: <container image>
|
image: <container image>
|
||||||
port: 8080 # Base port for local containers
|
port: 8080 # Base port for local containers
|
||||||
base_url: http://... # If set, uses remote backend (K8s/external)
|
|
||||||
auto_start: true # Whether to auto-start local containers
|
|
||||||
container_prefix: deer-flow-sandbox
|
container_prefix: deer-flow-sandbox
|
||||||
idle_timeout: 600 # Idle timeout in seconds (0 to disable)
|
idle_timeout: 600 # Idle timeout in seconds (0 to disable)
|
||||||
|
replicas: 3 # Max concurrent sandbox containers (LRU eviction when exceeded)
|
||||||
mounts: # Volume mounts for local containers
|
mounts: # Volume mounts for local containers
|
||||||
- host_path: /path/on/host
|
- host_path: /path/on/host
|
||||||
container_path: /path/in/container
|
container_path: /path/in/container
|
||||||
@@ -78,13 +73,17 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
self._thread_sandboxes: dict[str, str] = {} # thread_id -> sandbox_id
|
self._thread_sandboxes: dict[str, str] = {} # thread_id -> sandbox_id
|
||||||
self._thread_locks: dict[str, threading.Lock] = {} # thread_id -> in-process lock
|
self._thread_locks: dict[str, threading.Lock] = {} # thread_id -> in-process lock
|
||||||
self._last_activity: dict[str, float] = {} # sandbox_id -> last activity timestamp
|
self._last_activity: dict[str, float] = {} # sandbox_id -> last activity timestamp
|
||||||
|
# Warm pool: released sandboxes whose containers are still running.
|
||||||
|
# Maps sandbox_id -> (SandboxInfo, release_timestamp).
|
||||||
|
# Containers here can be reclaimed quickly (no cold-start) or destroyed
|
||||||
|
# when replicas capacity is exhausted.
|
||||||
|
self._warm_pool: dict[str, tuple[SandboxInfo, float]] = {}
|
||||||
self._shutdown_called = False
|
self._shutdown_called = False
|
||||||
self._idle_checker_stop = threading.Event()
|
self._idle_checker_stop = threading.Event()
|
||||||
self._idle_checker_thread: threading.Thread | None = None
|
self._idle_checker_thread: threading.Thread | None = None
|
||||||
|
|
||||||
self._config = self._load_config()
|
self._config = self._load_config()
|
||||||
self._backend: SandboxBackend = self._create_backend()
|
self._backend: SandboxBackend = self._create_backend()
|
||||||
self._state_store: SandboxStateStore = self._create_state_store()
|
|
||||||
|
|
||||||
# Register shutdown handler
|
# Register shutdown handler
|
||||||
atexit.register(self.shutdown)
|
atexit.register(self.shutdown)
|
||||||
@@ -102,16 +101,14 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
Selection logic (checked in order):
|
Selection logic (checked in order):
|
||||||
1. ``provisioner_url`` set → RemoteSandboxBackend (provisioner mode)
|
1. ``provisioner_url`` set → RemoteSandboxBackend (provisioner mode)
|
||||||
Provisioner dynamically creates Pods + Services in k3s.
|
Provisioner dynamically creates Pods + Services in k3s.
|
||||||
2. ``auto_start`` → LocalContainerBackend (Docker / Apple Container)
|
2. Default → LocalContainerBackend (local mode)
|
||||||
|
Local provider manages container lifecycle directly (start/stop).
|
||||||
"""
|
"""
|
||||||
provisioner_url = self._config.get("provisioner_url")
|
provisioner_url = self._config.get("provisioner_url")
|
||||||
if provisioner_url:
|
if provisioner_url:
|
||||||
logger.info(f"Using remote sandbox backend with provisioner at {provisioner_url}")
|
logger.info(f"Using remote sandbox backend with provisioner at {provisioner_url}")
|
||||||
return RemoteSandboxBackend(provisioner_url=provisioner_url)
|
return RemoteSandboxBackend(provisioner_url=provisioner_url)
|
||||||
|
|
||||||
if not self._config.get("auto_start", True):
|
|
||||||
raise RuntimeError("auto_start is disabled and no base_url is configured")
|
|
||||||
|
|
||||||
logger.info("Using local container sandbox backend")
|
logger.info("Using local container sandbox backend")
|
||||||
return LocalContainerBackend(
|
return LocalContainerBackend(
|
||||||
image=self._config["image"],
|
image=self._config["image"],
|
||||||
@@ -121,21 +118,6 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
environment=self._config["environment"],
|
environment=self._config["environment"],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_state_store(self) -> SandboxStateStore:
|
|
||||||
"""Create the state store for cross-process sandbox mapping persistence.
|
|
||||||
|
|
||||||
Currently uses file-based store. For distributed multi-host deployments,
|
|
||||||
a Redis-based store can be plugged in here.
|
|
||||||
"""
|
|
||||||
# TODO: Support RedisSandboxStateStore for distributed deployments.
|
|
||||||
# Configuration would be:
|
|
||||||
# sandbox:
|
|
||||||
# state_store: redis
|
|
||||||
# redis_url: redis://localhost:6379/0
|
|
||||||
# This would enable cross-host sandbox discovery (e.g., multiple K8s pods
|
|
||||||
# without shared PVC, or multi-node Docker Swarm).
|
|
||||||
return FileSandboxStateStore(base_dir=str(get_paths().base_dir))
|
|
||||||
|
|
||||||
# ── Configuration ────────────────────────────────────────────────────
|
# ── Configuration ────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _load_config(self) -> dict:
|
def _load_config(self) -> dict:
|
||||||
@@ -143,13 +125,15 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
config = get_app_config()
|
config = get_app_config()
|
||||||
sandbox_config = config.sandbox
|
sandbox_config = config.sandbox
|
||||||
|
|
||||||
|
idle_timeout = getattr(sandbox_config, "idle_timeout", None)
|
||||||
|
replicas = getattr(sandbox_config, "replicas", None)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image": sandbox_config.image or DEFAULT_IMAGE,
|
"image": sandbox_config.image or DEFAULT_IMAGE,
|
||||||
"port": sandbox_config.port or DEFAULT_PORT,
|
"port": sandbox_config.port or DEFAULT_PORT,
|
||||||
"base_url": sandbox_config.base_url,
|
|
||||||
"auto_start": sandbox_config.auto_start if sandbox_config.auto_start is not None else True,
|
|
||||||
"container_prefix": sandbox_config.container_prefix or DEFAULT_CONTAINER_PREFIX,
|
"container_prefix": sandbox_config.container_prefix or DEFAULT_CONTAINER_PREFIX,
|
||||||
"idle_timeout": getattr(sandbox_config, "idle_timeout", None) or DEFAULT_IDLE_TIMEOUT,
|
"idle_timeout": idle_timeout if idle_timeout is not None else DEFAULT_IDLE_TIMEOUT,
|
||||||
|
"replicas": replicas if replicas is not None else DEFAULT_REPLICAS,
|
||||||
"mounts": sandbox_config.mounts or [],
|
"mounts": sandbox_config.mounts or [],
|
||||||
"environment": self._resolve_env_vars(sandbox_config.environment or {}),
|
"environment": self._resolve_env_vars(sandbox_config.environment or {}),
|
||||||
# provisioner URL for dynamic pod management (e.g. http://provisioner:8002)
|
# provisioner URL for dynamic pod management (e.g. http://provisioner:8002)
|
||||||
@@ -201,28 +185,38 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
"""Get volume mounts for a thread's data directories.
|
"""Get volume mounts for a thread's data directories.
|
||||||
|
|
||||||
Creates directories if they don't exist (lazy initialization).
|
Creates directories if they don't exist (lazy initialization).
|
||||||
|
Mount sources use host_base_dir so that when running inside Docker with a
|
||||||
|
mounted Docker socket (DooD), the host Docker daemon can resolve the paths.
|
||||||
"""
|
"""
|
||||||
paths = get_paths()
|
paths = get_paths()
|
||||||
paths.ensure_thread_dirs(thread_id)
|
paths.ensure_thread_dirs(thread_id)
|
||||||
|
|
||||||
mounts = [
|
# host_paths resolves to the host-side base dir when DEER_FLOW_HOST_BASE_DIR
|
||||||
(str(paths.sandbox_work_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/workspace", False),
|
# is set, otherwise falls back to the container's own base dir (native mode).
|
||||||
(str(paths.sandbox_uploads_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/uploads", False),
|
host_paths = Paths(base_dir=paths.host_base_dir)
|
||||||
(str(paths.sandbox_outputs_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/outputs", False),
|
|
||||||
]
|
|
||||||
|
|
||||||
return mounts
|
return [
|
||||||
|
(str(host_paths.sandbox_work_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/workspace", False),
|
||||||
|
(str(host_paths.sandbox_uploads_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/uploads", False),
|
||||||
|
(str(host_paths.sandbox_outputs_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/outputs", False),
|
||||||
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_skills_mount() -> tuple[str, str, bool] | None:
|
def _get_skills_mount() -> tuple[str, str, bool] | None:
|
||||||
"""Get the skills directory mount configuration."""
|
"""Get the skills directory mount configuration.
|
||||||
|
|
||||||
|
Mount source uses DEER_FLOW_HOST_SKILLS_PATH when running inside Docker (DooD)
|
||||||
|
so the host Docker daemon can resolve the path.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
config = get_app_config()
|
config = get_app_config()
|
||||||
skills_path = config.skills.get_skills_path()
|
skills_path = config.skills.get_skills_path()
|
||||||
container_path = config.skills.container_path
|
container_path = config.skills.container_path
|
||||||
|
|
||||||
if skills_path.exists():
|
if skills_path.exists():
|
||||||
return (str(skills_path), container_path, True) # Read-only for security
|
# When running inside Docker with DooD, use host-side skills path.
|
||||||
|
host_skills = os.environ.get("DEER_FLOW_HOST_SKILLS_PATH") or str(skills_path)
|
||||||
|
return (host_skills, container_path, True) # Read-only for security
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not setup skills mount: {e}")
|
logger.warning(f"Could not setup skills mount: {e}")
|
||||||
return None
|
return None
|
||||||
@@ -249,21 +243,53 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
|
|
||||||
def _cleanup_idle_sandboxes(self, idle_timeout: float) -> None:
|
def _cleanup_idle_sandboxes(self, idle_timeout: float) -> None:
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
sandboxes_to_release = []
|
active_to_destroy = []
|
||||||
|
warm_to_destroy: list[tuple[str, SandboxInfo]] = []
|
||||||
|
|
||||||
with self._lock:
|
with self._lock:
|
||||||
|
# Active sandboxes: tracked via _last_activity
|
||||||
for sandbox_id, last_activity in self._last_activity.items():
|
for sandbox_id, last_activity in self._last_activity.items():
|
||||||
idle_duration = current_time - last_activity
|
idle_duration = current_time - last_activity
|
||||||
if idle_duration > idle_timeout:
|
if idle_duration > idle_timeout:
|
||||||
sandboxes_to_release.append(sandbox_id)
|
active_to_destroy.append(sandbox_id)
|
||||||
logger.info(f"Sandbox {sandbox_id} idle for {idle_duration:.1f}s, marking for release")
|
logger.info(f"Sandbox {sandbox_id} idle for {idle_duration:.1f}s, marking for destroy")
|
||||||
|
|
||||||
for sandbox_id in sandboxes_to_release:
|
# Warm pool: tracked via release_timestamp stored in _warm_pool
|
||||||
|
for sandbox_id, (info, release_ts) in list(self._warm_pool.items()):
|
||||||
|
warm_duration = current_time - release_ts
|
||||||
|
if warm_duration > idle_timeout:
|
||||||
|
warm_to_destroy.append((sandbox_id, info))
|
||||||
|
del self._warm_pool[sandbox_id]
|
||||||
|
logger.info(f"Warm-pool sandbox {sandbox_id} idle for {warm_duration:.1f}s, marking for destroy")
|
||||||
|
|
||||||
|
# Destroy active sandboxes (re-verify still idle before acting)
|
||||||
|
for sandbox_id in active_to_destroy:
|
||||||
try:
|
try:
|
||||||
logger.info(f"Releasing idle sandbox {sandbox_id}")
|
# Re-verify the sandbox is still idle under the lock before destroying.
|
||||||
self.release(sandbox_id)
|
# Between the snapshot above and here, the sandbox may have been
|
||||||
|
# re-acquired (last_activity updated) or already released/destroyed.
|
||||||
|
with self._lock:
|
||||||
|
last_activity = self._last_activity.get(sandbox_id)
|
||||||
|
if last_activity is None:
|
||||||
|
# Already released or destroyed by another path — skip.
|
||||||
|
logger.info(f"Sandbox {sandbox_id} already gone before idle destroy, skipping")
|
||||||
|
continue
|
||||||
|
if (time.time() - last_activity) < idle_timeout:
|
||||||
|
# Re-acquired (activity updated) since the snapshot — skip.
|
||||||
|
logger.info(f"Sandbox {sandbox_id} was re-acquired before idle destroy, skipping")
|
||||||
|
continue
|
||||||
|
logger.info(f"Destroying idle sandbox {sandbox_id}")
|
||||||
|
self.destroy(sandbox_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to release idle sandbox {sandbox_id}: {e}")
|
logger.error(f"Failed to destroy idle sandbox {sandbox_id}: {e}")
|
||||||
|
|
||||||
|
# Destroy warm-pool sandboxes (already removed from _warm_pool under lock above)
|
||||||
|
for sandbox_id, info in warm_to_destroy:
|
||||||
|
try:
|
||||||
|
self._backend.destroy(info)
|
||||||
|
logger.info(f"Destroyed idle warm-pool sandbox {sandbox_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to destroy idle warm-pool sandbox {sandbox_id}: {e}")
|
||||||
|
|
||||||
# ── Signal handling ──────────────────────────────────────────────────
|
# ── Signal handling ──────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -321,11 +347,12 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
return self._acquire_internal(thread_id)
|
return self._acquire_internal(thread_id)
|
||||||
|
|
||||||
def _acquire_internal(self, thread_id: str | None) -> str:
|
def _acquire_internal(self, thread_id: str | None) -> str:
|
||||||
"""Internal sandbox acquisition with three-layer consistency.
|
"""Internal sandbox acquisition with two-layer consistency.
|
||||||
|
|
||||||
Layer 1: In-process cache (fastest, covers same-process repeated access)
|
Layer 1: In-process cache (fastest, covers same-process repeated access)
|
||||||
Layer 2: Cross-process state store + file lock (covers multi-process)
|
Layer 2: Backend discovery (covers containers started by other processes;
|
||||||
Layer 3: Backend discovery (covers containers started by other processes)
|
sandbox_id is deterministic from thread_id so no shared state file
|
||||||
|
is needed — any process can derive the same container name)
|
||||||
"""
|
"""
|
||||||
# ── Layer 1: In-process cache (fast path) ──
|
# ── Layer 1: In-process cache (fast path) ──
|
||||||
if thread_id:
|
if thread_id:
|
||||||
@@ -342,56 +369,96 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
# Deterministic ID for thread-specific, random for anonymous
|
# Deterministic ID for thread-specific, random for anonymous
|
||||||
sandbox_id = self._deterministic_sandbox_id(thread_id) if thread_id else str(uuid.uuid4())[:8]
|
sandbox_id = self._deterministic_sandbox_id(thread_id) if thread_id else str(uuid.uuid4())[:8]
|
||||||
|
|
||||||
# ── Layer 2 & 3: Cross-process recovery + creation ──
|
# ── Layer 1.5: Warm pool (container still running, no cold-start) ──
|
||||||
if thread_id:
|
if thread_id:
|
||||||
with self._state_store.lock(thread_id):
|
with self._lock:
|
||||||
# Try to recover from persisted state or discover existing container
|
if sandbox_id in self._warm_pool:
|
||||||
recovered_id = self._try_recover(thread_id)
|
info, _ = self._warm_pool.pop(sandbox_id)
|
||||||
if recovered_id is not None:
|
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
|
||||||
return recovered_id
|
self._sandboxes[sandbox_id] = sandbox
|
||||||
# Nothing to recover — create new sandbox (still under cross-process lock)
|
self._sandbox_infos[sandbox_id] = info
|
||||||
return self._create_sandbox(thread_id, sandbox_id)
|
self._last_activity[sandbox_id] = time.time()
|
||||||
else:
|
self._thread_sandboxes[thread_id] = sandbox_id
|
||||||
|
logger.info(f"Reclaimed warm-pool sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
|
||||||
|
return sandbox_id
|
||||||
|
|
||||||
|
# ── Layer 2: Backend discovery + create (protected by cross-process lock) ──
|
||||||
|
# Use a file lock so that two processes racing to create the same sandbox
|
||||||
|
# for the same thread_id serialize here: the second process will discover
|
||||||
|
# the container started by the first instead of hitting a name-conflict.
|
||||||
|
if thread_id:
|
||||||
|
return self._discover_or_create_with_lock(thread_id, sandbox_id)
|
||||||
|
|
||||||
return self._create_sandbox(thread_id, sandbox_id)
|
return self._create_sandbox(thread_id, sandbox_id)
|
||||||
|
|
||||||
def _try_recover(self, thread_id: str) -> str | None:
|
def _discover_or_create_with_lock(self, thread_id: str, sandbox_id: str) -> str:
|
||||||
"""Try to recover a sandbox from persisted state or backend discovery.
|
"""Discover an existing sandbox or create a new one under a cross-process file lock.
|
||||||
|
|
||||||
Called under cross-process lock for the given thread_id.
|
The file lock serializes concurrent sandbox creation for the same thread_id
|
||||||
|
across multiple processes, preventing container-name conflicts.
|
||||||
Args:
|
|
||||||
thread_id: The thread ID.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The sandbox_id if recovery succeeded, None otherwise.
|
|
||||||
"""
|
"""
|
||||||
info = self._state_store.load(thread_id)
|
paths = get_paths()
|
||||||
if info is None:
|
paths.ensure_thread_dirs(thread_id)
|
||||||
return None
|
lock_path = paths.thread_dir(thread_id) / f"{sandbox_id}.lock"
|
||||||
|
|
||||||
# Re-discover: verifies sandbox is alive and gets current connection info
|
with open(lock_path, "a") as lock_file:
|
||||||
# (handles cases like port changes after container restart)
|
try:
|
||||||
discovered = self._backend.discover(info.sandbox_id)
|
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
||||||
if discovered is None:
|
# Re-check in-process caches under the file lock in case another
|
||||||
logger.info(f"Persisted sandbox {info.sandbox_id} for thread {thread_id} could not be recovered")
|
# thread in this process won the race while we were waiting.
|
||||||
self._state_store.remove(thread_id)
|
with self._lock:
|
||||||
return None
|
if thread_id in self._thread_sandboxes:
|
||||||
|
existing_id = self._thread_sandboxes[thread_id]
|
||||||
|
if existing_id in self._sandboxes:
|
||||||
|
logger.info(f"Reusing in-process sandbox {existing_id} for thread {thread_id} (post-lock check)")
|
||||||
|
self._last_activity[existing_id] = time.time()
|
||||||
|
return existing_id
|
||||||
|
if sandbox_id in self._warm_pool:
|
||||||
|
info, _ = self._warm_pool.pop(sandbox_id)
|
||||||
|
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
|
||||||
|
self._sandboxes[sandbox_id] = sandbox
|
||||||
|
self._sandbox_infos[sandbox_id] = info
|
||||||
|
self._last_activity[sandbox_id] = time.time()
|
||||||
|
self._thread_sandboxes[thread_id] = sandbox_id
|
||||||
|
logger.info(f"Reclaimed warm-pool sandbox {sandbox_id} for thread {thread_id} (post-lock check)")
|
||||||
|
return sandbox_id
|
||||||
|
|
||||||
# Adopt into this process's memory
|
# Backend discovery: another process may have created the container.
|
||||||
|
discovered = self._backend.discover(sandbox_id)
|
||||||
|
if discovered is not None:
|
||||||
sandbox = AioSandbox(id=discovered.sandbox_id, base_url=discovered.sandbox_url)
|
sandbox = AioSandbox(id=discovered.sandbox_id, base_url=discovered.sandbox_url)
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self._sandboxes[discovered.sandbox_id] = sandbox
|
self._sandboxes[discovered.sandbox_id] = sandbox
|
||||||
self._sandbox_infos[discovered.sandbox_id] = discovered
|
self._sandbox_infos[discovered.sandbox_id] = discovered
|
||||||
self._last_activity[discovered.sandbox_id] = time.time()
|
self._last_activity[discovered.sandbox_id] = time.time()
|
||||||
self._thread_sandboxes[thread_id] = discovered.sandbox_id
|
self._thread_sandboxes[thread_id] = discovered.sandbox_id
|
||||||
|
logger.info(f"Discovered existing sandbox {discovered.sandbox_id} for thread {thread_id} at {discovered.sandbox_url}")
|
||||||
# Update state if connection info changed
|
|
||||||
if discovered.sandbox_url != info.sandbox_url:
|
|
||||||
self._state_store.save(thread_id, discovered)
|
|
||||||
|
|
||||||
logger.info(f"Recovered sandbox {discovered.sandbox_id} for thread {thread_id} at {discovered.sandbox_url}")
|
|
||||||
return discovered.sandbox_id
|
return discovered.sandbox_id
|
||||||
|
|
||||||
|
return self._create_sandbox(thread_id, sandbox_id)
|
||||||
|
finally:
|
||||||
|
fcntl.flock(lock_file, fcntl.LOCK_UN)
|
||||||
|
|
||||||
|
def _evict_oldest_warm(self) -> str | None:
|
||||||
|
"""Destroy the oldest container in the warm pool to free capacity.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The evicted sandbox_id, or None if warm pool is empty.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
if not self._warm_pool:
|
||||||
|
return None
|
||||||
|
oldest_id = min(self._warm_pool, key=lambda sid: self._warm_pool[sid][1])
|
||||||
|
info, _ = self._warm_pool.pop(oldest_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._backend.destroy(info)
|
||||||
|
logger.info(f"Destroyed warm-pool sandbox {oldest_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to destroy warm-pool sandbox {oldest_id}: {e}")
|
||||||
|
return None
|
||||||
|
return oldest_id
|
||||||
|
|
||||||
def _create_sandbox(self, thread_id: str | None, sandbox_id: str) -> str:
|
def _create_sandbox(self, thread_id: str | None, sandbox_id: str) -> str:
|
||||||
"""Create a new sandbox via the backend.
|
"""Create a new sandbox via the backend.
|
||||||
|
|
||||||
@@ -407,6 +474,21 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
"""
|
"""
|
||||||
extra_mounts = self._get_extra_mounts(thread_id)
|
extra_mounts = self._get_extra_mounts(thread_id)
|
||||||
|
|
||||||
|
# Enforce replicas: only warm-pool containers count toward eviction budget.
|
||||||
|
# Active sandboxes are in use by live threads and must not be forcibly stopped.
|
||||||
|
replicas = self._config.get("replicas", DEFAULT_REPLICAS)
|
||||||
|
with self._lock:
|
||||||
|
total = len(self._sandboxes) + len(self._warm_pool)
|
||||||
|
if total >= replicas:
|
||||||
|
evicted = self._evict_oldest_warm()
|
||||||
|
if evicted:
|
||||||
|
logger.info(f"Evicted warm-pool sandbox {evicted} to stay within replicas={replicas}")
|
||||||
|
else:
|
||||||
|
# All slots are occupied by active sandboxes — proceed anyway and log.
|
||||||
|
# The replicas limit is a soft cap; we never forcibly stop a container
|
||||||
|
# that is actively serving a thread.
|
||||||
|
logger.warning(f"All {replicas} replica slots are in active use; creating sandbox {sandbox_id} beyond the soft limit")
|
||||||
|
|
||||||
info = self._backend.create(thread_id, sandbox_id, extra_mounts=extra_mounts or None)
|
info = self._backend.create(thread_id, sandbox_id, extra_mounts=extra_mounts or None)
|
||||||
|
|
||||||
# Wait for sandbox to be ready
|
# Wait for sandbox to be ready
|
||||||
@@ -422,10 +504,6 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
if thread_id:
|
if thread_id:
|
||||||
self._thread_sandboxes[thread_id] = sandbox_id
|
self._thread_sandboxes[thread_id] = sandbox_id
|
||||||
|
|
||||||
# Persist for cross-process discovery
|
|
||||||
if thread_id:
|
|
||||||
self._state_store.save(thread_id, info)
|
|
||||||
|
|
||||||
logger.info(f"Created sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
|
logger.info(f"Created sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
|
||||||
return sandbox_id
|
return sandbox_id
|
||||||
|
|
||||||
@@ -445,7 +523,11 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
return sandbox
|
return sandbox
|
||||||
|
|
||||||
def release(self, sandbox_id: str) -> None:
|
def release(self, sandbox_id: str) -> None:
|
||||||
"""Release a sandbox: clean up in-memory state, persisted state, and backend resources.
|
"""Release a sandbox from active use into the warm pool.
|
||||||
|
|
||||||
|
The container is kept running so it can be reclaimed quickly by the same
|
||||||
|
thread on its next turn without a cold-start. The container will only be
|
||||||
|
stopped when the replicas limit forces eviction or during shutdown.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sandbox_id: The ID of the sandbox to release.
|
sandbox_id: The ID of the sandbox to release.
|
||||||
@@ -460,15 +542,40 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
for tid in thread_ids_to_remove:
|
for tid in thread_ids_to_remove:
|
||||||
del self._thread_sandboxes[tid]
|
del self._thread_sandboxes[tid]
|
||||||
self._last_activity.pop(sandbox_id, None)
|
self._last_activity.pop(sandbox_id, None)
|
||||||
|
# Park in warm pool — container keeps running
|
||||||
|
if info and sandbox_id not in self._warm_pool:
|
||||||
|
self._warm_pool[sandbox_id] = (info, time.time())
|
||||||
|
|
||||||
# Clean up persisted state (outside lock, involves file I/O)
|
logger.info(f"Released sandbox {sandbox_id} to warm pool (container still running)")
|
||||||
|
|
||||||
|
def destroy(self, sandbox_id: str) -> None:
|
||||||
|
"""Destroy a sandbox: stop the container and free all resources.
|
||||||
|
|
||||||
|
Unlike release(), this actually stops the container. Use this for
|
||||||
|
explicit cleanup, capacity-driven eviction, or shutdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sandbox_id: The ID of the sandbox to destroy.
|
||||||
|
"""
|
||||||
|
info = None
|
||||||
|
thread_ids_to_remove: list[str] = []
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self._sandboxes.pop(sandbox_id, None)
|
||||||
|
info = self._sandbox_infos.pop(sandbox_id, None)
|
||||||
|
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
|
||||||
for tid in thread_ids_to_remove:
|
for tid in thread_ids_to_remove:
|
||||||
self._state_store.remove(tid)
|
del self._thread_sandboxes[tid]
|
||||||
|
self._last_activity.pop(sandbox_id, None)
|
||||||
|
# Also pull from warm pool if it was parked there
|
||||||
|
if info is None and sandbox_id in self._warm_pool:
|
||||||
|
info, _ = self._warm_pool.pop(sandbox_id)
|
||||||
|
else:
|
||||||
|
self._warm_pool.pop(sandbox_id, None)
|
||||||
|
|
||||||
# Destroy backend resources (stop container, release port, etc.)
|
|
||||||
if info:
|
if info:
|
||||||
self._backend.destroy(info)
|
self._backend.destroy(info)
|
||||||
logger.info(f"Released sandbox {sandbox_id}")
|
logger.info(f"Destroyed sandbox {sandbox_id}")
|
||||||
|
|
||||||
def shutdown(self) -> None:
|
def shutdown(self) -> None:
|
||||||
"""Shutdown all sandboxes. Thread-safe and idempotent."""
|
"""Shutdown all sandboxes. Thread-safe and idempotent."""
|
||||||
@@ -477,6 +584,8 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
return
|
return
|
||||||
self._shutdown_called = True
|
self._shutdown_called = True
|
||||||
sandbox_ids = list(self._sandboxes.keys())
|
sandbox_ids = list(self._sandboxes.keys())
|
||||||
|
warm_items = list(self._warm_pool.items())
|
||||||
|
self._warm_pool.clear()
|
||||||
|
|
||||||
# Stop idle checker
|
# Stop idle checker
|
||||||
self._idle_checker_stop.set()
|
self._idle_checker_stop.set()
|
||||||
@@ -484,10 +593,17 @@ class AioSandboxProvider(SandboxProvider):
|
|||||||
self._idle_checker_thread.join(timeout=5)
|
self._idle_checker_thread.join(timeout=5)
|
||||||
logger.info("Stopped idle checker thread")
|
logger.info("Stopped idle checker thread")
|
||||||
|
|
||||||
logger.info(f"Shutting down {len(sandbox_ids)} sandbox(es)")
|
logger.info(f"Shutting down {len(sandbox_ids)} active + {len(warm_items)} warm-pool sandbox(es)")
|
||||||
|
|
||||||
for sandbox_id in sandbox_ids:
|
for sandbox_id in sandbox_ids:
|
||||||
try:
|
try:
|
||||||
self.release(sandbox_id)
|
self.destroy(sandbox_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to release sandbox {sandbox_id} during shutdown: {e}")
|
logger.error(f"Failed to destroy sandbox {sandbox_id} during shutdown: {e}")
|
||||||
|
|
||||||
|
for sandbox_id, (info, _) in warm_items:
|
||||||
|
try:
|
||||||
|
self._backend.destroy(info)
|
||||||
|
logger.info(f"Destroyed warm-pool sandbox {sandbox_id} during shutdown")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to destroy warm-pool sandbox {sandbox_id} during shutdown: {e}")
|
||||||
|
|||||||
@@ -1,102 +0,0 @@
|
|||||||
"""File-based sandbox state store.
|
|
||||||
|
|
||||||
Uses JSON files for persistence and fcntl file locking for cross-process
|
|
||||||
mutual exclusion. Works across processes on the same machine or across
|
|
||||||
K8s pods with a shared PVC mount.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import fcntl
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from collections.abc import Generator
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from src.config.paths import Paths
|
|
||||||
|
|
||||||
from .sandbox_info import SandboxInfo
|
|
||||||
from .state_store import SandboxStateStore
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
SANDBOX_STATE_FILE = "sandbox.json"
|
|
||||||
SANDBOX_LOCK_FILE = "sandbox.lock"
|
|
||||||
|
|
||||||
|
|
||||||
class FileSandboxStateStore(SandboxStateStore):
|
|
||||||
"""File-based state store using JSON files and fcntl file locking.
|
|
||||||
|
|
||||||
State is stored at: {base_dir}/threads/{thread_id}/sandbox.json
|
|
||||||
Lock files at: {base_dir}/threads/{thread_id}/sandbox.lock
|
|
||||||
|
|
||||||
This works across processes on the same machine sharing a filesystem.
|
|
||||||
For K8s multi-pod scenarios, requires a shared PVC mount at base_dir.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, base_dir: str):
|
|
||||||
"""Initialize the file-based state store.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
base_dir: Root directory for state files (typically Paths.base_dir).
|
|
||||||
"""
|
|
||||||
self._paths = Paths(base_dir)
|
|
||||||
|
|
||||||
def _thread_dir(self, thread_id: str) -> Path:
|
|
||||||
"""Get the directory for a thread's state files."""
|
|
||||||
return self._paths.thread_dir(thread_id)
|
|
||||||
|
|
||||||
def save(self, thread_id: str, info: SandboxInfo) -> None:
|
|
||||||
thread_dir = self._thread_dir(thread_id)
|
|
||||||
os.makedirs(thread_dir, exist_ok=True)
|
|
||||||
state_file = thread_dir / SANDBOX_STATE_FILE
|
|
||||||
try:
|
|
||||||
state_file.write_text(json.dumps(info.to_dict()))
|
|
||||||
logger.info(f"Saved sandbox state for thread {thread_id}: {info.sandbox_id}")
|
|
||||||
except OSError as e:
|
|
||||||
logger.warning(f"Failed to save sandbox state for thread {thread_id}: {e}")
|
|
||||||
|
|
||||||
def load(self, thread_id: str) -> SandboxInfo | None:
|
|
||||||
state_file = self._thread_dir(thread_id) / SANDBOX_STATE_FILE
|
|
||||||
if not state_file.exists():
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
data = json.loads(state_file.read_text())
|
|
||||||
return SandboxInfo.from_dict(data)
|
|
||||||
except (OSError, json.JSONDecodeError, KeyError) as e:
|
|
||||||
logger.warning(f"Failed to load sandbox state for thread {thread_id}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def remove(self, thread_id: str) -> None:
|
|
||||||
state_file = self._thread_dir(thread_id) / SANDBOX_STATE_FILE
|
|
||||||
try:
|
|
||||||
if state_file.exists():
|
|
||||||
state_file.unlink()
|
|
||||||
logger.info(f"Removed sandbox state for thread {thread_id}")
|
|
||||||
except OSError as e:
|
|
||||||
logger.warning(f"Failed to remove sandbox state for thread {thread_id}: {e}")
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def lock(self, thread_id: str) -> Generator[None, None, None]:
|
|
||||||
"""Acquire a cross-process file lock using fcntl.flock.
|
|
||||||
|
|
||||||
The lock is held for the duration of the context manager.
|
|
||||||
Only one process can hold the lock at a time for a given thread_id.
|
|
||||||
|
|
||||||
Note: fcntl.flock is available on macOS and Linux.
|
|
||||||
"""
|
|
||||||
thread_dir = self._thread_dir(thread_id)
|
|
||||||
os.makedirs(thread_dir, exist_ok=True)
|
|
||||||
lock_path = thread_dir / SANDBOX_LOCK_FILE
|
|
||||||
lock_file = open(lock_path, "w")
|
|
||||||
try:
|
|
||||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
|
||||||
lock_file.close()
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
@@ -7,6 +7,7 @@ Handles container lifecycle, port allocation, and cross-process container discov
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from src.utils.network import get_free_port, release_port
|
from src.utils.network import get_free_port, release_port
|
||||||
@@ -104,16 +105,47 @@ class LocalContainerBackend(SandboxBackend):
|
|||||||
RuntimeError: If the container fails to start.
|
RuntimeError: If the container fails to start.
|
||||||
"""
|
"""
|
||||||
container_name = f"{self._container_prefix}-{sandbox_id}"
|
container_name = f"{self._container_prefix}-{sandbox_id}"
|
||||||
port = get_free_port(start_port=self._base_port)
|
|
||||||
|
# Retry loop: if Docker rejects the port (e.g. a stale container still
|
||||||
|
# holds the binding after a process restart), skip that port and try the
|
||||||
|
# next one. The socket-bind check in get_free_port mirrors Docker's
|
||||||
|
# 0.0.0.0 bind, but Docker's port-release can be slightly asynchronous,
|
||||||
|
# so a reactive fallback here ensures we always make progress.
|
||||||
|
_next_start = self._base_port
|
||||||
|
container_id: str | None = None
|
||||||
|
port: int = 0
|
||||||
|
for _attempt in range(10):
|
||||||
|
port = get_free_port(start_port=_next_start)
|
||||||
try:
|
try:
|
||||||
container_id = self._start_container(container_name, port, extra_mounts)
|
container_id = self._start_container(container_name, port, extra_mounts)
|
||||||
except Exception:
|
break
|
||||||
|
except RuntimeError as exc:
|
||||||
release_port(port)
|
release_port(port)
|
||||||
|
err = str(exc)
|
||||||
|
err_lower = err.lower()
|
||||||
|
# Port already bound: skip this port and retry with the next one.
|
||||||
|
if "port is already allocated" in err or "address already in use" in err_lower:
|
||||||
|
logger.warning(f"Port {port} rejected by Docker (already allocated), retrying with next port")
|
||||||
|
_next_start = port + 1
|
||||||
|
continue
|
||||||
|
# Container-name conflict: another process may have already started
|
||||||
|
# the deterministic sandbox container for this sandbox_id. Try to
|
||||||
|
# discover and adopt the existing container instead of failing.
|
||||||
|
if "is already in use by container" in err_lower or "conflict. the container name" in err_lower:
|
||||||
|
logger.warning(f"Container name {container_name} already in use, attempting to discover existing sandbox instance")
|
||||||
|
existing = self.discover(sandbox_id)
|
||||||
|
if existing is not None:
|
||||||
|
return existing
|
||||||
raise
|
raise
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Could not start sandbox container: all candidate ports are already allocated by Docker")
|
||||||
|
|
||||||
|
# When running inside Docker (DooD), sandbox containers are reachable via
|
||||||
|
# host.docker.internal rather than localhost (they run on the host daemon).
|
||||||
|
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
|
||||||
return SandboxInfo(
|
return SandboxInfo(
|
||||||
sandbox_id=sandbox_id,
|
sandbox_id=sandbox_id,
|
||||||
sandbox_url=f"http://localhost:{port}",
|
sandbox_url=f"http://{sandbox_host}:{port}",
|
||||||
container_name=container_name,
|
container_name=container_name,
|
||||||
container_id=container_id,
|
container_id=container_id,
|
||||||
)
|
)
|
||||||
@@ -159,7 +191,8 @@ class LocalContainerBackend(SandboxBackend):
|
|||||||
if port is None:
|
if port is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sandbox_url = f"http://localhost:{port}"
|
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
|
||||||
|
sandbox_url = f"http://{sandbox_host}:{port}"
|
||||||
if not wait_for_sandbox_ready(sandbox_url, timeout=5):
|
if not wait_for_sandbox_ready(sandbox_url, timeout=5):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -1,70 +0,0 @@
|
|||||||
"""Abstract base class for sandbox state persistence.
|
|
||||||
|
|
||||||
The state store handles cross-process persistence of thread_id → sandbox mappings,
|
|
||||||
enabling different processes (gateway, langgraph, multiple workers) to find the same
|
|
||||||
sandbox for a given thread.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from collections.abc import Generator
|
|
||||||
from contextlib import contextmanager
|
|
||||||
|
|
||||||
from .sandbox_info import SandboxInfo
|
|
||||||
|
|
||||||
|
|
||||||
class SandboxStateStore(ABC):
|
|
||||||
"""Abstract base for persisting thread_id → sandbox mappings across processes.
|
|
||||||
|
|
||||||
Implementations:
|
|
||||||
- FileSandboxStateStore: JSON files + fcntl file locking (single-host)
|
|
||||||
- TODO: RedisSandboxStateStore: Redis-based for distributed multi-host deployments
|
|
||||||
"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def save(self, thread_id: str, info: SandboxInfo) -> None:
|
|
||||||
"""Save sandbox state for a thread.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
thread_id: The thread ID.
|
|
||||||
info: Sandbox metadata to persist.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def load(self, thread_id: str) -> SandboxInfo | None:
|
|
||||||
"""Load sandbox state for a thread.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
thread_id: The thread ID.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
SandboxInfo if found, None otherwise.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def remove(self, thread_id: str) -> None:
|
|
||||||
"""Remove sandbox state for a thread.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
thread_id: The thread ID.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
@contextmanager
|
|
||||||
def lock(self, thread_id: str) -> Generator[None, None, None]:
|
|
||||||
"""Acquire a cross-process lock for a thread's sandbox operations.
|
|
||||||
|
|
||||||
Ensures only one process can create/modify a sandbox for a given
|
|
||||||
thread_id at a time, preventing duplicate sandbox creation.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
thread_id: The thread ID to lock.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
None — use as a context manager.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
@@ -133,11 +133,15 @@ class ExtensionsConfig(BaseModel):
|
|||||||
# Return empty config if extensions config file is not found
|
# Return empty config if extensions config file is not found
|
||||||
return cls(mcp_servers={}, skills={})
|
return cls(mcp_servers={}, skills={})
|
||||||
|
|
||||||
|
try:
|
||||||
with open(resolved_path, encoding="utf-8") as f:
|
with open(resolved_path, encoding="utf-8") as f:
|
||||||
config_data = json.load(f)
|
config_data = json.load(f)
|
||||||
|
|
||||||
cls.resolve_env_variables(config_data)
|
cls.resolve_env_variables(config_data)
|
||||||
return cls.model_validate(config_data)
|
return cls.model_validate(config_data)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(f"Extensions config file at {resolved_path} is not valid JSON: {e}") from e
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Failed to load extensions config from {resolved_path}: {e}") from e
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def resolve_env_variables(cls, config: dict[str, Any]) -> dict[str, Any]:
|
def resolve_env_variables(cls, config: dict[str, Any]) -> dict[str, Any]:
|
||||||
@@ -156,7 +160,11 @@ class ExtensionsConfig(BaseModel):
|
|||||||
if value.startswith("$"):
|
if value.startswith("$"):
|
||||||
env_value = os.getenv(value[1:])
|
env_value = os.getenv(value[1:])
|
||||||
if env_value is None:
|
if env_value is None:
|
||||||
raise ValueError(f"Environment variable {value[1:]} not found for config value {value}")
|
# Unresolved placeholder — store empty string so downstream
|
||||||
|
# consumers (e.g. MCP servers) don't receive the literal "$VAR"
|
||||||
|
# token as an actual environment value.
|
||||||
|
config[key] = ""
|
||||||
|
else:
|
||||||
config[key] = env_value
|
config[key] = env_value
|
||||||
else:
|
else:
|
||||||
config[key] = value
|
config[key] = value
|
||||||
|
|||||||
@@ -38,6 +38,21 @@ class Paths:
|
|||||||
def __init__(self, base_dir: str | Path | None = None) -> None:
|
def __init__(self, base_dir: str | Path | None = None) -> None:
|
||||||
self._base_dir = Path(base_dir).resolve() if base_dir is not None else None
|
self._base_dir = Path(base_dir).resolve() if base_dir is not None else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def host_base_dir(self) -> Path:
|
||||||
|
"""Host-visible base dir for Docker volume mount sources.
|
||||||
|
|
||||||
|
When running inside Docker with a mounted Docker socket (DooD), the Docker
|
||||||
|
daemon runs on the host and resolves mount paths against the host filesystem.
|
||||||
|
Set DEER_FLOW_HOST_BASE_DIR to the host-side path that corresponds to this
|
||||||
|
container's base_dir so that sandbox container volume mounts work correctly.
|
||||||
|
|
||||||
|
Falls back to base_dir when the env var is not set (native/local execution).
|
||||||
|
"""
|
||||||
|
if env := os.getenv("DEER_FLOW_HOST_BASE_DIR"):
|
||||||
|
return Path(env)
|
||||||
|
return self.base_dir
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def base_dir(self) -> Path:
|
def base_dir(self) -> Path:
|
||||||
"""Root directory for all application data."""
|
"""Root directory for all application data."""
|
||||||
@@ -124,10 +139,21 @@ class Paths:
|
|||||||
return self.thread_dir(thread_id) / "user-data"
|
return self.thread_dir(thread_id) / "user-data"
|
||||||
|
|
||||||
def ensure_thread_dirs(self, thread_id: str) -> None:
|
def ensure_thread_dirs(self, thread_id: str) -> None:
|
||||||
"""Create all standard sandbox directories for a thread."""
|
"""Create all standard sandbox directories for a thread.
|
||||||
self.sandbox_work_dir(thread_id).mkdir(parents=True, exist_ok=True)
|
|
||||||
self.sandbox_uploads_dir(thread_id).mkdir(parents=True, exist_ok=True)
|
Directories are created with mode 0o777 so that sandbox containers
|
||||||
self.sandbox_outputs_dir(thread_id).mkdir(parents=True, exist_ok=True)
|
(which may run as a different UID than the host backend process) can
|
||||||
|
write to the volume-mounted paths without "Permission denied" errors.
|
||||||
|
The explicit chmod() call is necessary because Path.mkdir(mode=...) is
|
||||||
|
subject to the process umask and may not yield the intended permissions.
|
||||||
|
"""
|
||||||
|
for d in [
|
||||||
|
self.sandbox_work_dir(thread_id),
|
||||||
|
self.sandbox_uploads_dir(thread_id),
|
||||||
|
self.sandbox_outputs_dir(thread_id),
|
||||||
|
]:
|
||||||
|
d.mkdir(parents=True, exist_ok=True)
|
||||||
|
d.chmod(0o777)
|
||||||
|
|
||||||
def resolve_virtual_path(self, thread_id: str, virtual_path: str) -> Path:
|
def resolve_virtual_path(self, thread_id: str, virtual_path: str) -> Path:
|
||||||
"""Resolve a sandbox virtual path to the actual host filesystem path.
|
"""Resolve a sandbox virtual path to the actual host filesystem path.
|
||||||
|
|||||||
@@ -18,8 +18,7 @@ class SandboxConfig(BaseModel):
|
|||||||
AioSandboxProvider specific options:
|
AioSandboxProvider specific options:
|
||||||
image: Docker image to use (default: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest)
|
image: Docker image to use (default: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest)
|
||||||
port: Base port for sandbox containers (default: 8080)
|
port: Base port for sandbox containers (default: 8080)
|
||||||
base_url: If set, uses existing sandbox instead of starting new container
|
replicas: Maximum number of concurrent sandbox containers (default: 3). When the limit is reached the least-recently-used sandbox is evicted to make room.
|
||||||
auto_start: Whether to automatically start Docker container (default: true)
|
|
||||||
container_prefix: Prefix for container names (default: deer-flow-sandbox)
|
container_prefix: Prefix for container names (default: deer-flow-sandbox)
|
||||||
idle_timeout: Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.
|
idle_timeout: Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.
|
||||||
mounts: List of volume mounts to share directories with the container
|
mounts: List of volume mounts to share directories with the container
|
||||||
@@ -38,13 +37,9 @@ class SandboxConfig(BaseModel):
|
|||||||
default=None,
|
default=None,
|
||||||
description="Base port for sandbox containers",
|
description="Base port for sandbox containers",
|
||||||
)
|
)
|
||||||
base_url: str | None = Field(
|
replicas: int | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="If set, uses existing sandbox at this URL instead of starting new container",
|
description="Maximum number of concurrent sandbox containers (default: 3). When the limit is reached the least-recently-used sandbox is evicted to make room.",
|
||||||
)
|
|
||||||
auto_start: bool | None = Field(
|
|
||||||
default=None,
|
|
||||||
description="Whether to automatically start Docker container",
|
|
||||||
)
|
)
|
||||||
container_prefix: str | None = Field(
|
container_prefix: str | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
|
|||||||
@@ -237,12 +237,12 @@ async def get_skill(skill_name: str) -> SkillResponse:
|
|||||||
"/skills/{skill_name}",
|
"/skills/{skill_name}",
|
||||||
response_model=SkillResponse,
|
response_model=SkillResponse,
|
||||||
summary="Update Skill",
|
summary="Update Skill",
|
||||||
description="Update a skill's enabled status by modifying the skills_state_config.json file.",
|
description="Update a skill's enabled status by modifying the extensions_config.json file.",
|
||||||
)
|
)
|
||||||
async def update_skill(skill_name: str, request: SkillUpdateRequest) -> SkillResponse:
|
async def update_skill(skill_name: str, request: SkillUpdateRequest) -> SkillResponse:
|
||||||
"""Update a skill's enabled status.
|
"""Update a skill's enabled status.
|
||||||
|
|
||||||
This will modify the skills_state_config.json file to update the enabled state.
|
This will modify the extensions_config.json file to update the enabled state.
|
||||||
The SKILL.md file itself is not modified.
|
The SKILL.md file itself is not modified.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import logging
|
||||||
from typing import NotRequired, override
|
from typing import NotRequired, override
|
||||||
|
|
||||||
from langchain.agents import AgentState
|
from langchain.agents import AgentState
|
||||||
@@ -7,6 +8,8 @@ from langgraph.runtime import Runtime
|
|||||||
from src.agents.thread_state import SandboxState, ThreadDataState
|
from src.agents.thread_state import SandboxState, ThreadDataState
|
||||||
from src.sandbox import get_sandbox_provider
|
from src.sandbox import get_sandbox_provider
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SandboxMiddlewareState(AgentState):
|
class SandboxMiddlewareState(AgentState):
|
||||||
"""Compatible with the `ThreadState` schema."""
|
"""Compatible with the `ThreadState` schema."""
|
||||||
@@ -42,7 +45,7 @@ class SandboxMiddleware(AgentMiddleware[SandboxMiddlewareState]):
|
|||||||
def _acquire_sandbox(self, thread_id: str) -> str:
|
def _acquire_sandbox(self, thread_id: str) -> str:
|
||||||
provider = get_sandbox_provider()
|
provider = get_sandbox_provider()
|
||||||
sandbox_id = provider.acquire(thread_id)
|
sandbox_id = provider.acquire(thread_id)
|
||||||
print(f"Acquiring sandbox {sandbox_id}")
|
logger.info(f"Acquiring sandbox {sandbox_id}")
|
||||||
return sandbox_id
|
return sandbox_id
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@@ -54,7 +57,25 @@ class SandboxMiddleware(AgentMiddleware[SandboxMiddlewareState]):
|
|||||||
# Eager initialization (original behavior)
|
# Eager initialization (original behavior)
|
||||||
if "sandbox" not in state or state["sandbox"] is None:
|
if "sandbox" not in state or state["sandbox"] is None:
|
||||||
thread_id = runtime.context["thread_id"]
|
thread_id = runtime.context["thread_id"]
|
||||||
print(f"Thread ID: {thread_id}")
|
|
||||||
sandbox_id = self._acquire_sandbox(thread_id)
|
sandbox_id = self._acquire_sandbox(thread_id)
|
||||||
|
logger.info(f"Assigned sandbox {sandbox_id} to thread {thread_id}")
|
||||||
return {"sandbox": {"sandbox_id": sandbox_id}}
|
return {"sandbox": {"sandbox_id": sandbox_id}}
|
||||||
return super().before_agent(state, runtime)
|
return super().before_agent(state, runtime)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def after_agent(self, state: SandboxMiddlewareState, runtime: Runtime) -> dict | None:
|
||||||
|
sandbox = state.get("sandbox")
|
||||||
|
if sandbox is not None:
|
||||||
|
sandbox_id = sandbox["sandbox_id"]
|
||||||
|
logger.info(f"Releasing sandbox {sandbox_id}")
|
||||||
|
get_sandbox_provider().release(sandbox_id)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if runtime.context.get("sandbox_id") is not None:
|
||||||
|
sandbox_id = runtime.context.get("sandbox_id")
|
||||||
|
logger.info(f"Releasing sandbox {sandbox_id} from context")
|
||||||
|
get_sandbox_provider().release(sandbox_id)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# No sandbox to release
|
||||||
|
return super().after_agent(state, runtime)
|
||||||
|
|||||||
@@ -135,6 +135,8 @@ def sandbox_from_runtime(runtime: ToolRuntime[ContextT, ThreadState] | None = No
|
|||||||
sandbox = get_sandbox_provider().get(sandbox_id)
|
sandbox = get_sandbox_provider().get(sandbox_id)
|
||||||
if sandbox is None:
|
if sandbox is None:
|
||||||
raise SandboxNotFoundError(f"Sandbox with ID '{sandbox_id}' not found", sandbox_id=sandbox_id)
|
raise SandboxNotFoundError(f"Sandbox with ID '{sandbox_id}' not found", sandbox_id=sandbox_id)
|
||||||
|
|
||||||
|
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for downstream use
|
||||||
return sandbox
|
return sandbox
|
||||||
|
|
||||||
|
|
||||||
@@ -169,6 +171,7 @@ def ensure_sandbox_initialized(runtime: ToolRuntime[ContextT, ThreadState] | Non
|
|||||||
if sandbox_id is not None:
|
if sandbox_id is not None:
|
||||||
sandbox = get_sandbox_provider().get(sandbox_id)
|
sandbox = get_sandbox_provider().get(sandbox_id)
|
||||||
if sandbox is not None:
|
if sandbox is not None:
|
||||||
|
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
|
||||||
return sandbox
|
return sandbox
|
||||||
# Sandbox was released, fall through to acquire new one
|
# Sandbox was released, fall through to acquire new one
|
||||||
|
|
||||||
@@ -188,6 +191,7 @@ def ensure_sandbox_initialized(runtime: ToolRuntime[ContextT, ThreadState] | Non
|
|||||||
if sandbox is None:
|
if sandbox is None:
|
||||||
raise SandboxNotFoundError("Sandbox not found after acquisition", sandbox_id=sandbox_id)
|
raise SandboxNotFoundError("Sandbox not found after acquisition", sandbox_id=sandbox_id)
|
||||||
|
|
||||||
|
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
|
||||||
return sandbox
|
return sandbox
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -44,9 +44,13 @@ class PortAllocator:
|
|||||||
if port in self._reserved_ports:
|
if port in self._reserved_ports:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Bind to 0.0.0.0 (wildcard) rather than localhost so that the check
|
||||||
|
# mirrors exactly what Docker does. Docker binds to 0.0.0.0:PORT;
|
||||||
|
# checking only 127.0.0.1 can falsely report a port as available even
|
||||||
|
# when Docker already occupies it on the wildcard address.
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||||
try:
|
try:
|
||||||
s.bind(("localhost", port))
|
s.bind(("0.0.0.0", port))
|
||||||
return True
|
return True
|
||||||
except OSError:
|
except OSError:
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -201,9 +201,6 @@ sandbox:
|
|||||||
# sandbox:
|
# sandbox:
|
||||||
# use: src.community.aio_sandbox:AioSandboxProvider
|
# use: src.community.aio_sandbox:AioSandboxProvider
|
||||||
#
|
#
|
||||||
# # Optional: Use existing sandbox at this URL (no container will be started)
|
|
||||||
# # base_url: http://localhost:8080
|
|
||||||
#
|
|
||||||
# # Optional: Container image to use (works with both Docker and Apple Container)
|
# # Optional: Container image to use (works with both Docker and Apple Container)
|
||||||
# # Default: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest
|
# # Default: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest
|
||||||
# # Recommended: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest (works on both x86_64 and arm64)
|
# # Recommended: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest (works on both x86_64 and arm64)
|
||||||
@@ -211,9 +208,11 @@ sandbox:
|
|||||||
#
|
#
|
||||||
# # Optional: Base port for sandbox containers (default: 8080)
|
# # Optional: Base port for sandbox containers (default: 8080)
|
||||||
# # port: 8080
|
# # port: 8080
|
||||||
#
|
|
||||||
# # Optional: Whether to automatically start Docker container (default: true)
|
# # Optional: Maximum number of concurrent sandbox containers (default: 3)
|
||||||
# # auto_start: true
|
# # When the limit is reached the least-recently-used sandbox is evicted to
|
||||||
|
# # make room for new ones. Use a positive integer here; omit this field to use the default.
|
||||||
|
# # replicas: 3
|
||||||
#
|
#
|
||||||
# # Optional: Prefix for container names (default: deer-flow-sandbox)
|
# # Optional: Prefix for container names (default: deer-flow-sandbox)
|
||||||
# # container_prefix: deer-flow-sandbox
|
# # container_prefix: deer-flow-sandbox
|
||||||
@@ -348,7 +347,6 @@ memory:
|
|||||||
injection_enabled: true # Whether to inject memory into system prompt
|
injection_enabled: true # Whether to inject memory into system prompt
|
||||||
max_injection_tokens: 2000 # Maximum tokens for memory injection
|
max_injection_tokens: 2000 # Maximum tokens for memory injection
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Checkpointer Configuration
|
# Checkpointer Configuration
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@@ -373,9 +371,9 @@ memory:
|
|||||||
# type: memory
|
# type: memory
|
||||||
#
|
#
|
||||||
# SQLite (file-based, single-process):
|
# SQLite (file-based, single-process):
|
||||||
# checkpointer:
|
checkpointer:
|
||||||
# type: sqlite
|
type: sqlite
|
||||||
# connection_string: checkpoints.db
|
connection_string: checkpoints.db
|
||||||
#
|
#
|
||||||
# PostgreSQL (multi-process, production):
|
# PostgreSQL (multi-process, production):
|
||||||
# checkpointer:
|
# checkpointer:
|
||||||
|
|||||||
@@ -111,17 +111,24 @@ services:
|
|||||||
container_name: deer-flow-gateway
|
container_name: deer-flow-gateway
|
||||||
command: sh -c "cd backend && uv run uvicorn src.gateway.app:app --host 0.0.0.0 --port 8001 --reload --reload-include='*.yaml .env' > /app/logs/gateway.log 2>&1"
|
command: sh -c "cd backend && uv run uvicorn src.gateway.app:app --host 0.0.0.0 --port 8001 --reload --reload-include='*.yaml .env' > /app/logs/gateway.log 2>&1"
|
||||||
volumes:
|
volumes:
|
||||||
- ../backend/src:/app/backend/src
|
- ../backend/:/app/backend/
|
||||||
- ../backend/.env:/app/backend/.env
|
# Preserve the .venv built during Docker image build — mounting the full backend/
|
||||||
|
# directory above would otherwise shadow it with the (empty) host directory.
|
||||||
|
- gateway-venv:/app/backend/.venv
|
||||||
- ../config.yaml:/app/config.yaml
|
- ../config.yaml:/app/config.yaml
|
||||||
|
- ../extensions_config.json:/app/extensions_config.json
|
||||||
- ../skills:/app/skills
|
- ../skills:/app/skills
|
||||||
- ../logs:/app/logs
|
- ../logs:/app/logs
|
||||||
- ../backend/.deer-flow:/app/backend/.deer-flow
|
|
||||||
# Mount uv cache for faster dependency installation
|
# Mount uv cache for faster dependency installation
|
||||||
- ~/.cache/uv:/root/.cache/uv
|
- ~/.cache/uv:/root/.cache/uv
|
||||||
|
# DooD: same as gateway — AioSandboxProvider runs inside LangGraph process.
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
working_dir: /app
|
working_dir: /app
|
||||||
environment:
|
environment:
|
||||||
- CI=true
|
- CI=true
|
||||||
|
- DEER_FLOW_HOST_BASE_DIR=${DEER_FLOW_ROOT}/backend/.deer-flow
|
||||||
|
- DEER_FLOW_HOST_SKILLS_PATH=${DEER_FLOW_ROOT}/skills
|
||||||
|
- DEER_FLOW_SANDBOX_HOST=host.docker.internal
|
||||||
env_file:
|
env_file:
|
||||||
- ../.env
|
- ../.env
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
@@ -140,24 +147,38 @@ services:
|
|||||||
container_name: deer-flow-langgraph
|
container_name: deer-flow-langgraph
|
||||||
command: sh -c "cd backend && uv run langgraph dev --no-browser --allow-blocking --host 0.0.0.0 --port 2024 > /app/logs/langgraph.log 2>&1"
|
command: sh -c "cd backend && uv run langgraph dev --no-browser --allow-blocking --host 0.0.0.0 --port 2024 > /app/logs/langgraph.log 2>&1"
|
||||||
volumes:
|
volumes:
|
||||||
- ../backend/src:/app/backend/src
|
- ../backend/:/app/backend/
|
||||||
- ../backend/.env:/app/backend/.env
|
# Preserve the .venv built during Docker image build — mounting the full backend/
|
||||||
|
# directory above would otherwise shadow it with the (empty) host directory.
|
||||||
|
- langgraph-venv:/app/backend/.venv
|
||||||
- ../config.yaml:/app/config.yaml
|
- ../config.yaml:/app/config.yaml
|
||||||
|
- ../extensions_config.json:/app/extensions_config.json
|
||||||
- ../skills:/app/skills
|
- ../skills:/app/skills
|
||||||
- ../logs:/app/logs
|
- ../logs:/app/logs
|
||||||
- ../backend/.deer-flow:/app/backend/.deer-flow
|
|
||||||
# Mount uv cache for faster dependency installation
|
# Mount uv cache for faster dependency installation
|
||||||
- ~/.cache/uv:/root/.cache/uv
|
- ~/.cache/uv:/root/.cache/uv
|
||||||
|
# DooD: same as gateway — AioSandboxProvider runs inside LangGraph process.
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
working_dir: /app
|
working_dir: /app
|
||||||
environment:
|
environment:
|
||||||
- CI=true
|
- CI=true
|
||||||
|
- DEER_FLOW_HOST_BASE_DIR=${DEER_FLOW_ROOT}/backend/.deer-flow
|
||||||
|
- DEER_FLOW_HOST_SKILLS_PATH=${DEER_FLOW_ROOT}/skills
|
||||||
|
- DEER_FLOW_SANDBOX_HOST=host.docker.internal
|
||||||
env_file:
|
env_file:
|
||||||
- ../.env
|
- ../.env
|
||||||
|
extra_hosts:
|
||||||
|
# For Linux: map host.docker.internal to host gateway
|
||||||
|
- "host.docker.internal:host-gateway"
|
||||||
networks:
|
networks:
|
||||||
- deer-flow-dev
|
- deer-flow-dev
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
volumes: {}
|
volumes:
|
||||||
|
# Persist .venv across container restarts so dependencies installed during
|
||||||
|
# image build are not shadowed by the host backend/ directory mount.
|
||||||
|
gateway-venv:
|
||||||
|
langgraph-venv:
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
deer-flow-dev:
|
deer-flow-dev:
|
||||||
|
|||||||
@@ -1,18 +1,25 @@
|
|||||||
{
|
{
|
||||||
"mcpServers": {
|
"mcpServers": {
|
||||||
"filesystem": {
|
"filesystem": {
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"type": "stdio",
|
"type": "stdio",
|
||||||
"command": "npx",
|
"command": "npx",
|
||||||
"args": ["-y", "@modelcontextprotocol/server-filesystem", "/path/to/allowed/files"],
|
"args": [
|
||||||
|
"-y",
|
||||||
|
"@modelcontextprotocol/server-filesystem",
|
||||||
|
"/path/to/allowed/files"
|
||||||
|
],
|
||||||
"env": {},
|
"env": {},
|
||||||
"description": "Provides filesystem access within allowed directories"
|
"description": "Provides filesystem access within allowed directories"
|
||||||
},
|
},
|
||||||
"github": {
|
"github": {
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"type": "stdio",
|
"type": "stdio",
|
||||||
"command": "npx",
|
"command": "npx",
|
||||||
"args": ["-y", "@modelcontextprotocol/server-github"],
|
"args": [
|
||||||
|
"-y",
|
||||||
|
"@modelcontextprotocol/server-github"
|
||||||
|
],
|
||||||
"env": {
|
"env": {
|
||||||
"GITHUB_TOKEN": "$GITHUB_TOKEN"
|
"GITHUB_TOKEN": "$GITHUB_TOKEN"
|
||||||
},
|
},
|
||||||
@@ -22,50 +29,14 @@
|
|||||||
"enabled": false,
|
"enabled": false,
|
||||||
"type": "stdio",
|
"type": "stdio",
|
||||||
"command": "npx",
|
"command": "npx",
|
||||||
"args": ["-y", "@modelcontextprotocol/server-postgres", "postgresql://localhost/mydb"],
|
"args": [
|
||||||
|
"-y",
|
||||||
|
"@modelcontextprotocol/server-postgres",
|
||||||
|
"postgresql://localhost/mydb"
|
||||||
|
],
|
||||||
"env": {},
|
"env": {},
|
||||||
"description": "PostgreSQL database access"
|
"description": "PostgreSQL database access"
|
||||||
},
|
|
||||||
"my-sse-server": {
|
|
||||||
"type": "sse",
|
|
||||||
"url": "https://api.example.com/mcp",
|
|
||||||
"headers": {
|
|
||||||
"Authorization": "Bearer $API_TOKEN",
|
|
||||||
"X-Custom-Header": "value"
|
|
||||||
},
|
|
||||||
"oauth": {
|
|
||||||
"enabled": true,
|
|
||||||
"token_url": "https://auth.example.com/oauth/token",
|
|
||||||
"grant_type": "client_credentials",
|
|
||||||
"client_id": "$MCP_OAUTH_CLIENT_ID",
|
|
||||||
"client_secret": "$MCP_OAUTH_CLIENT_SECRET",
|
|
||||||
"scope": "mcp.read mcp.write",
|
|
||||||
"audience": "https://api.example.com",
|
|
||||||
"refresh_skew_seconds": 60
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"my-http-server": {
|
"skills": {}
|
||||||
"type": "http",
|
|
||||||
"url": "https://api.example.com/mcp",
|
|
||||||
"headers": {
|
|
||||||
"Authorization": "Bearer $API_TOKEN",
|
|
||||||
"X-Custom-Header": "value"
|
|
||||||
},
|
|
||||||
"oauth": {
|
|
||||||
"enabled": true,
|
|
||||||
"token_url": "https://auth.example.com/oauth/token",
|
|
||||||
"grant_type": "client_credentials",
|
|
||||||
"client_id": "$MCP_OAUTH_CLIENT_ID",
|
|
||||||
"client_secret": "$MCP_OAUTH_CLIENT_SECRET"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"skills": {
|
|
||||||
"pdf-processing": {
|
|
||||||
"enabled": true
|
|
||||||
},
|
|
||||||
"frontend-design": {
|
|
||||||
"enabled": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -125,6 +125,39 @@ start() {
|
|||||||
echo ""
|
echo ""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Ensure config.yaml exists before starting.
|
||||||
|
if [ ! -f "$PROJECT_ROOT/config.yaml" ]; then
|
||||||
|
if [ -f "$PROJECT_ROOT/config.example.yaml" ]; then
|
||||||
|
cp "$PROJECT_ROOT/config.example.yaml" "$PROJECT_ROOT/config.yaml"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}============================================================${NC}"
|
||||||
|
echo -e "${YELLOW} config.yaml has been created from config.example.yaml.${NC}"
|
||||||
|
echo -e "${YELLOW} Please edit config.yaml to set your API keys and model ${NC}"
|
||||||
|
echo -e "${YELLOW} configuration before starting DeerFlow. ${NC}"
|
||||||
|
echo -e "${YELLOW}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW} Edit the file: $PROJECT_ROOT/config.yaml${NC}"
|
||||||
|
echo -e "${YELLOW} Then run: make docker-start${NC}"
|
||||||
|
echo ""
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}✗ config.yaml not found and no config.example.yaml to copy from.${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Ensure extensions_config.json exists as a file before mounting.
|
||||||
|
# Docker creates a directory when bind-mounting a non-existent host path.
|
||||||
|
if [ ! -f "$PROJECT_ROOT/extensions_config.json" ]; then
|
||||||
|
if [ -f "$PROJECT_ROOT/extensions_config.example.json" ]; then
|
||||||
|
cp "$PROJECT_ROOT/extensions_config.example.json" "$PROJECT_ROOT/extensions_config.json"
|
||||||
|
echo -e "${BLUE}Created extensions_config.json from example${NC}"
|
||||||
|
else
|
||||||
|
echo "{}" > "$PROJECT_ROOT/extensions_config.json"
|
||||||
|
echo -e "${BLUE}Created empty extensions_config.json${NC}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Building and starting containers..."
|
echo "Building and starting containers..."
|
||||||
cd "$DOCKER_DIR" && $COMPOSE_CMD up --build -d --remove-orphans $services
|
cd "$DOCKER_DIR" && $COMPOSE_CMD up --build -d --remove-orphans $services
|
||||||
echo ""
|
echo ""
|
||||||
@@ -177,8 +210,15 @@ logs() {
|
|||||||
|
|
||||||
# Stop Docker development environment
|
# Stop Docker development environment
|
||||||
stop() {
|
stop() {
|
||||||
|
# DEER_FLOW_ROOT is referenced in docker-compose-dev.yaml; set it before
|
||||||
|
# running compose down to suppress "variable is not set" warnings.
|
||||||
|
if [ -z "$DEER_FLOW_ROOT" ]; then
|
||||||
|
export DEER_FLOW_ROOT="$PROJECT_ROOT"
|
||||||
|
fi
|
||||||
echo "Stopping Docker development services..."
|
echo "Stopping Docker development services..."
|
||||||
cd "$DOCKER_DIR" && $COMPOSE_CMD down
|
cd "$DOCKER_DIR" && $COMPOSE_CMD down
|
||||||
|
echo "Cleaning up sandbox containers..."
|
||||||
|
"$SCRIPT_DIR/cleanup-containers.sh" deer-flow-sandbox 2>/dev/null || true
|
||||||
echo -e "${GREEN}✓ Docker services stopped${NC}"
|
echo -e "${GREEN}✓ Docker services stopped${NC}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ pkill -f "next dev" 2>/dev/null || true
|
|||||||
nginx -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" -s quit 2>/dev/null || true
|
nginx -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" -s quit 2>/dev/null || true
|
||||||
sleep 1
|
sleep 1
|
||||||
pkill -9 nginx 2>/dev/null || true
|
pkill -9 nginx 2>/dev/null || true
|
||||||
|
killall -9 nginx 2>/dev/null || true
|
||||||
./scripts/cleanup-containers.sh deer-flow-sandbox 2>/dev/null || true
|
./scripts/cleanup-containers.sh deer-flow-sandbox 2>/dev/null || true
|
||||||
sleep 1
|
sleep 1
|
||||||
|
|
||||||
@@ -60,9 +61,15 @@ cleanup() {
|
|||||||
pkill -f "langgraph dev" 2>/dev/null || true
|
pkill -f "langgraph dev" 2>/dev/null || true
|
||||||
pkill -f "uvicorn src.gateway.app:app" 2>/dev/null || true
|
pkill -f "uvicorn src.gateway.app:app" 2>/dev/null || true
|
||||||
pkill -f "next dev" 2>/dev/null || true
|
pkill -f "next dev" 2>/dev/null || true
|
||||||
nginx -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" -s quit 2>/dev/null || true
|
# Kill nginx using the captured PID first (most reliable),
|
||||||
|
# then fall back to pkill/killall for any stray nginx workers.
|
||||||
|
if [ -n "${NGINX_PID:-}" ] && kill -0 "$NGINX_PID" 2>/dev/null; then
|
||||||
|
kill -TERM "$NGINX_PID" 2>/dev/null || true
|
||||||
sleep 1
|
sleep 1
|
||||||
|
kill -9 "$NGINX_PID" 2>/dev/null || true
|
||||||
|
fi
|
||||||
pkill -9 nginx 2>/dev/null || true
|
pkill -9 nginx 2>/dev/null || true
|
||||||
|
killall -9 nginx 2>/dev/null || true
|
||||||
echo "Cleaning up sandbox containers..."
|
echo "Cleaning up sandbox containers..."
|
||||||
./scripts/cleanup-containers.sh deer-flow-sandbox 2>/dev/null || true
|
./scripts/cleanup-containers.sh deer-flow-sandbox 2>/dev/null || true
|
||||||
echo "✓ All services stopped"
|
echo "✓ All services stopped"
|
||||||
@@ -106,6 +113,7 @@ echo "✓ Frontend started on localhost:3000"
|
|||||||
|
|
||||||
echo "Starting Nginx reverse proxy..."
|
echo "Starting Nginx reverse proxy..."
|
||||||
nginx -g 'daemon off;' -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" > logs/nginx.log 2>&1 &
|
nginx -g 'daemon off;' -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" > logs/nginx.log 2>&1 &
|
||||||
|
NGINX_PID=$!
|
||||||
./scripts/wait-for-port.sh 2026 10 "Nginx" || {
|
./scripts/wait-for-port.sh 2026 10 "Nginx" || {
|
||||||
echo " See logs/nginx.log for details"
|
echo " See logs/nginx.log for details"
|
||||||
tail -10 logs/nginx.log
|
tail -10 logs/nginx.log
|
||||||
|
|||||||
Reference in New Issue
Block a user