chore(docker): Refactor sandbox state management and improve Docker integration (#1068)

* Refactor sandbox state management and improve Docker integration

- Removed FileSandboxStateStore and SandboxStateStore classes for a cleaner architecture.
- Enhanced LocalContainerBackend to handle port allocation retries and introduced environment variable support for sandbox host configuration.
- Updated Paths class to include host_base_dir for Docker volume mounts and ensured proper permissions for sandbox directories.
- Modified ExtensionsConfig to improve error handling when loading configuration files and adjusted environment variable resolution.
- Updated sandbox configuration to include a replicas option for managing concurrent sandbox containers.
- Improved logging and context management in SandboxMiddleware for better sandbox lifecycle handling.
- Enhanced network port allocation logic to bind to 0.0.0.0 for compatibility with Docker.
- Updated Docker Compose files to ensure proper volume management and environment variable configuration.
- Created scripts to ensure necessary configuration files are present before starting services.
- Cleaned up unused MCP server configurations in extensions_config.example.json.

* Address Copilot review suggestions from PR #1068 (#9)

---------

Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
JeffJiang
2026-03-11 10:03:01 +08:00
committed by GitHub
parent 6ae7f0c0ee
commit f836d8e17c
18 changed files with 455 additions and 384 deletions

View File

@@ -7,9 +7,11 @@ RUN apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"
# Install Docker CLI (for DooD: allows starting sandbox containers via host Docker socket)
COPY --from=docker:cli /usr/local/bin/docker /usr/local/bin/docker
# Install uv from a pinned versioned image (avoids curl|sh from untrusted remote)
COPY --from=ghcr.io/astral-sh/uv:0.7.20 /uv /uvx /usr/local/bin/
# Set working directory
WORKDIR /app

View File

@@ -1,19 +1,15 @@
from .aio_sandbox import AioSandbox
from .aio_sandbox_provider import AioSandboxProvider
from .backend import SandboxBackend
from .file_state_store import FileSandboxStateStore
from .local_backend import LocalContainerBackend
from .remote_backend import RemoteSandboxBackend
from .sandbox_info import SandboxInfo
from .state_store import SandboxStateStore
__all__ = [
"AioSandbox",
"AioSandboxProvider",
"FileSandboxStateStore",
"LocalContainerBackend",
"RemoteSandboxBackend",
"SandboxBackend",
"SandboxInfo",
"SandboxStateStore",
]

View File

@@ -1,18 +1,17 @@
"""AIO Sandbox Provider — orchestrates sandbox lifecycle with pluggable backends.
This provider composes two abstractions:
This provider composes:
- SandboxBackend: how sandboxes are provisioned (local container vs remote/K8s)
- SandboxStateStore: how thread→sandbox mappings are persisted (file vs Redis)
The provider itself handles:
- In-process caching for fast repeated access
- Thread-safe locking (in-process + cross-process via state store)
- Idle timeout management
- Graceful shutdown with signal handling
- Mount computation (thread-specific, skills)
"""
import atexit
import fcntl
import hashlib
import logging
import os
@@ -22,17 +21,15 @@ import time
import uuid
from src.config import get_app_config
from src.config.paths import VIRTUAL_PATH_PREFIX, get_paths
from src.config.paths import VIRTUAL_PATH_PREFIX, Paths, get_paths
from src.sandbox.sandbox import Sandbox
from src.sandbox.sandbox_provider import SandboxProvider
from .aio_sandbox import AioSandbox
from .backend import SandboxBackend, wait_for_sandbox_ready
from .file_state_store import FileSandboxStateStore
from .local_backend import LocalContainerBackend
from .remote_backend import RemoteSandboxBackend
from .sandbox_info import SandboxInfo
from .state_store import SandboxStateStore
logger = logging.getLogger(__name__)
@@ -41,6 +38,7 @@ DEFAULT_IMAGE = "enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in
DEFAULT_PORT = 8080
DEFAULT_CONTAINER_PREFIX = "deer-flow-sandbox"
DEFAULT_IDLE_TIMEOUT = 600 # 10 minutes in seconds
DEFAULT_REPLICAS = 3 # Maximum concurrent sandbox containers
IDLE_CHECK_INTERVAL = 60 # Check every 60 seconds
@@ -48,20 +46,17 @@ class AioSandboxProvider(SandboxProvider):
"""Sandbox provider that manages containers running the AIO sandbox.
Architecture:
This provider composes a SandboxBackend (how to provision) and a
SandboxStateStore (how to persist state), enabling:
This provider composes a SandboxBackend (how to provision), enabling:
- Local Docker/Apple Container mode (auto-start containers)
- Remote/K8s mode (connect to pre-existing sandbox URL)
- Cross-process consistency via file-based or Redis state stores
Configuration options in config.yaml under sandbox:
use: src.community.aio_sandbox:AioSandboxProvider
image: <container image>
port: 8080 # Base port for local containers
base_url: http://... # If set, uses remote backend (K8s/external)
auto_start: true # Whether to auto-start local containers
container_prefix: deer-flow-sandbox
idle_timeout: 600 # Idle timeout in seconds (0 to disable)
replicas: 3 # Max concurrent sandbox containers (LRU eviction when exceeded)
mounts: # Volume mounts for local containers
- host_path: /path/on/host
container_path: /path/in/container
@@ -78,13 +73,17 @@ class AioSandboxProvider(SandboxProvider):
self._thread_sandboxes: dict[str, str] = {} # thread_id -> sandbox_id
self._thread_locks: dict[str, threading.Lock] = {} # thread_id -> in-process lock
self._last_activity: dict[str, float] = {} # sandbox_id -> last activity timestamp
# Warm pool: released sandboxes whose containers are still running.
# Maps sandbox_id -> (SandboxInfo, release_timestamp).
# Containers here can be reclaimed quickly (no cold-start) or destroyed
# when replicas capacity is exhausted.
self._warm_pool: dict[str, tuple[SandboxInfo, float]] = {}
self._shutdown_called = False
self._idle_checker_stop = threading.Event()
self._idle_checker_thread: threading.Thread | None = None
self._config = self._load_config()
self._backend: SandboxBackend = self._create_backend()
self._state_store: SandboxStateStore = self._create_state_store()
# Register shutdown handler
atexit.register(self.shutdown)
@@ -102,16 +101,14 @@ class AioSandboxProvider(SandboxProvider):
Selection logic (checked in order):
1. ``provisioner_url`` set → RemoteSandboxBackend (provisioner mode)
Provisioner dynamically creates Pods + Services in k3s.
2. ``auto_start`` → LocalContainerBackend (Docker / Apple Container)
2. Default → LocalContainerBackend (local mode)
Local provider manages container lifecycle directly (start/stop).
"""
provisioner_url = self._config.get("provisioner_url")
if provisioner_url:
logger.info(f"Using remote sandbox backend with provisioner at {provisioner_url}")
return RemoteSandboxBackend(provisioner_url=provisioner_url)
if not self._config.get("auto_start", True):
raise RuntimeError("auto_start is disabled and no base_url is configured")
logger.info("Using local container sandbox backend")
return LocalContainerBackend(
image=self._config["image"],
@@ -121,21 +118,6 @@ class AioSandboxProvider(SandboxProvider):
environment=self._config["environment"],
)
def _create_state_store(self) -> SandboxStateStore:
"""Create the state store for cross-process sandbox mapping persistence.
Currently uses file-based store. For distributed multi-host deployments,
a Redis-based store can be plugged in here.
"""
# TODO: Support RedisSandboxStateStore for distributed deployments.
# Configuration would be:
# sandbox:
# state_store: redis
# redis_url: redis://localhost:6379/0
# This would enable cross-host sandbox discovery (e.g., multiple K8s pods
# without shared PVC, or multi-node Docker Swarm).
return FileSandboxStateStore(base_dir=str(get_paths().base_dir))
# ── Configuration ────────────────────────────────────────────────────
def _load_config(self) -> dict:
@@ -143,13 +125,15 @@ class AioSandboxProvider(SandboxProvider):
config = get_app_config()
sandbox_config = config.sandbox
idle_timeout = getattr(sandbox_config, "idle_timeout", None)
replicas = getattr(sandbox_config, "replicas", None)
return {
"image": sandbox_config.image or DEFAULT_IMAGE,
"port": sandbox_config.port or DEFAULT_PORT,
"base_url": sandbox_config.base_url,
"auto_start": sandbox_config.auto_start if sandbox_config.auto_start is not None else True,
"container_prefix": sandbox_config.container_prefix or DEFAULT_CONTAINER_PREFIX,
"idle_timeout": getattr(sandbox_config, "idle_timeout", None) or DEFAULT_IDLE_TIMEOUT,
"idle_timeout": idle_timeout if idle_timeout is not None else DEFAULT_IDLE_TIMEOUT,
"replicas": replicas if replicas is not None else DEFAULT_REPLICAS,
"mounts": sandbox_config.mounts or [],
"environment": self._resolve_env_vars(sandbox_config.environment or {}),
# provisioner URL for dynamic pod management (e.g. http://provisioner:8002)
@@ -201,28 +185,38 @@ class AioSandboxProvider(SandboxProvider):
"""Get volume mounts for a thread's data directories.
Creates directories if they don't exist (lazy initialization).
Mount sources use host_base_dir so that when running inside Docker with a
mounted Docker socket (DooD), the host Docker daemon can resolve the paths.
"""
paths = get_paths()
paths.ensure_thread_dirs(thread_id)
mounts = [
(str(paths.sandbox_work_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/workspace", False),
(str(paths.sandbox_uploads_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/uploads", False),
(str(paths.sandbox_outputs_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/outputs", False),
]
# host_paths resolves to the host-side base dir when DEER_FLOW_HOST_BASE_DIR
# is set, otherwise falls back to the container's own base dir (native mode).
host_paths = Paths(base_dir=paths.host_base_dir)
return mounts
return [
(str(host_paths.sandbox_work_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/workspace", False),
(str(host_paths.sandbox_uploads_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/uploads", False),
(str(host_paths.sandbox_outputs_dir(thread_id)), f"{VIRTUAL_PATH_PREFIX}/outputs", False),
]
@staticmethod
def _get_skills_mount() -> tuple[str, str, bool] | None:
"""Get the skills directory mount configuration."""
"""Get the skills directory mount configuration.
Mount source uses DEER_FLOW_HOST_SKILLS_PATH when running inside Docker (DooD)
so the host Docker daemon can resolve the path.
"""
try:
config = get_app_config()
skills_path = config.skills.get_skills_path()
container_path = config.skills.container_path
if skills_path.exists():
return (str(skills_path), container_path, True) # Read-only for security
# When running inside Docker with DooD, use host-side skills path.
host_skills = os.environ.get("DEER_FLOW_HOST_SKILLS_PATH") or str(skills_path)
return (host_skills, container_path, True) # Read-only for security
except Exception as e:
logger.warning(f"Could not setup skills mount: {e}")
return None
@@ -249,21 +243,53 @@ class AioSandboxProvider(SandboxProvider):
def _cleanup_idle_sandboxes(self, idle_timeout: float) -> None:
current_time = time.time()
sandboxes_to_release = []
active_to_destroy = []
warm_to_destroy: list[tuple[str, SandboxInfo]] = []
with self._lock:
# Active sandboxes: tracked via _last_activity
for sandbox_id, last_activity in self._last_activity.items():
idle_duration = current_time - last_activity
if idle_duration > idle_timeout:
sandboxes_to_release.append(sandbox_id)
logger.info(f"Sandbox {sandbox_id} idle for {idle_duration:.1f}s, marking for release")
active_to_destroy.append(sandbox_id)
logger.info(f"Sandbox {sandbox_id} idle for {idle_duration:.1f}s, marking for destroy")
for sandbox_id in sandboxes_to_release:
# Warm pool: tracked via release_timestamp stored in _warm_pool
for sandbox_id, (info, release_ts) in list(self._warm_pool.items()):
warm_duration = current_time - release_ts
if warm_duration > idle_timeout:
warm_to_destroy.append((sandbox_id, info))
del self._warm_pool[sandbox_id]
logger.info(f"Warm-pool sandbox {sandbox_id} idle for {warm_duration:.1f}s, marking for destroy")
# Destroy active sandboxes (re-verify still idle before acting)
for sandbox_id in active_to_destroy:
try:
logger.info(f"Releasing idle sandbox {sandbox_id}")
self.release(sandbox_id)
# Re-verify the sandbox is still idle under the lock before destroying.
# Between the snapshot above and here, the sandbox may have been
# re-acquired (last_activity updated) or already released/destroyed.
with self._lock:
last_activity = self._last_activity.get(sandbox_id)
if last_activity is None:
# Already released or destroyed by another path — skip.
logger.info(f"Sandbox {sandbox_id} already gone before idle destroy, skipping")
continue
if (time.time() - last_activity) < idle_timeout:
# Re-acquired (activity updated) since the snapshot — skip.
logger.info(f"Sandbox {sandbox_id} was re-acquired before idle destroy, skipping")
continue
logger.info(f"Destroying idle sandbox {sandbox_id}")
self.destroy(sandbox_id)
except Exception as e:
logger.error(f"Failed to release idle sandbox {sandbox_id}: {e}")
logger.error(f"Failed to destroy idle sandbox {sandbox_id}: {e}")
# Destroy warm-pool sandboxes (already removed from _warm_pool under lock above)
for sandbox_id, info in warm_to_destroy:
try:
self._backend.destroy(info)
logger.info(f"Destroyed idle warm-pool sandbox {sandbox_id}")
except Exception as e:
logger.error(f"Failed to destroy idle warm-pool sandbox {sandbox_id}: {e}")
# ── Signal handling ──────────────────────────────────────────────────
@@ -321,11 +347,12 @@ class AioSandboxProvider(SandboxProvider):
return self._acquire_internal(thread_id)
def _acquire_internal(self, thread_id: str | None) -> str:
"""Internal sandbox acquisition with three-layer consistency.
"""Internal sandbox acquisition with two-layer consistency.
Layer 1: In-process cache (fastest, covers same-process repeated access)
Layer 2: Cross-process state store + file lock (covers multi-process)
Layer 3: Backend discovery (covers containers started by other processes)
Layer 2: Backend discovery (covers containers started by other processes;
sandbox_id is deterministic from thread_id so no shared state file
is needed — any process can derive the same container name)
"""
# ── Layer 1: In-process cache (fast path) ──
if thread_id:
@@ -342,55 +369,95 @@ class AioSandboxProvider(SandboxProvider):
# Deterministic ID for thread-specific, random for anonymous
sandbox_id = self._deterministic_sandbox_id(thread_id) if thread_id else str(uuid.uuid4())[:8]
# ── Layer 2 & 3: Cross-process recovery + creation ──
# ── Layer 1.5: Warm pool (container still running, no cold-start) ──
if thread_id:
with self._state_store.lock(thread_id):
# Try to recover from persisted state or discover existing container
recovered_id = self._try_recover(thread_id)
if recovered_id is not None:
return recovered_id
# Nothing to recover — create new sandbox (still under cross-process lock)
with self._lock:
if sandbox_id in self._warm_pool:
info, _ = self._warm_pool.pop(sandbox_id)
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
self._sandboxes[sandbox_id] = sandbox
self._sandbox_infos[sandbox_id] = info
self._last_activity[sandbox_id] = time.time()
self._thread_sandboxes[thread_id] = sandbox_id
logger.info(f"Reclaimed warm-pool sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
return sandbox_id
# ── Layer 2: Backend discovery + create (protected by cross-process lock) ──
# Use a file lock so that two processes racing to create the same sandbox
# for the same thread_id serialize here: the second process will discover
# the container started by the first instead of hitting a name-conflict.
if thread_id:
return self._discover_or_create_with_lock(thread_id, sandbox_id)
return self._create_sandbox(thread_id, sandbox_id)
def _discover_or_create_with_lock(self, thread_id: str, sandbox_id: str) -> str:
"""Discover an existing sandbox or create a new one under a cross-process file lock.
The file lock serializes concurrent sandbox creation for the same thread_id
across multiple processes, preventing container-name conflicts.
"""
paths = get_paths()
paths.ensure_thread_dirs(thread_id)
lock_path = paths.thread_dir(thread_id) / f"{sandbox_id}.lock"
with open(lock_path, "a") as lock_file:
try:
fcntl.flock(lock_file, fcntl.LOCK_EX)
# Re-check in-process caches under the file lock in case another
# thread in this process won the race while we were waiting.
with self._lock:
if thread_id in self._thread_sandboxes:
existing_id = self._thread_sandboxes[thread_id]
if existing_id in self._sandboxes:
logger.info(f"Reusing in-process sandbox {existing_id} for thread {thread_id} (post-lock check)")
self._last_activity[existing_id] = time.time()
return existing_id
if sandbox_id in self._warm_pool:
info, _ = self._warm_pool.pop(sandbox_id)
sandbox = AioSandbox(id=sandbox_id, base_url=info.sandbox_url)
self._sandboxes[sandbox_id] = sandbox
self._sandbox_infos[sandbox_id] = info
self._last_activity[sandbox_id] = time.time()
self._thread_sandboxes[thread_id] = sandbox_id
logger.info(f"Reclaimed warm-pool sandbox {sandbox_id} for thread {thread_id} (post-lock check)")
return sandbox_id
# Backend discovery: another process may have created the container.
discovered = self._backend.discover(sandbox_id)
if discovered is not None:
sandbox = AioSandbox(id=discovered.sandbox_id, base_url=discovered.sandbox_url)
with self._lock:
self._sandboxes[discovered.sandbox_id] = sandbox
self._sandbox_infos[discovered.sandbox_id] = discovered
self._last_activity[discovered.sandbox_id] = time.time()
self._thread_sandboxes[thread_id] = discovered.sandbox_id
logger.info(f"Discovered existing sandbox {discovered.sandbox_id} for thread {thread_id} at {discovered.sandbox_url}")
return discovered.sandbox_id
return self._create_sandbox(thread_id, sandbox_id)
else:
return self._create_sandbox(thread_id, sandbox_id)
finally:
fcntl.flock(lock_file, fcntl.LOCK_UN)
def _try_recover(self, thread_id: str) -> str | None:
"""Try to recover a sandbox from persisted state or backend discovery.
Called under cross-process lock for the given thread_id.
Args:
thread_id: The thread ID.
def _evict_oldest_warm(self) -> str | None:
"""Destroy the oldest container in the warm pool to free capacity.
Returns:
The sandbox_id if recovery succeeded, None otherwise.
The evicted sandbox_id, or None if warm pool is empty.
"""
info = self._state_store.load(thread_id)
if info is None:
return None
# Re-discover: verifies sandbox is alive and gets current connection info
# (handles cases like port changes after container restart)
discovered = self._backend.discover(info.sandbox_id)
if discovered is None:
logger.info(f"Persisted sandbox {info.sandbox_id} for thread {thread_id} could not be recovered")
self._state_store.remove(thread_id)
return None
# Adopt into this process's memory
sandbox = AioSandbox(id=discovered.sandbox_id, base_url=discovered.sandbox_url)
with self._lock:
self._sandboxes[discovered.sandbox_id] = sandbox
self._sandbox_infos[discovered.sandbox_id] = discovered
self._last_activity[discovered.sandbox_id] = time.time()
self._thread_sandboxes[thread_id] = discovered.sandbox_id
if not self._warm_pool:
return None
oldest_id = min(self._warm_pool, key=lambda sid: self._warm_pool[sid][1])
info, _ = self._warm_pool.pop(oldest_id)
# Update state if connection info changed
if discovered.sandbox_url != info.sandbox_url:
self._state_store.save(thread_id, discovered)
logger.info(f"Recovered sandbox {discovered.sandbox_id} for thread {thread_id} at {discovered.sandbox_url}")
return discovered.sandbox_id
try:
self._backend.destroy(info)
logger.info(f"Destroyed warm-pool sandbox {oldest_id}")
except Exception as e:
logger.error(f"Failed to destroy warm-pool sandbox {oldest_id}: {e}")
return None
return oldest_id
def _create_sandbox(self, thread_id: str | None, sandbox_id: str) -> str:
"""Create a new sandbox via the backend.
@@ -407,6 +474,21 @@ class AioSandboxProvider(SandboxProvider):
"""
extra_mounts = self._get_extra_mounts(thread_id)
# Enforce replicas: only warm-pool containers count toward eviction budget.
# Active sandboxes are in use by live threads and must not be forcibly stopped.
replicas = self._config.get("replicas", DEFAULT_REPLICAS)
with self._lock:
total = len(self._sandboxes) + len(self._warm_pool)
if total >= replicas:
evicted = self._evict_oldest_warm()
if evicted:
logger.info(f"Evicted warm-pool sandbox {evicted} to stay within replicas={replicas}")
else:
# All slots are occupied by active sandboxes — proceed anyway and log.
# The replicas limit is a soft cap; we never forcibly stop a container
# that is actively serving a thread.
logger.warning(f"All {replicas} replica slots are in active use; creating sandbox {sandbox_id} beyond the soft limit")
info = self._backend.create(thread_id, sandbox_id, extra_mounts=extra_mounts or None)
# Wait for sandbox to be ready
@@ -422,10 +504,6 @@ class AioSandboxProvider(SandboxProvider):
if thread_id:
self._thread_sandboxes[thread_id] = sandbox_id
# Persist for cross-process discovery
if thread_id:
self._state_store.save(thread_id, info)
logger.info(f"Created sandbox {sandbox_id} for thread {thread_id} at {info.sandbox_url}")
return sandbox_id
@@ -445,7 +523,11 @@ class AioSandboxProvider(SandboxProvider):
return sandbox
def release(self, sandbox_id: str) -> None:
"""Release a sandbox: clean up in-memory state, persisted state, and backend resources.
"""Release a sandbox from active use into the warm pool.
The container is kept running so it can be reclaimed quickly by the same
thread on its next turn without a cold-start. The container will only be
stopped when the replicas limit forces eviction or during shutdown.
Args:
sandbox_id: The ID of the sandbox to release.
@@ -460,15 +542,40 @@ class AioSandboxProvider(SandboxProvider):
for tid in thread_ids_to_remove:
del self._thread_sandboxes[tid]
self._last_activity.pop(sandbox_id, None)
# Park in warm pool — container keeps running
if info and sandbox_id not in self._warm_pool:
self._warm_pool[sandbox_id] = (info, time.time())
# Clean up persisted state (outside lock, involves file I/O)
for tid in thread_ids_to_remove:
self._state_store.remove(tid)
logger.info(f"Released sandbox {sandbox_id} to warm pool (container still running)")
def destroy(self, sandbox_id: str) -> None:
"""Destroy a sandbox: stop the container and free all resources.
Unlike release(), this actually stops the container. Use this for
explicit cleanup, capacity-driven eviction, or shutdown.
Args:
sandbox_id: The ID of the sandbox to destroy.
"""
info = None
thread_ids_to_remove: list[str] = []
with self._lock:
self._sandboxes.pop(sandbox_id, None)
info = self._sandbox_infos.pop(sandbox_id, None)
thread_ids_to_remove = [tid for tid, sid in self._thread_sandboxes.items() if sid == sandbox_id]
for tid in thread_ids_to_remove:
del self._thread_sandboxes[tid]
self._last_activity.pop(sandbox_id, None)
# Also pull from warm pool if it was parked there
if info is None and sandbox_id in self._warm_pool:
info, _ = self._warm_pool.pop(sandbox_id)
else:
self._warm_pool.pop(sandbox_id, None)
# Destroy backend resources (stop container, release port, etc.)
if info:
self._backend.destroy(info)
logger.info(f"Released sandbox {sandbox_id}")
logger.info(f"Destroyed sandbox {sandbox_id}")
def shutdown(self) -> None:
"""Shutdown all sandboxes. Thread-safe and idempotent."""
@@ -477,6 +584,8 @@ class AioSandboxProvider(SandboxProvider):
return
self._shutdown_called = True
sandbox_ids = list(self._sandboxes.keys())
warm_items = list(self._warm_pool.items())
self._warm_pool.clear()
# Stop idle checker
self._idle_checker_stop.set()
@@ -484,10 +593,17 @@ class AioSandboxProvider(SandboxProvider):
self._idle_checker_thread.join(timeout=5)
logger.info("Stopped idle checker thread")
logger.info(f"Shutting down {len(sandbox_ids)} sandbox(es)")
logger.info(f"Shutting down {len(sandbox_ids)} active + {len(warm_items)} warm-pool sandbox(es)")
for sandbox_id in sandbox_ids:
try:
self.release(sandbox_id)
self.destroy(sandbox_id)
except Exception as e:
logger.error(f"Failed to release sandbox {sandbox_id} during shutdown: {e}")
logger.error(f"Failed to destroy sandbox {sandbox_id} during shutdown: {e}")
for sandbox_id, (info, _) in warm_items:
try:
self._backend.destroy(info)
logger.info(f"Destroyed warm-pool sandbox {sandbox_id} during shutdown")
except Exception as e:
logger.error(f"Failed to destroy warm-pool sandbox {sandbox_id} during shutdown: {e}")

View File

@@ -1,102 +0,0 @@
"""File-based sandbox state store.
Uses JSON files for persistence and fcntl file locking for cross-process
mutual exclusion. Works across processes on the same machine or across
K8s pods with a shared PVC mount.
"""
from __future__ import annotations
import fcntl
import json
import logging
import os
from collections.abc import Generator
from contextlib import contextmanager
from pathlib import Path
from src.config.paths import Paths
from .sandbox_info import SandboxInfo
from .state_store import SandboxStateStore
logger = logging.getLogger(__name__)
SANDBOX_STATE_FILE = "sandbox.json"
SANDBOX_LOCK_FILE = "sandbox.lock"
class FileSandboxStateStore(SandboxStateStore):
"""File-based state store using JSON files and fcntl file locking.
State is stored at: {base_dir}/threads/{thread_id}/sandbox.json
Lock files at: {base_dir}/threads/{thread_id}/sandbox.lock
This works across processes on the same machine sharing a filesystem.
For K8s multi-pod scenarios, requires a shared PVC mount at base_dir.
"""
def __init__(self, base_dir: str):
"""Initialize the file-based state store.
Args:
base_dir: Root directory for state files (typically Paths.base_dir).
"""
self._paths = Paths(base_dir)
def _thread_dir(self, thread_id: str) -> Path:
"""Get the directory for a thread's state files."""
return self._paths.thread_dir(thread_id)
def save(self, thread_id: str, info: SandboxInfo) -> None:
thread_dir = self._thread_dir(thread_id)
os.makedirs(thread_dir, exist_ok=True)
state_file = thread_dir / SANDBOX_STATE_FILE
try:
state_file.write_text(json.dumps(info.to_dict()))
logger.info(f"Saved sandbox state for thread {thread_id}: {info.sandbox_id}")
except OSError as e:
logger.warning(f"Failed to save sandbox state for thread {thread_id}: {e}")
def load(self, thread_id: str) -> SandboxInfo | None:
state_file = self._thread_dir(thread_id) / SANDBOX_STATE_FILE
if not state_file.exists():
return None
try:
data = json.loads(state_file.read_text())
return SandboxInfo.from_dict(data)
except (OSError, json.JSONDecodeError, KeyError) as e:
logger.warning(f"Failed to load sandbox state for thread {thread_id}: {e}")
return None
def remove(self, thread_id: str) -> None:
state_file = self._thread_dir(thread_id) / SANDBOX_STATE_FILE
try:
if state_file.exists():
state_file.unlink()
logger.info(f"Removed sandbox state for thread {thread_id}")
except OSError as e:
logger.warning(f"Failed to remove sandbox state for thread {thread_id}: {e}")
@contextmanager
def lock(self, thread_id: str) -> Generator[None, None, None]:
"""Acquire a cross-process file lock using fcntl.flock.
The lock is held for the duration of the context manager.
Only one process can hold the lock at a time for a given thread_id.
Note: fcntl.flock is available on macOS and Linux.
"""
thread_dir = self._thread_dir(thread_id)
os.makedirs(thread_dir, exist_ok=True)
lock_path = thread_dir / SANDBOX_LOCK_FILE
lock_file = open(lock_path, "w")
try:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
yield
finally:
try:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
lock_file.close()
except OSError:
pass

View File

@@ -7,6 +7,7 @@ Handles container lifecycle, port allocation, and cross-process container discov
from __future__ import annotations
import logging
import os
import subprocess
from src.utils.network import get_free_port, release_port
@@ -104,16 +105,47 @@ class LocalContainerBackend(SandboxBackend):
RuntimeError: If the container fails to start.
"""
container_name = f"{self._container_prefix}-{sandbox_id}"
port = get_free_port(start_port=self._base_port)
try:
container_id = self._start_container(container_name, port, extra_mounts)
except Exception:
release_port(port)
raise
# Retry loop: if Docker rejects the port (e.g. a stale container still
# holds the binding after a process restart), skip that port and try the
# next one. The socket-bind check in get_free_port mirrors Docker's
# 0.0.0.0 bind, but Docker's port-release can be slightly asynchronous,
# so a reactive fallback here ensures we always make progress.
_next_start = self._base_port
container_id: str | None = None
port: int = 0
for _attempt in range(10):
port = get_free_port(start_port=_next_start)
try:
container_id = self._start_container(container_name, port, extra_mounts)
break
except RuntimeError as exc:
release_port(port)
err = str(exc)
err_lower = err.lower()
# Port already bound: skip this port and retry with the next one.
if "port is already allocated" in err or "address already in use" in err_lower:
logger.warning(f"Port {port} rejected by Docker (already allocated), retrying with next port")
_next_start = port + 1
continue
# Container-name conflict: another process may have already started
# the deterministic sandbox container for this sandbox_id. Try to
# discover and adopt the existing container instead of failing.
if "is already in use by container" in err_lower or "conflict. the container name" in err_lower:
logger.warning(f"Container name {container_name} already in use, attempting to discover existing sandbox instance")
existing = self.discover(sandbox_id)
if existing is not None:
return existing
raise
else:
raise RuntimeError("Could not start sandbox container: all candidate ports are already allocated by Docker")
# When running inside Docker (DooD), sandbox containers are reachable via
# host.docker.internal rather than localhost (they run on the host daemon).
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
return SandboxInfo(
sandbox_id=sandbox_id,
sandbox_url=f"http://localhost:{port}",
sandbox_url=f"http://{sandbox_host}:{port}",
container_name=container_name,
container_id=container_id,
)
@@ -159,7 +191,8 @@ class LocalContainerBackend(SandboxBackend):
if port is None:
return None
sandbox_url = f"http://localhost:{port}"
sandbox_host = os.environ.get("DEER_FLOW_SANDBOX_HOST", "localhost")
sandbox_url = f"http://{sandbox_host}:{port}"
if not wait_for_sandbox_ready(sandbox_url, timeout=5):
return None

View File

@@ -1,70 +0,0 @@
"""Abstract base class for sandbox state persistence.
The state store handles cross-process persistence of thread_id → sandbox mappings,
enabling different processes (gateway, langgraph, multiple workers) to find the same
sandbox for a given thread.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from collections.abc import Generator
from contextlib import contextmanager
from .sandbox_info import SandboxInfo
class SandboxStateStore(ABC):
"""Abstract base for persisting thread_id → sandbox mappings across processes.
Implementations:
- FileSandboxStateStore: JSON files + fcntl file locking (single-host)
- TODO: RedisSandboxStateStore: Redis-based for distributed multi-host deployments
"""
@abstractmethod
def save(self, thread_id: str, info: SandboxInfo) -> None:
"""Save sandbox state for a thread.
Args:
thread_id: The thread ID.
info: Sandbox metadata to persist.
"""
...
@abstractmethod
def load(self, thread_id: str) -> SandboxInfo | None:
"""Load sandbox state for a thread.
Args:
thread_id: The thread ID.
Returns:
SandboxInfo if found, None otherwise.
"""
...
@abstractmethod
def remove(self, thread_id: str) -> None:
"""Remove sandbox state for a thread.
Args:
thread_id: The thread ID.
"""
...
@abstractmethod
@contextmanager
def lock(self, thread_id: str) -> Generator[None, None, None]:
"""Acquire a cross-process lock for a thread's sandbox operations.
Ensures only one process can create/modify a sandbox for a given
thread_id at a time, preventing duplicate sandbox creation.
Args:
thread_id: The thread ID to lock.
Yields:
None — use as a context manager.
"""
...

View File

@@ -133,11 +133,15 @@ class ExtensionsConfig(BaseModel):
# Return empty config if extensions config file is not found
return cls(mcp_servers={}, skills={})
with open(resolved_path, encoding="utf-8") as f:
config_data = json.load(f)
cls.resolve_env_variables(config_data)
return cls.model_validate(config_data)
try:
with open(resolved_path, encoding="utf-8") as f:
config_data = json.load(f)
cls.resolve_env_variables(config_data)
return cls.model_validate(config_data)
except json.JSONDecodeError as e:
raise ValueError(f"Extensions config file at {resolved_path} is not valid JSON: {e}") from e
except Exception as e:
raise RuntimeError(f"Failed to load extensions config from {resolved_path}: {e}") from e
@classmethod
def resolve_env_variables(cls, config: dict[str, Any]) -> dict[str, Any]:
@@ -156,8 +160,12 @@ class ExtensionsConfig(BaseModel):
if value.startswith("$"):
env_value = os.getenv(value[1:])
if env_value is None:
raise ValueError(f"Environment variable {value[1:]} not found for config value {value}")
config[key] = env_value
# Unresolved placeholder — store empty string so downstream
# consumers (e.g. MCP servers) don't receive the literal "$VAR"
# token as an actual environment value.
config[key] = ""
else:
config[key] = env_value
else:
config[key] = value
elif isinstance(value, dict):

View File

@@ -38,6 +38,21 @@ class Paths:
def __init__(self, base_dir: str | Path | None = None) -> None:
self._base_dir = Path(base_dir).resolve() if base_dir is not None else None
@property
def host_base_dir(self) -> Path:
"""Host-visible base dir for Docker volume mount sources.
When running inside Docker with a mounted Docker socket (DooD), the Docker
daemon runs on the host and resolves mount paths against the host filesystem.
Set DEER_FLOW_HOST_BASE_DIR to the host-side path that corresponds to this
container's base_dir so that sandbox container volume mounts work correctly.
Falls back to base_dir when the env var is not set (native/local execution).
"""
if env := os.getenv("DEER_FLOW_HOST_BASE_DIR"):
return Path(env)
return self.base_dir
@property
def base_dir(self) -> Path:
"""Root directory for all application data."""
@@ -124,10 +139,21 @@ class Paths:
return self.thread_dir(thread_id) / "user-data"
def ensure_thread_dirs(self, thread_id: str) -> None:
"""Create all standard sandbox directories for a thread."""
self.sandbox_work_dir(thread_id).mkdir(parents=True, exist_ok=True)
self.sandbox_uploads_dir(thread_id).mkdir(parents=True, exist_ok=True)
self.sandbox_outputs_dir(thread_id).mkdir(parents=True, exist_ok=True)
"""Create all standard sandbox directories for a thread.
Directories are created with mode 0o777 so that sandbox containers
(which may run as a different UID than the host backend process) can
write to the volume-mounted paths without "Permission denied" errors.
The explicit chmod() call is necessary because Path.mkdir(mode=...) is
subject to the process umask and may not yield the intended permissions.
"""
for d in [
self.sandbox_work_dir(thread_id),
self.sandbox_uploads_dir(thread_id),
self.sandbox_outputs_dir(thread_id),
]:
d.mkdir(parents=True, exist_ok=True)
d.chmod(0o777)
def resolve_virtual_path(self, thread_id: str, virtual_path: str) -> Path:
"""Resolve a sandbox virtual path to the actual host filesystem path.

View File

@@ -18,8 +18,7 @@ class SandboxConfig(BaseModel):
AioSandboxProvider specific options:
image: Docker image to use (default: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest)
port: Base port for sandbox containers (default: 8080)
base_url: If set, uses existing sandbox instead of starting new container
auto_start: Whether to automatically start Docker container (default: true)
replicas: Maximum number of concurrent sandbox containers (default: 3). When the limit is reached the least-recently-used sandbox is evicted to make room.
container_prefix: Prefix for container names (default: deer-flow-sandbox)
idle_timeout: Idle timeout in seconds before sandbox is released (default: 600 = 10 minutes). Set to 0 to disable.
mounts: List of volume mounts to share directories with the container
@@ -38,13 +37,9 @@ class SandboxConfig(BaseModel):
default=None,
description="Base port for sandbox containers",
)
base_url: str | None = Field(
replicas: int | None = Field(
default=None,
description="If set, uses existing sandbox at this URL instead of starting new container",
)
auto_start: bool | None = Field(
default=None,
description="Whether to automatically start Docker container",
description="Maximum number of concurrent sandbox containers (default: 3). When the limit is reached the least-recently-used sandbox is evicted to make room.",
)
container_prefix: str | None = Field(
default=None,

View File

@@ -237,12 +237,12 @@ async def get_skill(skill_name: str) -> SkillResponse:
"/skills/{skill_name}",
response_model=SkillResponse,
summary="Update Skill",
description="Update a skill's enabled status by modifying the skills_state_config.json file.",
description="Update a skill's enabled status by modifying the extensions_config.json file.",
)
async def update_skill(skill_name: str, request: SkillUpdateRequest) -> SkillResponse:
"""Update a skill's enabled status.
This will modify the skills_state_config.json file to update the enabled state.
This will modify the extensions_config.json file to update the enabled state.
The SKILL.md file itself is not modified.
Args:

View File

@@ -1,3 +1,4 @@
import logging
from typing import NotRequired, override
from langchain.agents import AgentState
@@ -7,6 +8,8 @@ from langgraph.runtime import Runtime
from src.agents.thread_state import SandboxState, ThreadDataState
from src.sandbox import get_sandbox_provider
logger = logging.getLogger(__name__)
class SandboxMiddlewareState(AgentState):
"""Compatible with the `ThreadState` schema."""
@@ -42,7 +45,7 @@ class SandboxMiddleware(AgentMiddleware[SandboxMiddlewareState]):
def _acquire_sandbox(self, thread_id: str) -> str:
provider = get_sandbox_provider()
sandbox_id = provider.acquire(thread_id)
print(f"Acquiring sandbox {sandbox_id}")
logger.info(f"Acquiring sandbox {sandbox_id}")
return sandbox_id
@override
@@ -54,7 +57,25 @@ class SandboxMiddleware(AgentMiddleware[SandboxMiddlewareState]):
# Eager initialization (original behavior)
if "sandbox" not in state or state["sandbox"] is None:
thread_id = runtime.context["thread_id"]
print(f"Thread ID: {thread_id}")
sandbox_id = self._acquire_sandbox(thread_id)
logger.info(f"Assigned sandbox {sandbox_id} to thread {thread_id}")
return {"sandbox": {"sandbox_id": sandbox_id}}
return super().before_agent(state, runtime)
@override
def after_agent(self, state: SandboxMiddlewareState, runtime: Runtime) -> dict | None:
sandbox = state.get("sandbox")
if sandbox is not None:
sandbox_id = sandbox["sandbox_id"]
logger.info(f"Releasing sandbox {sandbox_id}")
get_sandbox_provider().release(sandbox_id)
return None
if runtime.context.get("sandbox_id") is not None:
sandbox_id = runtime.context.get("sandbox_id")
logger.info(f"Releasing sandbox {sandbox_id} from context")
get_sandbox_provider().release(sandbox_id)
return None
# No sandbox to release
return super().after_agent(state, runtime)

View File

@@ -135,6 +135,8 @@ def sandbox_from_runtime(runtime: ToolRuntime[ContextT, ThreadState] | None = No
sandbox = get_sandbox_provider().get(sandbox_id)
if sandbox is None:
raise SandboxNotFoundError(f"Sandbox with ID '{sandbox_id}' not found", sandbox_id=sandbox_id)
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for downstream use
return sandbox
@@ -169,6 +171,7 @@ def ensure_sandbox_initialized(runtime: ToolRuntime[ContextT, ThreadState] | Non
if sandbox_id is not None:
sandbox = get_sandbox_provider().get(sandbox_id)
if sandbox is not None:
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
return sandbox
# Sandbox was released, fall through to acquire new one
@@ -188,6 +191,7 @@ def ensure_sandbox_initialized(runtime: ToolRuntime[ContextT, ThreadState] | Non
if sandbox is None:
raise SandboxNotFoundError("Sandbox not found after acquisition", sandbox_id=sandbox_id)
runtime.context["sandbox_id"] = sandbox_id # Ensure sandbox_id is in context for releasing in after_agent
return sandbox

View File

@@ -44,9 +44,13 @@ class PortAllocator:
if port in self._reserved_ports:
return False
# Bind to 0.0.0.0 (wildcard) rather than localhost so that the check
# mirrors exactly what Docker does. Docker binds to 0.0.0.0:PORT;
# checking only 127.0.0.1 can falsely report a port as available even
# when Docker already occupies it on the wildcard address.
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("localhost", port))
s.bind(("0.0.0.0", port))
return True
except OSError:
return False

View File

@@ -201,9 +201,6 @@ sandbox:
# sandbox:
# use: src.community.aio_sandbox:AioSandboxProvider
#
# # Optional: Use existing sandbox at this URL (no container will be started)
# # base_url: http://localhost:8080
#
# # Optional: Container image to use (works with both Docker and Apple Container)
# # Default: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest
# # Recommended: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest (works on both x86_64 and arm64)
@@ -211,9 +208,11 @@ sandbox:
#
# # Optional: Base port for sandbox containers (default: 8080)
# # port: 8080
#
# # Optional: Whether to automatically start Docker container (default: true)
# # auto_start: true
# # Optional: Maximum number of concurrent sandbox containers (default: 3)
# # When the limit is reached the least-recently-used sandbox is evicted to
# # make room for new ones. Use a positive integer here; omit this field to use the default.
# # replicas: 3
#
# # Optional: Prefix for container names (default: deer-flow-sandbox)
# # container_prefix: deer-flow-sandbox
@@ -348,7 +347,6 @@ memory:
injection_enabled: true # Whether to inject memory into system prompt
max_injection_tokens: 2000 # Maximum tokens for memory injection
# ============================================================================
# Checkpointer Configuration
# ============================================================================
@@ -373,9 +371,9 @@ memory:
# type: memory
#
# SQLite (file-based, single-process):
# checkpointer:
# type: sqlite
# connection_string: checkpoints.db
checkpointer:
type: sqlite
connection_string: checkpoints.db
#
# PostgreSQL (multi-process, production):
# checkpointer:

View File

@@ -111,17 +111,24 @@ services:
container_name: deer-flow-gateway
command: sh -c "cd backend && uv run uvicorn src.gateway.app:app --host 0.0.0.0 --port 8001 --reload --reload-include='*.yaml .env' > /app/logs/gateway.log 2>&1"
volumes:
- ../backend/src:/app/backend/src
- ../backend/.env:/app/backend/.env
- ../backend/:/app/backend/
# Preserve the .venv built during Docker image build — mounting the full backend/
# directory above would otherwise shadow it with the (empty) host directory.
- gateway-venv:/app/backend/.venv
- ../config.yaml:/app/config.yaml
- ../extensions_config.json:/app/extensions_config.json
- ../skills:/app/skills
- ../logs:/app/logs
- ../backend/.deer-flow:/app/backend/.deer-flow
# Mount uv cache for faster dependency installation
- ~/.cache/uv:/root/.cache/uv
# DooD: same as gateway — AioSandboxProvider runs inside LangGraph process.
- /var/run/docker.sock:/var/run/docker.sock
working_dir: /app
environment:
- CI=true
- DEER_FLOW_HOST_BASE_DIR=${DEER_FLOW_ROOT}/backend/.deer-flow
- DEER_FLOW_HOST_SKILLS_PATH=${DEER_FLOW_ROOT}/skills
- DEER_FLOW_SANDBOX_HOST=host.docker.internal
env_file:
- ../.env
extra_hosts:
@@ -140,24 +147,38 @@ services:
container_name: deer-flow-langgraph
command: sh -c "cd backend && uv run langgraph dev --no-browser --allow-blocking --host 0.0.0.0 --port 2024 > /app/logs/langgraph.log 2>&1"
volumes:
- ../backend/src:/app/backend/src
- ../backend/.env:/app/backend/.env
- ../backend/:/app/backend/
# Preserve the .venv built during Docker image build — mounting the full backend/
# directory above would otherwise shadow it with the (empty) host directory.
- langgraph-venv:/app/backend/.venv
- ../config.yaml:/app/config.yaml
- ../extensions_config.json:/app/extensions_config.json
- ../skills:/app/skills
- ../logs:/app/logs
- ../backend/.deer-flow:/app/backend/.deer-flow
# Mount uv cache for faster dependency installation
- ~/.cache/uv:/root/.cache/uv
# DooD: same as gateway — AioSandboxProvider runs inside LangGraph process.
- /var/run/docker.sock:/var/run/docker.sock
working_dir: /app
environment:
- CI=true
- DEER_FLOW_HOST_BASE_DIR=${DEER_FLOW_ROOT}/backend/.deer-flow
- DEER_FLOW_HOST_SKILLS_PATH=${DEER_FLOW_ROOT}/skills
- DEER_FLOW_SANDBOX_HOST=host.docker.internal
env_file:
- ../.env
extra_hosts:
# For Linux: map host.docker.internal to host gateway
- "host.docker.internal:host-gateway"
networks:
- deer-flow-dev
restart: unless-stopped
volumes: {}
volumes:
# Persist .venv across container restarts so dependencies installed during
# image build are not shadowed by the host backend/ directory mount.
gateway-venv:
langgraph-venv:
networks:
deer-flow-dev:

View File

@@ -1,18 +1,25 @@
{
"mcpServers": {
"filesystem": {
"enabled": true,
"enabled": false,
"type": "stdio",
"command": "npx",
"args": ["-y", "@modelcontextprotocol/server-filesystem", "/path/to/allowed/files"],
"args": [
"-y",
"@modelcontextprotocol/server-filesystem",
"/path/to/allowed/files"
],
"env": {},
"description": "Provides filesystem access within allowed directories"
},
"github": {
"enabled": true,
"enabled": false,
"type": "stdio",
"command": "npx",
"args": ["-y", "@modelcontextprotocol/server-github"],
"args": [
"-y",
"@modelcontextprotocol/server-github"
],
"env": {
"GITHUB_TOKEN": "$GITHUB_TOKEN"
},
@@ -22,50 +29,14 @@
"enabled": false,
"type": "stdio",
"command": "npx",
"args": ["-y", "@modelcontextprotocol/server-postgres", "postgresql://localhost/mydb"],
"args": [
"-y",
"@modelcontextprotocol/server-postgres",
"postgresql://localhost/mydb"
],
"env": {},
"description": "PostgreSQL database access"
},
"my-sse-server": {
"type": "sse",
"url": "https://api.example.com/mcp",
"headers": {
"Authorization": "Bearer $API_TOKEN",
"X-Custom-Header": "value"
},
"oauth": {
"enabled": true,
"token_url": "https://auth.example.com/oauth/token",
"grant_type": "client_credentials",
"client_id": "$MCP_OAUTH_CLIENT_ID",
"client_secret": "$MCP_OAUTH_CLIENT_SECRET",
"scope": "mcp.read mcp.write",
"audience": "https://api.example.com",
"refresh_skew_seconds": 60
}
},
"my-http-server": {
"type": "http",
"url": "https://api.example.com/mcp",
"headers": {
"Authorization": "Bearer $API_TOKEN",
"X-Custom-Header": "value"
},
"oauth": {
"enabled": true,
"token_url": "https://auth.example.com/oauth/token",
"grant_type": "client_credentials",
"client_id": "$MCP_OAUTH_CLIENT_ID",
"client_secret": "$MCP_OAUTH_CLIENT_SECRET"
}
}
},
"skills": {
"pdf-processing": {
"enabled": true
},
"frontend-design": {
"enabled": true
}
}
}
},
"skills": {}
}

View File

@@ -125,6 +125,39 @@ start() {
echo ""
fi
# Ensure config.yaml exists before starting.
if [ ! -f "$PROJECT_ROOT/config.yaml" ]; then
if [ -f "$PROJECT_ROOT/config.example.yaml" ]; then
cp "$PROJECT_ROOT/config.example.yaml" "$PROJECT_ROOT/config.yaml"
echo ""
echo -e "${YELLOW}============================================================${NC}"
echo -e "${YELLOW} config.yaml has been created from config.example.yaml.${NC}"
echo -e "${YELLOW} Please edit config.yaml to set your API keys and model ${NC}"
echo -e "${YELLOW} configuration before starting DeerFlow. ${NC}"
echo -e "${YELLOW}============================================================${NC}"
echo ""
echo -e "${YELLOW} Edit the file: $PROJECT_ROOT/config.yaml${NC}"
echo -e "${YELLOW} Then run: make docker-start${NC}"
echo ""
exit 0
else
echo -e "${YELLOW}✗ config.yaml not found and no config.example.yaml to copy from.${NC}"
exit 1
fi
fi
# Ensure extensions_config.json exists as a file before mounting.
# Docker creates a directory when bind-mounting a non-existent host path.
if [ ! -f "$PROJECT_ROOT/extensions_config.json" ]; then
if [ -f "$PROJECT_ROOT/extensions_config.example.json" ]; then
cp "$PROJECT_ROOT/extensions_config.example.json" "$PROJECT_ROOT/extensions_config.json"
echo -e "${BLUE}Created extensions_config.json from example${NC}"
else
echo "{}" > "$PROJECT_ROOT/extensions_config.json"
echo -e "${BLUE}Created empty extensions_config.json${NC}"
fi
fi
echo "Building and starting containers..."
cd "$DOCKER_DIR" && $COMPOSE_CMD up --build -d --remove-orphans $services
echo ""
@@ -177,8 +210,15 @@ logs() {
# Stop Docker development environment
stop() {
# DEER_FLOW_ROOT is referenced in docker-compose-dev.yaml; set it before
# running compose down to suppress "variable is not set" warnings.
if [ -z "$DEER_FLOW_ROOT" ]; then
export DEER_FLOW_ROOT="$PROJECT_ROOT"
fi
echo "Stopping Docker development services..."
cd "$DOCKER_DIR" && $COMPOSE_CMD down
echo "Cleaning up sandbox containers..."
"$SCRIPT_DIR/cleanup-containers.sh" deer-flow-sandbox 2>/dev/null || true
echo -e "${GREEN}✓ Docker services stopped${NC}"
}

View File

@@ -18,6 +18,7 @@ pkill -f "next dev" 2>/dev/null || true
nginx -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" -s quit 2>/dev/null || true
sleep 1
pkill -9 nginx 2>/dev/null || true
killall -9 nginx 2>/dev/null || true
./scripts/cleanup-containers.sh deer-flow-sandbox 2>/dev/null || true
sleep 1
@@ -60,9 +61,15 @@ cleanup() {
pkill -f "langgraph dev" 2>/dev/null || true
pkill -f "uvicorn src.gateway.app:app" 2>/dev/null || true
pkill -f "next dev" 2>/dev/null || true
nginx -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" -s quit 2>/dev/null || true
sleep 1
# Kill nginx using the captured PID first (most reliable),
# then fall back to pkill/killall for any stray nginx workers.
if [ -n "${NGINX_PID:-}" ] && kill -0 "$NGINX_PID" 2>/dev/null; then
kill -TERM "$NGINX_PID" 2>/dev/null || true
sleep 1
kill -9 "$NGINX_PID" 2>/dev/null || true
fi
pkill -9 nginx 2>/dev/null || true
killall -9 nginx 2>/dev/null || true
echo "Cleaning up sandbox containers..."
./scripts/cleanup-containers.sh deer-flow-sandbox 2>/dev/null || true
echo "✓ All services stopped"
@@ -106,6 +113,7 @@ echo "✓ Frontend started on localhost:3000"
echo "Starting Nginx reverse proxy..."
nginx -g 'daemon off;' -c "$REPO_ROOT/docker/nginx/nginx.local.conf" -p "$REPO_ROOT" > logs/nginx.log 2>&1 &
NGINX_PID=$!
./scripts/wait-for-port.sh 2026 10 "Nginx" || {
echo " See logs/nginx.log for details"
tail -10 logs/nginx.log