mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 06:12:14 +08:00
Adds Kubernetes sandbox provisioner support (#35)
* Adds Kubernetes sandbox provisioner support * Improves Docker dev setup by standardizing host paths Replaces hardcoded host paths with a configurable root directory, making the development environment more portable and easier to use across different machines. Automatically sets the root path if not already defined, reducing manual setup steps.
This commit is contained in:
@@ -6,11 +6,56 @@
|
||||
# - frontend: Frontend Next.js dev server (port 3000)
|
||||
# - gateway: Backend Gateway API (port 8001)
|
||||
# - langgraph: LangGraph server (port 2024)
|
||||
# - provisioner: Sandbox provisioner (creates Pods in host Kubernetes)
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Host machine must have a running Kubernetes cluster (Docker Desktop K8s,
|
||||
# minikube, kind, etc.) with kubectl configured (~/.kube/config).
|
||||
#
|
||||
# Access: http://localhost:2026
|
||||
|
||||
services:
|
||||
# Nginx Reverse Proxy
|
||||
# ── Sandbox Provisioner ────────────────────────────────────────────────
|
||||
# Manages per-sandbox Pod + Service lifecycle in the host Kubernetes
|
||||
# cluster via the K8s API.
|
||||
# Backend accesses sandboxes directly via host.docker.internal:{NodePort}.
|
||||
provisioner:
|
||||
build:
|
||||
context: ./provisioner
|
||||
dockerfile: Dockerfile
|
||||
container_name: deer-flow-provisioner
|
||||
volumes:
|
||||
- ~/.kube/config:/root/.kube/config:ro
|
||||
environment:
|
||||
- K8S_NAMESPACE=deer-flow
|
||||
- SANDBOX_IMAGE=enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest
|
||||
# Host paths for K8s HostPath volumes (must be absolute paths accessible by K8s node)
|
||||
# On Docker Desktop/OrbStack, use your actual host paths like /Users/username/...
|
||||
# Set these in your shell before running docker-compose:
|
||||
# export DEER_FLOW_ROOT=/absolute/path/to/deer-flow
|
||||
- SKILLS_HOST_PATH=${DEER_FLOW_ROOT}/skills
|
||||
- THREADS_HOST_PATH=${DEER_FLOW_ROOT}/backend/.deer-flow/threads
|
||||
- KUBECONFIG_PATH=/root/.kube/config
|
||||
- NODE_HOST=host.docker.internal
|
||||
# Override K8S API server URL since kubeconfig uses 127.0.0.1
|
||||
# which is unreachable from inside the container
|
||||
- K8S_API_SERVER=https://host.docker.internal:26443
|
||||
env_file:
|
||||
- ../.env
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
networks:
|
||||
- deer-flow-dev
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8002/health"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 6
|
||||
start_period: 15s
|
||||
|
||||
# ── Reverse Proxy ──────────────────────────────────────────────────────
|
||||
# Routes API traffic to gateway, langgraph, and provisioner services.
|
||||
nginx:
|
||||
image: nginx:alpine
|
||||
container_name: deer-flow-nginx
|
||||
@@ -22,6 +67,7 @@ services:
|
||||
- frontend
|
||||
- gateway
|
||||
- langgraph
|
||||
- provisioner
|
||||
networks:
|
||||
- deer-flow-dev
|
||||
restart: unless-stopped
|
||||
@@ -58,6 +104,8 @@ services:
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: backend/Dockerfile
|
||||
cache_from:
|
||||
- type=local,src=/tmp/docker-cache-gateway
|
||||
container_name: deer-flow-gateway
|
||||
command: sh -c "cd backend && uv run uvicorn src.gateway.app:app --host 0.0.0.0 --port 8001 --reload --reload-include='*.yaml .env' > /app/logs/gateway.log 2>&1"
|
||||
volumes:
|
||||
@@ -66,11 +114,14 @@ services:
|
||||
- ../config.yaml:/app/config.yaml
|
||||
- ../skills:/app/skills
|
||||
- ../logs:/app/logs
|
||||
- ../backend/.deer-flow:/app/backend/.deer-flow
|
||||
# Mount uv cache for faster dependency installation
|
||||
- ~/.cache/uv:/root/.cache/uv
|
||||
working_dir: /app
|
||||
environment:
|
||||
- CI=true
|
||||
env_file:
|
||||
- ../backend/.env
|
||||
- ../.env
|
||||
extra_hosts:
|
||||
# For Linux: map host.docker.internal to host gateway
|
||||
- "host.docker.internal:host-gateway"
|
||||
@@ -83,6 +134,8 @@ services:
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: backend/Dockerfile
|
||||
cache_from:
|
||||
- type=local,src=/tmp/docker-cache-langgraph
|
||||
container_name: deer-flow-langgraph
|
||||
command: sh -c "cd backend && uv run langgraph dev --no-browser --allow-blocking --host 0.0.0.0 --port 2024 > /app/logs/langgraph.log 2>&1"
|
||||
volumes:
|
||||
@@ -91,15 +144,23 @@ services:
|
||||
- ../config.yaml:/app/config.yaml
|
||||
- ../skills:/app/skills
|
||||
- ../logs:/app/logs
|
||||
- ../backend/.deer-flow:/app/backend/.deer-flow
|
||||
# Mount uv cache for faster dependency installation
|
||||
- ~/.cache/uv:/root/.cache/uv
|
||||
working_dir: /app
|
||||
environment:
|
||||
- CI=true
|
||||
env_file:
|
||||
- ../backend/.env
|
||||
- ../.env
|
||||
networks:
|
||||
- deer-flow-dev
|
||||
restart: unless-stopped
|
||||
|
||||
volumes: {}
|
||||
|
||||
networks:
|
||||
deer-flow-dev:
|
||||
driver: bridge
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 192.168.200.0/24
|
||||
|
||||
@@ -1,427 +0,0 @@
|
||||
# Kubernetes Sandbox Setup
|
||||
|
||||
This guide explains how to deploy and configure the DeerFlow sandbox execution environment on Kubernetes.
|
||||
|
||||
## Overview
|
||||
|
||||
The Kubernetes sandbox deployment allows you to run DeerFlow's code execution sandbox in a Kubernetes cluster, providing:
|
||||
|
||||
- **Isolated Execution**: Sandbox runs in dedicated Kubernetes pods
|
||||
- **Scalability**: Easy horizontal scaling with replica configuration
|
||||
- **Cluster Integration**: Seamless integration with existing Kubernetes infrastructure
|
||||
- **Persistent Skills**: Skills directory mounted from host or PersistentVolume
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, ensure you have:
|
||||
|
||||
1. **Kubernetes Cluster**: One of the following:
|
||||
- Docker Desktop with Kubernetes enabled
|
||||
- OrbStack with Kubernetes enabled
|
||||
- Minikube
|
||||
- Any production Kubernetes cluster
|
||||
|
||||
2. **kubectl**: Kubernetes command-line tool
|
||||
```bash
|
||||
# macOS
|
||||
brew install kubectl
|
||||
|
||||
# Linux
|
||||
# See: https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/
|
||||
```
|
||||
|
||||
3. **Docker**: For pulling the sandbox image (optional, but recommended)
|
||||
```bash
|
||||
# Verify installation
|
||||
docker version
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Enable Kubernetes
|
||||
|
||||
**Docker Desktop:**
|
||||
```
|
||||
Settings → Kubernetes → Enable Kubernetes → Apply & Restart
|
||||
```
|
||||
|
||||
**OrbStack:**
|
||||
```
|
||||
Settings → Enable Kubernetes
|
||||
```
|
||||
|
||||
**Minikube:**
|
||||
```bash
|
||||
minikube start
|
||||
```
|
||||
|
||||
### 2. Run Setup Script
|
||||
|
||||
The easiest way to get started:
|
||||
|
||||
```bash
|
||||
cd docker/k8s
|
||||
./setup.sh
|
||||
```
|
||||
|
||||
This will:
|
||||
- ✅ Check kubectl installation and cluster connectivity
|
||||
- ✅ Pull the sandbox Docker image (optional, can be skipped)
|
||||
- ✅ Create the `deer-flow` namespace
|
||||
- ✅ Deploy the sandbox service and deployment
|
||||
- ✅ Verify the deployment is running
|
||||
|
||||
### 3. Configure Backend
|
||||
|
||||
Add the following to `backend/config.yaml`:
|
||||
|
||||
```yaml
|
||||
sandbox:
|
||||
use: src.community.aio_sandbox:AioSandboxProvider
|
||||
base_url: http://deer-flow-sandbox.deer-flow.svc.cluster.local:8080
|
||||
```
|
||||
|
||||
### 4. Verify Deployment
|
||||
|
||||
Check that the sandbox pod is running:
|
||||
|
||||
```bash
|
||||
kubectl get pods -n deer-flow
|
||||
```
|
||||
|
||||
You should see:
|
||||
```
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
deer-flow-sandbox-xxxxxxxxxx-xxxxx 1/1 Running 0 1m
|
||||
```
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Custom Skills Path
|
||||
|
||||
By default, the setup script uses `PROJECT_ROOT/skills`. You can specify a custom path:
|
||||
|
||||
**Using command-line argument:**
|
||||
```bash
|
||||
./setup.sh --skills-path /custom/path/to/skills
|
||||
```
|
||||
|
||||
**Using environment variable:**
|
||||
```bash
|
||||
SKILLS_PATH=/custom/path/to/skills ./setup.sh
|
||||
```
|
||||
|
||||
### Custom Sandbox Image
|
||||
|
||||
To use a different sandbox image:
|
||||
|
||||
**Using command-line argument:**
|
||||
```bash
|
||||
./setup.sh --image your-registry/sandbox:tag
|
||||
```
|
||||
|
||||
**Using environment variable:**
|
||||
```bash
|
||||
SANDBOX_IMAGE=your-registry/sandbox:tag ./setup.sh
|
||||
```
|
||||
|
||||
### Skip Image Pull
|
||||
|
||||
If you already have the image locally or want to pull it manually later:
|
||||
|
||||
```bash
|
||||
./setup.sh --skip-pull
|
||||
```
|
||||
|
||||
### Combined Options
|
||||
|
||||
```bash
|
||||
./setup.sh --skip-pull --skills-path /custom/skills --image custom/sandbox:latest
|
||||
```
|
||||
|
||||
## Manual Deployment
|
||||
|
||||
If you prefer manual deployment or need more control:
|
||||
|
||||
### 1. Create Namespace
|
||||
|
||||
```bash
|
||||
kubectl apply -f namespace.yaml
|
||||
```
|
||||
|
||||
### 2. Create Service
|
||||
|
||||
```bash
|
||||
kubectl apply -f sandbox-service.yaml
|
||||
```
|
||||
|
||||
### 3. Deploy Sandbox
|
||||
|
||||
First, update the skills path in `sandbox-deployment.yaml`:
|
||||
|
||||
```bash
|
||||
# Replace __SKILLS_PATH__ with your actual path
|
||||
sed 's|__SKILLS_PATH__|/Users/feng/Projects/deer-flow/skills|g' \
|
||||
sandbox-deployment.yaml | kubectl apply -f -
|
||||
```
|
||||
|
||||
Or manually edit `sandbox-deployment.yaml` and replace `__SKILLS_PATH__` with your skills directory path.
|
||||
|
||||
### 4. Verify Deployment
|
||||
|
||||
```bash
|
||||
# Check all resources
|
||||
kubectl get all -n deer-flow
|
||||
|
||||
# Check pod status
|
||||
kubectl get pods -n deer-flow
|
||||
|
||||
# Check pod logs
|
||||
kubectl logs -n deer-flow -l app=deer-flow-sandbox
|
||||
|
||||
# Describe pod for detailed info
|
||||
kubectl describe pod -n deer-flow -l app=deer-flow-sandbox
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### Resource Limits
|
||||
|
||||
Edit `sandbox-deployment.yaml` to adjust resource limits:
|
||||
|
||||
```yaml
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m # Minimum CPU
|
||||
memory: 256Mi # Minimum memory
|
||||
limits:
|
||||
cpu: 1000m # Maximum CPU (1 core)
|
||||
memory: 1Gi # Maximum memory
|
||||
```
|
||||
|
||||
### Scaling
|
||||
|
||||
Adjust the number of replicas:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
replicas: 3 # Run 3 sandbox pods
|
||||
```
|
||||
|
||||
Or scale dynamically:
|
||||
|
||||
```bash
|
||||
kubectl scale deployment deer-flow-sandbox -n deer-flow --replicas=3
|
||||
```
|
||||
|
||||
### Health Checks
|
||||
|
||||
The deployment includes readiness and liveness probes:
|
||||
|
||||
- **Readiness Probe**: Checks if the pod is ready to serve traffic
|
||||
- **Liveness Probe**: Restarts the pod if it becomes unhealthy
|
||||
|
||||
Configure in `sandbox-deployment.yaml`:
|
||||
|
||||
```yaml
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /v1/sandbox
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Pod Not Starting
|
||||
|
||||
Check pod status and events:
|
||||
|
||||
```bash
|
||||
kubectl describe pod -n deer-flow -l app=deer-flow-sandbox
|
||||
```
|
||||
|
||||
Common issues:
|
||||
- **ImagePullBackOff**: Docker image cannot be pulled
|
||||
- Solution: Pre-pull image with `docker pull <image>`
|
||||
- **Skills path not found**: HostPath doesn't exist
|
||||
- Solution: Verify the skills path exists on the host
|
||||
- **Resource constraints**: Not enough CPU/memory
|
||||
- Solution: Adjust resource requests/limits
|
||||
|
||||
### Service Not Accessible
|
||||
|
||||
Verify the service is running:
|
||||
|
||||
```bash
|
||||
kubectl get service -n deer-flow
|
||||
kubectl describe service deer-flow-sandbox -n deer-flow
|
||||
```
|
||||
|
||||
Test connectivity from another pod:
|
||||
|
||||
```bash
|
||||
kubectl run test-pod -n deer-flow --rm -it --image=curlimages/curl -- \
|
||||
curl http://deer-flow-sandbox.deer-flow.svc.cluster.local:8080/v1/sandbox
|
||||
```
|
||||
|
||||
### Check Logs
|
||||
|
||||
View sandbox logs:
|
||||
|
||||
```bash
|
||||
# Follow logs in real-time
|
||||
kubectl logs -n deer-flow -l app=deer-flow-sandbox -f
|
||||
|
||||
# View logs from previous container (if crashed)
|
||||
kubectl logs -n deer-flow -l app=deer-flow-sandbox --previous
|
||||
```
|
||||
|
||||
### Health Check Failures
|
||||
|
||||
If pods show as not ready:
|
||||
|
||||
```bash
|
||||
# Check readiness probe
|
||||
kubectl get events -n deer-flow --sort-by='.lastTimestamp'
|
||||
|
||||
# Exec into pod to debug
|
||||
kubectl exec -it -n deer-flow <pod-name> -- /bin/sh
|
||||
```
|
||||
|
||||
## Cleanup
|
||||
|
||||
### Remove All Resources
|
||||
|
||||
Using the setup script:
|
||||
|
||||
```bash
|
||||
./setup.sh --cleanup
|
||||
```
|
||||
|
||||
Or manually:
|
||||
|
||||
```bash
|
||||
kubectl delete -f sandbox-deployment.yaml
|
||||
kubectl delete -f sandbox-service.yaml
|
||||
kubectl delete namespace deer-flow
|
||||
```
|
||||
|
||||
### Remove Specific Resources
|
||||
|
||||
```bash
|
||||
# Delete only the deployment (keeps namespace and service)
|
||||
kubectl delete deployment deer-flow-sandbox -n deer-flow
|
||||
|
||||
# Delete pods (they will be recreated by deployment)
|
||||
kubectl delete pods -n deer-flow -l app=deer-flow-sandbox
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────┐
|
||||
│ DeerFlow Backend │
|
||||
│ (config.yaml: base_url configured) │
|
||||
└────────────────┬────────────────────────────┘
|
||||
│ HTTP requests
|
||||
↓
|
||||
┌─────────────────────────────────────────────┐
|
||||
│ Kubernetes Service (ClusterIP) │
|
||||
│ deer-flow-sandbox.deer-flow.svc:8080 │
|
||||
└────────────────┬────────────────────────────┘
|
||||
│ Load balancing
|
||||
↓
|
||||
┌─────────────────────────────────────────────┐
|
||||
│ Sandbox Pods (replicas) │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ Pod 1 │ │ Pod 2 │ │ Pod 3 │ │
|
||||
│ │ Port 8080│ │ Port 8080│ │ Port 8080│ │
|
||||
│ └──────────┘ └──────────┘ └──────────┘ │
|
||||
└────────────────┬────────────────────────────┘
|
||||
│ Volume mount
|
||||
↓
|
||||
┌─────────────────────────────────────────────┐
|
||||
│ Host Skills Directory │
|
||||
│ /path/to/deer-flow/skills │
|
||||
└─────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Setup Script Reference
|
||||
|
||||
### Command-Line Options
|
||||
|
||||
```bash
|
||||
./setup.sh [options]
|
||||
|
||||
Options:
|
||||
-h, --help Show help message
|
||||
-c, --cleanup Remove all Kubernetes resources
|
||||
-p, --skip-pull Skip pulling sandbox image
|
||||
--image <image> Use custom sandbox image
|
||||
--skills-path <path> Custom skills directory path
|
||||
|
||||
Environment Variables:
|
||||
SANDBOX_IMAGE Custom sandbox image
|
||||
SKILLS_PATH Custom skills path
|
||||
|
||||
Examples:
|
||||
./setup.sh # Use default settings
|
||||
./setup.sh --skills-path /custom/path # Use custom skills path
|
||||
./setup.sh --skip-pull --image custom:tag # Custom image, skip pull
|
||||
SKILLS_PATH=/custom/path ./setup.sh # Use env variable
|
||||
```
|
||||
|
||||
## Production Considerations
|
||||
|
||||
### Security
|
||||
|
||||
1. **Network Policies**: Restrict pod-to-pod communication
|
||||
2. **RBAC**: Configure appropriate service account permissions
|
||||
3. **Pod Security**: Enable pod security standards
|
||||
4. **Image Security**: Scan images for vulnerabilities
|
||||
|
||||
### High Availability
|
||||
|
||||
1. **Multiple Replicas**: Run at least 3 replicas
|
||||
2. **Pod Disruption Budget**: Prevent all pods from being evicted
|
||||
3. **Node Affinity**: Distribute pods across nodes
|
||||
4. **Resource Quotas**: Set namespace resource limits
|
||||
|
||||
### Monitoring
|
||||
|
||||
1. **Prometheus**: Scrape metrics from pods
|
||||
2. **Logging**: Centralized log aggregation
|
||||
3. **Alerting**: Set up alerts for pod failures
|
||||
4. **Tracing**: Distributed tracing for requests
|
||||
|
||||
### Storage
|
||||
|
||||
For production, consider using PersistentVolume instead of hostPath:
|
||||
|
||||
1. **Create PersistentVolume**: Define storage backend
|
||||
2. **Create PersistentVolumeClaim**: Request storage
|
||||
3. **Update Deployment**: Use PVC instead of hostPath
|
||||
|
||||
See `skills-pv-pvc.yaml.bak` for reference implementation.
|
||||
|
||||
## Next Steps
|
||||
|
||||
After successful deployment:
|
||||
|
||||
1. **Start Backend**: `make dev` or `make docker-start`
|
||||
2. **Test Sandbox**: Create a conversation and execute code
|
||||
3. **Monitor**: Watch pod logs and resource usage
|
||||
4. **Scale**: Adjust replicas based on workload
|
||||
|
||||
## Support
|
||||
|
||||
For issues and questions:
|
||||
|
||||
- Check troubleshooting section above
|
||||
- Review pod logs: `kubectl logs -n deer-flow -l app=deer-flow-sandbox`
|
||||
- See main project documentation: [../../README.md](../../README.md)
|
||||
- Report issues on GitHub
|
||||
@@ -1,7 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: deer-flow
|
||||
labels:
|
||||
app.kubernetes.io/name: deer-flow
|
||||
app.kubernetes.io/component: sandbox
|
||||
@@ -1,65 +0,0 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deer-flow-sandbox
|
||||
namespace: deer-flow
|
||||
labels:
|
||||
app.kubernetes.io/name: deer-flow
|
||||
app.kubernetes.io/component: sandbox
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: deer-flow-sandbox
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: deer-flow-sandbox
|
||||
app.kubernetes.io/name: deer-flow
|
||||
app.kubernetes.io/component: sandbox
|
||||
spec:
|
||||
containers:
|
||||
- name: sandbox
|
||||
image: enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /v1/sandbox
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /v1/sandbox
|
||||
port: 8080
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 3
|
||||
failureThreshold: 3
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
volumeMounts:
|
||||
- name: skills
|
||||
mountPath: /mnt/skills
|
||||
readOnly: true
|
||||
securityContext:
|
||||
privileged: false
|
||||
allowPrivilegeEscalation: true
|
||||
volumes:
|
||||
- name: skills
|
||||
hostPath:
|
||||
# Path to skills directory on the host machine
|
||||
# This will be replaced by setup.sh with the actual path
|
||||
path: __SKILLS_PATH__
|
||||
type: Directory
|
||||
restartPolicy: Always
|
||||
@@ -1,21 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: deer-flow-sandbox
|
||||
namespace: deer-flow
|
||||
labels:
|
||||
app.kubernetes.io/name: deer-flow
|
||||
app.kubernetes.io/component: sandbox
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None # Headless service for direct Pod DNS access
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
targetPort: 8080
|
||||
protocol: TCP
|
||||
selector:
|
||||
app: deer-flow-sandbox
|
||||
# Enable DNS-based service discovery
|
||||
# Pods will be accessible at: {pod-name}.deer-flow-sandbox.deer-flow.svc.cluster.local:8080
|
||||
publishNotReadyAddresses: false
|
||||
@@ -1,245 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Kubernetes Sandbox Initialization Script for Deer-Flow
|
||||
# This script sets up the Kubernetes environment for the sandbox provider
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
|
||||
# Default sandbox image
|
||||
DEFAULT_SANDBOX_IMAGE="enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}"
|
||||
echo -e "${BLUE}║ Deer-Flow Kubernetes Sandbox Setup ║${NC}"
|
||||
echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}"
|
||||
echo
|
||||
|
||||
# Function to print status messages
|
||||
info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Check if kubectl is installed
|
||||
check_kubectl() {
|
||||
info "Checking kubectl installation..."
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
error "kubectl is not installed. Please install kubectl first."
|
||||
echo " - macOS: brew install kubectl"
|
||||
echo " - Linux: https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/"
|
||||
exit 1
|
||||
fi
|
||||
success "kubectl is installed"
|
||||
}
|
||||
|
||||
# Check if Kubernetes cluster is accessible
|
||||
check_cluster() {
|
||||
info "Checking Kubernetes cluster connection..."
|
||||
if ! kubectl cluster-info &> /dev/null; then
|
||||
error "Cannot connect to Kubernetes cluster."
|
||||
echo "Please ensure:"
|
||||
echo " - Docker Desktop: Settings → Kubernetes → Enable Kubernetes"
|
||||
echo " - Or OrbStack: Enable Kubernetes in settings"
|
||||
echo " - Or Minikube: minikube start"
|
||||
exit 1
|
||||
fi
|
||||
success "Connected to Kubernetes cluster"
|
||||
}
|
||||
|
||||
# Apply Kubernetes resources
|
||||
apply_resources() {
|
||||
info "Applying Kubernetes resources..."
|
||||
|
||||
# Determine skills path
|
||||
SKILLS_PATH="${SKILLS_PATH:-${PROJECT_ROOT}/skills}"
|
||||
info "Using skills path: ${SKILLS_PATH}"
|
||||
|
||||
# Validate skills path exists
|
||||
if [[ ! -d "${SKILLS_PATH}" ]]; then
|
||||
warn "Skills path does not exist: ${SKILLS_PATH}"
|
||||
warn "Creating directory..."
|
||||
mkdir -p "${SKILLS_PATH}"
|
||||
fi
|
||||
|
||||
echo " → Creating namespace..."
|
||||
kubectl apply -f "${SCRIPT_DIR}/namespace.yaml"
|
||||
|
||||
echo " → Creating sandbox service..."
|
||||
kubectl apply -f "${SCRIPT_DIR}/sandbox-service.yaml"
|
||||
|
||||
echo " → Creating sandbox deployment with skills path: ${SKILLS_PATH}"
|
||||
# Replace __SKILLS_PATH__ placeholder with actual path
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
# macOS
|
||||
sed "s|__SKILLS_PATH__|${SKILLS_PATH}|g" "${SCRIPT_DIR}/sandbox-deployment.yaml" | kubectl apply -f -
|
||||
else
|
||||
# Linux
|
||||
sed "s|__SKILLS_PATH__|${SKILLS_PATH}|g" "${SCRIPT_DIR}/sandbox-deployment.yaml" | kubectl apply -f -
|
||||
fi
|
||||
|
||||
success "All Kubernetes resources applied"
|
||||
}
|
||||
|
||||
# Verify deployment
|
||||
verify_deployment() {
|
||||
info "Verifying deployment..."
|
||||
|
||||
echo " → Checking namespace..."
|
||||
kubectl get namespace deer-flow
|
||||
|
||||
echo " → Checking service..."
|
||||
kubectl get service -n deer-flow
|
||||
|
||||
echo " → Checking deployment..."
|
||||
kubectl get deployment -n deer-flow
|
||||
|
||||
echo " → Checking pods..."
|
||||
kubectl get pods -n deer-flow
|
||||
|
||||
success "Deployment verified"
|
||||
}
|
||||
|
||||
# Pull sandbox image
|
||||
pull_image() {
|
||||
info "Checking sandbox image..."
|
||||
|
||||
IMAGE="${SANDBOX_IMAGE:-$DEFAULT_SANDBOX_IMAGE}"
|
||||
|
||||
# Check if image already exists locally
|
||||
if docker image inspect "$IMAGE" &> /dev/null; then
|
||||
success "Image already exists locally: $IMAGE"
|
||||
return 0
|
||||
fi
|
||||
|
||||
info "Pulling sandbox image (this may take a few minutes on first run)..."
|
||||
echo " → Image: $IMAGE"
|
||||
echo
|
||||
|
||||
if docker pull "$IMAGE"; then
|
||||
success "Image pulled successfully"
|
||||
else
|
||||
warn "Failed to pull image. Pod startup may be slow on first run."
|
||||
echo " You can manually pull the image later with:"
|
||||
echo " docker pull $IMAGE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Print next steps
|
||||
print_next_steps() {
|
||||
echo
|
||||
echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}"
|
||||
echo -e "${BLUE}║ Setup Complete! ║${NC}"
|
||||
echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}"
|
||||
echo
|
||||
echo -e "${YELLOW}To enable Kubernetes sandbox, add the following to backend/config.yaml:${NC}"
|
||||
echo
|
||||
echo -e "${GREEN}sandbox:${NC}"
|
||||
echo -e "${GREEN} use: src.community.aio_sandbox:AioSandboxProvider${NC}"
|
||||
echo -e "${GREEN} base_url: http://deer-flow-sandbox.deer-flow.svc.cluster.local:8080${NC}"
|
||||
echo
|
||||
echo
|
||||
echo -e "${GREEN}Next steps:${NC}"
|
||||
echo " make dev # Start backend and frontend in development mode"
|
||||
echo " make docker-start # Start backend and frontend in Docker containers"
|
||||
echo
|
||||
}
|
||||
|
||||
# Cleanup function
|
||||
cleanup() {
|
||||
if [[ "$1" == "--cleanup" ]] || [[ "$1" == "-c" ]]; then
|
||||
info "Cleaning up Kubernetes resources..."
|
||||
kubectl delete -f "${SCRIPT_DIR}/sandbox-deployment.yaml" --ignore-not-found=true
|
||||
kubectl delete -f "${SCRIPT_DIR}/sandbox-service.yaml" --ignore-not-found=true
|
||||
kubectl delete -f "${SCRIPT_DIR}/namespace.yaml" --ignore-not-found=true
|
||||
success "Cleanup complete"
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
# Show help
|
||||
show_help() {
|
||||
echo "Usage: $0 [options]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -h, --help Show this help message"
|
||||
echo " -c, --cleanup Remove all Kubernetes resources"
|
||||
echo " -p, --skip-pull Skip pulling sandbox image"
|
||||
echo " --image <image> Use custom sandbox image"
|
||||
echo " --skills-path <path> Custom skills directory path"
|
||||
echo
|
||||
echo "Environment variables:"
|
||||
echo " SANDBOX_IMAGE Custom sandbox image (default: $DEFAULT_SANDBOX_IMAGE)"
|
||||
echo " SKILLS_PATH Custom skills path (default: PROJECT_ROOT/skills)"
|
||||
echo
|
||||
echo "Examples:"
|
||||
echo " $0 # Use default settings"
|
||||
echo " $0 --skills-path /custom/path # Use custom skills path"
|
||||
echo " SKILLS_PATH=/custom/path $0 # Use env variable"
|
||||
echo
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
SKIP_PULL=false
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help)
|
||||
show_help
|
||||
;;
|
||||
-c|--cleanup)
|
||||
cleanup "$1"
|
||||
;;
|
||||
-p|--skip-pull)
|
||||
SKIP_PULL=true
|
||||
shift
|
||||
;;
|
||||
--image)
|
||||
SANDBOX_IMAGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--skills-path)
|
||||
SKILLS_PATH="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
check_kubectl
|
||||
check_cluster
|
||||
|
||||
# Pull image first to avoid Pod startup timeout
|
||||
if [[ "$SKIP_PULL" == false ]]; then
|
||||
pull_image
|
||||
fi
|
||||
|
||||
apply_resources
|
||||
verify_deployment
|
||||
print_next_steps
|
||||
}
|
||||
|
||||
main
|
||||
@@ -14,6 +14,9 @@ http {
|
||||
access_log /dev/stdout;
|
||||
error_log /dev/stderr;
|
||||
|
||||
# Docker internal DNS (for resolving k3s hostname)
|
||||
resolver 127.0.0.11 valid=10s ipv6=off;
|
||||
|
||||
# Upstream servers (using Docker service names)
|
||||
upstream gateway {
|
||||
server gateway:8001;
|
||||
@@ -27,9 +30,14 @@ http {
|
||||
server frontend:3000;
|
||||
}
|
||||
|
||||
upstream provisioner {
|
||||
server provisioner:8002;
|
||||
}
|
||||
|
||||
# ── Main server (path-based routing) ─────────────────────────────────
|
||||
server {
|
||||
listen 2026;
|
||||
listen [::]:2026;
|
||||
listen 2026 default_server;
|
||||
listen [::]:2026 default_server;
|
||||
server_name _;
|
||||
|
||||
# Hide CORS headers from upstream to prevent duplicates
|
||||
@@ -180,6 +188,16 @@ http {
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
# ── Provisioner API (sandbox management) ────────────────────────
|
||||
location /api/sandboxes {
|
||||
proxy_pass http://provisioner;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
# All other requests go to frontend
|
||||
location / {
|
||||
proxy_pass http://frontend;
|
||||
|
||||
19
docker/provisioner/Dockerfile
Normal file
19
docker/provisioner/Dockerfile
Normal file
@@ -0,0 +1,19 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir \
|
||||
fastapi \
|
||||
"uvicorn[standard]" \
|
||||
kubernetes
|
||||
|
||||
WORKDIR /app
|
||||
COPY app.py .
|
||||
|
||||
EXPOSE 8002
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8002"]
|
||||
318
docker/provisioner/README.md
Normal file
318
docker/provisioner/README.md
Normal file
@@ -0,0 +1,318 @@
|
||||
# DeerFlow Sandbox Provisioner
|
||||
|
||||
The **Sandbox Provisioner** is a FastAPI service that dynamically manages sandbox Pods in Kubernetes. It provides a REST API for the DeerFlow backend to create, monitor, and destroy isolated sandbox environments for code execution.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌────────────┐ HTTP ┌─────────────┐ K8s API ┌──────────────┐
|
||||
│ Backend │ ─────▸ │ Provisioner │ ────────▸ │ Host K8s │
|
||||
│ (gateway/ │ │ :8002 │ │ API Server │
|
||||
│ langgraph) │ └─────────────┘ └──────┬───────┘
|
||||
└────────────┘ │ creates
|
||||
│
|
||||
┌─────────────┐ ┌────▼─────┐
|
||||
│ Backend │ ──────▸ │ Sandbox │
|
||||
│ (via Docker │ NodePort│ Pod(s) │
|
||||
│ network) │ └──────────┘
|
||||
└─────────────┘
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **Backend Request**: When the backend needs to execute code, it sends a `POST /api/sandboxes` request with a `sandbox_id` and `thread_id`.
|
||||
|
||||
2. **Pod Creation**: The provisioner creates a dedicated Pod in the `deer-flow` namespace with:
|
||||
- The sandbox container image (all-in-one-sandbox)
|
||||
- HostPath volumes mounted for:
|
||||
- `/mnt/skills` → Read-only access to public skills
|
||||
- `/mnt/user-data` → Read-write access to thread-specific data
|
||||
- Resource limits (CPU, memory, ephemeral storage)
|
||||
- Readiness/liveness probes
|
||||
|
||||
3. **Service Creation**: A NodePort Service is created to expose the Pod, with Kubernetes auto-allocating a port from the NodePort range (typically 30000-32767).
|
||||
|
||||
4. **Access URL**: The provisioner returns `http://host.docker.internal:{NodePort}` to the backend, which the backend containers can reach directly.
|
||||
|
||||
5. **Cleanup**: When the session ends, `DELETE /api/sandboxes/{sandbox_id}` removes both the Pod and Service.
|
||||
|
||||
## Requirements
|
||||
|
||||
Host machine with a running Kubernetes cluster (Docker Desktop K8s, OrbStack, minikube, kind, etc.)
|
||||
|
||||
### Enable Kubernetes in Docker Desktop
|
||||
1. Open Docker Desktop settings
|
||||
2. Go to "Kubernetes" tab
|
||||
3. Check "Enable Kubernetes"
|
||||
4. Click "Apply & Restart"
|
||||
|
||||
### Enable Kubernetes in OrbStack
|
||||
1. Open OrbStack settings
|
||||
2. Go to "Kubernetes" tab
|
||||
3. Check "Enable Kubernetes"
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `GET /health`
|
||||
Health check endpoint.
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"status": "ok"
|
||||
}
|
||||
```
|
||||
|
||||
### `POST /api/sandboxes`
|
||||
Create a new sandbox Pod + Service.
|
||||
|
||||
**Request**:
|
||||
```json
|
||||
{
|
||||
"sandbox_id": "abc-123",
|
||||
"thread_id": "thread-456"
|
||||
}
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"sandbox_id": "abc-123",
|
||||
"sandbox_url": "http://host.docker.internal:32123",
|
||||
"status": "Pending"
|
||||
}
|
||||
```
|
||||
|
||||
**Idempotent**: Calling with the same `sandbox_id` returns the existing sandbox info.
|
||||
|
||||
### `GET /api/sandboxes/{sandbox_id}`
|
||||
Get status and URL of a specific sandbox.
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"sandbox_id": "abc-123",
|
||||
"sandbox_url": "http://host.docker.internal:32123",
|
||||
"status": "Running"
|
||||
}
|
||||
```
|
||||
|
||||
**Status Values**: `Pending`, `Running`, `Succeeded`, `Failed`, `Unknown`, `NotFound`
|
||||
|
||||
### `DELETE /api/sandboxes/{sandbox_id}`
|
||||
Destroy a sandbox Pod + Service.
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"ok": true,
|
||||
"sandbox_id": "abc-123"
|
||||
}
|
||||
```
|
||||
|
||||
### `GET /api/sandboxes`
|
||||
List all sandboxes currently managed.
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"sandboxes": [
|
||||
{
|
||||
"sandbox_id": "abc-123",
|
||||
"sandbox_url": "http://host.docker.internal:32123",
|
||||
"status": "Running"
|
||||
}
|
||||
],
|
||||
"count": 1
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The provisioner is configured via environment variables (set in [docker-compose-dev.yaml](../docker-compose-dev.yaml)):
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `K8S_NAMESPACE` | `deer-flow` | Kubernetes namespace for sandbox resources |
|
||||
| `SANDBOX_IMAGE` | `enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest` | Container image for sandbox Pods |
|
||||
| `SKILLS_HOST_PATH` | - | **Host machine** path to skills directory (must be absolute) |
|
||||
| `THREADS_HOST_PATH` | - | **Host machine** path to threads data directory (must be absolute) |
|
||||
| `KUBECONFIG_PATH` | `/root/.kube/config` | Path to kubeconfig **inside** the provisioner container |
|
||||
| `NODE_HOST` | `host.docker.internal` | Hostname that backend containers use to reach host NodePorts |
|
||||
| `K8S_API_SERVER` | (from kubeconfig) | Override K8s API server URL (e.g., `https://host.docker.internal:26443`) |
|
||||
|
||||
### Important: K8S_API_SERVER Override
|
||||
|
||||
If your kubeconfig uses `localhost`, `127.0.0.1`, or `0.0.0.0` as the API server address (common with OrbStack, minikube, kind), the provisioner **cannot** reach it from inside the Docker container.
|
||||
|
||||
**Solution**: Set `K8S_API_SERVER` to use `host.docker.internal`:
|
||||
|
||||
```yaml
|
||||
# docker-compose-dev.yaml
|
||||
provisioner:
|
||||
environment:
|
||||
- K8S_API_SERVER=https://host.docker.internal:26443 # Replace 26443 with your API port
|
||||
```
|
||||
|
||||
Check your kubeconfig API server:
|
||||
```bash
|
||||
kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}'
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### Host Machine Requirements
|
||||
|
||||
1. **Kubernetes Cluster**:
|
||||
- Docker Desktop with Kubernetes enabled, or
|
||||
- OrbStack (built-in K8s), or
|
||||
- minikube, kind, k3s, etc.
|
||||
|
||||
2. **kubectl Configured**:
|
||||
- `~/.kube/config` must exist and be valid
|
||||
- Current context should point to your local cluster
|
||||
|
||||
3. **Kubernetes Access**:
|
||||
- The provisioner needs permissions to:
|
||||
- Create/read/delete Pods in the `deer-flow` namespace
|
||||
- Create/read/delete Services in the `deer-flow` namespace
|
||||
- Read Namespaces (to create `deer-flow` if missing)
|
||||
|
||||
4. **Host Paths**:
|
||||
- The `SKILLS_HOST_PATH` and `THREADS_HOST_PATH` must be **absolute paths on the host machine**
|
||||
- These paths are mounted into sandbox Pods via K8s HostPath volumes
|
||||
- The paths must exist and be readable by the K8s node
|
||||
|
||||
### Docker Compose Setup
|
||||
|
||||
The provisioner runs as part of the docker-compose-dev stack:
|
||||
|
||||
```bash
|
||||
# Start all services including provisioner
|
||||
make docker-start
|
||||
|
||||
# Or start just the provisioner
|
||||
docker compose -p deer-flow-dev -f docker/docker-compose-dev.yaml up -d provisioner
|
||||
```
|
||||
|
||||
The compose file:
|
||||
- Mounts your host's `~/.kube/config` into the container
|
||||
- Adds `extra_hosts` entry for `host.docker.internal` (required on Linux)
|
||||
- Configures environment variables for K8s access
|
||||
|
||||
## Testing
|
||||
|
||||
### Manual API Testing
|
||||
|
||||
```bash
|
||||
# Health check
|
||||
curl http://localhost:8002/health
|
||||
|
||||
# Create a sandbox (via provisioner container for internal DNS)
|
||||
docker exec deer-flow-provisioner curl -X POST http://localhost:8002/api/sandboxes \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"sandbox_id":"test-001","thread_id":"thread-001"}'
|
||||
|
||||
# Check sandbox status
|
||||
docker exec deer-flow-provisioner curl http://localhost:8002/api/sandboxes/test-001
|
||||
|
||||
# List all sandboxes
|
||||
docker exec deer-flow-provisioner curl http://localhost:8002/api/sandboxes
|
||||
|
||||
# Verify Pod and Service in K8s
|
||||
kubectl get pod,svc -n deer-flow -l sandbox-id=test-001
|
||||
|
||||
# Delete sandbox
|
||||
docker exec deer-flow-provisioner curl -X DELETE http://localhost:8002/api/sandboxes/test-001
|
||||
```
|
||||
|
||||
### Verify from Backend Containers
|
||||
|
||||
Once a sandbox is created, the backend containers (gateway, langgraph) can access it:
|
||||
|
||||
```bash
|
||||
# Get sandbox URL from provisioner
|
||||
SANDBOX_URL=$(docker exec deer-flow-provisioner curl -s http://localhost:8002/api/sandboxes/test-001 | jq -r .sandbox_url)
|
||||
|
||||
# Test from gateway container
|
||||
docker exec deer-flow-gateway curl -s $SANDBOX_URL/v1/sandbox
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Issue: "Kubeconfig not found"
|
||||
|
||||
**Cause**: The kubeconfig file doesn't exist at the mounted path.
|
||||
|
||||
**Solution**:
|
||||
- Ensure `~/.kube/config` exists on your host machine
|
||||
- Run `kubectl config view` to verify
|
||||
- Check the volume mount in docker-compose-dev.yaml
|
||||
|
||||
### Issue: "Connection refused" to K8s API
|
||||
|
||||
**Cause**: The provisioner can't reach the K8s API server.
|
||||
|
||||
**Solution**:
|
||||
1. Check your kubeconfig server address:
|
||||
```bash
|
||||
kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}'
|
||||
```
|
||||
2. If it's `localhost` or `127.0.0.1`, set `K8S_API_SERVER`:
|
||||
```yaml
|
||||
environment:
|
||||
- K8S_API_SERVER=https://host.docker.internal:PORT
|
||||
```
|
||||
|
||||
### Issue: "Unprocessable Entity" when creating Pod
|
||||
|
||||
**Cause**: HostPath volumes contain invalid paths (e.g., relative paths with `..`).
|
||||
|
||||
**Solution**:
|
||||
- Use absolute paths for `SKILLS_HOST_PATH` and `THREADS_HOST_PATH`
|
||||
- Verify the paths exist on your host machine:
|
||||
```bash
|
||||
ls -la /path/to/skills
|
||||
ls -la /path/to/backend/.deer-flow/threads
|
||||
```
|
||||
|
||||
### Issue: Pod stuck in "ContainerCreating"
|
||||
|
||||
**Cause**: Usually pulling the sandbox image from the registry.
|
||||
|
||||
**Solution**:
|
||||
- Pre-pull the image: `make docker-init`
|
||||
- Check Pod events: `kubectl describe pod sandbox-XXX -n deer-flow`
|
||||
- Check node: `kubectl get nodes`
|
||||
|
||||
### Issue: Cannot access sandbox URL from backend
|
||||
|
||||
**Cause**: NodePort not reachable or `NODE_HOST` misconfigured.
|
||||
|
||||
**Solution**:
|
||||
- Verify the Service exists: `kubectl get svc -n deer-flow`
|
||||
- Test from host: `curl http://localhost:NODE_PORT/v1/sandbox`
|
||||
- Ensure `extra_hosts` is set in docker-compose (Linux)
|
||||
- Check `NODE_HOST` env var matches how backend reaches host
|
||||
|
||||
## Security Considerations
|
||||
|
||||
1. **HostPath Volumes**: The provisioner mounts host directories into sandbox Pods. Ensure these paths contain only trusted data.
|
||||
|
||||
2. **Resource Limits**: Each sandbox Pod has CPU, memory, and storage limits to prevent resource exhaustion.
|
||||
|
||||
3. **Network Isolation**: Sandbox Pods run in the `deer-flow` namespace but share the host's network namespace via NodePort. Consider NetworkPolicies for stricter isolation.
|
||||
|
||||
4. **kubeconfig Access**: The provisioner has full access to your Kubernetes cluster via the mounted kubeconfig. Run it only in trusted environments.
|
||||
|
||||
5. **Image Trust**: The sandbox image should come from a trusted registry. Review and audit the image contents.
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- [ ] Support for custom resource requests/limits per sandbox
|
||||
- [ ] PersistentVolume support for larger data requirements
|
||||
- [ ] Automatic cleanup of stale sandboxes (timeout-based)
|
||||
- [ ] Metrics and monitoring (Prometheus integration)
|
||||
- [ ] Multi-cluster support (route to different K8s clusters)
|
||||
- [ ] Pod affinity/anti-affinity rules for better placement
|
||||
- [ ] NetworkPolicy templates for sandbox isolation
|
||||
486
docker/provisioner/app.py
Normal file
486
docker/provisioner/app.py
Normal file
@@ -0,0 +1,486 @@
|
||||
"""DeerFlow Sandbox Provisioner Service.
|
||||
|
||||
Dynamically creates and manages per-sandbox Pods in Kubernetes.
|
||||
Each ``sandbox_id`` gets its own Pod + NodePort Service. The backend
|
||||
accesses sandboxes directly via ``{NODE_HOST}:{NodePort}``.
|
||||
|
||||
The provisioner connects to the host machine's Kubernetes cluster via a
|
||||
mounted kubeconfig (``~/.kube/config``). Sandbox Pods run on the host
|
||||
K8s and are accessed by the backend via ``{NODE_HOST}:{NodePort}``.
|
||||
|
||||
Endpoints:
|
||||
POST /api/sandboxes — Create a sandbox Pod + Service
|
||||
DELETE /api/sandboxes/{sandbox_id} — Destroy a sandbox Pod + Service
|
||||
GET /api/sandboxes/{sandbox_id} — Get sandbox status & URL
|
||||
GET /api/sandboxes — List all sandboxes
|
||||
GET /health — Provisioner health check
|
||||
|
||||
Architecture (docker-compose-dev):
|
||||
┌────────────┐ HTTP ┌─────────────┐ K8s API ┌──────────────┐
|
||||
│ remote │ ─────▸ │ provisioner │ ────────▸ │ host K8s │
|
||||
│ _backend │ │ :8002 │ │ API server │
|
||||
└────────────┘ └─────────────┘ └──────┬───────┘
|
||||
│ creates
|
||||
┌─────────────┐ ┌──────▼───────┐
|
||||
│ backend │ ────────▸ │ sandbox │
|
||||
│ │ direct │ Pod(s) │
|
||||
└─────────────┘ NodePort └──────────────┘
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import urllib3
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from kubernetes import client as k8s_client
|
||||
from kubernetes import config as k8s_config
|
||||
from kubernetes.client.rest import ApiException
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Suppress only the InsecureRequestWarning from urllib3
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
# ── Configuration (all tuneable via environment variables) ───────────────
|
||||
|
||||
K8S_NAMESPACE = os.environ.get("K8S_NAMESPACE", "deer-flow")
|
||||
SANDBOX_IMAGE = os.environ.get(
|
||||
"SANDBOX_IMAGE",
|
||||
"enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest",
|
||||
)
|
||||
SKILLS_HOST_PATH = os.environ.get("SKILLS_HOST_PATH", "/skills")
|
||||
THREADS_HOST_PATH = os.environ.get("THREADS_HOST_PATH", "/.deer-flow/threads")
|
||||
|
||||
# Path to the kubeconfig *inside* the provisioner container.
|
||||
# Typically the host's ~/.kube/config is mounted here.
|
||||
KUBECONFIG_PATH = os.environ.get("KUBECONFIG_PATH", "/root/.kube/config")
|
||||
|
||||
# The hostname / IP that the *backend container* uses to reach NodePort
|
||||
# services on the host Kubernetes node. On Docker Desktop for macOS this
|
||||
# is ``host.docker.internal``; on Linux it may be the host's LAN IP.
|
||||
NODE_HOST = os.environ.get("NODE_HOST", "host.docker.internal")
|
||||
|
||||
# ── K8s client setup ────────────────────────────────────────────────────
|
||||
|
||||
core_v1: k8s_client.CoreV1Api | None = None
|
||||
|
||||
|
||||
def _init_k8s_client() -> k8s_client.CoreV1Api:
|
||||
"""Load kubeconfig from the mounted host config and return a CoreV1Api.
|
||||
|
||||
Tries the mounted kubeconfig first, then falls back to in-cluster
|
||||
config (useful if the provisioner itself runs inside K8s).
|
||||
"""
|
||||
try:
|
||||
k8s_config.load_kube_config(config_file=KUBECONFIG_PATH)
|
||||
logger.info(f"Loaded kubeconfig from {KUBECONFIG_PATH}")
|
||||
except Exception:
|
||||
logger.warning("Could not load kubeconfig from file, trying in-cluster config")
|
||||
k8s_config.load_incluster_config()
|
||||
|
||||
# When connecting from inside Docker to the host's K8s API, the
|
||||
# kubeconfig may reference ``localhost`` or ``127.0.0.1``. We
|
||||
# optionally rewrite the server address so it reaches the host.
|
||||
k8s_api_server = os.environ.get("K8S_API_SERVER")
|
||||
if k8s_api_server:
|
||||
configuration = k8s_client.Configuration.get_default_copy()
|
||||
configuration.host = k8s_api_server
|
||||
# Self-signed certs are common for local clusters
|
||||
configuration.verify_ssl = False
|
||||
api_client = k8s_client.ApiClient(configuration)
|
||||
return k8s_client.CoreV1Api(api_client)
|
||||
|
||||
return k8s_client.CoreV1Api()
|
||||
|
||||
|
||||
def _wait_for_kubeconfig(timeout: int = 30) -> None:
|
||||
"""Block until the kubeconfig file is available."""
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
if os.path.exists(KUBECONFIG_PATH):
|
||||
logger.info(f"Found kubeconfig at {KUBECONFIG_PATH}")
|
||||
return
|
||||
logger.info(f"Waiting for kubeconfig at {KUBECONFIG_PATH} …")
|
||||
time.sleep(2)
|
||||
raise RuntimeError(f"Kubeconfig not found at {KUBECONFIG_PATH} after {timeout}s")
|
||||
|
||||
|
||||
def _ensure_namespace() -> None:
|
||||
"""Create the K8s namespace if it does not yet exist."""
|
||||
try:
|
||||
core_v1.read_namespace(K8S_NAMESPACE)
|
||||
logger.info(f"Namespace '{K8S_NAMESPACE}' already exists")
|
||||
except ApiException as exc:
|
||||
if exc.status == 404:
|
||||
ns = k8s_client.V1Namespace(
|
||||
metadata=k8s_client.V1ObjectMeta(
|
||||
name=K8S_NAMESPACE,
|
||||
labels={
|
||||
"app.kubernetes.io/name": "deer-flow",
|
||||
"app.kubernetes.io/component": "sandbox",
|
||||
},
|
||||
)
|
||||
)
|
||||
core_v1.create_namespace(ns)
|
||||
logger.info(f"Created namespace '{K8S_NAMESPACE}'")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
# ── FastAPI lifespan ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_app: FastAPI):
|
||||
global core_v1
|
||||
_wait_for_kubeconfig()
|
||||
core_v1 = _init_k8s_client()
|
||||
_ensure_namespace()
|
||||
logger.info("Provisioner is ready (using host Kubernetes)")
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="DeerFlow Sandbox Provisioner", lifespan=lifespan)
|
||||
|
||||
|
||||
# ── Request / Response models ───────────────────────────────────────────
|
||||
|
||||
|
||||
class CreateSandboxRequest(BaseModel):
|
||||
sandbox_id: str
|
||||
thread_id: str
|
||||
|
||||
|
||||
class SandboxResponse(BaseModel):
|
||||
sandbox_id: str
|
||||
sandbox_url: str # Direct access URL, e.g. http://host.docker.internal:{NodePort}
|
||||
status: str
|
||||
|
||||
|
||||
# ── K8s resource helpers ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _pod_name(sandbox_id: str) -> str:
|
||||
return f"sandbox-{sandbox_id}"
|
||||
|
||||
|
||||
def _svc_name(sandbox_id: str) -> str:
|
||||
return f"sandbox-{sandbox_id}-svc"
|
||||
|
||||
|
||||
def _sandbox_url(node_port: int) -> str:
|
||||
"""Build the sandbox URL using the configured NODE_HOST."""
|
||||
return f"http://{NODE_HOST}:{node_port}"
|
||||
|
||||
|
||||
def _build_pod(sandbox_id: str, thread_id: str) -> k8s_client.V1Pod:
|
||||
"""Construct a Pod manifest for a single sandbox."""
|
||||
return k8s_client.V1Pod(
|
||||
metadata=k8s_client.V1ObjectMeta(
|
||||
name=_pod_name(sandbox_id),
|
||||
namespace=K8S_NAMESPACE,
|
||||
labels={
|
||||
"app": "deer-flow-sandbox",
|
||||
"sandbox-id": sandbox_id,
|
||||
"app.kubernetes.io/name": "deer-flow",
|
||||
"app.kubernetes.io/component": "sandbox",
|
||||
},
|
||||
),
|
||||
spec=k8s_client.V1PodSpec(
|
||||
containers=[
|
||||
k8s_client.V1Container(
|
||||
name="sandbox",
|
||||
image=SANDBOX_IMAGE,
|
||||
image_pull_policy="IfNotPresent",
|
||||
ports=[
|
||||
k8s_client.V1ContainerPort(
|
||||
name="http",
|
||||
container_port=8080,
|
||||
protocol="TCP",
|
||||
)
|
||||
],
|
||||
readiness_probe=k8s_client.V1Probe(
|
||||
http_get=k8s_client.V1HTTPGetAction(
|
||||
path="/v1/sandbox",
|
||||
port=8080,
|
||||
),
|
||||
initial_delay_seconds=5,
|
||||
period_seconds=5,
|
||||
timeout_seconds=3,
|
||||
failure_threshold=3,
|
||||
),
|
||||
liveness_probe=k8s_client.V1Probe(
|
||||
http_get=k8s_client.V1HTTPGetAction(
|
||||
path="/v1/sandbox",
|
||||
port=8080,
|
||||
),
|
||||
initial_delay_seconds=10,
|
||||
period_seconds=10,
|
||||
timeout_seconds=3,
|
||||
failure_threshold=3,
|
||||
),
|
||||
resources=k8s_client.V1ResourceRequirements(
|
||||
requests={
|
||||
"cpu": "100m",
|
||||
"memory": "256Mi",
|
||||
"ephemeral-storage": "500Mi",
|
||||
},
|
||||
limits={
|
||||
"cpu": "1000m",
|
||||
"memory": "1Gi",
|
||||
"ephemeral-storage": "500Mi",
|
||||
},
|
||||
),
|
||||
volume_mounts=[
|
||||
k8s_client.V1VolumeMount(
|
||||
name="skills",
|
||||
mount_path="/mnt/skills",
|
||||
read_only=True,
|
||||
),
|
||||
k8s_client.V1VolumeMount(
|
||||
name="user-data",
|
||||
mount_path="/mnt/user-data",
|
||||
read_only=False,
|
||||
),
|
||||
],
|
||||
security_context=k8s_client.V1SecurityContext(
|
||||
privileged=False,
|
||||
allow_privilege_escalation=True,
|
||||
),
|
||||
)
|
||||
],
|
||||
volumes=[
|
||||
k8s_client.V1Volume(
|
||||
name="skills",
|
||||
host_path=k8s_client.V1HostPathVolumeSource(
|
||||
path=SKILLS_HOST_PATH,
|
||||
type="Directory",
|
||||
),
|
||||
),
|
||||
k8s_client.V1Volume(
|
||||
name="user-data",
|
||||
host_path=k8s_client.V1HostPathVolumeSource(
|
||||
path=f"{THREADS_HOST_PATH}/{thread_id}/user-data",
|
||||
type="DirectoryOrCreate",
|
||||
),
|
||||
),
|
||||
],
|
||||
restart_policy="Always",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _build_service(sandbox_id: str) -> k8s_client.V1Service:
|
||||
"""Construct a NodePort Service manifest (port auto-allocated by K8s)."""
|
||||
return k8s_client.V1Service(
|
||||
metadata=k8s_client.V1ObjectMeta(
|
||||
name=_svc_name(sandbox_id),
|
||||
namespace=K8S_NAMESPACE,
|
||||
labels={
|
||||
"app": "deer-flow-sandbox",
|
||||
"sandbox-id": sandbox_id,
|
||||
"app.kubernetes.io/name": "deer-flow",
|
||||
"app.kubernetes.io/component": "sandbox",
|
||||
},
|
||||
),
|
||||
spec=k8s_client.V1ServiceSpec(
|
||||
type="NodePort",
|
||||
ports=[
|
||||
k8s_client.V1ServicePort(
|
||||
name="http",
|
||||
port=8080,
|
||||
target_port=8080,
|
||||
protocol="TCP",
|
||||
# nodePort omitted → K8s auto-allocates from the range
|
||||
)
|
||||
],
|
||||
selector={
|
||||
"sandbox-id": sandbox_id,
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _get_node_port(sandbox_id: str) -> int | None:
|
||||
"""Read the K8s-allocated NodePort from the Service."""
|
||||
try:
|
||||
svc = core_v1.read_namespaced_service(_svc_name(sandbox_id), K8S_NAMESPACE)
|
||||
for port in svc.spec.ports or []:
|
||||
if port.name == "http":
|
||||
return port.node_port
|
||||
except ApiException:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _get_pod_phase(sandbox_id: str) -> str:
|
||||
"""Return the Pod phase (Pending / Running / Succeeded / Failed / Unknown)."""
|
||||
try:
|
||||
pod = core_v1.read_namespaced_pod(_pod_name(sandbox_id), K8S_NAMESPACE)
|
||||
return pod.status.phase or "Unknown"
|
||||
except ApiException:
|
||||
return "NotFound"
|
||||
|
||||
|
||||
# ── API endpoints ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
"""Provisioner health check."""
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.post("/api/sandboxes", response_model=SandboxResponse)
|
||||
async def create_sandbox(req: CreateSandboxRequest):
|
||||
"""Create a sandbox Pod + NodePort Service for *sandbox_id*.
|
||||
|
||||
If the sandbox already exists, returns the existing information
|
||||
(idempotent).
|
||||
"""
|
||||
sandbox_id = req.sandbox_id
|
||||
thread_id = req.thread_id
|
||||
|
||||
logger.info(
|
||||
f"Received request to create sandbox '{sandbox_id}' for thread '{thread_id}'"
|
||||
)
|
||||
|
||||
# ── Fast path: sandbox already exists ────────────────────────────
|
||||
existing_port = _get_node_port(sandbox_id)
|
||||
if existing_port:
|
||||
return SandboxResponse(
|
||||
sandbox_id=sandbox_id,
|
||||
sandbox_url=_sandbox_url(existing_port),
|
||||
status=_get_pod_phase(sandbox_id),
|
||||
)
|
||||
|
||||
# ── Create Pod ───────────────────────────────────────────────────
|
||||
try:
|
||||
core_v1.create_namespaced_pod(K8S_NAMESPACE, _build_pod(sandbox_id, thread_id))
|
||||
logger.info(f"Created Pod {_pod_name(sandbox_id)}")
|
||||
except ApiException as exc:
|
||||
if exc.status != 409: # 409 = AlreadyExists
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Pod creation failed: {exc.reason}"
|
||||
)
|
||||
|
||||
# ── Create Service ───────────────────────────────────────────────
|
||||
try:
|
||||
core_v1.create_namespaced_service(K8S_NAMESPACE, _build_service(sandbox_id))
|
||||
logger.info(f"Created Service {_svc_name(sandbox_id)}")
|
||||
except ApiException as exc:
|
||||
if exc.status != 409:
|
||||
# Roll back the Pod on failure
|
||||
try:
|
||||
core_v1.delete_namespaced_pod(_pod_name(sandbox_id), K8S_NAMESPACE)
|
||||
except ApiException:
|
||||
pass
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Service creation failed: {exc.reason}"
|
||||
)
|
||||
|
||||
# ── Read the auto-allocated NodePort ─────────────────────────────
|
||||
node_port: int | None = None
|
||||
for _ in range(20):
|
||||
node_port = _get_node_port(sandbox_id)
|
||||
if node_port:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
if not node_port:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="NodePort was not allocated in time"
|
||||
)
|
||||
|
||||
return SandboxResponse(
|
||||
sandbox_id=sandbox_id,
|
||||
sandbox_url=_sandbox_url(node_port),
|
||||
status=_get_pod_phase(sandbox_id),
|
||||
)
|
||||
|
||||
|
||||
@app.delete("/api/sandboxes/{sandbox_id}")
|
||||
async def destroy_sandbox(sandbox_id: str):
|
||||
"""Destroy a sandbox Pod + Service."""
|
||||
errors: list[str] = []
|
||||
|
||||
# Delete Service
|
||||
try:
|
||||
core_v1.delete_namespaced_service(_svc_name(sandbox_id), K8S_NAMESPACE)
|
||||
logger.info(f"Deleted Service {_svc_name(sandbox_id)}")
|
||||
except ApiException as exc:
|
||||
if exc.status != 404:
|
||||
errors.append(f"service: {exc.reason}")
|
||||
|
||||
# Delete Pod
|
||||
try:
|
||||
core_v1.delete_namespaced_pod(_pod_name(sandbox_id), K8S_NAMESPACE)
|
||||
logger.info(f"Deleted Pod {_pod_name(sandbox_id)}")
|
||||
except ApiException as exc:
|
||||
if exc.status != 404:
|
||||
errors.append(f"pod: {exc.reason}")
|
||||
|
||||
if errors:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Partial cleanup: {', '.join(errors)}"
|
||||
)
|
||||
|
||||
return {"ok": True, "sandbox_id": sandbox_id}
|
||||
|
||||
|
||||
@app.get("/api/sandboxes/{sandbox_id}", response_model=SandboxResponse)
|
||||
async def get_sandbox(sandbox_id: str):
|
||||
"""Return current status and URL for a sandbox."""
|
||||
node_port = _get_node_port(sandbox_id)
|
||||
if not node_port:
|
||||
raise HTTPException(status_code=404, detail=f"Sandbox '{sandbox_id}' not found")
|
||||
|
||||
return SandboxResponse(
|
||||
sandbox_id=sandbox_id,
|
||||
sandbox_url=_sandbox_url(node_port),
|
||||
status=_get_pod_phase(sandbox_id),
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/sandboxes")
|
||||
async def list_sandboxes():
|
||||
"""List every sandbox currently managed in the namespace."""
|
||||
try:
|
||||
services = core_v1.list_namespaced_service(
|
||||
K8S_NAMESPACE,
|
||||
label_selector="app=deer-flow-sandbox",
|
||||
)
|
||||
except ApiException as exc:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to list services: {exc.reason}"
|
||||
)
|
||||
|
||||
sandboxes: list[SandboxResponse] = []
|
||||
for svc in services.items:
|
||||
sid = (svc.metadata.labels or {}).get("sandbox-id")
|
||||
if not sid:
|
||||
continue
|
||||
node_port = None
|
||||
for port in svc.spec.ports or []:
|
||||
if port.name == "http":
|
||||
node_port = port.node_port
|
||||
break
|
||||
if node_port:
|
||||
sandboxes.append(
|
||||
SandboxResponse(
|
||||
sandbox_id=sid,
|
||||
sandbox_url=_sandbox_url(node_port),
|
||||
status=_get_pod_phase(sid),
|
||||
)
|
||||
)
|
||||
|
||||
return {"sandboxes": sandboxes, "count": len(sandboxes)}
|
||||
Reference in New Issue
Block a user