2026-02-12 11:02:09 +08:00
import logging
2026-01-17 15:09:44 +08:00
import mimetypes
2026-01-31 22:27:06 +08:00
import zipfile
2026-01-16 23:04:38 +08:00
from pathlib import Path
2026-01-29 12:29:13 +08:00
from urllib . parse import quote
2026-01-16 23:04:38 +08:00
2026-02-09 16:24:01 +08:00
from fastapi import APIRouter , HTTPException , Request
2026-03-26 17:44:25 +08:00
from fastapi . responses import FileResponse , PlainTextResponse , Response
2026-01-16 23:04:38 +08:00
refactor: split backend into harness (deerflow.*) and app (app.*) (#1131)
* refactor: extract shared utils to break harness→app cross-layer imports
Move _validate_skill_frontmatter to src/skills/validation.py and
CONVERTIBLE_EXTENSIONS + convert_file_to_markdown to src/utils/file_conversion.py.
This eliminates the two reverse dependencies from client.py (harness layer)
into gateway/routers/ (app layer), preparing for the harness/app package split.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
* refactor: split backend/src into harness (deerflow.*) and app (app.*)
Physically split the monolithic backend/src/ package into two layers:
- **Harness** (`packages/harness/deerflow/`): publishable agent framework
package with import prefix `deerflow.*`. Contains agents, sandbox, tools,
models, MCP, skills, config, and all core infrastructure.
- **App** (`app/`): unpublished application code with import prefix `app.*`.
Contains gateway (FastAPI REST API) and channels (IM integrations).
Key changes:
- Move 13 harness modules to packages/harness/deerflow/ via git mv
- Move gateway + channels to app/ via git mv
- Rename all imports: src.* → deerflow.* (harness) / app.* (app layer)
- Set up uv workspace with deerflow-harness as workspace member
- Update langgraph.json, config.example.yaml, all scripts, Docker files
- Add build-system (hatchling) to harness pyproject.toml
- Add PYTHONPATH=. to gateway startup commands for app.* resolution
- Update ruff.toml with known-first-party for import sorting
- Update all documentation to reflect new directory structure
Boundary rule enforced: harness code never imports from app.
All 429 tests pass. Lint clean.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
* chore: add harness→app boundary check test and update docs
Add test_harness_boundary.py that scans all Python files in
packages/harness/deerflow/ and fails if any `from app.*` or
`import app.*` statement is found. This enforces the architectural
rule that the harness layer never depends on the app layer.
Update CLAUDE.md to document the harness/app split architecture,
import conventions, and the boundary enforcement test.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
* feat: add config versioning with auto-upgrade on startup
When config.example.yaml schema changes, developers' local config.yaml
files can silently become outdated. This adds a config_version field and
auto-upgrade mechanism so breaking changes (like src.* → deerflow.*
renames) are applied automatically before services start.
- Add config_version: 1 to config.example.yaml
- Add startup version check warning in AppConfig.from_file()
- Add scripts/config-upgrade.sh with migration registry for value replacements
- Add `make config-upgrade` target
- Auto-run config-upgrade in serve.sh and start-daemon.sh before starting services
- Add config error hints in service failure messages
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
* fix comments
* fix: update src.* import in test_sandbox_tools_security to deerflow.*
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
* fix: handle empty config and search parent dirs for config.example.yaml
Address Copilot review comments on PR #1131:
- Guard against yaml.safe_load() returning None for empty config files
- Search parent directories for config.example.yaml instead of only
looking next to config.yaml, fixing detection in common setups
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
* fix: correct skills root path depth and config_version type coercion
- loader.py: fix get_skills_root_path() to use 5 parent levels (was 3)
after harness split, file lives at packages/harness/deerflow/skills/
so parent×3 resolved to backend/packages/harness/ instead of backend/
- app_config.py: coerce config_version to int() before comparison in
_check_config_version() to prevent TypeError when YAML stores value
as string (e.g. config_version: "1")
- tests: add regression tests for both fixes
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
* fix: update test imports from src.* to deerflow.*/app.* after harness refactor
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---------
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 22:55:52 +08:00
from app . gateway . path_utils import resolve_thread_virtual_path
2026-01-16 23:04:38 +08:00
2026-02-12 11:02:09 +08:00
logger = logging . getLogger ( __name__ )
2026-01-16 23:04:38 +08:00
router = APIRouter ( prefix = " /api " , tags = [ " artifacts " ] )
2026-03-26 17:44:25 +08:00
ACTIVE_CONTENT_MIME_TYPES = {
" text/html " ,
" application/xhtml+xml " ,
" image/svg+xml " ,
}
def _build_content_disposition ( disposition_type : str , filename : str ) - > str :
""" Build an RFC 5987 encoded Content-Disposition header value. """
return f " { disposition_type } ; filename*=UTF-8 ' ' { quote ( filename ) } "
def _build_attachment_headers ( filename : str , extra_headers : dict [ str , str ] | None = None ) - > dict [ str , str ] :
headers = { " Content-Disposition " : _build_content_disposition ( " attachment " , filename ) }
if extra_headers :
headers . update ( extra_headers )
return headers
2026-01-16 23:04:38 +08:00
2026-01-17 15:09:44 +08:00
def is_text_file_by_content ( path : Path , sample_size : int = 8192 ) - > bool :
""" Check if file is text by examining content for null bytes. """
try :
with open ( path , " rb " ) as f :
chunk = f . read ( sample_size )
# Text files shouldn't contain null bytes
return b " \x00 " not in chunk
except Exception :
return False
2026-01-31 22:27:06 +08:00
def _extract_file_from_skill_archive ( zip_path : Path , internal_path : str ) - > bytes | None :
""" Extract a file from a .skill ZIP archive.
Args :
zip_path : Path to the . skill file ( ZIP archive ) .
internal_path : Path to the file inside the archive ( e . g . , " SKILL.md " ) .
Returns :
The file content as bytes , or None if not found .
"""
if not zipfile . is_zipfile ( zip_path ) :
return None
try :
with zipfile . ZipFile ( zip_path , " r " ) as zip_ref :
# List all files in the archive
namelist = zip_ref . namelist ( )
# Try direct path first
if internal_path in namelist :
return zip_ref . read ( internal_path )
# Try with any top-level directory prefix (e.g., "skill-name/SKILL.md")
for name in namelist :
if name . endswith ( " / " + internal_path ) or name == internal_path :
return zip_ref . read ( name )
# Not found
return None
except ( zipfile . BadZipFile , KeyError ) :
return None
2026-01-20 13:20:50 +08:00
@router.get (
" /threads/ {thread_id} /artifacts/ { path:path} " ,
summary = " Get Artifact File " ,
2026-03-26 17:44:25 +08:00
description = " Retrieve an artifact file generated by the AI agent. Text and binary files can be viewed inline, while active web content is always downloaded. " ,
2026-01-20 13:20:50 +08:00
)
2026-03-26 17:44:25 +08:00
async def get_artifact ( thread_id : str , path : str , request : Request , download : bool = False ) - > Response :
2026-01-16 23:04:38 +08:00
""" Get an artifact file by its path.
2026-01-20 13:20:50 +08:00
The endpoint automatically detects file types and returns appropriate content types .
2026-03-26 17:44:25 +08:00
Use the ` download ` query parameter to force file download for non - active content .
2026-01-20 13:20:50 +08:00
2026-01-16 23:04:38 +08:00
Args :
thread_id : The thread ID .
path : The artifact path with virtual prefix ( e . g . , mnt / user - data / outputs / file . txt ) .
2026-01-20 13:20:50 +08:00
request : FastAPI request object ( automatically injected ) .
2026-01-16 23:04:38 +08:00
Returns :
2026-01-20 13:20:50 +08:00
The file content as a FileResponse with appropriate content type :
2026-03-26 17:44:25 +08:00
- Active content ( HTML / XHTML / SVG ) : Served as download attachment
2026-01-20 13:20:50 +08:00
- Text files : Plain text with proper MIME type
- Binary files : Inline display with download option
2026-01-16 23:04:38 +08:00
Raises :
2026-01-20 13:20:50 +08:00
HTTPException :
- 400 if path is invalid or not a file
- 403 if access denied ( path traversal detected )
- 404 if file not found
Query Parameters :
2026-03-26 17:44:25 +08:00
download ( bool ) : If true , forces attachment download for file types that are
otherwise returned inline or as plain text . Active HTML / XHTML / SVG content
is always downloaded regardless of this flag .
2026-01-20 13:20:50 +08:00
Example :
2026-03-26 17:44:25 +08:00
- Get text file inline : ` / api / threads / abc123 / artifacts / mnt / user - data / outputs / notes . txt `
2026-01-20 13:20:50 +08:00
- Download file : ` / api / threads / abc123 / artifacts / mnt / user - data / outputs / data . csv ? download = true `
2026-03-26 17:44:25 +08:00
- Active web content such as ` . html ` , ` . xhtml ` , and ` . svg ` artifacts is always downloaded
2026-01-16 23:04:38 +08:00
"""
2026-01-31 22:27:06 +08:00
# Check if this is a request for a file inside a .skill archive (e.g., xxx.skill/SKILL.md)
if " .skill/ " in path :
# Split the path at ".skill/" to get the ZIP file path and internal path
skill_marker = " .skill/ "
marker_pos = path . find ( skill_marker )
skill_file_path = path [ : marker_pos + len ( " .skill " ) ] # e.g., "mnt/user-data/outputs/my-skill.skill"
internal_path = path [ marker_pos + len ( skill_marker ) : ] # e.g., "SKILL.md"
2026-02-09 12:55:12 +08:00
actual_skill_path = resolve_thread_virtual_path ( thread_id , skill_file_path )
2026-01-31 22:27:06 +08:00
if not actual_skill_path . exists ( ) :
raise HTTPException ( status_code = 404 , detail = f " Skill file not found: { skill_file_path } " )
if not actual_skill_path . is_file ( ) :
raise HTTPException ( status_code = 400 , detail = f " Path is not a file: { skill_file_path } " )
# Extract the file from the .skill archive
content = _extract_file_from_skill_archive ( actual_skill_path , internal_path )
if content is None :
raise HTTPException ( status_code = 404 , detail = f " File ' { internal_path } ' not found in skill archive " )
# Determine MIME type based on the internal file
mime_type , _ = mimetypes . guess_type ( internal_path )
# Add cache headers to avoid repeated ZIP extraction (cache for 5 minutes)
cache_headers = { " Cache-Control " : " private, max-age=300 " }
2026-03-26 17:44:25 +08:00
download_name = Path ( internal_path ) . name or actual_skill_path . stem
if download or mime_type in ACTIVE_CONTENT_MIME_TYPES :
return Response ( content = content , media_type = mime_type or " application/octet-stream " , headers = _build_attachment_headers ( download_name , cache_headers ) )
2026-01-31 22:27:06 +08:00
if mime_type and mime_type . startswith ( " text/ " ) :
return PlainTextResponse ( content = content . decode ( " utf-8 " ) , media_type = mime_type , headers = cache_headers )
# Default to plain text for unknown types that look like text
try :
return PlainTextResponse ( content = content . decode ( " utf-8 " ) , media_type = " text/plain " , headers = cache_headers )
except UnicodeDecodeError :
return Response ( content = content , media_type = mime_type or " application/octet-stream " , headers = cache_headers )
2026-02-09 12:55:12 +08:00
actual_path = resolve_thread_virtual_path ( thread_id , path )
2026-01-16 23:04:38 +08:00
2026-02-12 11:02:09 +08:00
logger . info ( f " Resolving artifact path: thread_id= { thread_id } , requested_path= { path } , actual_path= { actual_path } " )
2026-01-16 23:04:38 +08:00
if not actual_path . exists ( ) :
raise HTTPException ( status_code = 404 , detail = f " Artifact not found: { path } " )
if not actual_path . is_file ( ) :
raise HTTPException ( status_code = 400 , detail = f " Path is not a file: { path } " )
2026-01-17 15:09:44 +08:00
mime_type , _ = mimetypes . guess_type ( actual_path )
2026-03-26 17:44:25 +08:00
if download :
return FileResponse ( path = actual_path , filename = actual_path . name , media_type = mime_type , headers = _build_attachment_headers ( actual_path . name ) )
2026-01-17 15:09:44 +08:00
2026-03-26 17:44:25 +08:00
# Always force download for active content types to prevent script execution
# in the application origin when users open generated artifacts.
if mime_type in ACTIVE_CONTENT_MIME_TYPES :
return FileResponse ( path = actual_path , filename = actual_path . name , media_type = mime_type , headers = _build_attachment_headers ( actual_path . name ) )
2026-01-17 15:09:44 +08:00
if mime_type and mime_type . startswith ( " text/ " ) :
2026-03-16 16:53:12 +08:00
return PlainTextResponse ( content = actual_path . read_text ( encoding = " utf-8 " ) , media_type = mime_type )
2026-01-17 15:09:44 +08:00
if is_text_file_by_content ( actual_path ) :
2026-03-16 16:53:12 +08:00
return PlainTextResponse ( content = actual_path . read_text ( encoding = " utf-8 " ) , media_type = mime_type )
2026-01-17 15:09:44 +08:00
2026-03-26 17:44:25 +08:00
return Response ( content = actual_path . read_bytes ( ) , media_type = mime_type , headers = { " Content-Disposition " : _build_content_disposition ( " inline " , actual_path . name ) } )