mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 06:12:14 +08:00
feat: citations prompts, path_utils, and citation code cleanup
- Prompt: add citation reminders for web_search and subagent synthesis (lead_agent, general_purpose) - Gateway: add path_utils for shared thread virtual path resolution; refactor artifacts and skills to use it - Citations: simplify removeAllCitations (single parse); backend _extract_citation_urls and remove_citations_block cleanup Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -267,6 +267,7 @@ The key AI trends for 2026 include enhanced reasoning capabilities and multimoda
|
||||
|
||||
<critical_reminders>
|
||||
- **Clarification First**: ALWAYS clarify unclear/missing/ambiguous requirements BEFORE starting work - never assume or guess
|
||||
- **Web search citations**: When you use web_search (or synthesize subagent results that used it), you MUST output the `<citations>` block and [Title](url) links as specified in citations_format so citations display for the user.
|
||||
{subagent_reminder}- Skill First: Always load the relevant skill before starting **complex** tasks.
|
||||
- Progressive Loading: Load resources incrementally as referenced in skills
|
||||
- Output Files: Final deliverables must be in `/mnt/user-data/outputs`
|
||||
@@ -340,6 +341,7 @@ def apply_prompt_template(subagent_enabled: bool = False) -> str:
|
||||
# Add subagent reminder to critical_reminders if enabled
|
||||
subagent_reminder = (
|
||||
"- **Orchestrator Mode**: You are a task orchestrator - decompose complex tasks into parallel sub-tasks and launch multiple subagents simultaneously. Synthesize results, don't execute directly.\n"
|
||||
"- **Citations when synthesizing**: When you synthesize subagent results that used web search or cite sources, you MUST include a consolidated `<citations>` block (JSONL format) and use [Title](url) markdown links in your response so citations display correctly.\n"
|
||||
if subagent_enabled
|
||||
else ""
|
||||
)
|
||||
|
||||
44
backend/src/gateway/path_utils.py
Normal file
44
backend/src/gateway/path_utils.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Shared path resolution for thread virtual paths (e.g. mnt/user-data/outputs/...)."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
from src.agents.middlewares.thread_data_middleware import THREAD_DATA_BASE_DIR
|
||||
|
||||
# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
|
||||
VIRTUAL_PATH_PREFIX = "mnt/user-data"
|
||||
|
||||
|
||||
def resolve_thread_virtual_path(thread_id: str, virtual_path: str) -> Path:
|
||||
"""Resolve a virtual path to the actual filesystem path under thread user-data.
|
||||
|
||||
Args:
|
||||
thread_id: The thread ID.
|
||||
virtual_path: The virtual path (e.g., mnt/user-data/outputs/file.txt).
|
||||
Leading slashes are stripped.
|
||||
|
||||
Returns:
|
||||
The resolved filesystem path.
|
||||
|
||||
Raises:
|
||||
HTTPException: If the path is invalid or outside allowed directories.
|
||||
"""
|
||||
virtual_path = virtual_path.lstrip("/")
|
||||
if not virtual_path.startswith(VIRTUAL_PATH_PREFIX):
|
||||
raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
|
||||
relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
|
||||
|
||||
base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
|
||||
actual_path = base_dir / relative_path
|
||||
|
||||
try:
|
||||
actual_path = actual_path.resolve()
|
||||
base_resolved = base_dir.resolve()
|
||||
if not str(actual_path).startswith(str(base_resolved)):
|
||||
raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
|
||||
except (ValueError, RuntimeError):
|
||||
raise HTTPException(status_code=400, detail="Invalid path")
|
||||
|
||||
return actual_path
|
||||
@@ -1,5 +1,5 @@
|
||||
import json
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
@@ -8,49 +8,11 @@ from urllib.parse import quote
|
||||
from fastapi import APIRouter, HTTPException, Request, Response
|
||||
from fastapi.responses import FileResponse, HTMLResponse, PlainTextResponse
|
||||
|
||||
# Base directory for thread data (relative to backend/)
|
||||
THREAD_DATA_BASE_DIR = ".deer-flow/threads"
|
||||
|
||||
# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
|
||||
VIRTUAL_PATH_PREFIX = "mnt/user-data"
|
||||
from src.gateway.path_utils import resolve_thread_virtual_path
|
||||
|
||||
router = APIRouter(prefix="/api", tags=["artifacts"])
|
||||
|
||||
|
||||
def _resolve_artifact_path(thread_id: str, artifact_path: str) -> Path:
|
||||
"""Resolve a virtual artifact path to the actual filesystem path.
|
||||
|
||||
Args:
|
||||
thread_id: The thread ID.
|
||||
artifact_path: The virtual path (e.g., mnt/user-data/outputs/file.txt).
|
||||
|
||||
Returns:
|
||||
The resolved filesystem path.
|
||||
|
||||
Raises:
|
||||
HTTPException: If the path is invalid or outside allowed directories.
|
||||
"""
|
||||
# Validate and remove virtual path prefix
|
||||
if not artifact_path.startswith(VIRTUAL_PATH_PREFIX):
|
||||
raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
|
||||
relative_path = artifact_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
|
||||
|
||||
# Build the actual path
|
||||
base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
|
||||
actual_path = base_dir / relative_path
|
||||
|
||||
# Security check: ensure the path is within the thread's user-data directory
|
||||
try:
|
||||
actual_path = actual_path.resolve()
|
||||
base_dir = base_dir.resolve()
|
||||
if not str(actual_path).startswith(str(base_dir)):
|
||||
raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
|
||||
except (ValueError, RuntimeError):
|
||||
raise HTTPException(status_code=400, detail="Invalid path")
|
||||
|
||||
return actual_path
|
||||
|
||||
|
||||
def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
|
||||
"""Check if file is text by examining content for null bytes."""
|
||||
try:
|
||||
@@ -62,66 +24,38 @@ def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def remove_citations_block(content: str) -> str:
|
||||
"""Remove ALL citations from markdown content.
|
||||
|
||||
Removes:
|
||||
- <citations>...</citations> blocks (complete and incomplete)
|
||||
- [cite-N] references
|
||||
- Citation markdown links that were converted from [cite-N]
|
||||
|
||||
This is used for downloads to provide clean markdown without any citation references.
|
||||
|
||||
Args:
|
||||
content: The markdown content that may contain citations blocks.
|
||||
|
||||
Returns:
|
||||
Clean content with all citations completely removed.
|
||||
"""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
result = content
|
||||
|
||||
# Step 1: Parse and extract citation URLs before removing blocks
|
||||
citation_urls = set()
|
||||
citations_pattern = r'<citations>([\s\S]*?)</citations>'
|
||||
for match in re.finditer(citations_pattern, content):
|
||||
citations_block = match.group(1)
|
||||
# Extract URLs from JSON lines
|
||||
import json
|
||||
for line in citations_block.split('\n'):
|
||||
def _extract_citation_urls(content: str) -> set[str]:
|
||||
"""Extract URLs from <citations> JSONL blocks. Format must match frontend core/citations/utils.ts."""
|
||||
urls: set[str] = set()
|
||||
for match in re.finditer(r"<citations>([\s\S]*?)</citations>", content):
|
||||
for line in match.group(1).split("\n"):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
if line.startswith("{"):
|
||||
try:
|
||||
citation = json.loads(line)
|
||||
if 'url' in citation:
|
||||
citation_urls.add(citation['url'])
|
||||
obj = json.loads(line)
|
||||
if "url" in obj:
|
||||
urls.add(obj["url"])
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
|
||||
# Step 2: Remove complete citations blocks
|
||||
result = re.sub(r'<citations>[\s\S]*?</citations>', '', result)
|
||||
|
||||
# Step 3: Remove incomplete citations blocks (at end of content during streaming)
|
||||
return urls
|
||||
|
||||
|
||||
def remove_citations_block(content: str) -> str:
|
||||
"""Remove ALL citations from markdown (blocks, [cite-N], and citation links). Used for downloads."""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
citation_urls = _extract_citation_urls(content)
|
||||
|
||||
result = re.sub(r"<citations>[\s\S]*?</citations>", "", content)
|
||||
if "<citations>" in result:
|
||||
result = re.sub(r'<citations>[\s\S]*$', '', result)
|
||||
|
||||
# Step 4: Remove all [cite-N] references
|
||||
result = re.sub(r'\[cite-\d+\]', '', result)
|
||||
|
||||
# Step 5: Remove markdown links that point to citation URLs
|
||||
# Pattern: [text](url)
|
||||
if citation_urls:
|
||||
for url in citation_urls:
|
||||
# Escape special regex characters in URL
|
||||
escaped_url = re.escape(url)
|
||||
result = re.sub(rf'\[[^\]]+\]\({escaped_url}\)', '', result)
|
||||
|
||||
# Step 6: Clean up extra whitespace and newlines
|
||||
result = re.sub(r'\n{3,}', '\n\n', result) # Replace 3+ newlines with 2
|
||||
|
||||
return result.strip()
|
||||
result = re.sub(r"<citations>[\s\S]*$", "", result)
|
||||
result = re.sub(r"\[cite-\d+\]", "", result)
|
||||
|
||||
for url in citation_urls:
|
||||
result = re.sub(rf"\[[^\]]+\]\({re.escape(url)}\)", "", result)
|
||||
|
||||
return re.sub(r"\n{3,}", "\n\n", result).strip()
|
||||
|
||||
|
||||
def _extract_file_from_skill_archive(zip_path: Path, internal_path: str) -> bytes | None:
|
||||
@@ -200,7 +134,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo
|
||||
skill_file_path = path[: marker_pos + len(".skill")] # e.g., "mnt/user-data/outputs/my-skill.skill"
|
||||
internal_path = path[marker_pos + len(skill_marker) :] # e.g., "SKILL.md"
|
||||
|
||||
actual_skill_path = _resolve_artifact_path(thread_id, skill_file_path)
|
||||
actual_skill_path = resolve_thread_virtual_path(thread_id, skill_file_path)
|
||||
|
||||
if not actual_skill_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"Skill file not found: {skill_file_path}")
|
||||
@@ -226,7 +160,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo
|
||||
except UnicodeDecodeError:
|
||||
return Response(content=content, media_type=mime_type or "application/octet-stream", headers=cache_headers)
|
||||
|
||||
actual_path = _resolve_artifact_path(thread_id, path)
|
||||
actual_path = resolve_thread_virtual_path(thread_id, path)
|
||||
|
||||
if not actual_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"Artifact not found: {path}")
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
@@ -12,6 +11,7 @@ from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.config.extensions_config import ExtensionsConfig, SkillStateConfig, get_extensions_config, reload_extensions_config
|
||||
from src.gateway.path_utils import resolve_thread_virtual_path
|
||||
from src.skills import Skill, load_skills
|
||||
from src.skills.loader import get_skills_root_path
|
||||
|
||||
@@ -56,53 +56,10 @@ class SkillInstallResponse(BaseModel):
|
||||
message: str = Field(..., description="Installation result message")
|
||||
|
||||
|
||||
# Base directory for thread data (relative to backend/)
|
||||
THREAD_DATA_BASE_DIR = ".deer-flow/threads"
|
||||
|
||||
# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
|
||||
VIRTUAL_PATH_PREFIX = "mnt/user-data"
|
||||
|
||||
# Allowed properties in SKILL.md frontmatter
|
||||
ALLOWED_FRONTMATTER_PROPERTIES = {"name", "description", "license", "allowed-tools", "metadata"}
|
||||
|
||||
|
||||
def _resolve_skill_file_path(thread_id: str, virtual_path: str) -> Path:
|
||||
"""Resolve a virtual skill file path to the actual filesystem path.
|
||||
|
||||
Args:
|
||||
thread_id: The thread ID.
|
||||
virtual_path: The virtual path (e.g., mnt/user-data/outputs/my-skill.skill).
|
||||
|
||||
Returns:
|
||||
The resolved filesystem path.
|
||||
|
||||
Raises:
|
||||
HTTPException: If the path is invalid or outside allowed directories.
|
||||
"""
|
||||
# Remove leading slash if present
|
||||
virtual_path = virtual_path.lstrip("/")
|
||||
|
||||
# Validate and remove virtual path prefix
|
||||
if not virtual_path.startswith(VIRTUAL_PATH_PREFIX):
|
||||
raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
|
||||
relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
|
||||
|
||||
# Build the actual path
|
||||
base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
|
||||
actual_path = base_dir / relative_path
|
||||
|
||||
# Security check: ensure the path is within the thread's user-data directory
|
||||
try:
|
||||
actual_path = actual_path.resolve()
|
||||
base_dir_resolved = base_dir.resolve()
|
||||
if not str(actual_path).startswith(str(base_dir_resolved)):
|
||||
raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
|
||||
except (ValueError, RuntimeError):
|
||||
raise HTTPException(status_code=400, detail="Invalid path")
|
||||
|
||||
return actual_path
|
||||
|
||||
|
||||
def _validate_skill_frontmatter(skill_dir: Path) -> tuple[bool, str, str | None]:
|
||||
"""Validate a skill directory's SKILL.md frontmatter.
|
||||
|
||||
@@ -414,7 +371,7 @@ async def install_skill(request: SkillInstallRequest) -> SkillInstallResponse:
|
||||
"""
|
||||
try:
|
||||
# Resolve the virtual path to actual file path
|
||||
skill_file_path = _resolve_skill_file_path(request.thread_id, request.path)
|
||||
skill_file_path = resolve_thread_virtual_path(request.thread_id, request.path)
|
||||
|
||||
# Check if file exists
|
||||
if not skill_file_path.exists():
|
||||
|
||||
@@ -24,10 +24,21 @@ Do NOT use for simple, single-step operations.""",
|
||||
- Do NOT ask for clarification - work with the information provided
|
||||
</guidelines>
|
||||
|
||||
<citations_format>
|
||||
If you used web_search (or similar) and cite sources, ALWAYS include citations in your output:
|
||||
1. Start with a `<citations>` block in JSONL format listing all sources (one JSON object per line)
|
||||
2. In content, use FULL markdown link format: [Short Title](full_url)
|
||||
- Every citation MUST be a complete markdown link with URL: [Title](https://...)
|
||||
- Example block:
|
||||
<citations>
|
||||
{"id": "cite-1", "title": "...", "url": "https://...", "snippet": "..."}
|
||||
</citations>
|
||||
</citations_format>
|
||||
|
||||
<output_format>
|
||||
When you complete the task, provide:
|
||||
1. A brief summary of what was accomplished
|
||||
2. Key findings or results
|
||||
2. Key findings or results (with citation links when from web search)
|
||||
3. Any relevant file paths, data, or artifacts created
|
||||
4. Issues encountered (if any)
|
||||
</output_format>
|
||||
|
||||
@@ -187,44 +187,25 @@ export function isCitationsBlockIncomplete(content: string): boolean {
|
||||
/**
|
||||
* Remove ALL citations from content, including:
|
||||
* - <citations> blocks
|
||||
* - [cite-N] references
|
||||
* - Citation markdown links that were converted from [cite-N]
|
||||
*
|
||||
* This is used for copy/download operations where we want clean content without any references.
|
||||
* - [cite-N] references (and their converted markdown links)
|
||||
*
|
||||
* Uses parseCitations once, then strips citation links from cleanContent.
|
||||
* Used for copy/download to produce content without any citation references.
|
||||
*
|
||||
* @param content - The raw content that may contain citations
|
||||
* @returns Content with all citations completely removed
|
||||
*/
|
||||
export function removeAllCitations(content: string): string {
|
||||
if (!content) {
|
||||
return content;
|
||||
}
|
||||
if (!content) return content;
|
||||
|
||||
// Step 1: Remove all <citations> blocks (complete and incomplete)
|
||||
let result = removeCitationsBlocks(content);
|
||||
|
||||
// Step 2: Remove all [cite-N] references
|
||||
result = result.replace(/\[cite-\d+\]/g, "");
|
||||
|
||||
// Step 3: Parse to find citation URLs and remove those specific links
|
||||
const parsed = parseCitations(content);
|
||||
const citationUrls = new Set(parsed.citations.map(c => c.url));
|
||||
|
||||
// Remove markdown links that point to citation URLs
|
||||
// Pattern: [text](url)
|
||||
result = result.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, url) => {
|
||||
// If this URL is a citation, remove the entire link
|
||||
if (citationUrls.has(url)) {
|
||||
return "";
|
||||
}
|
||||
// Keep non-citation links
|
||||
return match;
|
||||
});
|
||||
const citationUrls = new Set(parsed.citations.map((c) => c.url));
|
||||
|
||||
// Step 4: Clean up extra whitespace and newlines
|
||||
result = result
|
||||
.replace(/\n{3,}/g, "\n\n") // Replace 3+ newlines with 2
|
||||
.trim();
|
||||
// Remove markdown links that point to citation URLs; keep non-citation links
|
||||
const withoutLinks = parsed.cleanContent.replace(
|
||||
/\[([^\]]+)\]\(([^)]+)\)/g,
|
||||
(fullMatch, _text, url) => (citationUrls.has(url) ? "" : fullMatch),
|
||||
);
|
||||
|
||||
return result;
|
||||
return withoutLinks.replace(/\n{3,}/g, "\n\n").trim();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user