feat: citations prompts, path_utils, and citation code cleanup

- Prompt: add citation reminders for web_search and subagent synthesis (lead_agent, general_purpose)
- Gateway: add path_utils for shared thread virtual path resolution; refactor artifacts and skills to use it
- Citations: simplify removeAllCitations (single parse); backend _extract_citation_urls and remove_citations_block cleanup

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ruitanglin
2026-02-09 12:55:12 +08:00
parent 2b10b97bb9
commit eb5782b93b
6 changed files with 103 additions and 174 deletions

View File

@@ -267,6 +267,7 @@ The key AI trends for 2026 include enhanced reasoning capabilities and multimoda
<critical_reminders>
- **Clarification First**: ALWAYS clarify unclear/missing/ambiguous requirements BEFORE starting work - never assume or guess
- **Web search citations**: When you use web_search (or synthesize subagent results that used it), you MUST output the `<citations>` block and [Title](url) links as specified in citations_format so citations display for the user.
{subagent_reminder}- Skill First: Always load the relevant skill before starting **complex** tasks.
- Progressive Loading: Load resources incrementally as referenced in skills
- Output Files: Final deliverables must be in `/mnt/user-data/outputs`
@@ -340,6 +341,7 @@ def apply_prompt_template(subagent_enabled: bool = False) -> str:
# Add subagent reminder to critical_reminders if enabled
subagent_reminder = (
"- **Orchestrator Mode**: You are a task orchestrator - decompose complex tasks into parallel sub-tasks and launch multiple subagents simultaneously. Synthesize results, don't execute directly.\n"
"- **Citations when synthesizing**: When you synthesize subagent results that used web search or cite sources, you MUST include a consolidated `<citations>` block (JSONL format) and use [Title](url) markdown links in your response so citations display correctly.\n"
if subagent_enabled
else ""
)

View File

@@ -0,0 +1,44 @@
"""Shared path resolution for thread virtual paths (e.g. mnt/user-data/outputs/...)."""
import os
from pathlib import Path
from fastapi import HTTPException
from src.agents.middlewares.thread_data_middleware import THREAD_DATA_BASE_DIR
# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
VIRTUAL_PATH_PREFIX = "mnt/user-data"
def resolve_thread_virtual_path(thread_id: str, virtual_path: str) -> Path:
"""Resolve a virtual path to the actual filesystem path under thread user-data.
Args:
thread_id: The thread ID.
virtual_path: The virtual path (e.g., mnt/user-data/outputs/file.txt).
Leading slashes are stripped.
Returns:
The resolved filesystem path.
Raises:
HTTPException: If the path is invalid or outside allowed directories.
"""
virtual_path = virtual_path.lstrip("/")
if not virtual_path.startswith(VIRTUAL_PATH_PREFIX):
raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
actual_path = base_dir / relative_path
try:
actual_path = actual_path.resolve()
base_resolved = base_dir.resolve()
if not str(actual_path).startswith(str(base_resolved)):
raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
except (ValueError, RuntimeError):
raise HTTPException(status_code=400, detail="Invalid path")
return actual_path

View File

@@ -1,5 +1,5 @@
import json
import mimetypes
import os
import re
import zipfile
from pathlib import Path
@@ -8,49 +8,11 @@ from urllib.parse import quote
from fastapi import APIRouter, HTTPException, Request, Response
from fastapi.responses import FileResponse, HTMLResponse, PlainTextResponse
# Base directory for thread data (relative to backend/)
THREAD_DATA_BASE_DIR = ".deer-flow/threads"
# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
VIRTUAL_PATH_PREFIX = "mnt/user-data"
from src.gateway.path_utils import resolve_thread_virtual_path
router = APIRouter(prefix="/api", tags=["artifacts"])
def _resolve_artifact_path(thread_id: str, artifact_path: str) -> Path:
"""Resolve a virtual artifact path to the actual filesystem path.
Args:
thread_id: The thread ID.
artifact_path: The virtual path (e.g., mnt/user-data/outputs/file.txt).
Returns:
The resolved filesystem path.
Raises:
HTTPException: If the path is invalid or outside allowed directories.
"""
# Validate and remove virtual path prefix
if not artifact_path.startswith(VIRTUAL_PATH_PREFIX):
raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
relative_path = artifact_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
# Build the actual path
base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
actual_path = base_dir / relative_path
# Security check: ensure the path is within the thread's user-data directory
try:
actual_path = actual_path.resolve()
base_dir = base_dir.resolve()
if not str(actual_path).startswith(str(base_dir)):
raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
except (ValueError, RuntimeError):
raise HTTPException(status_code=400, detail="Invalid path")
return actual_path
def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
"""Check if file is text by examining content for null bytes."""
try:
@@ -62,66 +24,38 @@ def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
return False
def remove_citations_block(content: str) -> str:
"""Remove ALL citations from markdown content.
Removes:
- <citations>...</citations> blocks (complete and incomplete)
- [cite-N] references
- Citation markdown links that were converted from [cite-N]
This is used for downloads to provide clean markdown without any citation references.
Args:
content: The markdown content that may contain citations blocks.
Returns:
Clean content with all citations completely removed.
"""
if not content:
return content
result = content
# Step 1: Parse and extract citation URLs before removing blocks
citation_urls = set()
citations_pattern = r'<citations>([\s\S]*?)</citations>'
for match in re.finditer(citations_pattern, content):
citations_block = match.group(1)
# Extract URLs from JSON lines
import json
for line in citations_block.split('\n'):
def _extract_citation_urls(content: str) -> set[str]:
"""Extract URLs from <citations> JSONL blocks. Format must match frontend core/citations/utils.ts."""
urls: set[str] = set()
for match in re.finditer(r"<citations>([\s\S]*?)</citations>", content):
for line in match.group(1).split("\n"):
line = line.strip()
if line.startswith('{'):
if line.startswith("{"):
try:
citation = json.loads(line)
if 'url' in citation:
citation_urls.add(citation['url'])
obj = json.loads(line)
if "url" in obj:
urls.add(obj["url"])
except (json.JSONDecodeError, ValueError):
pass
# Step 2: Remove complete citations blocks
result = re.sub(r'<citations>[\s\S]*?</citations>', '', result)
# Step 3: Remove incomplete citations blocks (at end of content during streaming)
return urls
def remove_citations_block(content: str) -> str:
"""Remove ALL citations from markdown (blocks, [cite-N], and citation links). Used for downloads."""
if not content:
return content
citation_urls = _extract_citation_urls(content)
result = re.sub(r"<citations>[\s\S]*?</citations>", "", content)
if "<citations>" in result:
result = re.sub(r'<citations>[\s\S]*$', '', result)
# Step 4: Remove all [cite-N] references
result = re.sub(r'\[cite-\d+\]', '', result)
# Step 5: Remove markdown links that point to citation URLs
# Pattern: [text](url)
if citation_urls:
for url in citation_urls:
# Escape special regex characters in URL
escaped_url = re.escape(url)
result = re.sub(rf'\[[^\]]+\]\({escaped_url}\)', '', result)
# Step 6: Clean up extra whitespace and newlines
result = re.sub(r'\n{3,}', '\n\n', result) # Replace 3+ newlines with 2
return result.strip()
result = re.sub(r"<citations>[\s\S]*$", "", result)
result = re.sub(r"\[cite-\d+\]", "", result)
for url in citation_urls:
result = re.sub(rf"\[[^\]]+\]\({re.escape(url)}\)", "", result)
return re.sub(r"\n{3,}", "\n\n", result).strip()
def _extract_file_from_skill_archive(zip_path: Path, internal_path: str) -> bytes | None:
@@ -200,7 +134,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo
skill_file_path = path[: marker_pos + len(".skill")] # e.g., "mnt/user-data/outputs/my-skill.skill"
internal_path = path[marker_pos + len(skill_marker) :] # e.g., "SKILL.md"
actual_skill_path = _resolve_artifact_path(thread_id, skill_file_path)
actual_skill_path = resolve_thread_virtual_path(thread_id, skill_file_path)
if not actual_skill_path.exists():
raise HTTPException(status_code=404, detail=f"Skill file not found: {skill_file_path}")
@@ -226,7 +160,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo
except UnicodeDecodeError:
return Response(content=content, media_type=mime_type or "application/octet-stream", headers=cache_headers)
actual_path = _resolve_artifact_path(thread_id, path)
actual_path = resolve_thread_virtual_path(thread_id, path)
if not actual_path.exists():
raise HTTPException(status_code=404, detail=f"Artifact not found: {path}")

View File

@@ -1,6 +1,5 @@
import json
import logging
import os
import re
import shutil
import tempfile
@@ -12,6 +11,7 @@ from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, Field
from src.config.extensions_config import ExtensionsConfig, SkillStateConfig, get_extensions_config, reload_extensions_config
from src.gateway.path_utils import resolve_thread_virtual_path
from src.skills import Skill, load_skills
from src.skills.loader import get_skills_root_path
@@ -56,53 +56,10 @@ class SkillInstallResponse(BaseModel):
message: str = Field(..., description="Installation result message")
# Base directory for thread data (relative to backend/)
THREAD_DATA_BASE_DIR = ".deer-flow/threads"
# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
VIRTUAL_PATH_PREFIX = "mnt/user-data"
# Allowed properties in SKILL.md frontmatter
ALLOWED_FRONTMATTER_PROPERTIES = {"name", "description", "license", "allowed-tools", "metadata"}
def _resolve_skill_file_path(thread_id: str, virtual_path: str) -> Path:
"""Resolve a virtual skill file path to the actual filesystem path.
Args:
thread_id: The thread ID.
virtual_path: The virtual path (e.g., mnt/user-data/outputs/my-skill.skill).
Returns:
The resolved filesystem path.
Raises:
HTTPException: If the path is invalid or outside allowed directories.
"""
# Remove leading slash if present
virtual_path = virtual_path.lstrip("/")
# Validate and remove virtual path prefix
if not virtual_path.startswith(VIRTUAL_PATH_PREFIX):
raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
# Build the actual path
base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
actual_path = base_dir / relative_path
# Security check: ensure the path is within the thread's user-data directory
try:
actual_path = actual_path.resolve()
base_dir_resolved = base_dir.resolve()
if not str(actual_path).startswith(str(base_dir_resolved)):
raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
except (ValueError, RuntimeError):
raise HTTPException(status_code=400, detail="Invalid path")
return actual_path
def _validate_skill_frontmatter(skill_dir: Path) -> tuple[bool, str, str | None]:
"""Validate a skill directory's SKILL.md frontmatter.
@@ -414,7 +371,7 @@ async def install_skill(request: SkillInstallRequest) -> SkillInstallResponse:
"""
try:
# Resolve the virtual path to actual file path
skill_file_path = _resolve_skill_file_path(request.thread_id, request.path)
skill_file_path = resolve_thread_virtual_path(request.thread_id, request.path)
# Check if file exists
if not skill_file_path.exists():

View File

@@ -24,10 +24,21 @@ Do NOT use for simple, single-step operations.""",
- Do NOT ask for clarification - work with the information provided
</guidelines>
<citations_format>
If you used web_search (or similar) and cite sources, ALWAYS include citations in your output:
1. Start with a `<citations>` block in JSONL format listing all sources (one JSON object per line)
2. In content, use FULL markdown link format: [Short Title](full_url)
- Every citation MUST be a complete markdown link with URL: [Title](https://...)
- Example block:
<citations>
{"id": "cite-1", "title": "...", "url": "https://...", "snippet": "..."}
</citations>
</citations_format>
<output_format>
When you complete the task, provide:
1. A brief summary of what was accomplished
2. Key findings or results
2. Key findings or results (with citation links when from web search)
3. Any relevant file paths, data, or artifacts created
4. Issues encountered (if any)
</output_format>

View File

@@ -187,44 +187,25 @@ export function isCitationsBlockIncomplete(content: string): boolean {
/**
* Remove ALL citations from content, including:
* - <citations> blocks
* - [cite-N] references
* - Citation markdown links that were converted from [cite-N]
*
* This is used for copy/download operations where we want clean content without any references.
* - [cite-N] references (and their converted markdown links)
*
* Uses parseCitations once, then strips citation links from cleanContent.
* Used for copy/download to produce content without any citation references.
*
* @param content - The raw content that may contain citations
* @returns Content with all citations completely removed
*/
export function removeAllCitations(content: string): string {
if (!content) {
return content;
}
if (!content) return content;
// Step 1: Remove all <citations> blocks (complete and incomplete)
let result = removeCitationsBlocks(content);
// Step 2: Remove all [cite-N] references
result = result.replace(/\[cite-\d+\]/g, "");
// Step 3: Parse to find citation URLs and remove those specific links
const parsed = parseCitations(content);
const citationUrls = new Set(parsed.citations.map(c => c.url));
// Remove markdown links that point to citation URLs
// Pattern: [text](url)
result = result.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, url) => {
// If this URL is a citation, remove the entire link
if (citationUrls.has(url)) {
return "";
}
// Keep non-citation links
return match;
});
const citationUrls = new Set(parsed.citations.map((c) => c.url));
// Step 4: Clean up extra whitespace and newlines
result = result
.replace(/\n{3,}/g, "\n\n") // Replace 3+ newlines with 2
.trim();
// Remove markdown links that point to citation URLs; keep non-citation links
const withoutLinks = parsed.cleanContent.replace(
/\[([^\]]+)\]\(([^)]+)\)/g,
(fullMatch, _text, url) => (citationUrls.has(url) ? "" : fullMatch),
);
return result;
return withoutLinks.replace(/\n{3,}/g, "\n\n").trim();
}