From 2a399478307bdd827632bdeddf5baf11c1412d22 Mon Sep 17 00:00:00 2001 From: LofiSu Date: Mon, 9 Feb 2026 12:55:12 +0800 Subject: [PATCH] feat: citations prompts, path_utils, and citation code cleanup - Prompt: add citation reminders for web_search and subagent synthesis (lead_agent, general_purpose) - Gateway: add path_utils for shared thread virtual path resolution; refactor artifacts and skills to use it - Citations: simplify removeAllCitations (single parse); backend _extract_citation_urls and remove_citations_block cleanup Co-authored-by: Cursor --- backend/src/agents/lead_agent/prompt.py | 2 + backend/src/gateway/path_utils.py | 44 ++++++ backend/src/gateway/routers/artifacts.py | 128 +++++------------- backend/src/gateway/routers/skills.py | 47 +------ .../src/subagents/builtins/general_purpose.py | 13 +- frontend/src/core/citations/utils.ts | 43 ++---- 6 files changed, 103 insertions(+), 174 deletions(-) create mode 100644 backend/src/gateway/path_utils.py diff --git a/backend/src/agents/lead_agent/prompt.py b/backend/src/agents/lead_agent/prompt.py index 192e788..ce175c2 100644 --- a/backend/src/agents/lead_agent/prompt.py +++ b/backend/src/agents/lead_agent/prompt.py @@ -267,6 +267,7 @@ The key AI trends for 2026 include enhanced reasoning capabilities and multimoda - **Clarification First**: ALWAYS clarify unclear/missing/ambiguous requirements BEFORE starting work - never assume or guess +- **Web search citations**: When you use web_search (or synthesize subagent results that used it), you MUST output the `` block and [Title](url) links as specified in citations_format so citations display for the user. {subagent_reminder}- Skill First: Always load the relevant skill before starting **complex** tasks. - Progressive Loading: Load resources incrementally as referenced in skills - Output Files: Final deliverables must be in `/mnt/user-data/outputs` @@ -340,6 +341,7 @@ def apply_prompt_template(subagent_enabled: bool = False) -> str: # Add subagent reminder to critical_reminders if enabled subagent_reminder = ( "- **Orchestrator Mode**: You are a task orchestrator - decompose complex tasks into parallel sub-tasks and launch multiple subagents simultaneously. Synthesize results, don't execute directly.\n" + "- **Citations when synthesizing**: When you synthesize subagent results that used web search or cite sources, you MUST include a consolidated `` block (JSONL format) and use [Title](url) markdown links in your response so citations display correctly.\n" if subagent_enabled else "" ) diff --git a/backend/src/gateway/path_utils.py b/backend/src/gateway/path_utils.py new file mode 100644 index 0000000..119752e --- /dev/null +++ b/backend/src/gateway/path_utils.py @@ -0,0 +1,44 @@ +"""Shared path resolution for thread virtual paths (e.g. mnt/user-data/outputs/...).""" + +import os +from pathlib import Path + +from fastapi import HTTPException + +from src.agents.middlewares.thread_data_middleware import THREAD_DATA_BASE_DIR + +# Virtual path prefix used in sandbox environments (without leading slash for URL path matching) +VIRTUAL_PATH_PREFIX = "mnt/user-data" + + +def resolve_thread_virtual_path(thread_id: str, virtual_path: str) -> Path: + """Resolve a virtual path to the actual filesystem path under thread user-data. + + Args: + thread_id: The thread ID. + virtual_path: The virtual path (e.g., mnt/user-data/outputs/file.txt). + Leading slashes are stripped. + + Returns: + The resolved filesystem path. + + Raises: + HTTPException: If the path is invalid or outside allowed directories. + """ + virtual_path = virtual_path.lstrip("/") + if not virtual_path.startswith(VIRTUAL_PATH_PREFIX): + raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}") + relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/") + + base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data" + actual_path = base_dir / relative_path + + try: + actual_path = actual_path.resolve() + base_resolved = base_dir.resolve() + if not str(actual_path).startswith(str(base_resolved)): + raise HTTPException(status_code=403, detail="Access denied: path traversal detected") + except (ValueError, RuntimeError): + raise HTTPException(status_code=400, detail="Invalid path") + + return actual_path diff --git a/backend/src/gateway/routers/artifacts.py b/backend/src/gateway/routers/artifacts.py index 9798193..a2a13a7 100644 --- a/backend/src/gateway/routers/artifacts.py +++ b/backend/src/gateway/routers/artifacts.py @@ -1,5 +1,5 @@ +import json import mimetypes -import os import re import zipfile from pathlib import Path @@ -8,49 +8,11 @@ from urllib.parse import quote from fastapi import APIRouter, HTTPException, Request, Response from fastapi.responses import FileResponse, HTMLResponse, PlainTextResponse -# Base directory for thread data (relative to backend/) -THREAD_DATA_BASE_DIR = ".deer-flow/threads" - -# Virtual path prefix used in sandbox environments (without leading slash for URL path matching) -VIRTUAL_PATH_PREFIX = "mnt/user-data" +from src.gateway.path_utils import resolve_thread_virtual_path router = APIRouter(prefix="/api", tags=["artifacts"]) -def _resolve_artifact_path(thread_id: str, artifact_path: str) -> Path: - """Resolve a virtual artifact path to the actual filesystem path. - - Args: - thread_id: The thread ID. - artifact_path: The virtual path (e.g., mnt/user-data/outputs/file.txt). - - Returns: - The resolved filesystem path. - - Raises: - HTTPException: If the path is invalid or outside allowed directories. - """ - # Validate and remove virtual path prefix - if not artifact_path.startswith(VIRTUAL_PATH_PREFIX): - raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}") - relative_path = artifact_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/") - - # Build the actual path - base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data" - actual_path = base_dir / relative_path - - # Security check: ensure the path is within the thread's user-data directory - try: - actual_path = actual_path.resolve() - base_dir = base_dir.resolve() - if not str(actual_path).startswith(str(base_dir)): - raise HTTPException(status_code=403, detail="Access denied: path traversal detected") - except (ValueError, RuntimeError): - raise HTTPException(status_code=400, detail="Invalid path") - - return actual_path - - def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool: """Check if file is text by examining content for null bytes.""" try: @@ -62,66 +24,38 @@ def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool: return False -def remove_citations_block(content: str) -> str: - """Remove ALL citations from markdown content. - - Removes: - - ... blocks (complete and incomplete) - - [cite-N] references - - Citation markdown links that were converted from [cite-N] - - This is used for downloads to provide clean markdown without any citation references. - - Args: - content: The markdown content that may contain citations blocks. - - Returns: - Clean content with all citations completely removed. - """ - if not content: - return content - - result = content - - # Step 1: Parse and extract citation URLs before removing blocks - citation_urls = set() - citations_pattern = r'([\s\S]*?)' - for match in re.finditer(citations_pattern, content): - citations_block = match.group(1) - # Extract URLs from JSON lines - import json - for line in citations_block.split('\n'): +def _extract_citation_urls(content: str) -> set[str]: + """Extract URLs from JSONL blocks. Format must match frontend core/citations/utils.ts.""" + urls: set[str] = set() + for match in re.finditer(r"([\s\S]*?)", content): + for line in match.group(1).split("\n"): line = line.strip() - if line.startswith('{'): + if line.startswith("{"): try: - citation = json.loads(line) - if 'url' in citation: - citation_urls.add(citation['url']) + obj = json.loads(line) + if "url" in obj: + urls.add(obj["url"]) except (json.JSONDecodeError, ValueError): pass - - # Step 2: Remove complete citations blocks - result = re.sub(r'[\s\S]*?', '', result) - - # Step 3: Remove incomplete citations blocks (at end of content during streaming) + return urls + + +def remove_citations_block(content: str) -> str: + """Remove ALL citations from markdown (blocks, [cite-N], and citation links). Used for downloads.""" + if not content: + return content + + citation_urls = _extract_citation_urls(content) + + result = re.sub(r"[\s\S]*?", "", content) if "" in result: - result = re.sub(r'[\s\S]*$', '', result) - - # Step 4: Remove all [cite-N] references - result = re.sub(r'\[cite-\d+\]', '', result) - - # Step 5: Remove markdown links that point to citation URLs - # Pattern: [text](url) - if citation_urls: - for url in citation_urls: - # Escape special regex characters in URL - escaped_url = re.escape(url) - result = re.sub(rf'\[[^\]]+\]\({escaped_url}\)', '', result) - - # Step 6: Clean up extra whitespace and newlines - result = re.sub(r'\n{3,}', '\n\n', result) # Replace 3+ newlines with 2 - - return result.strip() + result = re.sub(r"[\s\S]*$", "", result) + result = re.sub(r"\[cite-\d+\]", "", result) + + for url in citation_urls: + result = re.sub(rf"\[[^\]]+\]\({re.escape(url)}\)", "", result) + + return re.sub(r"\n{3,}", "\n\n", result).strip() def _extract_file_from_skill_archive(zip_path: Path, internal_path: str) -> bytes | None: @@ -200,7 +134,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo skill_file_path = path[: marker_pos + len(".skill")] # e.g., "mnt/user-data/outputs/my-skill.skill" internal_path = path[marker_pos + len(skill_marker) :] # e.g., "SKILL.md" - actual_skill_path = _resolve_artifact_path(thread_id, skill_file_path) + actual_skill_path = resolve_thread_virtual_path(thread_id, skill_file_path) if not actual_skill_path.exists(): raise HTTPException(status_code=404, detail=f"Skill file not found: {skill_file_path}") @@ -226,7 +160,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo except UnicodeDecodeError: return Response(content=content, media_type=mime_type or "application/octet-stream", headers=cache_headers) - actual_path = _resolve_artifact_path(thread_id, path) + actual_path = resolve_thread_virtual_path(thread_id, path) if not actual_path.exists(): raise HTTPException(status_code=404, detail=f"Artifact not found: {path}") diff --git a/backend/src/gateway/routers/skills.py b/backend/src/gateway/routers/skills.py index 67bca69..11c5356 100644 --- a/backend/src/gateway/routers/skills.py +++ b/backend/src/gateway/routers/skills.py @@ -1,6 +1,5 @@ import json import logging -import os import re import shutil import tempfile @@ -12,6 +11,7 @@ from fastapi import APIRouter, HTTPException from pydantic import BaseModel, Field from src.config.extensions_config import ExtensionsConfig, SkillStateConfig, get_extensions_config, reload_extensions_config +from src.gateway.path_utils import resolve_thread_virtual_path from src.skills import Skill, load_skills from src.skills.loader import get_skills_root_path @@ -56,53 +56,10 @@ class SkillInstallResponse(BaseModel): message: str = Field(..., description="Installation result message") -# Base directory for thread data (relative to backend/) -THREAD_DATA_BASE_DIR = ".deer-flow/threads" - -# Virtual path prefix used in sandbox environments (without leading slash for URL path matching) -VIRTUAL_PATH_PREFIX = "mnt/user-data" - # Allowed properties in SKILL.md frontmatter ALLOWED_FRONTMATTER_PROPERTIES = {"name", "description", "license", "allowed-tools", "metadata"} -def _resolve_skill_file_path(thread_id: str, virtual_path: str) -> Path: - """Resolve a virtual skill file path to the actual filesystem path. - - Args: - thread_id: The thread ID. - virtual_path: The virtual path (e.g., mnt/user-data/outputs/my-skill.skill). - - Returns: - The resolved filesystem path. - - Raises: - HTTPException: If the path is invalid or outside allowed directories. - """ - # Remove leading slash if present - virtual_path = virtual_path.lstrip("/") - - # Validate and remove virtual path prefix - if not virtual_path.startswith(VIRTUAL_PATH_PREFIX): - raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}") - relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/") - - # Build the actual path - base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data" - actual_path = base_dir / relative_path - - # Security check: ensure the path is within the thread's user-data directory - try: - actual_path = actual_path.resolve() - base_dir_resolved = base_dir.resolve() - if not str(actual_path).startswith(str(base_dir_resolved)): - raise HTTPException(status_code=403, detail="Access denied: path traversal detected") - except (ValueError, RuntimeError): - raise HTTPException(status_code=400, detail="Invalid path") - - return actual_path - - def _validate_skill_frontmatter(skill_dir: Path) -> tuple[bool, str, str | None]: """Validate a skill directory's SKILL.md frontmatter. @@ -414,7 +371,7 @@ async def install_skill(request: SkillInstallRequest) -> SkillInstallResponse: """ try: # Resolve the virtual path to actual file path - skill_file_path = _resolve_skill_file_path(request.thread_id, request.path) + skill_file_path = resolve_thread_virtual_path(request.thread_id, request.path) # Check if file exists if not skill_file_path.exists(): diff --git a/backend/src/subagents/builtins/general_purpose.py b/backend/src/subagents/builtins/general_purpose.py index 1ab6562..0854422 100644 --- a/backend/src/subagents/builtins/general_purpose.py +++ b/backend/src/subagents/builtins/general_purpose.py @@ -24,10 +24,21 @@ Do NOT use for simple, single-step operations.""", - Do NOT ask for clarification - work with the information provided + +If you used web_search (or similar) and cite sources, ALWAYS include citations in your output: +1. Start with a `` block in JSONL format listing all sources (one JSON object per line) +2. In content, use FULL markdown link format: [Short Title](full_url) +- Every citation MUST be a complete markdown link with URL: [Title](https://...) +- Example block: + +{"id": "cite-1", "title": "...", "url": "https://...", "snippet": "..."} + + + When you complete the task, provide: 1. A brief summary of what was accomplished -2. Key findings or results +2. Key findings or results (with citation links when from web search) 3. Any relevant file paths, data, or artifacts created 4. Issues encountered (if any) diff --git a/frontend/src/core/citations/utils.ts b/frontend/src/core/citations/utils.ts index d2384a4..965c2a6 100644 --- a/frontend/src/core/citations/utils.ts +++ b/frontend/src/core/citations/utils.ts @@ -187,44 +187,25 @@ export function isCitationsBlockIncomplete(content: string): boolean { /** * Remove ALL citations from content, including: * - blocks - * - [cite-N] references - * - Citation markdown links that were converted from [cite-N] - * - * This is used for copy/download operations where we want clean content without any references. + * - [cite-N] references (and their converted markdown links) + * + * Uses parseCitations once, then strips citation links from cleanContent. + * Used for copy/download to produce content without any citation references. * * @param content - The raw content that may contain citations * @returns Content with all citations completely removed */ export function removeAllCitations(content: string): string { - if (!content) { - return content; - } + if (!content) return content; - // Step 1: Remove all blocks (complete and incomplete) - let result = removeCitationsBlocks(content); - - // Step 2: Remove all [cite-N] references - result = result.replace(/\[cite-\d+\]/g, ""); - - // Step 3: Parse to find citation URLs and remove those specific links const parsed = parseCitations(content); - const citationUrls = new Set(parsed.citations.map(c => c.url)); - - // Remove markdown links that point to citation URLs - // Pattern: [text](url) - result = result.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, url) => { - // If this URL is a citation, remove the entire link - if (citationUrls.has(url)) { - return ""; - } - // Keep non-citation links - return match; - }); + const citationUrls = new Set(parsed.citations.map((c) => c.url)); - // Step 4: Clean up extra whitespace and newlines - result = result - .replace(/\n{3,}/g, "\n\n") // Replace 3+ newlines with 2 - .trim(); + // Remove markdown links that point to citation URLs; keep non-citation links + const withoutLinks = parsed.cleanContent.replace( + /\[([^\]]+)\]\(([^)]+)\)/g, + (fullMatch, _text, url) => (citationUrls.has(url) ? "" : fullMatch), + ); - return result; + return withoutLinks.replace(/\n{3,}/g, "\n\n").trim(); }