feat: citations prompts, path_utils, and citation code cleanup

- Prompt: add citation reminders for web_search and subagent synthesis (lead_agent, general_purpose) - Gateway: add path_utils for shared thread virtual path resolution; refactor artifacts and skills to use it - Citations: simplify removeAllCitations (single parse); backend _extract_citation_urls and remove_citations_block cleanup Co-authored-by: Cursor <cursoragent@cursor.com>
2026-04-02 22:02:13 +08:00 · 2026-02-09 12:55:12 +08:00
parent 2b10b97bb9
commit eb5782b93b
6 changed files with 103 additions and 174 deletions
--- a/backend/src/agents/lead_agent/prompt.py
+++ b/backend/src/agents/lead_agent/prompt.py
@@ -267,6 +267,7 @@ The key AI trends for 2026 include enhanced reasoning capabilities and multimoda

 <critical_reminders>
 - **Clarification First**: ALWAYS clarify unclear/missing/ambiguous requirements BEFORE starting work - never assume or guess
+- **Web search citations**: When you use web_search (or synthesize subagent results that used it), you MUST output the `<citations>` block and [Title](url) links as specified in citations_format so citations display for the user.
 {subagent_reminder}- Skill First: Always load the relevant skill before starting **complex** tasks.
 - Progressive Loading: Load resources incrementally as referenced in skills
 - Output Files: Final deliverables must be in `/mnt/user-data/outputs`
@@ -340,6 +341,7 @@ def apply_prompt_template(subagent_enabled: bool = False) -> str:
    # Add subagent reminder to critical_reminders if enabled
    subagent_reminder = (
        "- **Orchestrator Mode**: You are a task orchestrator - decompose complex tasks into parallel sub-tasks and launch multiple subagents simultaneously. Synthesize results, don't execute directly.\n"
+        "- **Citations when synthesizing**: When you synthesize subagent results that used web search or cite sources, you MUST include a consolidated `<citations>` block (JSONL format) and use [Title](url) markdown links in your response so citations display correctly.\n"
        if subagent_enabled
        else ""
    )
--- a/backend/src/gateway/path_utils.py
+++ b/backend/src/gateway/path_utils.py
@@ -0,0 +1,44 @@
+"""Shared path resolution for thread virtual paths (e.g. mnt/user-data/outputs/...)."""
+
+import os
+from pathlib import Path
+
+from fastapi import HTTPException
+
+from src.agents.middlewares.thread_data_middleware import THREAD_DATA_BASE_DIR
+
+# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
+VIRTUAL_PATH_PREFIX = "mnt/user-data"
+
+
+def resolve_thread_virtual_path(thread_id: str, virtual_path: str) -> Path:
+    """Resolve a virtual path to the actual filesystem path under thread user-data.
+
+    Args:
+        thread_id: The thread ID.
+        virtual_path: The virtual path (e.g., mnt/user-data/outputs/file.txt).
+                      Leading slashes are stripped.
+
+    Returns:
+        The resolved filesystem path.
+
+    Raises:
+        HTTPException: If the path is invalid or outside allowed directories.
+    """
+    virtual_path = virtual_path.lstrip("/")
+    if not virtual_path.startswith(VIRTUAL_PATH_PREFIX):
+        raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
+    relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
+
+    base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
+    actual_path = base_dir / relative_path
+
+    try:
+        actual_path = actual_path.resolve()
+        base_resolved = base_dir.resolve()
+        if not str(actual_path).startswith(str(base_resolved)):
+            raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
+    except (ValueError, RuntimeError):
+        raise HTTPException(status_code=400, detail="Invalid path")
+
+    return actual_path
--- a/backend/src/gateway/routers/artifacts.py
+++ b/backend/src/gateway/routers/artifacts.py
@@ -1,5 +1,5 @@
+import json
 import mimetypes
-import os
 import re
 import zipfile
 from pathlib import Path
@@ -8,49 +8,11 @@ from urllib.parse import quote
 from fastapi import APIRouter, HTTPException, Request, Response
 from fastapi.responses import FileResponse, HTMLResponse, PlainTextResponse

-# Base directory for thread data (relative to backend/)
-THREAD_DATA_BASE_DIR = ".deer-flow/threads"
-
-# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
-VIRTUAL_PATH_PREFIX = "mnt/user-data"
+from src.gateway.path_utils import resolve_thread_virtual_path

 router = APIRouter(prefix="/api", tags=["artifacts"])


-def _resolve_artifact_path(thread_id: str, artifact_path: str) -> Path:
-    """Resolve a virtual artifact path to the actual filesystem path.
-
-    Args:
-        thread_id: The thread ID.
-        artifact_path: The virtual path (e.g., mnt/user-data/outputs/file.txt).
-
-    Returns:
-        The resolved filesystem path.
-
-    Raises:
-        HTTPException: If the path is invalid or outside allowed directories.
-    """
-    # Validate and remove virtual path prefix
-    if not artifact_path.startswith(VIRTUAL_PATH_PREFIX):
-        raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
-    relative_path = artifact_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
-
-    # Build the actual path
-    base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
-    actual_path = base_dir / relative_path
-
-    # Security check: ensure the path is within the thread's user-data directory
-    try:
-        actual_path = actual_path.resolve()
-        base_dir = base_dir.resolve()
-        if not str(actual_path).startswith(str(base_dir)):
-            raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
-    except (ValueError, RuntimeError):
-        raise HTTPException(status_code=400, detail="Invalid path")
-
-    return actual_path
-
-
 def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
    """Check if file is text by examining content for null bytes."""
    try:
@@ -62,66 +24,38 @@ def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
        return False


-def remove_citations_block(content: str) -> str:
-    """Remove ALL citations from markdown content.
-    
-    Removes:
-    - <citations>...</citations> blocks (complete and incomplete)
-    - [cite-N] references
-    - Citation markdown links that were converted from [cite-N]
-    
-    This is used for downloads to provide clean markdown without any citation references.
-    
-    Args:
-        content: The markdown content that may contain citations blocks.
-        
-    Returns:
-        Clean content with all citations completely removed.
-    """
-    if not content:
-        return content
-    
-    result = content
-    
-    # Step 1: Parse and extract citation URLs before removing blocks
-    citation_urls = set()
-    citations_pattern = r'<citations>([\s\S]*?)</citations>'
-    for match in re.finditer(citations_pattern, content):
-        citations_block = match.group(1)
-        # Extract URLs from JSON lines
-        import json
-        for line in citations_block.split('\n'):
+def _extract_citation_urls(content: str) -> set[str]:
+    """Extract URLs from <citations> JSONL blocks. Format must match frontend core/citations/utils.ts."""
+    urls: set[str] = set()
+    for match in re.finditer(r"<citations>([\s\S]*?)</citations>", content):
+        for line in match.group(1).split("\n"):
            line = line.strip()
-            if line.startswith('{'):
+            if line.startswith("{"):
                try:
-                    citation = json.loads(line)
-                    if 'url' in citation:
-                        citation_urls.add(citation['url'])
+                    obj = json.loads(line)
+                    if "url" in obj:
+                        urls.add(obj["url"])
                except (json.JSONDecodeError, ValueError):
                    pass
-    
-    # Step 2: Remove complete citations blocks
-    result = re.sub(r'<citations>[\s\S]*?</citations>', '', result)
-    
-    # Step 3: Remove incomplete citations blocks (at end of content during streaming)
+    return urls
+
+
+def remove_citations_block(content: str) -> str:
+    """Remove ALL citations from markdown (blocks, [cite-N], and citation links). Used for downloads."""
+    if not content:
+        return content
+
+    citation_urls = _extract_citation_urls(content)
+
+    result = re.sub(r"<citations>[\s\S]*?</citations>", "", content)
    if "<citations>" in result:
-        result = re.sub(r'<citations>[\s\S]*$', '', result)
-    
-    # Step 4: Remove all [cite-N] references
-    result = re.sub(r'\[cite-\d+\]', '', result)
-    
-    # Step 5: Remove markdown links that point to citation URLs
-    # Pattern: [text](url)
-    if citation_urls:
-        for url in citation_urls:
-            # Escape special regex characters in URL
-            escaped_url = re.escape(url)
-            result = re.sub(rf'\[[^\]]+\]\({escaped_url}\)', '', result)
-    
-    # Step 6: Clean up extra whitespace and newlines
-    result = re.sub(r'\n{3,}', '\n\n', result)  # Replace 3+ newlines with 2
-    
-    return result.strip()
+        result = re.sub(r"<citations>[\s\S]*$", "", result)
+    result = re.sub(r"\[cite-\d+\]", "", result)
+
+    for url in citation_urls:
+        result = re.sub(rf"\[[^\]]+\]\({re.escape(url)}\)", "", result)
+
+    return re.sub(r"\n{3,}", "\n\n", result).strip()


 def _extract_file_from_skill_archive(zip_path: Path, internal_path: str) -> bytes | None:
@@ -200,7 +134,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo
        skill_file_path = path[: marker_pos + len(".skill")]  # e.g., "mnt/user-data/outputs/my-skill.skill"
        internal_path = path[marker_pos + len(skill_marker) :]  # e.g., "SKILL.md"

-        actual_skill_path = _resolve_artifact_path(thread_id, skill_file_path)
+        actual_skill_path = resolve_thread_virtual_path(thread_id, skill_file_path)

        if not actual_skill_path.exists():
            raise HTTPException(status_code=404, detail=f"Skill file not found: {skill_file_path}")
@@ -226,7 +160,7 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo
        except UnicodeDecodeError:
            return Response(content=content, media_type=mime_type or "application/octet-stream", headers=cache_headers)

-    actual_path = _resolve_artifact_path(thread_id, path)
+    actual_path = resolve_thread_virtual_path(thread_id, path)

    if not actual_path.exists():
        raise HTTPException(status_code=404, detail=f"Artifact not found: {path}")
--- a/backend/src/gateway/routers/skills.py
+++ b/backend/src/gateway/routers/skills.py
@@ -1,6 +1,5 @@
 import json
 import logging
-import os
 import re
 import shutil
 import tempfile
@@ -12,6 +11,7 @@ from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel, Field

 from src.config.extensions_config import ExtensionsConfig, SkillStateConfig, get_extensions_config, reload_extensions_config
+from src.gateway.path_utils import resolve_thread_virtual_path
 from src.skills import Skill, load_skills
 from src.skills.loader import get_skills_root_path

@@ -56,53 +56,10 @@ class SkillInstallResponse(BaseModel):
    message: str = Field(..., description="Installation result message")


-# Base directory for thread data (relative to backend/)
-THREAD_DATA_BASE_DIR = ".deer-flow/threads"
-
-# Virtual path prefix used in sandbox environments (without leading slash for URL path matching)
-VIRTUAL_PATH_PREFIX = "mnt/user-data"
-
 # Allowed properties in SKILL.md frontmatter
 ALLOWED_FRONTMATTER_PROPERTIES = {"name", "description", "license", "allowed-tools", "metadata"}


-def _resolve_skill_file_path(thread_id: str, virtual_path: str) -> Path:
-    """Resolve a virtual skill file path to the actual filesystem path.
-
-    Args:
-        thread_id: The thread ID.
-        virtual_path: The virtual path (e.g., mnt/user-data/outputs/my-skill.skill).
-
-    Returns:
-        The resolved filesystem path.
-
-    Raises:
-        HTTPException: If the path is invalid or outside allowed directories.
-    """
-    # Remove leading slash if present
-    virtual_path = virtual_path.lstrip("/")
-
-    # Validate and remove virtual path prefix
-    if not virtual_path.startswith(VIRTUAL_PATH_PREFIX):
-        raise HTTPException(status_code=400, detail=f"Path must start with /{VIRTUAL_PATH_PREFIX}")
-    relative_path = virtual_path[len(VIRTUAL_PATH_PREFIX) :].lstrip("/")
-
-    # Build the actual path
-    base_dir = Path(os.getcwd()) / THREAD_DATA_BASE_DIR / thread_id / "user-data"
-    actual_path = base_dir / relative_path
-
-    # Security check: ensure the path is within the thread's user-data directory
-    try:
-        actual_path = actual_path.resolve()
-        base_dir_resolved = base_dir.resolve()
-        if not str(actual_path).startswith(str(base_dir_resolved)):
-            raise HTTPException(status_code=403, detail="Access denied: path traversal detected")
-    except (ValueError, RuntimeError):
-        raise HTTPException(status_code=400, detail="Invalid path")
-
-    return actual_path
-
-
 def _validate_skill_frontmatter(skill_dir: Path) -> tuple[bool, str, str | None]:
    """Validate a skill directory's SKILL.md frontmatter.

@@ -414,7 +371,7 @@ async def install_skill(request: SkillInstallRequest) -> SkillInstallResponse:
    """
    try:
        # Resolve the virtual path to actual file path
-        skill_file_path = _resolve_skill_file_path(request.thread_id, request.path)
+        skill_file_path = resolve_thread_virtual_path(request.thread_id, request.path)

        # Check if file exists
        if not skill_file_path.exists():
--- a/backend/src/subagents/builtins/general_purpose.py
+++ b/backend/src/subagents/builtins/general_purpose.py
@@ -24,10 +24,21 @@ Do NOT use for simple, single-step operations.""",
 - Do NOT ask for clarification - work with the information provided
 </guidelines>

+<citations_format>
+If you used web_search (or similar) and cite sources, ALWAYS include citations in your output:
+1. Start with a `<citations>` block in JSONL format listing all sources (one JSON object per line)
+2. In content, use FULL markdown link format: [Short Title](full_url)
+- Every citation MUST be a complete markdown link with URL: [Title](https://...)
+- Example block:
+<citations>
+{"id": "cite-1", "title": "...", "url": "https://...", "snippet": "..."}
+</citations>
+</citations_format>
+
 <output_format>
 When you complete the task, provide:
 1. A brief summary of what was accomplished
-2. Key findings or results
+2. Key findings or results (with citation links when from web search)
 3. Any relevant file paths, data, or artifacts created
 4. Issues encountered (if any)
 </output_format>
--- a/frontend/src/core/citations/utils.ts
+++ b/frontend/src/core/citations/utils.ts
@@ -187,44 +187,25 @@ export function isCitationsBlockIncomplete(content: string): boolean {
 /**
 * Remove ALL citations from content, including:
 * - <citations> blocks
- * - [cite-N] references
- * - Citation markdown links that were converted from [cite-N]
- * 
- * This is used for copy/download operations where we want clean content without any references.
+ * - [cite-N] references (and their converted markdown links)
+ *
+ * Uses parseCitations once, then strips citation links from cleanContent.
+ * Used for copy/download to produce content without any citation references.
 *
 * @param content - The raw content that may contain citations
 * @returns Content with all citations completely removed
 */
 export function removeAllCitations(content: string): string {
-  if (!content) {
-    return content;
-  }
+  if (!content) return content;

-  // Step 1: Remove all <citations> blocks (complete and incomplete)
-  let result = removeCitationsBlocks(content);
-
-  // Step 2: Remove all [cite-N] references
-  result = result.replace(/\[cite-\d+\]/g, "");
-
-  // Step 3: Parse to find citation URLs and remove those specific links
  const parsed = parseCitations(content);
-  const citationUrls = new Set(parsed.citations.map(c => c.url));
-  
-  // Remove markdown links that point to citation URLs
-  // Pattern: [text](url)
-  result = result.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, url) => {
-    // If this URL is a citation, remove the entire link
-    if (citationUrls.has(url)) {
-      return "";
-    }
-    // Keep non-citation links
-    return match;
-  });
+  const citationUrls = new Set(parsed.citations.map((c) => c.url));

-  // Step 4: Clean up extra whitespace and newlines
-  result = result
-    .replace(/\n{3,}/g, "\n\n") // Replace 3+ newlines with 2
-    .trim();
+  // Remove markdown links that point to citation URLs; keep non-citation links
+  const withoutLinks = parsed.cleanContent.replace(
+    /\[([^\]]+)\]\(([^)]+)\)/g,
+    (fullMatch, _text, url) => (citationUrls.has(url) ? "" : fullMatch),
+  );

-  return result;
+  return withoutLinks.replace(/\n{3,}/g, "\n\n").trim();
 }