diff --git a/backend/src/agents/memory/prompt.py b/backend/src/agents/memory/prompt.py
index 3982a2e..dd27476 100644
--- a/backend/src/agents/memory/prompt.py
+++ b/backend/src/agents/memory/prompt.py
@@ -1,5 +1,6 @@
"""Prompt templates for memory update and injection."""
+import re
from typing import Any
try:
@@ -108,6 +109,9 @@ Important Rules:
- For history sections, integrate new information chronologically into appropriate time period
- Preserve technical accuracy - keep exact names of technologies, companies, projects
- Focus on information useful for future interactions and personalization
+- IMPORTANT: Do NOT record file upload events in memory. Uploaded files are
+ session-specific and ephemeral — they will not be accessible in future sessions.
+ Recording upload events causes confusion in subsequent conversations.
Return ONLY valid JSON, no explanation or markdown."""
@@ -249,6 +253,16 @@ def format_conversation_for_update(messages: list[Any]) -> str:
text_parts = [p.get("text", "") for p in content if isinstance(p, dict) and "text" in p]
content = " ".join(text_parts) if text_parts else str(content)
+ # Strip uploaded_files tags from human messages to avoid persisting
+ # ephemeral file path info into long-term memory. Skip the turn entirely
+ # when nothing remains after stripping (upload-only message).
+ if role == "human":
+ content = re.sub(
+ r"[\s\S]*?\n*", "", str(content)
+ ).strip()
+ if not content:
+ continue
+
# Truncate very long messages
if len(str(content)) > 1000:
content = str(content)[:1000] + "..."
diff --git a/backend/src/agents/memory/updater.py b/backend/src/agents/memory/updater.py
index a9ad72a..b82da30 100644
--- a/backend/src/agents/memory/updater.py
+++ b/backend/src/agents/memory/updater.py
@@ -1,6 +1,7 @@
"""Memory updater for reading, writing, and updating memory data."""
import json
+import re
import uuid
from datetime import datetime
from pathlib import Path
@@ -135,6 +136,47 @@ def _load_memory_from_file(agent_name: str | None = None) -> dict[str, Any]:
return _create_empty_memory()
+# Matches sentences that describe a file-upload *event* rather than general
+# file-related work. Deliberately narrow to avoid removing legitimate facts
+# such as "User works with CSV files" or "prefers PDF export".
+_UPLOAD_SENTENCE_RE = re.compile(
+ r"[^.!?]*\b(?:"
+ r"upload(?:ed|ing)?(?:\s+\w+){0,3}\s+(?:file|files?|document|documents?|attachment|attachments?)"
+ r"|file\s+upload"
+ r"|/mnt/user-data/uploads/"
+ r"|"
+ r")[^.!?]*[.!?]?\s*",
+ re.IGNORECASE,
+)
+
+
+def _strip_upload_mentions_from_memory(memory_data: dict[str, Any]) -> dict[str, Any]:
+ """Remove sentences about file uploads from all memory summaries and facts.
+
+ Uploaded files are session-scoped; persisting upload events in long-term
+ memory causes the agent to search for non-existent files in future sessions.
+ """
+ # Scrub summaries in user/history sections
+ for section in ("user", "history"):
+ section_data = memory_data.get(section, {})
+ for _key, val in section_data.items():
+ if isinstance(val, dict) and "summary" in val:
+ cleaned = _UPLOAD_SENTENCE_RE.sub("", val["summary"]).strip()
+ cleaned = re.sub(r" +", " ", cleaned)
+ val["summary"] = cleaned
+
+ # Also remove any facts that describe upload events
+ facts = memory_data.get("facts", [])
+ if facts:
+ memory_data["facts"] = [
+ f
+ for f in facts
+ if not _UPLOAD_SENTENCE_RE.search(f.get("content", ""))
+ ]
+
+ return memory_data
+
+
def _save_memory_to_file(memory_data: dict[str, Any], agent_name: str | None = None) -> bool:
"""Save memory data to file and update cache.
@@ -244,6 +286,12 @@ class MemoryUpdater:
# Apply updates
updated_memory = self._apply_updates(current_memory, update_data, thread_id)
+ # Strip file-upload mentions from all summaries before saving.
+ # Uploaded files are session-scoped and won't exist in future sessions,
+ # so recording upload events in long-term memory causes the agent to
+ # try (and fail) to locate those files in subsequent conversations.
+ updated_memory = _strip_upload_mentions_from_memory(updated_memory)
+
# Save
return _save_memory_to_file(updated_memory, agent_name)
diff --git a/backend/src/agents/middlewares/memory_middleware.py b/backend/src/agents/middlewares/memory_middleware.py
index c7c74ea..8c88650 100644
--- a/backend/src/agents/middlewares/memory_middleware.py
+++ b/backend/src/agents/middlewares/memory_middleware.py
@@ -1,5 +1,6 @@
"""Middleware for memory mechanism."""
+import re
from typing import Any, override
from langchain.agents import AgentState
@@ -22,10 +23,16 @@ def _filter_messages_for_memory(messages: list[Any]) -> list[Any]:
This filters out:
- Tool messages (intermediate tool call results)
- AI messages with tool_calls (intermediate steps, not final responses)
+ - The block injected by UploadsMiddleware into human messages
+ (file paths are session-scoped and must not persist in long-term memory).
+ The user's actual question is preserved; only turns whose content is entirely
+ the upload block (nothing remains after stripping) are dropped along with
+ their paired assistant response.
Only keeps:
- - Human messages (user input)
- - AI messages without tool_calls (final assistant responses)
+ - Human messages (with the ephemeral upload block removed)
+ - AI messages without tool_calls (final assistant responses), unless the
+ paired human turn was upload-only and had no real user text.
Args:
messages: List of all conversation messages.
@@ -33,17 +40,47 @@ def _filter_messages_for_memory(messages: list[Any]) -> list[Any]:
Returns:
Filtered list containing only user inputs and final assistant responses.
"""
+ _UPLOAD_BLOCK_RE = re.compile(
+ r"[\s\S]*?\n*", re.IGNORECASE
+ )
+
filtered = []
+ skip_next_ai = False
for msg in messages:
msg_type = getattr(msg, "type", None)
if msg_type == "human":
- # Always keep user messages
- filtered.append(msg)
+ content = getattr(msg, "content", "")
+ if isinstance(content, list):
+ content = " ".join(
+ p.get("text", "") for p in content if isinstance(p, dict)
+ )
+ content_str = str(content)
+ if "" in content_str:
+ # Strip the ephemeral upload block; keep the user's real question.
+ stripped = _UPLOAD_BLOCK_RE.sub("", content_str).strip()
+ if not stripped:
+ # Nothing left — the entire turn was upload bookkeeping;
+ # skip it and the paired assistant response.
+ skip_next_ai = True
+ continue
+ # Rebuild the message with cleaned content so the user's question
+ # is still available for memory summarisation.
+ from copy import copy
+
+ clean_msg = copy(msg)
+ clean_msg.content = stripped
+ filtered.append(clean_msg)
+ skip_next_ai = False
+ else:
+ filtered.append(msg)
+ skip_next_ai = False
elif msg_type == "ai":
- # Only keep AI messages that are final responses (no tool_calls)
tool_calls = getattr(msg, "tool_calls", None)
if not tool_calls:
+ if skip_next_ai:
+ skip_next_ai = False
+ continue
filtered.append(msg)
# Skip tool messages and AI messages with tool_calls
diff --git a/backend/tests/test_memory_upload_filtering.py b/backend/tests/test_memory_upload_filtering.py
new file mode 100644
index 0000000..e100842
--- /dev/null
+++ b/backend/tests/test_memory_upload_filtering.py
@@ -0,0 +1,232 @@
+"""Tests for upload-event filtering in the memory pipeline.
+
+Covers two functions introduced to prevent ephemeral file-upload context from
+persisting in long-term memory:
+
+ - _filter_messages_for_memory (memory_middleware)
+ - _strip_upload_mentions_from_memory (updater)
+"""
+
+from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+
+from src.agents.memory.updater import _strip_upload_mentions_from_memory
+from src.agents.middlewares.memory_middleware import _filter_messages_for_memory
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_UPLOAD_BLOCK = (
+ "\n"
+ "The following files have been uploaded and are available for use:\n\n"
+ "- filename: secret.txt\n"
+ " path: /mnt/user-data/uploads/abc123/secret.txt\n"
+ " size: 42 bytes\n"
+ ""
+)
+
+
+def _human(text: str) -> HumanMessage:
+ return HumanMessage(content=text)
+
+
+def _ai(text: str, tool_calls=None) -> AIMessage:
+ msg = AIMessage(content=text)
+ if tool_calls:
+ msg.tool_calls = tool_calls
+ return msg
+
+
+# ===========================================================================
+# _filter_messages_for_memory
+# ===========================================================================
+
+
+class TestFilterMessagesForMemory:
+ # --- upload-only turns are excluded ---
+
+ def test_upload_only_turn_is_excluded(self):
+ """A human turn containing only (no real question)
+ and its paired AI response must both be dropped."""
+ msgs = [
+ _human(_UPLOAD_BLOCK),
+ _ai("I have read the file. It says: Hello."),
+ ]
+ result = _filter_messages_for_memory(msgs)
+ assert result == []
+
+ def test_upload_with_real_question_preserves_question(self):
+ """When the user asks a question alongside an upload, the question text
+ must reach the memory queue (upload block stripped, AI response kept)."""
+ combined = _UPLOAD_BLOCK + "\n\nWhat does this file contain?"
+ msgs = [
+ _human(combined),
+ _ai("The file contains: Hello DeerFlow."),
+ ]
+ result = _filter_messages_for_memory(msgs)
+
+ assert len(result) == 2
+ human_result = result[0]
+ assert "" not in human_result.content
+ assert "What does this file contain?" in human_result.content
+ assert result[1].content == "The file contains: Hello DeerFlow."
+
+ # --- non-upload turns pass through unchanged ---
+
+ def test_plain_conversation_passes_through(self):
+ msgs = [
+ _human("What is the capital of France?"),
+ _ai("The capital of France is Paris."),
+ ]
+ result = _filter_messages_for_memory(msgs)
+ assert len(result) == 2
+ assert result[0].content == "What is the capital of France?"
+ assert result[1].content == "The capital of France is Paris."
+
+ def test_tool_messages_are_excluded(self):
+ """Intermediate tool messages must never reach memory."""
+ msgs = [
+ _human("Search for something"),
+ _ai("Calling search tool", tool_calls=[{"name": "search", "id": "1", "args": {}}]),
+ ToolMessage(content="Search results", tool_call_id="1"),
+ _ai("Here are the results."),
+ ]
+ result = _filter_messages_for_memory(msgs)
+ human_msgs = [m for m in result if m.type == "human"]
+ ai_msgs = [m for m in result if m.type == "ai"]
+ assert len(human_msgs) == 1
+ assert len(ai_msgs) == 1
+ assert ai_msgs[0].content == "Here are the results."
+
+ def test_multi_turn_with_upload_in_middle(self):
+ """Only the upload turn is dropped; surrounding non-upload turns survive."""
+ msgs = [
+ _human("Hello, how are you?"),
+ _ai("I'm doing well, thank you!"),
+ _human(_UPLOAD_BLOCK), # upload-only → dropped
+ _ai("I read the uploaded file."), # paired AI → dropped
+ _human("What is 2 + 2?"),
+ _ai("4"),
+ ]
+ result = _filter_messages_for_memory(msgs)
+ human_contents = [m.content for m in result if m.type == "human"]
+ ai_contents = [m.content for m in result if m.type == "ai"]
+
+ assert "Hello, how are you?" in human_contents
+ assert "What is 2 + 2?" in human_contents
+ assert _UPLOAD_BLOCK not in human_contents
+ assert "I'm doing well, thank you!" in ai_contents
+ assert "4" in ai_contents
+ # The upload-paired AI response must NOT appear
+ assert "I read the uploaded file." not in ai_contents
+
+ def test_multimodal_content_list_handled(self):
+ """Human messages with list-style content (multimodal) are handled."""
+ msg = HumanMessage(content=[
+ {"type": "text", "text": _UPLOAD_BLOCK},
+ ])
+ msgs = [msg, _ai("Done.")]
+ result = _filter_messages_for_memory(msgs)
+ assert result == []
+
+ def test_file_path_not_in_filtered_content(self):
+ """After filtering, no upload file path should appear in any message."""
+ combined = _UPLOAD_BLOCK + "\n\nSummarise the file please."
+ msgs = [_human(combined), _ai("It says hello.")]
+ result = _filter_messages_for_memory(msgs)
+ all_content = " ".join(
+ m.content for m in result if isinstance(m.content, str)
+ )
+ assert "/mnt/user-data/uploads/" not in all_content
+ assert "" not in all_content
+
+
+# ===========================================================================
+# _strip_upload_mentions_from_memory
+# ===========================================================================
+
+
+class TestStripUploadMentionsFromMemory:
+ def _make_memory(self, summary: str, facts: list[dict] | None = None) -> dict:
+ return {
+ "user": {"topOfMind": {"summary": summary}},
+ "history": {"recentMonths": {"summary": ""}},
+ "facts": facts or [],
+ }
+
+ # --- summaries ---
+
+ def test_upload_event_sentence_removed_from_summary(self):
+ mem = self._make_memory(
+ "User is interested in AI. "
+ "User uploaded a test file for verification purposes. "
+ "User prefers concise answers."
+ )
+ result = _strip_upload_mentions_from_memory(mem)
+ summary = result["user"]["topOfMind"]["summary"]
+ assert "uploaded a test file" not in summary
+ assert "User is interested in AI" in summary
+ assert "User prefers concise answers" in summary
+
+ def test_upload_path_sentence_removed_from_summary(self):
+ mem = self._make_memory(
+ "User uses Python. "
+ "User uploaded file to /mnt/user-data/uploads/tid/data.csv. "
+ "User likes clean code."
+ )
+ result = _strip_upload_mentions_from_memory(mem)
+ summary = result["user"]["topOfMind"]["summary"]
+ assert "/mnt/user-data/uploads/" not in summary
+ assert "User uses Python" in summary
+
+ def test_legitimate_csv_mention_is_preserved(self):
+ """'User works with CSV files' must NOT be deleted — it's not an upload event."""
+ mem = self._make_memory("User regularly works with CSV files for data analysis.")
+ result = _strip_upload_mentions_from_memory(mem)
+ assert "CSV files" in result["user"]["topOfMind"]["summary"]
+
+ def test_pdf_export_preference_preserved(self):
+ """'Prefers PDF export' is a legitimate preference, not an upload event."""
+ mem = self._make_memory("User prefers PDF export for reports.")
+ result = _strip_upload_mentions_from_memory(mem)
+ assert "PDF export" in result["user"]["topOfMind"]["summary"]
+
+ def test_uploading_a_test_file_removed(self):
+ """'uploading a test file' (with intervening words) must be caught."""
+ mem = self._make_memory(
+ "User conducted a hands-on test by uploading a test file titled "
+ "'test_deerflow_memory_bug.txt'. User is also learning Python."
+ )
+ result = _strip_upload_mentions_from_memory(mem)
+ summary = result["user"]["topOfMind"]["summary"]
+ assert "test_deerflow_memory_bug.txt" not in summary
+ assert "uploading a test file" not in summary
+
+ # --- facts ---
+
+ def test_upload_fact_removed_from_facts(self):
+ facts = [
+ {"content": "User uploaded a file titled secret.txt", "category": "behavior"},
+ {"content": "User prefers dark mode", "category": "preference"},
+ {"content": "User is uploading document attachments regularly", "category": "behavior"},
+ ]
+ mem = self._make_memory("summary", facts=facts)
+ result = _strip_upload_mentions_from_memory(mem)
+ remaining = [f["content"] for f in result["facts"]]
+ assert "User prefers dark mode" in remaining
+ assert not any("uploaded a file" in c for c in remaining)
+ assert not any("uploading document" in c for c in remaining)
+
+ def test_non_upload_facts_preserved(self):
+ facts = [
+ {"content": "User graduated from Peking University", "category": "context"},
+ {"content": "User prefers Python over JavaScript", "category": "preference"},
+ ]
+ mem = self._make_memory("", facts=facts)
+ result = _strip_upload_mentions_from_memory(mem)
+ assert len(result["facts"]) == 2
+
+ def test_empty_memory_handled_gracefully(self):
+ mem = {"user": {}, "history": {}, "facts": []}
+ result = _strip_upload_mentions_from_memory(mem)
+ assert result == {"user": {}, "history": {}, "facts": []}