diff --git a/README.md b/README.md index 0d2def7..b27340d 100644 --- a/README.md +++ b/README.md @@ -437,6 +437,8 @@ Most agents forget everything the moment a conversation ends. DeerFlow remembers Across sessions, DeerFlow builds a persistent memory of your profile, preferences, and accumulated knowledge. The more you use it, the better it knows you — your writing style, your technical stack, your recurring workflows. Memory is stored locally and stays under your control. +Memory updates now skip duplicate fact entries at apply time, so repeated preferences and context do not accumulate endlessly across sessions. + ## Recommended Models DeerFlow is model-agnostic — it works with any LLM that implements the OpenAI-compatible API. That said, it performs best with models that support: diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 6c57461..22b77d2 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -312,7 +312,7 @@ Bridges external messaging platforms (Feishu, Slack, Telegram) to the DeerFlow a ### Memory System (`packages/harness/deerflow/agents/memory/`) **Components**: -- `updater.py` - LLM-based memory updates with fact extraction and atomic file I/O +- `updater.py` - LLM-based memory updates with fact extraction, whitespace-normalized fact deduplication (trims leading/trailing whitespace before comparing), and atomic file I/O - `queue.py` - Debounced update queue (per-thread deduplication, configurable wait time) - `prompt.py` - Prompt templates for memory updates @@ -325,9 +325,11 @@ Bridges external messaging platforms (Feishu, Slack, Telegram) to the DeerFlow a 1. `MemoryMiddleware` filters messages (user inputs + final AI responses) and queues conversation 2. Queue debounces (30s default), batches updates, deduplicates per-thread 3. Background thread invokes LLM to extract context updates and facts -4. Applies updates atomically (temp file + rename) with cache invalidation +4. Applies updates atomically (temp file + rename) with cache invalidation, skipping duplicate fact content before append 5. Next interaction injects top 15 facts + context into `` tags in system prompt +Focused regression coverage for the updater lives in `backend/tests/test_memory_updater.py`. + **Configuration** (`config.yaml` → `memory`): - `enabled` / `injection_enabled` - Master switches - `storage_path` - Path to memory.json diff --git a/backend/packages/harness/deerflow/agents/memory/updater.py b/backend/packages/harness/deerflow/agents/memory/updater.py index 19e3e20..9df62ae 100644 --- a/backend/packages/harness/deerflow/agents/memory/updater.py +++ b/backend/packages/harness/deerflow/agents/memory/updater.py @@ -173,6 +173,15 @@ def _strip_upload_mentions_from_memory(memory_data: dict[str, Any]) -> dict[str, return memory_data +def _fact_content_key(content: Any) -> str | None: + if not isinstance(content, str): + return None + stripped = content.strip() + if not stripped: + return None + return stripped + + def _save_memory_to_file(memory_data: dict[str, Any], agent_name: str | None = None) -> bool: """Save memory data to file and update cache. @@ -343,19 +352,35 @@ class MemoryUpdater: current_memory["facts"] = [f for f in current_memory.get("facts", []) if f.get("id") not in facts_to_remove] # Add new facts + existing_fact_keys = { + fact_key + for fact_key in ( + _fact_content_key(fact.get("content")) + for fact in current_memory.get("facts", []) + ) + if fact_key is not None + } new_facts = update_data.get("newFacts", []) for fact in new_facts: confidence = fact.get("confidence", 0.5) if confidence >= config.fact_confidence_threshold: + raw_content = fact.get("content", "") + normalized_content = raw_content.strip() + fact_key = _fact_content_key(normalized_content) + if fact_key is not None and fact_key in existing_fact_keys: + continue + fact_entry = { "id": f"fact_{uuid.uuid4().hex[:8]}", - "content": fact.get("content", ""), + "content": normalized_content, "category": fact.get("category", "context"), "confidence": confidence, "createdAt": now, "source": thread_id or "unknown", } current_memory["facts"].append(fact_entry) + if fact_key is not None: + existing_fact_keys.add(fact_key) # Enforce max facts limit if len(current_memory["facts"]) > config.max_facts: diff --git a/backend/tests/test_memory_updater.py b/backend/tests/test_memory_updater.py new file mode 100644 index 0000000..f43b73c --- /dev/null +++ b/backend/tests/test_memory_updater.py @@ -0,0 +1,137 @@ +from unittest.mock import patch + +from deerflow.agents.memory.updater import MemoryUpdater +from deerflow.config.memory_config import MemoryConfig + + +def _make_memory(facts: list[dict[str, object]] | None = None) -> dict[str, object]: + return { + "version": "1.0", + "lastUpdated": "", + "user": { + "workContext": {"summary": "", "updatedAt": ""}, + "personalContext": {"summary": "", "updatedAt": ""}, + "topOfMind": {"summary": "", "updatedAt": ""}, + }, + "history": { + "recentMonths": {"summary": "", "updatedAt": ""}, + "earlierContext": {"summary": "", "updatedAt": ""}, + "longTermBackground": {"summary": "", "updatedAt": ""}, + }, + "facts": facts or [], + } + + +def _memory_config(**overrides: object) -> MemoryConfig: + config = MemoryConfig() + for key, value in overrides.items(): + setattr(config, key, value) + return config + + +def test_apply_updates_skips_existing_duplicate_and_preserves_removals() -> None: + updater = MemoryUpdater() + current_memory = _make_memory( + facts=[ + { + "id": "fact_existing", + "content": "User likes Python", + "category": "preference", + "confidence": 0.9, + "createdAt": "2026-03-18T00:00:00Z", + "source": "thread-a", + }, + { + "id": "fact_remove", + "content": "Old context to remove", + "category": "context", + "confidence": 0.8, + "createdAt": "2026-03-18T00:00:00Z", + "source": "thread-a", + }, + ] + ) + update_data = { + "factsToRemove": ["fact_remove"], + "newFacts": [ + {"content": "User likes Python", "category": "preference", "confidence": 0.95}, + ], + } + + with patch( + "deerflow.agents.memory.updater.get_memory_config", + return_value=_memory_config(max_facts=100, fact_confidence_threshold=0.7), + ): + result = updater._apply_updates(current_memory, update_data, thread_id="thread-b") + + assert [fact["content"] for fact in result["facts"]] == ["User likes Python"] + assert all(fact["id"] != "fact_remove" for fact in result["facts"]) + + +def test_apply_updates_skips_same_batch_duplicates_and_keeps_source_metadata() -> None: + updater = MemoryUpdater() + current_memory = _make_memory() + update_data = { + "newFacts": [ + {"content": "User prefers dark mode", "category": "preference", "confidence": 0.91}, + {"content": "User prefers dark mode", "category": "preference", "confidence": 0.92}, + {"content": "User works on DeerFlow", "category": "context", "confidence": 0.87}, + ], + } + + with patch( + "deerflow.agents.memory.updater.get_memory_config", + return_value=_memory_config(max_facts=100, fact_confidence_threshold=0.7), + ): + result = updater._apply_updates(current_memory, update_data, thread_id="thread-42") + + assert [fact["content"] for fact in result["facts"]] == [ + "User prefers dark mode", + "User works on DeerFlow", + ] + assert all(fact["id"].startswith("fact_") for fact in result["facts"]) + assert all(fact["source"] == "thread-42" for fact in result["facts"]) + + +def test_apply_updates_preserves_threshold_and_max_facts_trimming() -> None: + updater = MemoryUpdater() + current_memory = _make_memory( + facts=[ + { + "id": "fact_python", + "content": "User likes Python", + "category": "preference", + "confidence": 0.95, + "createdAt": "2026-03-18T00:00:00Z", + "source": "thread-a", + }, + { + "id": "fact_dark_mode", + "content": "User prefers dark mode", + "category": "preference", + "confidence": 0.8, + "createdAt": "2026-03-18T00:00:00Z", + "source": "thread-a", + }, + ] + ) + update_data = { + "newFacts": [ + {"content": "User prefers dark mode", "category": "preference", "confidence": 0.9}, + {"content": "User uses uv", "category": "context", "confidence": 0.85}, + {"content": "User likes noisy logs", "category": "behavior", "confidence": 0.6}, + ], + } + + with patch( + "deerflow.agents.memory.updater.get_memory_config", + return_value=_memory_config(max_facts=2, fact_confidence_threshold=0.7), + ): + result = updater._apply_updates(current_memory, update_data, thread_id="thread-9") + + assert [fact["content"] for fact in result["facts"]] == [ + "User likes Python", + "User uses uv", + ] + assert all(fact["content"] != "User likes noisy logs" for fact in result["facts"]) + assert result["facts"][1]["source"] == "thread-9"