fix(harness): skip duplicate memory facts (#1193)

* fix(harness): skip duplicate memory facts

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>

* docs: note memory fact deduplication

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>

* Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Ryanba
2026-03-18 22:41:13 +08:00
committed by GitHub
parent 423ea59491
commit f67c3d2c9e
4 changed files with 169 additions and 3 deletions

View File

@@ -312,7 +312,7 @@ Bridges external messaging platforms (Feishu, Slack, Telegram) to the DeerFlow a
### Memory System (`packages/harness/deerflow/agents/memory/`)
**Components**:
- `updater.py` - LLM-based memory updates with fact extraction and atomic file I/O
- `updater.py` - LLM-based memory updates with fact extraction, whitespace-normalized fact deduplication (trims leading/trailing whitespace before comparing), and atomic file I/O
- `queue.py` - Debounced update queue (per-thread deduplication, configurable wait time)
- `prompt.py` - Prompt templates for memory updates
@@ -325,9 +325,11 @@ Bridges external messaging platforms (Feishu, Slack, Telegram) to the DeerFlow a
1. `MemoryMiddleware` filters messages (user inputs + final AI responses) and queues conversation
2. Queue debounces (30s default), batches updates, deduplicates per-thread
3. Background thread invokes LLM to extract context updates and facts
4. Applies updates atomically (temp file + rename) with cache invalidation
4. Applies updates atomically (temp file + rename) with cache invalidation, skipping duplicate fact content before append
5. Next interaction injects top 15 facts + context into `<memory>` tags in system prompt
Focused regression coverage for the updater lives in `backend/tests/test_memory_updater.py`.
**Configuration** (`config.yaml``memory`):
- `enabled` / `injection_enabled` - Master switches
- `storage_path` - Path to memory.json

View File

@@ -173,6 +173,15 @@ def _strip_upload_mentions_from_memory(memory_data: dict[str, Any]) -> dict[str,
return memory_data
def _fact_content_key(content: Any) -> str | None:
if not isinstance(content, str):
return None
stripped = content.strip()
if not stripped:
return None
return stripped
def _save_memory_to_file(memory_data: dict[str, Any], agent_name: str | None = None) -> bool:
"""Save memory data to file and update cache.
@@ -343,19 +352,35 @@ class MemoryUpdater:
current_memory["facts"] = [f for f in current_memory.get("facts", []) if f.get("id") not in facts_to_remove]
# Add new facts
existing_fact_keys = {
fact_key
for fact_key in (
_fact_content_key(fact.get("content"))
for fact in current_memory.get("facts", [])
)
if fact_key is not None
}
new_facts = update_data.get("newFacts", [])
for fact in new_facts:
confidence = fact.get("confidence", 0.5)
if confidence >= config.fact_confidence_threshold:
raw_content = fact.get("content", "")
normalized_content = raw_content.strip()
fact_key = _fact_content_key(normalized_content)
if fact_key is not None and fact_key in existing_fact_keys:
continue
fact_entry = {
"id": f"fact_{uuid.uuid4().hex[:8]}",
"content": fact.get("content", ""),
"content": normalized_content,
"category": fact.get("category", "context"),
"confidence": confidence,
"createdAt": now,
"source": thread_id or "unknown",
}
current_memory["facts"].append(fact_entry)
if fact_key is not None:
existing_fact_keys.add(fact_key)
# Enforce max facts limit
if len(current_memory["facts"]) > config.max_facts:

View File

@@ -0,0 +1,137 @@
from unittest.mock import patch
from deerflow.agents.memory.updater import MemoryUpdater
from deerflow.config.memory_config import MemoryConfig
def _make_memory(facts: list[dict[str, object]] | None = None) -> dict[str, object]:
return {
"version": "1.0",
"lastUpdated": "",
"user": {
"workContext": {"summary": "", "updatedAt": ""},
"personalContext": {"summary": "", "updatedAt": ""},
"topOfMind": {"summary": "", "updatedAt": ""},
},
"history": {
"recentMonths": {"summary": "", "updatedAt": ""},
"earlierContext": {"summary": "", "updatedAt": ""},
"longTermBackground": {"summary": "", "updatedAt": ""},
},
"facts": facts or [],
}
def _memory_config(**overrides: object) -> MemoryConfig:
config = MemoryConfig()
for key, value in overrides.items():
setattr(config, key, value)
return config
def test_apply_updates_skips_existing_duplicate_and_preserves_removals() -> None:
updater = MemoryUpdater()
current_memory = _make_memory(
facts=[
{
"id": "fact_existing",
"content": "User likes Python",
"category": "preference",
"confidence": 0.9,
"createdAt": "2026-03-18T00:00:00Z",
"source": "thread-a",
},
{
"id": "fact_remove",
"content": "Old context to remove",
"category": "context",
"confidence": 0.8,
"createdAt": "2026-03-18T00:00:00Z",
"source": "thread-a",
},
]
)
update_data = {
"factsToRemove": ["fact_remove"],
"newFacts": [
{"content": "User likes Python", "category": "preference", "confidence": 0.95},
],
}
with patch(
"deerflow.agents.memory.updater.get_memory_config",
return_value=_memory_config(max_facts=100, fact_confidence_threshold=0.7),
):
result = updater._apply_updates(current_memory, update_data, thread_id="thread-b")
assert [fact["content"] for fact in result["facts"]] == ["User likes Python"]
assert all(fact["id"] != "fact_remove" for fact in result["facts"])
def test_apply_updates_skips_same_batch_duplicates_and_keeps_source_metadata() -> None:
updater = MemoryUpdater()
current_memory = _make_memory()
update_data = {
"newFacts": [
{"content": "User prefers dark mode", "category": "preference", "confidence": 0.91},
{"content": "User prefers dark mode", "category": "preference", "confidence": 0.92},
{"content": "User works on DeerFlow", "category": "context", "confidence": 0.87},
],
}
with patch(
"deerflow.agents.memory.updater.get_memory_config",
return_value=_memory_config(max_facts=100, fact_confidence_threshold=0.7),
):
result = updater._apply_updates(current_memory, update_data, thread_id="thread-42")
assert [fact["content"] for fact in result["facts"]] == [
"User prefers dark mode",
"User works on DeerFlow",
]
assert all(fact["id"].startswith("fact_") for fact in result["facts"])
assert all(fact["source"] == "thread-42" for fact in result["facts"])
def test_apply_updates_preserves_threshold_and_max_facts_trimming() -> None:
updater = MemoryUpdater()
current_memory = _make_memory(
facts=[
{
"id": "fact_python",
"content": "User likes Python",
"category": "preference",
"confidence": 0.95,
"createdAt": "2026-03-18T00:00:00Z",
"source": "thread-a",
},
{
"id": "fact_dark_mode",
"content": "User prefers dark mode",
"category": "preference",
"confidence": 0.8,
"createdAt": "2026-03-18T00:00:00Z",
"source": "thread-a",
},
]
)
update_data = {
"newFacts": [
{"content": "User prefers dark mode", "category": "preference", "confidence": 0.9},
{"content": "User uses uv", "category": "context", "confidence": 0.85},
{"content": "User likes noisy logs", "category": "behavior", "confidence": 0.6},
],
}
with patch(
"deerflow.agents.memory.updater.get_memory_config",
return_value=_memory_config(max_facts=2, fact_confidence_threshold=0.7),
):
result = updater._apply_updates(current_memory, update_data, thread_id="thread-9")
assert [fact["content"] for fact in result["facts"]] == [
"User likes Python",
"User uses uv",
]
assert all(fact["content"] != "User likes noisy logs" for fact in result["facts"])
assert result["facts"][1]["source"] == "thread-9"