Files
deer-flow/tests/unit/citations/test_collector.py
Xun 9a34e32252 chore : Improved citation system (#834)
* improve: Improved citation system

* fix

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
2026-01-25 15:49:45 +08:00

290 lines
9.3 KiB
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Unit tests for CitationCollector optimization with reverse index cache.
Tests the O(1) URL lookup performance optimization via _url_to_index cache.
"""
from src.citations.collector import CitationCollector
class TestCitationCollectorOptimization:
"""Test CitationCollector reverse index cache optimization."""
def test_url_to_index_cache_initialization(self):
"""Test that _url_to_index is properly initialized."""
collector = CitationCollector()
assert hasattr(collector, "_url_to_index")
assert isinstance(collector._url_to_index, dict)
assert len(collector._url_to_index) == 0
def test_add_single_citation_updates_cache(self):
"""Test that adding a citation updates _url_to_index."""
collector = CitationCollector()
results = [
{
"url": "https://example.com",
"title": "Example",
"content": "Content",
"score": 0.9,
}
]
collector.add_from_search_results(results)
# Check cache is populated
assert "https://example.com" in collector._url_to_index
assert collector._url_to_index["https://example.com"] == 0
def test_add_multiple_citations_updates_cache_correctly(self):
"""Test that multiple citations are indexed correctly."""
collector = CitationCollector()
results = [
{
"url": f"https://example.com/{i}",
"title": f"Page {i}",
"content": f"Content {i}",
"score": 0.9,
}
for i in range(5)
]
collector.add_from_search_results(results)
# Check all URLs are indexed
assert len(collector._url_to_index) == 5
for i in range(5):
url = f"https://example.com/{i}"
assert collector._url_to_index[url] == i
def test_get_number_uses_cache_for_o1_lookup(self):
"""Test that get_number uses cache for O(1) lookup."""
collector = CitationCollector()
urls = [f"https://example.com/{i}" for i in range(100)]
results = [
{
"url": url,
"title": f"Title {i}",
"content": f"Content {i}",
"score": 0.9,
}
for i, url in enumerate(urls)
]
collector.add_from_search_results(results)
# Test lookup for various positions
assert collector.get_number("https://example.com/0") == 1
assert collector.get_number("https://example.com/50") == 51
assert collector.get_number("https://example.com/99") == 100
# Non-existent URL returns None
assert collector.get_number("https://nonexistent.com") is None
def test_add_from_crawl_result_updates_cache(self):
"""Test that add_from_crawl_result updates cache."""
collector = CitationCollector()
collector.add_from_crawl_result(
url="https://crawled.com/page",
title="Crawled Page",
content="Crawled content",
)
assert "https://crawled.com/page" in collector._url_to_index
assert collector._url_to_index["https://crawled.com/page"] == 0
def test_duplicate_url_does_not_change_cache(self):
"""Test that adding duplicate URLs doesn't change cache indices."""
collector = CitationCollector()
# Add first time
collector.add_from_search_results(
[
{
"url": "https://example.com",
"title": "Title 1",
"content": "Content 1",
"score": 0.8,
}
]
)
assert collector._url_to_index["https://example.com"] == 0
# Add same URL again with better score
collector.add_from_search_results(
[
{
"url": "https://example.com",
"title": "Title 1 Updated",
"content": "Content 1 Updated",
"score": 0.95,
}
]
)
# Cache index should not change
assert collector._url_to_index["https://example.com"] == 0
# But metadata should be updated
assert collector._citations["https://example.com"].relevance_score == 0.95
def test_merge_with_updates_cache_correctly(self):
"""Test that merge_with correctly updates cache for new URLs."""
collector1 = CitationCollector()
collector2 = CitationCollector()
# Add to collector1
collector1.add_from_search_results(
[
{
"url": "https://a.com",
"title": "A",
"content": "Content A",
"score": 0.9,
}
]
)
# Add to collector2
collector2.add_from_search_results(
[
{
"url": "https://b.com",
"title": "B",
"content": "Content B",
"score": 0.9,
}
]
)
collector1.merge_with(collector2)
# Both URLs should be in cache
assert "https://a.com" in collector1._url_to_index
assert "https://b.com" in collector1._url_to_index
assert collector1._url_to_index["https://a.com"] == 0
assert collector1._url_to_index["https://b.com"] == 1
def test_from_dict_rebuilds_cache(self):
"""Test that from_dict properly rebuilds cache."""
# Create original collector
original = CitationCollector()
original.add_from_search_results(
[
{
"url": f"https://example.com/{i}",
"title": f"Page {i}",
"content": f"Content {i}",
"score": 0.9,
}
for i in range(3)
]
)
# Serialize and deserialize
data = original.to_dict()
restored = CitationCollector.from_dict(data)
# Check cache is properly rebuilt
assert len(restored._url_to_index) == 3
for i in range(3):
url = f"https://example.com/{i}"
assert url in restored._url_to_index
assert restored._url_to_index[url] == i
def test_clear_resets_cache(self):
"""Test that clear() properly resets the cache."""
collector = CitationCollector()
collector.add_from_search_results(
[
{
"url": "https://example.com",
"title": "Example",
"content": "Content",
"score": 0.9,
}
]
)
assert len(collector._url_to_index) > 0
collector.clear()
assert len(collector._url_to_index) == 0
assert len(collector._citations) == 0
assert len(collector._citation_order) == 0
def test_cache_consistency_with_order_list(self):
"""Test that cache indices match positions in _citation_order."""
collector = CitationCollector()
urls = [f"https://example.com/{i}" for i in range(10)]
results = [
{
"url": url,
"title": f"Title {i}",
"content": f"Content {i}",
"score": 0.9,
}
for i, url in enumerate(urls)
]
collector.add_from_search_results(results)
# Verify cache indices match order list positions
for i, url in enumerate(collector._citation_order):
assert collector._url_to_index[url] == i
def test_mark_used_with_cache(self):
"""Test that mark_used works correctly with cache."""
collector = CitationCollector()
collector.add_from_search_results(
[
{
"url": "https://example.com/1",
"title": "Page 1",
"content": "Content 1",
"score": 0.9,
},
{
"url": "https://example.com/2",
"title": "Page 2",
"content": "Content 2",
"score": 0.9,
},
]
)
# Mark one as used
number = collector.mark_used("https://example.com/2")
assert number == 2
# Verify it's in used set
assert "https://example.com/2" in collector._used_citations
def test_large_collection_cache_performance(self):
"""Test that cache works correctly with large collections."""
collector = CitationCollector()
num_citations = 1000
results = [
{
"url": f"https://example.com/{i}",
"title": f"Title {i}",
"content": f"Content {i}",
"score": 0.9,
}
for i in range(num_citations)
]
collector.add_from_search_results(results)
# Verify cache size
assert len(collector._url_to_index) == num_citations
# Test lookups at various positions
test_indices = [0, 100, 500, 999]
for idx in test_indices:
url = f"https://example.com/{idx}"
assert collector.get_number(url) == idx + 1