mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-21 21:24:46 +08:00
chore : Improved citation system (#834)
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
289
tests/unit/citations/test_collector.py
Normal file
289
tests/unit/citations/test_collector.py
Normal file
@@ -0,0 +1,289 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
Unit tests for CitationCollector optimization with reverse index cache.
|
||||
|
||||
Tests the O(1) URL lookup performance optimization via _url_to_index cache.
|
||||
"""
|
||||
|
||||
from src.citations.collector import CitationCollector
|
||||
|
||||
|
||||
class TestCitationCollectorOptimization:
|
||||
"""Test CitationCollector reverse index cache optimization."""
|
||||
|
||||
def test_url_to_index_cache_initialization(self):
|
||||
"""Test that _url_to_index is properly initialized."""
|
||||
collector = CitationCollector()
|
||||
assert hasattr(collector, "_url_to_index")
|
||||
assert isinstance(collector._url_to_index, dict)
|
||||
assert len(collector._url_to_index) == 0
|
||||
|
||||
def test_add_single_citation_updates_cache(self):
|
||||
"""Test that adding a citation updates _url_to_index."""
|
||||
collector = CitationCollector()
|
||||
results = [
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"title": "Example",
|
||||
"content": "Content",
|
||||
"score": 0.9,
|
||||
}
|
||||
]
|
||||
|
||||
collector.add_from_search_results(results)
|
||||
|
||||
# Check cache is populated
|
||||
assert "https://example.com" in collector._url_to_index
|
||||
assert collector._url_to_index["https://example.com"] == 0
|
||||
|
||||
def test_add_multiple_citations_updates_cache_correctly(self):
|
||||
"""Test that multiple citations are indexed correctly."""
|
||||
collector = CitationCollector()
|
||||
results = [
|
||||
{
|
||||
"url": f"https://example.com/{i}",
|
||||
"title": f"Page {i}",
|
||||
"content": f"Content {i}",
|
||||
"score": 0.9,
|
||||
}
|
||||
for i in range(5)
|
||||
]
|
||||
|
||||
collector.add_from_search_results(results)
|
||||
|
||||
# Check all URLs are indexed
|
||||
assert len(collector._url_to_index) == 5
|
||||
for i in range(5):
|
||||
url = f"https://example.com/{i}"
|
||||
assert collector._url_to_index[url] == i
|
||||
|
||||
def test_get_number_uses_cache_for_o1_lookup(self):
|
||||
"""Test that get_number uses cache for O(1) lookup."""
|
||||
collector = CitationCollector()
|
||||
urls = [f"https://example.com/{i}" for i in range(100)]
|
||||
results = [
|
||||
{
|
||||
"url": url,
|
||||
"title": f"Title {i}",
|
||||
"content": f"Content {i}",
|
||||
"score": 0.9,
|
||||
}
|
||||
for i, url in enumerate(urls)
|
||||
]
|
||||
|
||||
collector.add_from_search_results(results)
|
||||
|
||||
# Test lookup for various positions
|
||||
assert collector.get_number("https://example.com/0") == 1
|
||||
assert collector.get_number("https://example.com/50") == 51
|
||||
assert collector.get_number("https://example.com/99") == 100
|
||||
|
||||
# Non-existent URL returns None
|
||||
assert collector.get_number("https://nonexistent.com") is None
|
||||
|
||||
def test_add_from_crawl_result_updates_cache(self):
|
||||
"""Test that add_from_crawl_result updates cache."""
|
||||
collector = CitationCollector()
|
||||
|
||||
collector.add_from_crawl_result(
|
||||
url="https://crawled.com/page",
|
||||
title="Crawled Page",
|
||||
content="Crawled content",
|
||||
)
|
||||
|
||||
assert "https://crawled.com/page" in collector._url_to_index
|
||||
assert collector._url_to_index["https://crawled.com/page"] == 0
|
||||
|
||||
def test_duplicate_url_does_not_change_cache(self):
|
||||
"""Test that adding duplicate URLs doesn't change cache indices."""
|
||||
collector = CitationCollector()
|
||||
|
||||
# Add first time
|
||||
collector.add_from_search_results(
|
||||
[
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"title": "Title 1",
|
||||
"content": "Content 1",
|
||||
"score": 0.8,
|
||||
}
|
||||
]
|
||||
)
|
||||
assert collector._url_to_index["https://example.com"] == 0
|
||||
|
||||
# Add same URL again with better score
|
||||
collector.add_from_search_results(
|
||||
[
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"title": "Title 1 Updated",
|
||||
"content": "Content 1 Updated",
|
||||
"score": 0.95,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Cache index should not change
|
||||
assert collector._url_to_index["https://example.com"] == 0
|
||||
# But metadata should be updated
|
||||
assert collector._citations["https://example.com"].relevance_score == 0.95
|
||||
|
||||
def test_merge_with_updates_cache_correctly(self):
|
||||
"""Test that merge_with correctly updates cache for new URLs."""
|
||||
collector1 = CitationCollector()
|
||||
collector2 = CitationCollector()
|
||||
|
||||
# Add to collector1
|
||||
collector1.add_from_search_results(
|
||||
[
|
||||
{
|
||||
"url": "https://a.com",
|
||||
"title": "A",
|
||||
"content": "Content A",
|
||||
"score": 0.9,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Add to collector2
|
||||
collector2.add_from_search_results(
|
||||
[
|
||||
{
|
||||
"url": "https://b.com",
|
||||
"title": "B",
|
||||
"content": "Content B",
|
||||
"score": 0.9,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
collector1.merge_with(collector2)
|
||||
|
||||
# Both URLs should be in cache
|
||||
assert "https://a.com" in collector1._url_to_index
|
||||
assert "https://b.com" in collector1._url_to_index
|
||||
assert collector1._url_to_index["https://a.com"] == 0
|
||||
assert collector1._url_to_index["https://b.com"] == 1
|
||||
|
||||
def test_from_dict_rebuilds_cache(self):
|
||||
"""Test that from_dict properly rebuilds cache."""
|
||||
# Create original collector
|
||||
original = CitationCollector()
|
||||
original.add_from_search_results(
|
||||
[
|
||||
{
|
||||
"url": f"https://example.com/{i}",
|
||||
"title": f"Page {i}",
|
||||
"content": f"Content {i}",
|
||||
"score": 0.9,
|
||||
}
|
||||
for i in range(3)
|
||||
]
|
||||
)
|
||||
|
||||
# Serialize and deserialize
|
||||
data = original.to_dict()
|
||||
restored = CitationCollector.from_dict(data)
|
||||
|
||||
# Check cache is properly rebuilt
|
||||
assert len(restored._url_to_index) == 3
|
||||
for i in range(3):
|
||||
url = f"https://example.com/{i}"
|
||||
assert url in restored._url_to_index
|
||||
assert restored._url_to_index[url] == i
|
||||
|
||||
def test_clear_resets_cache(self):
|
||||
"""Test that clear() properly resets the cache."""
|
||||
collector = CitationCollector()
|
||||
collector.add_from_search_results(
|
||||
[
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"title": "Example",
|
||||
"content": "Content",
|
||||
"score": 0.9,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
assert len(collector._url_to_index) > 0
|
||||
|
||||
collector.clear()
|
||||
|
||||
assert len(collector._url_to_index) == 0
|
||||
assert len(collector._citations) == 0
|
||||
assert len(collector._citation_order) == 0
|
||||
|
||||
def test_cache_consistency_with_order_list(self):
|
||||
"""Test that cache indices match positions in _citation_order."""
|
||||
collector = CitationCollector()
|
||||
urls = [f"https://example.com/{i}" for i in range(10)]
|
||||
results = [
|
||||
{
|
||||
"url": url,
|
||||
"title": f"Title {i}",
|
||||
"content": f"Content {i}",
|
||||
"score": 0.9,
|
||||
}
|
||||
for i, url in enumerate(urls)
|
||||
]
|
||||
|
||||
collector.add_from_search_results(results)
|
||||
|
||||
# Verify cache indices match order list positions
|
||||
for i, url in enumerate(collector._citation_order):
|
||||
assert collector._url_to_index[url] == i
|
||||
|
||||
def test_mark_used_with_cache(self):
|
||||
"""Test that mark_used works correctly with cache."""
|
||||
collector = CitationCollector()
|
||||
collector.add_from_search_results(
|
||||
[
|
||||
{
|
||||
"url": "https://example.com/1",
|
||||
"title": "Page 1",
|
||||
"content": "Content 1",
|
||||
"score": 0.9,
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/2",
|
||||
"title": "Page 2",
|
||||
"content": "Content 2",
|
||||
"score": 0.9,
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
# Mark one as used
|
||||
number = collector.mark_used("https://example.com/2")
|
||||
assert number == 2
|
||||
|
||||
# Verify it's in used set
|
||||
assert "https://example.com/2" in collector._used_citations
|
||||
|
||||
def test_large_collection_cache_performance(self):
|
||||
"""Test that cache works correctly with large collections."""
|
||||
collector = CitationCollector()
|
||||
num_citations = 1000
|
||||
results = [
|
||||
{
|
||||
"url": f"https://example.com/{i}",
|
||||
"title": f"Title {i}",
|
||||
"content": f"Content {i}",
|
||||
"score": 0.9,
|
||||
}
|
||||
for i in range(num_citations)
|
||||
]
|
||||
|
||||
collector.add_from_search_results(results)
|
||||
|
||||
# Verify cache size
|
||||
assert len(collector._url_to_index) == num_citations
|
||||
|
||||
# Test lookups at various positions
|
||||
test_indices = [0, 100, 500, 999]
|
||||
for idx in test_indices:
|
||||
url = f"https://example.com/{idx}"
|
||||
assert collector.get_number(url) == idx + 1
|
||||
Reference in New Issue
Block a user