mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-13 10:24:44 +08:00
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
290 lines
9.3 KiB
Python
290 lines
9.3 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""
|
|
Unit tests for CitationCollector optimization with reverse index cache.
|
|
|
|
Tests the O(1) URL lookup performance optimization via _url_to_index cache.
|
|
"""
|
|
|
|
from src.citations.collector import CitationCollector
|
|
|
|
|
|
class TestCitationCollectorOptimization:
|
|
"""Test CitationCollector reverse index cache optimization."""
|
|
|
|
def test_url_to_index_cache_initialization(self):
|
|
"""Test that _url_to_index is properly initialized."""
|
|
collector = CitationCollector()
|
|
assert hasattr(collector, "_url_to_index")
|
|
assert isinstance(collector._url_to_index, dict)
|
|
assert len(collector._url_to_index) == 0
|
|
|
|
def test_add_single_citation_updates_cache(self):
|
|
"""Test that adding a citation updates _url_to_index."""
|
|
collector = CitationCollector()
|
|
results = [
|
|
{
|
|
"url": "https://example.com",
|
|
"title": "Example",
|
|
"content": "Content",
|
|
"score": 0.9,
|
|
}
|
|
]
|
|
|
|
collector.add_from_search_results(results)
|
|
|
|
# Check cache is populated
|
|
assert "https://example.com" in collector._url_to_index
|
|
assert collector._url_to_index["https://example.com"] == 0
|
|
|
|
def test_add_multiple_citations_updates_cache_correctly(self):
|
|
"""Test that multiple citations are indexed correctly."""
|
|
collector = CitationCollector()
|
|
results = [
|
|
{
|
|
"url": f"https://example.com/{i}",
|
|
"title": f"Page {i}",
|
|
"content": f"Content {i}",
|
|
"score": 0.9,
|
|
}
|
|
for i in range(5)
|
|
]
|
|
|
|
collector.add_from_search_results(results)
|
|
|
|
# Check all URLs are indexed
|
|
assert len(collector._url_to_index) == 5
|
|
for i in range(5):
|
|
url = f"https://example.com/{i}"
|
|
assert collector._url_to_index[url] == i
|
|
|
|
def test_get_number_uses_cache_for_o1_lookup(self):
|
|
"""Test that get_number uses cache for O(1) lookup."""
|
|
collector = CitationCollector()
|
|
urls = [f"https://example.com/{i}" for i in range(100)]
|
|
results = [
|
|
{
|
|
"url": url,
|
|
"title": f"Title {i}",
|
|
"content": f"Content {i}",
|
|
"score": 0.9,
|
|
}
|
|
for i, url in enumerate(urls)
|
|
]
|
|
|
|
collector.add_from_search_results(results)
|
|
|
|
# Test lookup for various positions
|
|
assert collector.get_number("https://example.com/0") == 1
|
|
assert collector.get_number("https://example.com/50") == 51
|
|
assert collector.get_number("https://example.com/99") == 100
|
|
|
|
# Non-existent URL returns None
|
|
assert collector.get_number("https://nonexistent.com") is None
|
|
|
|
def test_add_from_crawl_result_updates_cache(self):
|
|
"""Test that add_from_crawl_result updates cache."""
|
|
collector = CitationCollector()
|
|
|
|
collector.add_from_crawl_result(
|
|
url="https://crawled.com/page",
|
|
title="Crawled Page",
|
|
content="Crawled content",
|
|
)
|
|
|
|
assert "https://crawled.com/page" in collector._url_to_index
|
|
assert collector._url_to_index["https://crawled.com/page"] == 0
|
|
|
|
def test_duplicate_url_does_not_change_cache(self):
|
|
"""Test that adding duplicate URLs doesn't change cache indices."""
|
|
collector = CitationCollector()
|
|
|
|
# Add first time
|
|
collector.add_from_search_results(
|
|
[
|
|
{
|
|
"url": "https://example.com",
|
|
"title": "Title 1",
|
|
"content": "Content 1",
|
|
"score": 0.8,
|
|
}
|
|
]
|
|
)
|
|
assert collector._url_to_index["https://example.com"] == 0
|
|
|
|
# Add same URL again with better score
|
|
collector.add_from_search_results(
|
|
[
|
|
{
|
|
"url": "https://example.com",
|
|
"title": "Title 1 Updated",
|
|
"content": "Content 1 Updated",
|
|
"score": 0.95,
|
|
}
|
|
]
|
|
)
|
|
|
|
# Cache index should not change
|
|
assert collector._url_to_index["https://example.com"] == 0
|
|
# But metadata should be updated
|
|
assert collector._citations["https://example.com"].relevance_score == 0.95
|
|
|
|
def test_merge_with_updates_cache_correctly(self):
|
|
"""Test that merge_with correctly updates cache for new URLs."""
|
|
collector1 = CitationCollector()
|
|
collector2 = CitationCollector()
|
|
|
|
# Add to collector1
|
|
collector1.add_from_search_results(
|
|
[
|
|
{
|
|
"url": "https://a.com",
|
|
"title": "A",
|
|
"content": "Content A",
|
|
"score": 0.9,
|
|
}
|
|
]
|
|
)
|
|
|
|
# Add to collector2
|
|
collector2.add_from_search_results(
|
|
[
|
|
{
|
|
"url": "https://b.com",
|
|
"title": "B",
|
|
"content": "Content B",
|
|
"score": 0.9,
|
|
}
|
|
]
|
|
)
|
|
|
|
collector1.merge_with(collector2)
|
|
|
|
# Both URLs should be in cache
|
|
assert "https://a.com" in collector1._url_to_index
|
|
assert "https://b.com" in collector1._url_to_index
|
|
assert collector1._url_to_index["https://a.com"] == 0
|
|
assert collector1._url_to_index["https://b.com"] == 1
|
|
|
|
def test_from_dict_rebuilds_cache(self):
|
|
"""Test that from_dict properly rebuilds cache."""
|
|
# Create original collector
|
|
original = CitationCollector()
|
|
original.add_from_search_results(
|
|
[
|
|
{
|
|
"url": f"https://example.com/{i}",
|
|
"title": f"Page {i}",
|
|
"content": f"Content {i}",
|
|
"score": 0.9,
|
|
}
|
|
for i in range(3)
|
|
]
|
|
)
|
|
|
|
# Serialize and deserialize
|
|
data = original.to_dict()
|
|
restored = CitationCollector.from_dict(data)
|
|
|
|
# Check cache is properly rebuilt
|
|
assert len(restored._url_to_index) == 3
|
|
for i in range(3):
|
|
url = f"https://example.com/{i}"
|
|
assert url in restored._url_to_index
|
|
assert restored._url_to_index[url] == i
|
|
|
|
def test_clear_resets_cache(self):
|
|
"""Test that clear() properly resets the cache."""
|
|
collector = CitationCollector()
|
|
collector.add_from_search_results(
|
|
[
|
|
{
|
|
"url": "https://example.com",
|
|
"title": "Example",
|
|
"content": "Content",
|
|
"score": 0.9,
|
|
}
|
|
]
|
|
)
|
|
|
|
assert len(collector._url_to_index) > 0
|
|
|
|
collector.clear()
|
|
|
|
assert len(collector._url_to_index) == 0
|
|
assert len(collector._citations) == 0
|
|
assert len(collector._citation_order) == 0
|
|
|
|
def test_cache_consistency_with_order_list(self):
|
|
"""Test that cache indices match positions in _citation_order."""
|
|
collector = CitationCollector()
|
|
urls = [f"https://example.com/{i}" for i in range(10)]
|
|
results = [
|
|
{
|
|
"url": url,
|
|
"title": f"Title {i}",
|
|
"content": f"Content {i}",
|
|
"score": 0.9,
|
|
}
|
|
for i, url in enumerate(urls)
|
|
]
|
|
|
|
collector.add_from_search_results(results)
|
|
|
|
# Verify cache indices match order list positions
|
|
for i, url in enumerate(collector._citation_order):
|
|
assert collector._url_to_index[url] == i
|
|
|
|
def test_mark_used_with_cache(self):
|
|
"""Test that mark_used works correctly with cache."""
|
|
collector = CitationCollector()
|
|
collector.add_from_search_results(
|
|
[
|
|
{
|
|
"url": "https://example.com/1",
|
|
"title": "Page 1",
|
|
"content": "Content 1",
|
|
"score": 0.9,
|
|
},
|
|
{
|
|
"url": "https://example.com/2",
|
|
"title": "Page 2",
|
|
"content": "Content 2",
|
|
"score": 0.9,
|
|
},
|
|
]
|
|
)
|
|
|
|
# Mark one as used
|
|
number = collector.mark_used("https://example.com/2")
|
|
assert number == 2
|
|
|
|
# Verify it's in used set
|
|
assert "https://example.com/2" in collector._used_citations
|
|
|
|
def test_large_collection_cache_performance(self):
|
|
"""Test that cache works correctly with large collections."""
|
|
collector = CitationCollector()
|
|
num_citations = 1000
|
|
results = [
|
|
{
|
|
"url": f"https://example.com/{i}",
|
|
"title": f"Title {i}",
|
|
"content": f"Content {i}",
|
|
"score": 0.9,
|
|
}
|
|
for i in range(num_citations)
|
|
]
|
|
|
|
collector.add_from_search_results(results)
|
|
|
|
# Verify cache size
|
|
assert len(collector._url_to_index) == num_citations
|
|
|
|
# Test lookups at various positions
|
|
test_indices = [0, 100, 500, 999]
|
|
for idx in test_indices:
|
|
url = f"https://example.com/{idx}"
|
|
assert collector.get_number(url) == idx + 1
|