chore : Improved citation system (#834)

* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
2026-04-21 21:24:46 +08:00 · 2026-01-25 15:49:45 +08:00
parent 31624b64b8
commit 9a34e32252
8 changed files with 1735 additions and 65 deletions
--- a/tests/unit/citations/test_collector.py
+++ b/tests/unit/citations/test_collector.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Unit tests for CitationCollector optimization with reverse index cache.
+
+Tests the O(1) URL lookup performance optimization via _url_to_index cache.
+"""
+
+from src.citations.collector import CitationCollector
+
+
+class TestCitationCollectorOptimization:
+    """Test CitationCollector reverse index cache optimization."""
+
+    def test_url_to_index_cache_initialization(self):
+        """Test that _url_to_index is properly initialized."""
+        collector = CitationCollector()
+        assert hasattr(collector, "_url_to_index")
+        assert isinstance(collector._url_to_index, dict)
+        assert len(collector._url_to_index) == 0
+
+    def test_add_single_citation_updates_cache(self):
+        """Test that adding a citation updates _url_to_index."""
+        collector = CitationCollector()
+        results = [
+            {
+                "url": "https://example.com",
+                "title": "Example",
+                "content": "Content",
+                "score": 0.9,
+            }
+        ]
+
+        collector.add_from_search_results(results)
+
+        # Check cache is populated
+        assert "https://example.com" in collector._url_to_index
+        assert collector._url_to_index["https://example.com"] == 0
+
+    def test_add_multiple_citations_updates_cache_correctly(self):
+        """Test that multiple citations are indexed correctly."""
+        collector = CitationCollector()
+        results = [
+            {
+                "url": f"https://example.com/{i}",
+                "title": f"Page {i}",
+                "content": f"Content {i}",
+                "score": 0.9,
+            }
+            for i in range(5)
+        ]
+
+        collector.add_from_search_results(results)
+
+        # Check all URLs are indexed
+        assert len(collector._url_to_index) == 5
+        for i in range(5):
+            url = f"https://example.com/{i}"
+            assert collector._url_to_index[url] == i
+
+    def test_get_number_uses_cache_for_o1_lookup(self):
+        """Test that get_number uses cache for O(1) lookup."""
+        collector = CitationCollector()
+        urls = [f"https://example.com/{i}" for i in range(100)]
+        results = [
+            {
+                "url": url,
+                "title": f"Title {i}",
+                "content": f"Content {i}",
+                "score": 0.9,
+            }
+            for i, url in enumerate(urls)
+        ]
+
+        collector.add_from_search_results(results)
+
+        # Test lookup for various positions
+        assert collector.get_number("https://example.com/0") == 1
+        assert collector.get_number("https://example.com/50") == 51
+        assert collector.get_number("https://example.com/99") == 100
+
+        # Non-existent URL returns None
+        assert collector.get_number("https://nonexistent.com") is None
+
+    def test_add_from_crawl_result_updates_cache(self):
+        """Test that add_from_crawl_result updates cache."""
+        collector = CitationCollector()
+
+        collector.add_from_crawl_result(
+            url="https://crawled.com/page",
+            title="Crawled Page",
+            content="Crawled content",
+        )
+
+        assert "https://crawled.com/page" in collector._url_to_index
+        assert collector._url_to_index["https://crawled.com/page"] == 0
+
+    def test_duplicate_url_does_not_change_cache(self):
+        """Test that adding duplicate URLs doesn't change cache indices."""
+        collector = CitationCollector()
+
+        # Add first time
+        collector.add_from_search_results(
+            [
+                {
+                    "url": "https://example.com",
+                    "title": "Title 1",
+                    "content": "Content 1",
+                    "score": 0.8,
+                }
+            ]
+        )
+        assert collector._url_to_index["https://example.com"] == 0
+
+        # Add same URL again with better score
+        collector.add_from_search_results(
+            [
+                {
+                    "url": "https://example.com",
+                    "title": "Title 1 Updated",
+                    "content": "Content 1 Updated",
+                    "score": 0.95,
+                }
+            ]
+        )
+
+        # Cache index should not change
+        assert collector._url_to_index["https://example.com"] == 0
+        # But metadata should be updated
+        assert collector._citations["https://example.com"].relevance_score == 0.95
+
+    def test_merge_with_updates_cache_correctly(self):
+        """Test that merge_with correctly updates cache for new URLs."""
+        collector1 = CitationCollector()
+        collector2 = CitationCollector()
+
+        # Add to collector1
+        collector1.add_from_search_results(
+            [
+                {
+                    "url": "https://a.com",
+                    "title": "A",
+                    "content": "Content A",
+                    "score": 0.9,
+                }
+            ]
+        )
+
+        # Add to collector2
+        collector2.add_from_search_results(
+            [
+                {
+                    "url": "https://b.com",
+                    "title": "B",
+                    "content": "Content B",
+                    "score": 0.9,
+                }
+            ]
+        )
+
+        collector1.merge_with(collector2)
+
+        # Both URLs should be in cache
+        assert "https://a.com" in collector1._url_to_index
+        assert "https://b.com" in collector1._url_to_index
+        assert collector1._url_to_index["https://a.com"] == 0
+        assert collector1._url_to_index["https://b.com"] == 1
+
+    def test_from_dict_rebuilds_cache(self):
+        """Test that from_dict properly rebuilds cache."""
+        # Create original collector
+        original = CitationCollector()
+        original.add_from_search_results(
+            [
+                {
+                    "url": f"https://example.com/{i}",
+                    "title": f"Page {i}",
+                    "content": f"Content {i}",
+                    "score": 0.9,
+                }
+                for i in range(3)
+            ]
+        )
+
+        # Serialize and deserialize
+        data = original.to_dict()
+        restored = CitationCollector.from_dict(data)
+
+        # Check cache is properly rebuilt
+        assert len(restored._url_to_index) == 3
+        for i in range(3):
+            url = f"https://example.com/{i}"
+            assert url in restored._url_to_index
+            assert restored._url_to_index[url] == i
+
+    def test_clear_resets_cache(self):
+        """Test that clear() properly resets the cache."""
+        collector = CitationCollector()
+        collector.add_from_search_results(
+            [
+                {
+                    "url": "https://example.com",
+                    "title": "Example",
+                    "content": "Content",
+                    "score": 0.9,
+                }
+            ]
+        )
+
+        assert len(collector._url_to_index) > 0
+
+        collector.clear()
+
+        assert len(collector._url_to_index) == 0
+        assert len(collector._citations) == 0
+        assert len(collector._citation_order) == 0
+
+    def test_cache_consistency_with_order_list(self):
+        """Test that cache indices match positions in _citation_order."""
+        collector = CitationCollector()
+        urls = [f"https://example.com/{i}" for i in range(10)]
+        results = [
+            {
+                "url": url,
+                "title": f"Title {i}",
+                "content": f"Content {i}",
+                "score": 0.9,
+            }
+            for i, url in enumerate(urls)
+        ]
+
+        collector.add_from_search_results(results)
+
+        # Verify cache indices match order list positions
+        for i, url in enumerate(collector._citation_order):
+            assert collector._url_to_index[url] == i
+
+    def test_mark_used_with_cache(self):
+        """Test that mark_used works correctly with cache."""
+        collector = CitationCollector()
+        collector.add_from_search_results(
+            [
+                {
+                    "url": "https://example.com/1",
+                    "title": "Page 1",
+                    "content": "Content 1",
+                    "score": 0.9,
+                },
+                {
+                    "url": "https://example.com/2",
+                    "title": "Page 2",
+                    "content": "Content 2",
+                    "score": 0.9,
+                },
+            ]
+        )
+
+        # Mark one as used
+        number = collector.mark_used("https://example.com/2")
+        assert number == 2
+
+        # Verify it's in used set
+        assert "https://example.com/2" in collector._used_citations
+
+    def test_large_collection_cache_performance(self):
+        """Test that cache works correctly with large collections."""
+        collector = CitationCollector()
+        num_citations = 1000
+        results = [
+            {
+                "url": f"https://example.com/{i}",
+                "title": f"Title {i}",
+                "content": f"Content {i}",
+                "score": 0.9,
+            }
+            for i in range(num_citations)
+        ]
+
+        collector.add_from_search_results(results)
+
+        # Verify cache size
+        assert len(collector._url_to_index) == num_citations
+
+        # Test lookups at various positions
+        test_indices = [0, 100, 500, 999]
+        for idx in test_indices:
+            url = f"https://example.com/{idx}"
+            assert collector.get_number(url) == idx + 1