chore : Improved citation system (#834)

* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
2026-04-21 05:14:45 +08:00 · 2026-01-25 15:49:45 +08:00
parent 31624b64b8
commit 9a34e32252
8 changed files with 1735 additions and 65 deletions
--- a/tests/unit/citations/test_formatter.py
+++ b/tests/unit/citations/test_formatter.py
@@ -0,0 +1,423 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Unit tests for citation formatter enhancements.
+
+Tests the multi-format citation parsing and extraction capabilities.
+"""
+
+from src.citations.formatter import (
+    parse_citations_from_report,
+    _extract_markdown_links,
+    _extract_numbered_citations,
+    _extract_footnote_citations,
+    _extract_html_links,
+)
+
+
+class TestExtractMarkdownLinks:
+    """Test Markdown link extraction [title](url)."""
+
+    def test_extract_single_markdown_link(self):
+        """Test extraction of a single markdown link."""
+        text = "[Example Article](https://example.com)"
+        citations = _extract_markdown_links(text)
+        assert len(citations) == 1
+        assert citations[0]["title"] == "Example Article"
+        assert citations[0]["url"] == "https://example.com"
+        assert citations[0]["format"] == "markdown"
+
+    def test_extract_multiple_markdown_links(self):
+        """Test extraction of multiple markdown links."""
+        text = "[Link 1](https://example.com/1) and [Link 2](https://example.com/2)"
+        citations = _extract_markdown_links(text)
+        assert len(citations) == 2
+        assert citations[0]["title"] == "Link 1"
+        assert citations[1]["title"] == "Link 2"
+
+    def test_extract_markdown_link_with_spaces(self):
+        """Test markdown link with spaces in title."""
+        text = "[Article Title With Spaces](https://example.com)"
+        citations = _extract_markdown_links(text)
+        assert len(citations) == 1
+        assert citations[0]["title"] == "Article Title With Spaces"
+
+    def test_extract_markdown_link_ignore_non_http(self):
+        """Test that non-HTTP URLs are ignored."""
+        text = "[Relative Link](./relative/path) [HTTP Link](https://example.com)"
+        citations = _extract_markdown_links(text)
+        assert len(citations) == 1
+        assert citations[0]["url"] == "https://example.com"
+
+    def test_extract_markdown_link_with_query_params(self):
+        """Test markdown links with query parameters."""
+        text = "[Search Result](https://example.com/search?q=test&page=1)"
+        citations = _extract_markdown_links(text)
+        assert len(citations) == 1
+        assert "q=test" in citations[0]["url"]
+
+    def test_extract_markdown_link_empty_text(self):
+        """Test with no markdown links."""
+        text = "Just plain text with no links"
+        citations = _extract_markdown_links(text)
+        assert len(citations) == 0
+
+    def test_extract_markdown_link_strip_whitespace(self):
+        """Test that whitespace in title and URL is stripped."""
+        # Markdown links with spaces in URL are not valid, so they won't be extracted
+        text = "[Title](https://example.com)"
+        citations = _extract_markdown_links(text)
+        assert len(citations) >= 1
+        assert citations[0]["title"] == "Title"
+        assert citations[0]["url"] == "https://example.com"
+
+
+class TestExtractNumberedCitations:
+    """Test numbered citation extraction [1] Title - URL."""
+
+    def test_extract_single_numbered_citation(self):
+        """Test extraction of a single numbered citation."""
+        text = "[1] Example Article - https://example.com"
+        citations = _extract_numbered_citations(text)
+        assert len(citations) == 1
+        assert citations[0]["title"] == "Example Article"
+        assert citations[0]["url"] == "https://example.com"
+        assert citations[0]["format"] == "numbered"
+
+    def test_extract_multiple_numbered_citations(self):
+        """Test extraction of multiple numbered citations."""
+        text = "[1] First - https://example.com/1\n[2] Second - https://example.com/2"
+        citations = _extract_numbered_citations(text)
+        assert len(citations) == 2
+        assert citations[0]["title"] == "First"
+        assert citations[1]["title"] == "Second"
+
+    def test_extract_numbered_citation_with_long_title(self):
+        """Test numbered citation with longer title."""
+        text = "[5] A Comprehensive Guide to Python Programming - https://example.com"
+        citations = _extract_numbered_citations(text)
+        assert len(citations) == 1
+        assert "Comprehensive Guide" in citations[0]["title"]
+
+    def test_extract_numbered_citation_requires_valid_format(self):
+        """Test that invalid numbered format is not extracted."""
+        text = "[1 Title - https://example.com"  # Missing closing bracket
+        citations = _extract_numbered_citations(text)
+        assert len(citations) == 0
+
+    def test_extract_numbered_citation_empty_text(self):
+        """Test with no numbered citations."""
+        text = "Just plain text"
+        citations = _extract_numbered_citations(text)
+        assert len(citations) == 0
+
+    def test_extract_numbered_citation_various_numbers(self):
+        """Test with various citation numbers."""
+        text = "[10] Title Ten - https://example.com/10\n[999] Title 999 - https://example.com/999"
+        citations = _extract_numbered_citations(text)
+        assert len(citations) == 2
+
+    def test_extract_numbered_citation_ignore_non_http(self):
+        """Test that non-HTTP URLs in numbered citations are ignored."""
+        text = "[1] Invalid - file://path [2] Valid - https://example.com"
+        citations = _extract_numbered_citations(text)
+        # Only the valid one should be extracted
+        assert len(citations) <= 1
+
+
+class TestExtractFootnoteCitations:
+    """Test footnote citation extraction [^1]: Title - URL."""
+
+    def test_extract_single_footnote_citation(self):
+        """Test extraction of a single footnote citation."""
+        text = "[^1]: Example Article - https://example.com"
+        citations = _extract_footnote_citations(text)
+        assert len(citations) == 1
+        assert citations[0]["title"] == "Example Article"
+        assert citations[0]["url"] == "https://example.com"
+        assert citations[0]["format"] == "footnote"
+
+    def test_extract_multiple_footnote_citations(self):
+        """Test extraction of multiple footnote citations."""
+        text = "[^1]: First - https://example.com/1\n[^2]: Second - https://example.com/2"
+        citations = _extract_footnote_citations(text)
+        assert len(citations) == 2
+
+    def test_extract_footnote_with_complex_number(self):
+        """Test footnote extraction with various numbers."""
+        text = "[^123]: Title - https://example.com"
+        citations = _extract_footnote_citations(text)
+        assert len(citations) == 1
+        assert citations[0]["title"] == "Title"
+
+    def test_extract_footnote_citation_with_spaces(self):
+        """Test footnote with spaces around separator."""
+        text = "[^1]:  Title with spaces  -  https://example.com  "
+        citations = _extract_footnote_citations(text)
+        assert len(citations) == 1
+        # Should strip whitespace
+        assert citations[0]["title"] == "Title with spaces"
+
+    def test_extract_footnote_citation_empty_text(self):
+        """Test with no footnote citations."""
+        text = "No footnotes here"
+        citations = _extract_footnote_citations(text)
+        assert len(citations) == 0
+
+    def test_extract_footnote_requires_caret(self):
+        """Test that missing caret prevents extraction."""
+        text = "[1]: Title - https://example.com"  # Missing ^
+        citations = _extract_footnote_citations(text)
+        assert len(citations) == 0
+
+
+class TestExtractHtmlLinks:
+    """Test HTML link extraction <a href="url">title</a>."""
+
+    def test_extract_single_html_link(self):
+        """Test extraction of a single HTML link."""
+        text = '<a href="https://example.com">Example Article</a>'
+        citations = _extract_html_links(text)
+        assert len(citations) == 1
+        assert citations[0]["title"] == "Example Article"
+        assert citations[0]["url"] == "https://example.com"
+        assert citations[0]["format"] == "html"
+
+    def test_extract_multiple_html_links(self):
+        """Test extraction of multiple HTML links."""
+        text = '<a href="https://a.com">Link A</a> <a href="https://b.com">Link B</a>'
+        citations = _extract_html_links(text)
+        assert len(citations) == 2
+
+    def test_extract_html_link_single_quotes(self):
+        """Test HTML links with single quotes."""
+        text = "<a href='https://example.com'>Title</a>"
+        citations = _extract_html_links(text)
+        assert len(citations) == 1
+        assert citations[0]["url"] == "https://example.com"
+
+    def test_extract_html_link_with_attributes(self):
+        """Test HTML links with additional attributes."""
+        text = '<a class="link" href="https://example.com" target="_blank">Title</a>'
+        citations = _extract_html_links(text)
+        assert len(citations) == 1
+        assert citations[0]["url"] == "https://example.com"
+
+    def test_extract_html_link_ignore_non_http(self):
+        """Test that non-HTTP URLs are ignored."""
+        text = '<a href="mailto:test@example.com">Email</a> <a href="https://example.com">Web</a>'
+        citations = _extract_html_links(text)
+        assert len(citations) == 1
+        assert citations[0]["url"] == "https://example.com"
+
+    def test_extract_html_link_case_insensitive(self):
+        """Test that HTML extraction is case-insensitive."""
+        text = '<A HREF="https://example.com">Title</A>'
+        citations = _extract_html_links(text)
+        assert len(citations) == 1
+
+    def test_extract_html_link_empty_text(self):
+        """Test with no HTML links."""
+        text = "No links here"
+        citations = _extract_html_links(text)
+        assert len(citations) == 0
+
+    def test_extract_html_link_strip_whitespace(self):
+        """Test that whitespace in title is stripped."""
+        text = '<a href="https://example.com">  Title with spaces  </a>'
+        citations = _extract_html_links(text)
+        assert citations[0]["title"] == "Title with spaces"
+
+
+class TestParseCitationsFromReport:
+    """Test comprehensive citation parsing from complete reports."""
+
+    def test_parse_markdown_links_from_report(self):
+        """Test parsing markdown links from a report."""
+        report = """
+        ## Key Citations
+        
+        [GitHub](https://github.com)
+        [Python Docs](https://python.org)
+        """
+        result = parse_citations_from_report(report)
+        assert result["count"] >= 2
+        urls = [c["url"] for c in result["citations"]]
+        assert "https://github.com" in urls
+
+    def test_parse_numbered_citations_from_report(self):
+        """Test parsing numbered citations."""
+        report = """
+        ## References
+        
+        [1] GitHub - https://github.com
+        [2] Python - https://python.org
+        """
+        result = parse_citations_from_report(report)
+        assert result["count"] >= 2
+
+    def test_parse_mixed_format_citations(self):
+        """Test parsing mixed citation formats."""
+        report = """
+        ## Key Citations
+        
+        [GitHub](https://github.com)
+        [^1]: Python - https://python.org
+        [2] Wikipedia - https://wikipedia.org
+        <a href="https://stackoverflow.com">Stack Overflow</a>
+        """
+        result = parse_citations_from_report(report)
+        # Should find all 4 citations
+        assert result["count"] >= 3
+
+    def test_parse_citations_deduplication(self):
+        """Test that duplicate URLs are deduplicated."""
+        report = """
+        ## Key Citations
+        
+        [GitHub 1](https://github.com)
+        [GitHub 2](https://github.com)
+        [GitHub](https://github.com)
+        """
+        result = parse_citations_from_report(report)
+        # Should have only 1 unique citation
+        assert result["count"] == 1
+        assert result["citations"][0]["url"] == "https://github.com"
+
+    def test_parse_citations_various_section_patterns(self):
+        """Test parsing with different section headers."""
+        report_refs = """
+        ## References
+        [GitHub](https://github.com)
+        """
+        report_sources = """
+        ## Sources
+        [GitHub](https://github.com)
+        """
+        report_bibliography = """
+        ## Bibliography
+        [GitHub](https://github.com)
+        """
+
+        assert parse_citations_from_report(report_refs)["count"] >= 1
+        assert parse_citations_from_report(report_sources)["count"] >= 1
+        assert parse_citations_from_report(report_bibliography)["count"] >= 1
+
+    def test_parse_citations_custom_patterns(self):
+        """Test parsing with custom section patterns."""
+        report = """
+        ## My Custom Sources
+        [GitHub](https://github.com)
+        """
+        result = parse_citations_from_report(
+            report,
+            section_patterns=[r"##\s*My Custom Sources"]
+        )
+        assert result["count"] >= 1
+
+    def test_parse_citations_empty_report(self):
+        """Test parsing an empty report."""
+        result = parse_citations_from_report("")
+        assert result["count"] == 0
+        assert result["citations"] == []
+
+    def test_parse_citations_no_section(self):
+        """Test parsing report without citation section."""
+        report = "This is a report with no citations section"
+        result = parse_citations_from_report(report)
+        assert result["count"] == 0
+
+    def test_parse_citations_complex_report(self):
+        """Test parsing a complex, realistic report."""
+        report = """
+        # Research Report
+        
+        ## Introduction
+        
+        This report summarizes findings from multiple sources.
+        
+        ## Key Findings
+        
+        Some important discoveries were made based on research [GitHub](https://github.com).
+        
+        ## Key Citations
+        
+        1. Primary sources:
+        [GitHub](https://github.com) - A collaborative platform
+        [^1]: Python - https://python.org
+        
+        2. Secondary sources:
+        [2] Wikipedia - https://wikipedia.org
+        
+        3. Web resources:
+        <a href="https://stackoverflow.com">Stack Overflow</a>
+        
+        ## Methodology
+        
+        [Additional](https://example.com) details about methodology.
+        
+        ---
+        
+        [^1]: The Python programming language official site
+        """
+
+        result = parse_citations_from_report(report)
+        # Should extract multiple citations from the Key Citations section
+        assert result["count"] >= 3
+        urls = [c["url"] for c in result["citations"]]
+        # Verify some key URLs are found
+        assert any("github.com" in url or "python.org" in url for url in urls)
+
+    def test_parse_citations_stops_at_next_section(self):
+        """Test that citation extraction looks for citation sections."""
+        report = """
+        ## Key Citations
+        
+        [Cite 1](https://example.com/1)
+        [Cite 2](https://example.com/2)
+        
+        ## Next Section
+        
+        Some other content
+        """
+        result = parse_citations_from_report(report)
+        # Should extract citations from the Key Citations section
+        # Note: The regex stops at next ## section
+        assert result["count"] >= 1
+        assert any("example.com/1" in c["url"] for c in result["citations"])
+
+    def test_parse_citations_preserves_metadata(self):
+        """Test that citation metadata is preserved."""
+        report = """
+        ## Key Citations
+        
+        [Python Documentation](https://python.org)
+        """
+        result = parse_citations_from_report(report)
+        assert len(result["citations"]) >= 1
+        citation = result["citations"][0]
+        assert "title" in citation
+        assert "url" in citation
+        assert "format" in citation
+
+    def test_parse_citations_whitespace_handling(self):
+        """Test handling of various whitespace configurations."""
+        report = """
+        ##   Key Citations   
+        
+        [Link](https://example.com)
+        
+        """
+        result = parse_citations_from_report(report)
+        assert result["count"] >= 1
+
+    def test_parse_citations_multiline_links(self):
+        """Test extraction of links across formatting."""
+        report = """
+        ## Key Citations
+        
+        Some paragraph with a [link to example](https://example.com) in the middle.
+        """
+        result = parse_citations_from_report(report)
+        assert result["count"] >= 1