Files
deer-flow/tests/unit/citations/test_formatter.py
Xun 9a34e32252 chore : Improved citation system (#834)
* improve: Improved citation system

* fix

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
2026-01-25 15:49:45 +08:00

424 lines
16 KiB
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Unit tests for citation formatter enhancements.
Tests the multi-format citation parsing and extraction capabilities.
"""
from src.citations.formatter import (
parse_citations_from_report,
_extract_markdown_links,
_extract_numbered_citations,
_extract_footnote_citations,
_extract_html_links,
)
class TestExtractMarkdownLinks:
"""Test Markdown link extraction [title](url)."""
def test_extract_single_markdown_link(self):
"""Test extraction of a single markdown link."""
text = "[Example Article](https://example.com)"
citations = _extract_markdown_links(text)
assert len(citations) == 1
assert citations[0]["title"] == "Example Article"
assert citations[0]["url"] == "https://example.com"
assert citations[0]["format"] == "markdown"
def test_extract_multiple_markdown_links(self):
"""Test extraction of multiple markdown links."""
text = "[Link 1](https://example.com/1) and [Link 2](https://example.com/2)"
citations = _extract_markdown_links(text)
assert len(citations) == 2
assert citations[0]["title"] == "Link 1"
assert citations[1]["title"] == "Link 2"
def test_extract_markdown_link_with_spaces(self):
"""Test markdown link with spaces in title."""
text = "[Article Title With Spaces](https://example.com)"
citations = _extract_markdown_links(text)
assert len(citations) == 1
assert citations[0]["title"] == "Article Title With Spaces"
def test_extract_markdown_link_ignore_non_http(self):
"""Test that non-HTTP URLs are ignored."""
text = "[Relative Link](./relative/path) [HTTP Link](https://example.com)"
citations = _extract_markdown_links(text)
assert len(citations) == 1
assert citations[0]["url"] == "https://example.com"
def test_extract_markdown_link_with_query_params(self):
"""Test markdown links with query parameters."""
text = "[Search Result](https://example.com/search?q=test&page=1)"
citations = _extract_markdown_links(text)
assert len(citations) == 1
assert "q=test" in citations[0]["url"]
def test_extract_markdown_link_empty_text(self):
"""Test with no markdown links."""
text = "Just plain text with no links"
citations = _extract_markdown_links(text)
assert len(citations) == 0
def test_extract_markdown_link_strip_whitespace(self):
"""Test that whitespace in title and URL is stripped."""
# Markdown links with spaces in URL are not valid, so they won't be extracted
text = "[Title](https://example.com)"
citations = _extract_markdown_links(text)
assert len(citations) >= 1
assert citations[0]["title"] == "Title"
assert citations[0]["url"] == "https://example.com"
class TestExtractNumberedCitations:
"""Test numbered citation extraction [1] Title - URL."""
def test_extract_single_numbered_citation(self):
"""Test extraction of a single numbered citation."""
text = "[1] Example Article - https://example.com"
citations = _extract_numbered_citations(text)
assert len(citations) == 1
assert citations[0]["title"] == "Example Article"
assert citations[0]["url"] == "https://example.com"
assert citations[0]["format"] == "numbered"
def test_extract_multiple_numbered_citations(self):
"""Test extraction of multiple numbered citations."""
text = "[1] First - https://example.com/1\n[2] Second - https://example.com/2"
citations = _extract_numbered_citations(text)
assert len(citations) == 2
assert citations[0]["title"] == "First"
assert citations[1]["title"] == "Second"
def test_extract_numbered_citation_with_long_title(self):
"""Test numbered citation with longer title."""
text = "[5] A Comprehensive Guide to Python Programming - https://example.com"
citations = _extract_numbered_citations(text)
assert len(citations) == 1
assert "Comprehensive Guide" in citations[0]["title"]
def test_extract_numbered_citation_requires_valid_format(self):
"""Test that invalid numbered format is not extracted."""
text = "[1 Title - https://example.com" # Missing closing bracket
citations = _extract_numbered_citations(text)
assert len(citations) == 0
def test_extract_numbered_citation_empty_text(self):
"""Test with no numbered citations."""
text = "Just plain text"
citations = _extract_numbered_citations(text)
assert len(citations) == 0
def test_extract_numbered_citation_various_numbers(self):
"""Test with various citation numbers."""
text = "[10] Title Ten - https://example.com/10\n[999] Title 999 - https://example.com/999"
citations = _extract_numbered_citations(text)
assert len(citations) == 2
def test_extract_numbered_citation_ignore_non_http(self):
"""Test that non-HTTP URLs in numbered citations are ignored."""
text = "[1] Invalid - file://path [2] Valid - https://example.com"
citations = _extract_numbered_citations(text)
# Only the valid one should be extracted
assert len(citations) <= 1
class TestExtractFootnoteCitations:
"""Test footnote citation extraction [^1]: Title - URL."""
def test_extract_single_footnote_citation(self):
"""Test extraction of a single footnote citation."""
text = "[^1]: Example Article - https://example.com"
citations = _extract_footnote_citations(text)
assert len(citations) == 1
assert citations[0]["title"] == "Example Article"
assert citations[0]["url"] == "https://example.com"
assert citations[0]["format"] == "footnote"
def test_extract_multiple_footnote_citations(self):
"""Test extraction of multiple footnote citations."""
text = "[^1]: First - https://example.com/1\n[^2]: Second - https://example.com/2"
citations = _extract_footnote_citations(text)
assert len(citations) == 2
def test_extract_footnote_with_complex_number(self):
"""Test footnote extraction with various numbers."""
text = "[^123]: Title - https://example.com"
citations = _extract_footnote_citations(text)
assert len(citations) == 1
assert citations[0]["title"] == "Title"
def test_extract_footnote_citation_with_spaces(self):
"""Test footnote with spaces around separator."""
text = "[^1]: Title with spaces - https://example.com "
citations = _extract_footnote_citations(text)
assert len(citations) == 1
# Should strip whitespace
assert citations[0]["title"] == "Title with spaces"
def test_extract_footnote_citation_empty_text(self):
"""Test with no footnote citations."""
text = "No footnotes here"
citations = _extract_footnote_citations(text)
assert len(citations) == 0
def test_extract_footnote_requires_caret(self):
"""Test that missing caret prevents extraction."""
text = "[1]: Title - https://example.com" # Missing ^
citations = _extract_footnote_citations(text)
assert len(citations) == 0
class TestExtractHtmlLinks:
"""Test HTML link extraction <a href="url">title</a>."""
def test_extract_single_html_link(self):
"""Test extraction of a single HTML link."""
text = '<a href="https://example.com">Example Article</a>'
citations = _extract_html_links(text)
assert len(citations) == 1
assert citations[0]["title"] == "Example Article"
assert citations[0]["url"] == "https://example.com"
assert citations[0]["format"] == "html"
def test_extract_multiple_html_links(self):
"""Test extraction of multiple HTML links."""
text = '<a href="https://a.com">Link A</a> <a href="https://b.com">Link B</a>'
citations = _extract_html_links(text)
assert len(citations) == 2
def test_extract_html_link_single_quotes(self):
"""Test HTML links with single quotes."""
text = "<a href='https://example.com'>Title</a>"
citations = _extract_html_links(text)
assert len(citations) == 1
assert citations[0]["url"] == "https://example.com"
def test_extract_html_link_with_attributes(self):
"""Test HTML links with additional attributes."""
text = '<a class="link" href="https://example.com" target="_blank">Title</a>'
citations = _extract_html_links(text)
assert len(citations) == 1
assert citations[0]["url"] == "https://example.com"
def test_extract_html_link_ignore_non_http(self):
"""Test that non-HTTP URLs are ignored."""
text = '<a href="mailto:test@example.com">Email</a> <a href="https://example.com">Web</a>'
citations = _extract_html_links(text)
assert len(citations) == 1
assert citations[0]["url"] == "https://example.com"
def test_extract_html_link_case_insensitive(self):
"""Test that HTML extraction is case-insensitive."""
text = '<A HREF="https://example.com">Title</A>'
citations = _extract_html_links(text)
assert len(citations) == 1
def test_extract_html_link_empty_text(self):
"""Test with no HTML links."""
text = "No links here"
citations = _extract_html_links(text)
assert len(citations) == 0
def test_extract_html_link_strip_whitespace(self):
"""Test that whitespace in title is stripped."""
text = '<a href="https://example.com"> Title with spaces </a>'
citations = _extract_html_links(text)
assert citations[0]["title"] == "Title with spaces"
class TestParseCitationsFromReport:
"""Test comprehensive citation parsing from complete reports."""
def test_parse_markdown_links_from_report(self):
"""Test parsing markdown links from a report."""
report = """
## Key Citations
[GitHub](https://github.com)
[Python Docs](https://python.org)
"""
result = parse_citations_from_report(report)
assert result["count"] >= 2
urls = [c["url"] for c in result["citations"]]
assert "https://github.com" in urls
def test_parse_numbered_citations_from_report(self):
"""Test parsing numbered citations."""
report = """
## References
[1] GitHub - https://github.com
[2] Python - https://python.org
"""
result = parse_citations_from_report(report)
assert result["count"] >= 2
def test_parse_mixed_format_citations(self):
"""Test parsing mixed citation formats."""
report = """
## Key Citations
[GitHub](https://github.com)
[^1]: Python - https://python.org
[2] Wikipedia - https://wikipedia.org
<a href="https://stackoverflow.com">Stack Overflow</a>
"""
result = parse_citations_from_report(report)
# Should find all 4 citations
assert result["count"] >= 3
def test_parse_citations_deduplication(self):
"""Test that duplicate URLs are deduplicated."""
report = """
## Key Citations
[GitHub 1](https://github.com)
[GitHub 2](https://github.com)
[GitHub](https://github.com)
"""
result = parse_citations_from_report(report)
# Should have only 1 unique citation
assert result["count"] == 1
assert result["citations"][0]["url"] == "https://github.com"
def test_parse_citations_various_section_patterns(self):
"""Test parsing with different section headers."""
report_refs = """
## References
[GitHub](https://github.com)
"""
report_sources = """
## Sources
[GitHub](https://github.com)
"""
report_bibliography = """
## Bibliography
[GitHub](https://github.com)
"""
assert parse_citations_from_report(report_refs)["count"] >= 1
assert parse_citations_from_report(report_sources)["count"] >= 1
assert parse_citations_from_report(report_bibliography)["count"] >= 1
def test_parse_citations_custom_patterns(self):
"""Test parsing with custom section patterns."""
report = """
## My Custom Sources
[GitHub](https://github.com)
"""
result = parse_citations_from_report(
report,
section_patterns=[r"##\s*My Custom Sources"]
)
assert result["count"] >= 1
def test_parse_citations_empty_report(self):
"""Test parsing an empty report."""
result = parse_citations_from_report("")
assert result["count"] == 0
assert result["citations"] == []
def test_parse_citations_no_section(self):
"""Test parsing report without citation section."""
report = "This is a report with no citations section"
result = parse_citations_from_report(report)
assert result["count"] == 0
def test_parse_citations_complex_report(self):
"""Test parsing a complex, realistic report."""
report = """
# Research Report
## Introduction
This report summarizes findings from multiple sources.
## Key Findings
Some important discoveries were made based on research [GitHub](https://github.com).
## Key Citations
1. Primary sources:
[GitHub](https://github.com) - A collaborative platform
[^1]: Python - https://python.org
2. Secondary sources:
[2] Wikipedia - https://wikipedia.org
3. Web resources:
<a href="https://stackoverflow.com">Stack Overflow</a>
## Methodology
[Additional](https://example.com) details about methodology.
---
[^1]: The Python programming language official site
"""
result = parse_citations_from_report(report)
# Should extract multiple citations from the Key Citations section
assert result["count"] >= 3
urls = [c["url"] for c in result["citations"]]
# Verify some key URLs are found
assert any("github.com" in url or "python.org" in url for url in urls)
def test_parse_citations_stops_at_next_section(self):
"""Test that citation extraction looks for citation sections."""
report = """
## Key Citations
[Cite 1](https://example.com/1)
[Cite 2](https://example.com/2)
## Next Section
Some other content
"""
result = parse_citations_from_report(report)
# Should extract citations from the Key Citations section
# Note: The regex stops at next ## section
assert result["count"] >= 1
assert any("example.com/1" in c["url"] for c in result["citations"])
def test_parse_citations_preserves_metadata(self):
"""Test that citation metadata is preserved."""
report = """
## Key Citations
[Python Documentation](https://python.org)
"""
result = parse_citations_from_report(report)
assert len(result["citations"]) >= 1
citation = result["citations"][0]
assert "title" in citation
assert "url" in citation
assert "format" in citation
def test_parse_citations_whitespace_handling(self):
"""Test handling of various whitespace configurations."""
report = """
## Key Citations
[Link](https://example.com)
"""
result = parse_citations_from_report(report)
assert result["count"] >= 1
def test_parse_citations_multiline_links(self):
"""Test extraction of links across formatting."""
report = """
## Key Citations
Some paragraph with a [link to example](https://example.com) in the middle.
"""
result = parse_citations_from_report(report)
assert result["count"] >= 1