mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-21 05:14:45 +08:00
chore : Improved citation system (#834)
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
423
tests/unit/citations/test_formatter.py
Normal file
423
tests/unit/citations/test_formatter.py
Normal file
@@ -0,0 +1,423 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
Unit tests for citation formatter enhancements.
|
||||
|
||||
Tests the multi-format citation parsing and extraction capabilities.
|
||||
"""
|
||||
|
||||
from src.citations.formatter import (
|
||||
parse_citations_from_report,
|
||||
_extract_markdown_links,
|
||||
_extract_numbered_citations,
|
||||
_extract_footnote_citations,
|
||||
_extract_html_links,
|
||||
)
|
||||
|
||||
|
||||
class TestExtractMarkdownLinks:
|
||||
"""Test Markdown link extraction [title](url)."""
|
||||
|
||||
def test_extract_single_markdown_link(self):
|
||||
"""Test extraction of a single markdown link."""
|
||||
text = "[Example Article](https://example.com)"
|
||||
citations = _extract_markdown_links(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["title"] == "Example Article"
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
assert citations[0]["format"] == "markdown"
|
||||
|
||||
def test_extract_multiple_markdown_links(self):
|
||||
"""Test extraction of multiple markdown links."""
|
||||
text = "[Link 1](https://example.com/1) and [Link 2](https://example.com/2)"
|
||||
citations = _extract_markdown_links(text)
|
||||
assert len(citations) == 2
|
||||
assert citations[0]["title"] == "Link 1"
|
||||
assert citations[1]["title"] == "Link 2"
|
||||
|
||||
def test_extract_markdown_link_with_spaces(self):
|
||||
"""Test markdown link with spaces in title."""
|
||||
text = "[Article Title With Spaces](https://example.com)"
|
||||
citations = _extract_markdown_links(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["title"] == "Article Title With Spaces"
|
||||
|
||||
def test_extract_markdown_link_ignore_non_http(self):
|
||||
"""Test that non-HTTP URLs are ignored."""
|
||||
text = "[Relative Link](./relative/path) [HTTP Link](https://example.com)"
|
||||
citations = _extract_markdown_links(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
|
||||
def test_extract_markdown_link_with_query_params(self):
|
||||
"""Test markdown links with query parameters."""
|
||||
text = "[Search Result](https://example.com/search?q=test&page=1)"
|
||||
citations = _extract_markdown_links(text)
|
||||
assert len(citations) == 1
|
||||
assert "q=test" in citations[0]["url"]
|
||||
|
||||
def test_extract_markdown_link_empty_text(self):
|
||||
"""Test with no markdown links."""
|
||||
text = "Just plain text with no links"
|
||||
citations = _extract_markdown_links(text)
|
||||
assert len(citations) == 0
|
||||
|
||||
def test_extract_markdown_link_strip_whitespace(self):
|
||||
"""Test that whitespace in title and URL is stripped."""
|
||||
# Markdown links with spaces in URL are not valid, so they won't be extracted
|
||||
text = "[Title](https://example.com)"
|
||||
citations = _extract_markdown_links(text)
|
||||
assert len(citations) >= 1
|
||||
assert citations[0]["title"] == "Title"
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
|
||||
|
||||
class TestExtractNumberedCitations:
|
||||
"""Test numbered citation extraction [1] Title - URL."""
|
||||
|
||||
def test_extract_single_numbered_citation(self):
|
||||
"""Test extraction of a single numbered citation."""
|
||||
text = "[1] Example Article - https://example.com"
|
||||
citations = _extract_numbered_citations(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["title"] == "Example Article"
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
assert citations[0]["format"] == "numbered"
|
||||
|
||||
def test_extract_multiple_numbered_citations(self):
|
||||
"""Test extraction of multiple numbered citations."""
|
||||
text = "[1] First - https://example.com/1\n[2] Second - https://example.com/2"
|
||||
citations = _extract_numbered_citations(text)
|
||||
assert len(citations) == 2
|
||||
assert citations[0]["title"] == "First"
|
||||
assert citations[1]["title"] == "Second"
|
||||
|
||||
def test_extract_numbered_citation_with_long_title(self):
|
||||
"""Test numbered citation with longer title."""
|
||||
text = "[5] A Comprehensive Guide to Python Programming - https://example.com"
|
||||
citations = _extract_numbered_citations(text)
|
||||
assert len(citations) == 1
|
||||
assert "Comprehensive Guide" in citations[0]["title"]
|
||||
|
||||
def test_extract_numbered_citation_requires_valid_format(self):
|
||||
"""Test that invalid numbered format is not extracted."""
|
||||
text = "[1 Title - https://example.com" # Missing closing bracket
|
||||
citations = _extract_numbered_citations(text)
|
||||
assert len(citations) == 0
|
||||
|
||||
def test_extract_numbered_citation_empty_text(self):
|
||||
"""Test with no numbered citations."""
|
||||
text = "Just plain text"
|
||||
citations = _extract_numbered_citations(text)
|
||||
assert len(citations) == 0
|
||||
|
||||
def test_extract_numbered_citation_various_numbers(self):
|
||||
"""Test with various citation numbers."""
|
||||
text = "[10] Title Ten - https://example.com/10\n[999] Title 999 - https://example.com/999"
|
||||
citations = _extract_numbered_citations(text)
|
||||
assert len(citations) == 2
|
||||
|
||||
def test_extract_numbered_citation_ignore_non_http(self):
|
||||
"""Test that non-HTTP URLs in numbered citations are ignored."""
|
||||
text = "[1] Invalid - file://path [2] Valid - https://example.com"
|
||||
citations = _extract_numbered_citations(text)
|
||||
# Only the valid one should be extracted
|
||||
assert len(citations) <= 1
|
||||
|
||||
|
||||
class TestExtractFootnoteCitations:
|
||||
"""Test footnote citation extraction [^1]: Title - URL."""
|
||||
|
||||
def test_extract_single_footnote_citation(self):
|
||||
"""Test extraction of a single footnote citation."""
|
||||
text = "[^1]: Example Article - https://example.com"
|
||||
citations = _extract_footnote_citations(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["title"] == "Example Article"
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
assert citations[0]["format"] == "footnote"
|
||||
|
||||
def test_extract_multiple_footnote_citations(self):
|
||||
"""Test extraction of multiple footnote citations."""
|
||||
text = "[^1]: First - https://example.com/1\n[^2]: Second - https://example.com/2"
|
||||
citations = _extract_footnote_citations(text)
|
||||
assert len(citations) == 2
|
||||
|
||||
def test_extract_footnote_with_complex_number(self):
|
||||
"""Test footnote extraction with various numbers."""
|
||||
text = "[^123]: Title - https://example.com"
|
||||
citations = _extract_footnote_citations(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["title"] == "Title"
|
||||
|
||||
def test_extract_footnote_citation_with_spaces(self):
|
||||
"""Test footnote with spaces around separator."""
|
||||
text = "[^1]: Title with spaces - https://example.com "
|
||||
citations = _extract_footnote_citations(text)
|
||||
assert len(citations) == 1
|
||||
# Should strip whitespace
|
||||
assert citations[0]["title"] == "Title with spaces"
|
||||
|
||||
def test_extract_footnote_citation_empty_text(self):
|
||||
"""Test with no footnote citations."""
|
||||
text = "No footnotes here"
|
||||
citations = _extract_footnote_citations(text)
|
||||
assert len(citations) == 0
|
||||
|
||||
def test_extract_footnote_requires_caret(self):
|
||||
"""Test that missing caret prevents extraction."""
|
||||
text = "[1]: Title - https://example.com" # Missing ^
|
||||
citations = _extract_footnote_citations(text)
|
||||
assert len(citations) == 0
|
||||
|
||||
|
||||
class TestExtractHtmlLinks:
|
||||
"""Test HTML link extraction <a href="url">title</a>."""
|
||||
|
||||
def test_extract_single_html_link(self):
|
||||
"""Test extraction of a single HTML link."""
|
||||
text = '<a href="https://example.com">Example Article</a>'
|
||||
citations = _extract_html_links(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["title"] == "Example Article"
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
assert citations[0]["format"] == "html"
|
||||
|
||||
def test_extract_multiple_html_links(self):
|
||||
"""Test extraction of multiple HTML links."""
|
||||
text = '<a href="https://a.com">Link A</a> <a href="https://b.com">Link B</a>'
|
||||
citations = _extract_html_links(text)
|
||||
assert len(citations) == 2
|
||||
|
||||
def test_extract_html_link_single_quotes(self):
|
||||
"""Test HTML links with single quotes."""
|
||||
text = "<a href='https://example.com'>Title</a>"
|
||||
citations = _extract_html_links(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
|
||||
def test_extract_html_link_with_attributes(self):
|
||||
"""Test HTML links with additional attributes."""
|
||||
text = '<a class="link" href="https://example.com" target="_blank">Title</a>'
|
||||
citations = _extract_html_links(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
|
||||
def test_extract_html_link_ignore_non_http(self):
|
||||
"""Test that non-HTTP URLs are ignored."""
|
||||
text = '<a href="mailto:test@example.com">Email</a> <a href="https://example.com">Web</a>'
|
||||
citations = _extract_html_links(text)
|
||||
assert len(citations) == 1
|
||||
assert citations[0]["url"] == "https://example.com"
|
||||
|
||||
def test_extract_html_link_case_insensitive(self):
|
||||
"""Test that HTML extraction is case-insensitive."""
|
||||
text = '<A HREF="https://example.com">Title</A>'
|
||||
citations = _extract_html_links(text)
|
||||
assert len(citations) == 1
|
||||
|
||||
def test_extract_html_link_empty_text(self):
|
||||
"""Test with no HTML links."""
|
||||
text = "No links here"
|
||||
citations = _extract_html_links(text)
|
||||
assert len(citations) == 0
|
||||
|
||||
def test_extract_html_link_strip_whitespace(self):
|
||||
"""Test that whitespace in title is stripped."""
|
||||
text = '<a href="https://example.com"> Title with spaces </a>'
|
||||
citations = _extract_html_links(text)
|
||||
assert citations[0]["title"] == "Title with spaces"
|
||||
|
||||
|
||||
class TestParseCitationsFromReport:
|
||||
"""Test comprehensive citation parsing from complete reports."""
|
||||
|
||||
def test_parse_markdown_links_from_report(self):
|
||||
"""Test parsing markdown links from a report."""
|
||||
report = """
|
||||
## Key Citations
|
||||
|
||||
[GitHub](https://github.com)
|
||||
[Python Docs](https://python.org)
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
assert result["count"] >= 2
|
||||
urls = [c["url"] for c in result["citations"]]
|
||||
assert "https://github.com" in urls
|
||||
|
||||
def test_parse_numbered_citations_from_report(self):
|
||||
"""Test parsing numbered citations."""
|
||||
report = """
|
||||
## References
|
||||
|
||||
[1] GitHub - https://github.com
|
||||
[2] Python - https://python.org
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
assert result["count"] >= 2
|
||||
|
||||
def test_parse_mixed_format_citations(self):
|
||||
"""Test parsing mixed citation formats."""
|
||||
report = """
|
||||
## Key Citations
|
||||
|
||||
[GitHub](https://github.com)
|
||||
[^1]: Python - https://python.org
|
||||
[2] Wikipedia - https://wikipedia.org
|
||||
<a href="https://stackoverflow.com">Stack Overflow</a>
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
# Should find all 4 citations
|
||||
assert result["count"] >= 3
|
||||
|
||||
def test_parse_citations_deduplication(self):
|
||||
"""Test that duplicate URLs are deduplicated."""
|
||||
report = """
|
||||
## Key Citations
|
||||
|
||||
[GitHub 1](https://github.com)
|
||||
[GitHub 2](https://github.com)
|
||||
[GitHub](https://github.com)
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
# Should have only 1 unique citation
|
||||
assert result["count"] == 1
|
||||
assert result["citations"][0]["url"] == "https://github.com"
|
||||
|
||||
def test_parse_citations_various_section_patterns(self):
|
||||
"""Test parsing with different section headers."""
|
||||
report_refs = """
|
||||
## References
|
||||
[GitHub](https://github.com)
|
||||
"""
|
||||
report_sources = """
|
||||
## Sources
|
||||
[GitHub](https://github.com)
|
||||
"""
|
||||
report_bibliography = """
|
||||
## Bibliography
|
||||
[GitHub](https://github.com)
|
||||
"""
|
||||
|
||||
assert parse_citations_from_report(report_refs)["count"] >= 1
|
||||
assert parse_citations_from_report(report_sources)["count"] >= 1
|
||||
assert parse_citations_from_report(report_bibliography)["count"] >= 1
|
||||
|
||||
def test_parse_citations_custom_patterns(self):
|
||||
"""Test parsing with custom section patterns."""
|
||||
report = """
|
||||
## My Custom Sources
|
||||
[GitHub](https://github.com)
|
||||
"""
|
||||
result = parse_citations_from_report(
|
||||
report,
|
||||
section_patterns=[r"##\s*My Custom Sources"]
|
||||
)
|
||||
assert result["count"] >= 1
|
||||
|
||||
def test_parse_citations_empty_report(self):
|
||||
"""Test parsing an empty report."""
|
||||
result = parse_citations_from_report("")
|
||||
assert result["count"] == 0
|
||||
assert result["citations"] == []
|
||||
|
||||
def test_parse_citations_no_section(self):
|
||||
"""Test parsing report without citation section."""
|
||||
report = "This is a report with no citations section"
|
||||
result = parse_citations_from_report(report)
|
||||
assert result["count"] == 0
|
||||
|
||||
def test_parse_citations_complex_report(self):
|
||||
"""Test parsing a complex, realistic report."""
|
||||
report = """
|
||||
# Research Report
|
||||
|
||||
## Introduction
|
||||
|
||||
This report summarizes findings from multiple sources.
|
||||
|
||||
## Key Findings
|
||||
|
||||
Some important discoveries were made based on research [GitHub](https://github.com).
|
||||
|
||||
## Key Citations
|
||||
|
||||
1. Primary sources:
|
||||
[GitHub](https://github.com) - A collaborative platform
|
||||
[^1]: Python - https://python.org
|
||||
|
||||
2. Secondary sources:
|
||||
[2] Wikipedia - https://wikipedia.org
|
||||
|
||||
3. Web resources:
|
||||
<a href="https://stackoverflow.com">Stack Overflow</a>
|
||||
|
||||
## Methodology
|
||||
|
||||
[Additional](https://example.com) details about methodology.
|
||||
|
||||
---
|
||||
|
||||
[^1]: The Python programming language official site
|
||||
"""
|
||||
|
||||
result = parse_citations_from_report(report)
|
||||
# Should extract multiple citations from the Key Citations section
|
||||
assert result["count"] >= 3
|
||||
urls = [c["url"] for c in result["citations"]]
|
||||
# Verify some key URLs are found
|
||||
assert any("github.com" in url or "python.org" in url for url in urls)
|
||||
|
||||
def test_parse_citations_stops_at_next_section(self):
|
||||
"""Test that citation extraction looks for citation sections."""
|
||||
report = """
|
||||
## Key Citations
|
||||
|
||||
[Cite 1](https://example.com/1)
|
||||
[Cite 2](https://example.com/2)
|
||||
|
||||
## Next Section
|
||||
|
||||
Some other content
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
# Should extract citations from the Key Citations section
|
||||
# Note: The regex stops at next ## section
|
||||
assert result["count"] >= 1
|
||||
assert any("example.com/1" in c["url"] for c in result["citations"])
|
||||
|
||||
def test_parse_citations_preserves_metadata(self):
|
||||
"""Test that citation metadata is preserved."""
|
||||
report = """
|
||||
## Key Citations
|
||||
|
||||
[Python Documentation](https://python.org)
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
assert len(result["citations"]) >= 1
|
||||
citation = result["citations"][0]
|
||||
assert "title" in citation
|
||||
assert "url" in citation
|
||||
assert "format" in citation
|
||||
|
||||
def test_parse_citations_whitespace_handling(self):
|
||||
"""Test handling of various whitespace configurations."""
|
||||
report = """
|
||||
## Key Citations
|
||||
|
||||
[Link](https://example.com)
|
||||
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
assert result["count"] >= 1
|
||||
|
||||
def test_parse_citations_multiline_links(self):
|
||||
"""Test extraction of links across formatting."""
|
||||
report = """
|
||||
## Key Citations
|
||||
|
||||
Some paragraph with a [link to example](https://example.com) in the middle.
|
||||
"""
|
||||
result = parse_citations_from_report(report)
|
||||
assert result["count"] >= 1
|
||||
Reference in New Issue
Block a user