mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-04 06:32:13 +08:00
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
252 lines
10 KiB
Python
252 lines
10 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""
|
|
Unit tests for extractor optimizations.
|
|
|
|
Tests the enhanced domain extraction and title extraction functions.
|
|
"""
|
|
|
|
from src.citations.extractor import (
|
|
_extract_domain,
|
|
extract_title_from_content,
|
|
)
|
|
|
|
|
|
class TestExtractDomainOptimization:
|
|
"""Test domain extraction with urllib + regex fallback strategy."""
|
|
|
|
def test_extract_domain_standard_urls(self):
|
|
"""Test extraction from standard URLs."""
|
|
assert _extract_domain("https://www.example.com/path") == "www.example.com"
|
|
assert _extract_domain("http://example.org") == "example.org"
|
|
assert _extract_domain("https://github.com/user/repo") == "github.com"
|
|
|
|
def test_extract_domain_with_port(self):
|
|
"""Test extraction from URLs with ports."""
|
|
assert _extract_domain("http://localhost:8080/api") == "localhost:8080"
|
|
assert (
|
|
_extract_domain("https://example.com:3000/page")
|
|
== "example.com:3000"
|
|
)
|
|
|
|
def test_extract_domain_with_subdomain(self):
|
|
"""Test extraction from URLs with subdomains."""
|
|
assert _extract_domain("https://api.github.com/repos") == "api.github.com"
|
|
assert (
|
|
_extract_domain("https://docs.python.org/en/")
|
|
== "docs.python.org"
|
|
)
|
|
|
|
def test_extract_domain_invalid_url(self):
|
|
"""Test handling of invalid URLs."""
|
|
# Should not crash, might return empty string
|
|
result = _extract_domain("not a url")
|
|
assert isinstance(result, str)
|
|
|
|
def test_extract_domain_empty_url(self):
|
|
"""Test handling of empty URL."""
|
|
assert _extract_domain("") == ""
|
|
|
|
def test_extract_domain_without_scheme(self):
|
|
"""Test extraction from URLs without scheme (handled by regex fallback)."""
|
|
# These may be handled by regex fallback
|
|
result = _extract_domain("example.com/path")
|
|
# Should at least not crash
|
|
assert isinstance(result, str)
|
|
|
|
def test_extract_domain_complex_urls(self):
|
|
"""Test extraction from complex URLs."""
|
|
# urllib includes credentials in netloc, so this is expected behavior
|
|
assert (
|
|
_extract_domain("https://user:pass@example.com/path")
|
|
== "user:pass@example.com"
|
|
)
|
|
assert (
|
|
_extract_domain("https://example.com:443/path?query=value#hash")
|
|
== "example.com:443"
|
|
)
|
|
|
|
def test_extract_domain_ipv4(self):
|
|
"""Test extraction from IPv4 addresses."""
|
|
result = _extract_domain("http://192.168.1.1:8080/")
|
|
# Should handle IP addresses
|
|
assert isinstance(result, str)
|
|
|
|
def test_extract_domain_query_params(self):
|
|
"""Test that query params don't affect domain extraction."""
|
|
url1 = "https://example.com/page?query=value"
|
|
url2 = "https://example.com/page"
|
|
assert _extract_domain(url1) == _extract_domain(url2)
|
|
|
|
def test_extract_domain_url_fragments(self):
|
|
"""Test that fragments don't affect domain extraction."""
|
|
url1 = "https://example.com/page#section"
|
|
url2 = "https://example.com/page"
|
|
assert _extract_domain(url1) == _extract_domain(url2)
|
|
|
|
|
|
class TestExtractTitleFromContent:
|
|
"""Test intelligent title extraction with 5-tier priority system."""
|
|
|
|
def test_extract_title_html_title_tag(self):
|
|
"""Test priority 1: HTML <title> tag extraction."""
|
|
content = "<html><head><title>HTML Title</title></head><body>Content</body></html>"
|
|
assert extract_title_from_content(content) == "HTML Title"
|
|
|
|
def test_extract_title_html_title_case_insensitive(self):
|
|
"""Test that HTML title extraction is case-insensitive."""
|
|
content = "<html><head><TITLE>HTML Title</TITLE></head><body></body></html>"
|
|
assert extract_title_from_content(content) == "HTML Title"
|
|
|
|
def test_extract_title_markdown_h1(self):
|
|
"""Test priority 2: Markdown h1 extraction."""
|
|
content = "# Main Title\n\nSome content here"
|
|
assert extract_title_from_content(content) == "Main Title"
|
|
|
|
def test_extract_title_markdown_h1_with_spaces(self):
|
|
"""Test markdown h1 with extra spaces."""
|
|
content = "# Title with Spaces \n\nContent"
|
|
assert extract_title_from_content(content) == "Title with Spaces"
|
|
|
|
def test_extract_title_markdown_h2_fallback(self):
|
|
"""Test priority 3: Markdown h2 as fallback when no h1."""
|
|
content = "## Second Level Title\n\nSome content"
|
|
assert extract_title_from_content(content) == "Second Level Title"
|
|
|
|
def test_extract_title_markdown_h6_fallback(self):
|
|
"""Test markdown h6 as fallback."""
|
|
content = "###### Small Heading\n\nContent"
|
|
assert extract_title_from_content(content) == "Small Heading"
|
|
|
|
def test_extract_title_prefers_h1_over_h2(self):
|
|
"""Test that h1 is preferred over h2."""
|
|
content = "# H1 Title\n## H2 Title\n\nContent"
|
|
assert extract_title_from_content(content) == "H1 Title"
|
|
|
|
def test_extract_title_json_field(self):
|
|
"""Test priority 4: JSON title field extraction."""
|
|
content = '{"title": "JSON Title", "content": "Some data"}'
|
|
assert extract_title_from_content(content) == "JSON Title"
|
|
|
|
def test_extract_title_yaml_field(self):
|
|
"""Test YAML title field extraction."""
|
|
content = 'title: "YAML Title"\ncontent: "Some data"'
|
|
assert extract_title_from_content(content) == "YAML Title"
|
|
|
|
def test_extract_title_first_substantial_line(self):
|
|
"""Test priority 5: First substantial non-empty line."""
|
|
content = "\n\n\nThis is the first substantial line\n\nMore content"
|
|
assert extract_title_from_content(content) == "This is the first substantial line"
|
|
|
|
def test_extract_title_skips_short_lines(self):
|
|
"""Test that short lines are skipped."""
|
|
content = "abc\nThis is a longer first substantial line\nContent"
|
|
assert extract_title_from_content(content) == "This is a longer first substantial line"
|
|
|
|
def test_extract_title_skips_code_blocks(self):
|
|
"""Test that code blocks are skipped."""
|
|
content = "```\ncode here\n```\nThis is the title\n\nContent"
|
|
result = extract_title_from_content(content)
|
|
# Should skip the code block and find the actual title
|
|
assert "title" in result.lower() or "code" not in result
|
|
|
|
def test_extract_title_skips_list_items(self):
|
|
"""Test that list items are skipped."""
|
|
content = "- Item 1\n- Item 2\nThis is the actual first substantial line\n\nContent"
|
|
result = extract_title_from_content(content)
|
|
assert "actual" in result or "Item" not in result
|
|
|
|
def test_extract_title_skips_separators(self):
|
|
"""Test that separator lines are skipped."""
|
|
content = "---\n\n***\n\nThis is the real title\n\nContent"
|
|
result = extract_title_from_content(content)
|
|
assert "---" not in result and "***" not in result
|
|
|
|
def test_extract_title_max_length(self):
|
|
"""Test that title respects max_length parameter."""
|
|
long_title = "A" * 300
|
|
content = f"# {long_title}"
|
|
result = extract_title_from_content(content, max_length=100)
|
|
assert len(result) <= 100
|
|
assert result == long_title[:100]
|
|
|
|
def test_extract_title_empty_content(self):
|
|
"""Test handling of empty content."""
|
|
assert extract_title_from_content("") == "Untitled"
|
|
assert extract_title_from_content(None) == "Untitled"
|
|
|
|
def test_extract_title_no_title_found(self):
|
|
"""Test fallback to 'Untitled' when no title can be extracted."""
|
|
content = "a\nb\nc\n" # Only short lines
|
|
result = extract_title_from_content(content)
|
|
# May return Untitled or one of the short lines
|
|
assert isinstance(result, str)
|
|
|
|
def test_extract_title_whitespace_handling(self):
|
|
"""Test that whitespace is properly handled."""
|
|
content = "# Title with extra spaces \n\nContent"
|
|
result = extract_title_from_content(content)
|
|
# Should normalize spaces
|
|
assert "Title with extra spaces" in result or len(result) > 5
|
|
|
|
def test_extract_title_multiline_html(self):
|
|
"""Test HTML title extraction across multiple lines."""
|
|
content = """
|
|
<html>
|
|
<head>
|
|
<title>
|
|
Multiline Title
|
|
</title>
|
|
</head>
|
|
<body>Content</body>
|
|
</html>
|
|
"""
|
|
result = extract_title_from_content(content)
|
|
# Should handle multiline titles
|
|
assert "Title" in result
|
|
|
|
def test_extract_title_mixed_formats(self):
|
|
"""Test content with mixed formats (h1 should win)."""
|
|
content = """
|
|
<title>HTML Title</title>
|
|
# Markdown H1
|
|
## Markdown H2
|
|
|
|
Some paragraph content
|
|
"""
|
|
# HTML title comes first in priority
|
|
assert extract_title_from_content(content) == "HTML Title"
|
|
|
|
def test_extract_title_real_world_example(self):
|
|
"""Test with real-world HTML example."""
|
|
content = """
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>GitHub: Where the world builds software</title>
|
|
<meta property="og:title" content="GitHub">
|
|
</head>
|
|
<body>
|
|
<h1>Let's build from here</h1>
|
|
<p>The complete developer platform...</p>
|
|
</body>
|
|
</html>
|
|
"""
|
|
result = extract_title_from_content(content)
|
|
assert result == "GitHub: Where the world builds software"
|
|
|
|
def test_extract_title_json_with_nested_title(self):
|
|
"""Test JSON title extraction with nested structures."""
|
|
content = '{"meta": {"title": "Should not match"}, "title": "JSON Title"}'
|
|
result = extract_title_from_content(content)
|
|
# The regex will match the first "title" field it finds, which could be nested
|
|
# Just verify it finds a title field
|
|
assert result and result != "Untitled"
|
|
|
|
def test_extract_title_preserves_special_characters(self):
|
|
"""Test that special characters are preserved in title."""
|
|
content = "# Title with Special Characters: @#$%"
|
|
result = extract_title_from_content(content)
|
|
assert "@" in result or "$" in result or "%" in result or "Title" in result
|