mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-05 15:10:20 +08:00
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
468 lines
16 KiB
Python
468 lines
16 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""
|
|
Unit tests for citation models.
|
|
|
|
Tests the Pydantic BaseModel implementation of CitationMetadata and Citation classes.
|
|
"""
|
|
|
|
import json
|
|
|
|
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from src.citations.models import Citation, CitationMetadata
|
|
|
|
|
|
class TestCitationMetadata:
|
|
"""Test CitationMetadata Pydantic model."""
|
|
|
|
def test_create_basic_metadata(self):
|
|
"""Test creating basic citation metadata."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com/article",
|
|
title="Example Article",
|
|
)
|
|
assert metadata.url == "https://example.com/article"
|
|
assert metadata.title == "Example Article"
|
|
assert metadata.domain == "example.com" # Auto-extracted from URL
|
|
assert metadata.description is None
|
|
assert metadata.images == []
|
|
assert metadata.extra == {}
|
|
|
|
def test_metadata_with_all_fields(self):
|
|
"""Test creating metadata with all fields populated."""
|
|
metadata = CitationMetadata(
|
|
url="https://github.com/example/repo",
|
|
title="Example Repository",
|
|
description="A great repository",
|
|
content_snippet="This is a snippet",
|
|
raw_content="Full content here",
|
|
author="John Doe",
|
|
published_date="2025-01-24",
|
|
language="en",
|
|
relevance_score=0.95,
|
|
credibility_score=0.88,
|
|
)
|
|
assert metadata.url == "https://github.com/example/repo"
|
|
assert metadata.domain == "github.com"
|
|
assert metadata.author == "John Doe"
|
|
assert metadata.relevance_score == 0.95
|
|
assert metadata.credibility_score == 0.88
|
|
|
|
def test_metadata_domain_auto_extraction(self):
|
|
"""Test automatic domain extraction from URL."""
|
|
test_cases = [
|
|
("https://www.example.com/path", "www.example.com"),
|
|
("http://github.com/user/repo", "github.com"),
|
|
("https://api.github.com:443/repos", "api.github.com:443"),
|
|
]
|
|
|
|
for url, expected_domain in test_cases:
|
|
metadata = CitationMetadata(url=url, title="Test")
|
|
assert metadata.domain == expected_domain
|
|
|
|
def test_metadata_id_generation(self):
|
|
"""Test unique ID generation from URL."""
|
|
metadata1 = CitationMetadata(
|
|
url="https://example.com/article",
|
|
title="Article",
|
|
)
|
|
metadata2 = CitationMetadata(
|
|
url="https://example.com/article",
|
|
title="Article",
|
|
)
|
|
# Same URL should produce same ID
|
|
assert metadata1.id == metadata2.id
|
|
|
|
metadata3 = CitationMetadata(
|
|
url="https://different.com/article",
|
|
title="Article",
|
|
)
|
|
# Different URL should produce different ID
|
|
assert metadata1.id != metadata3.id
|
|
|
|
def test_metadata_id_length(self):
|
|
"""Test that ID is truncated to 12 characters."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Test",
|
|
)
|
|
assert len(metadata.id) == 12
|
|
assert metadata.id.isalnum() or all(c in "0123456789abcdef" for c in metadata.id)
|
|
|
|
def test_metadata_from_dict(self):
|
|
"""Test creating metadata from dictionary."""
|
|
data = {
|
|
"url": "https://example.com",
|
|
"title": "Example",
|
|
"description": "A description",
|
|
"author": "John Doe",
|
|
}
|
|
metadata = CitationMetadata.from_dict(data)
|
|
assert metadata.url == "https://example.com"
|
|
assert metadata.title == "Example"
|
|
assert metadata.description == "A description"
|
|
assert metadata.author == "John Doe"
|
|
|
|
def test_metadata_from_dict_removes_id(self):
|
|
"""Test that from_dict removes computed 'id' field."""
|
|
data = {
|
|
"url": "https://example.com",
|
|
"title": "Example",
|
|
"id": "some_old_id", # Should be ignored
|
|
}
|
|
metadata = CitationMetadata.from_dict(data)
|
|
# Should use newly computed ID, not the old one
|
|
assert metadata.id != "some_old_id"
|
|
|
|
def test_metadata_to_dict(self):
|
|
"""Test converting metadata to dictionary."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
author="John Doe",
|
|
)
|
|
result = metadata.to_dict()
|
|
|
|
assert result["url"] == "https://example.com"
|
|
assert result["title"] == "Example"
|
|
assert result["author"] == "John Doe"
|
|
assert result["id"] == metadata.id
|
|
assert result["domain"] == "example.com"
|
|
|
|
def test_metadata_from_search_result(self):
|
|
"""Test creating metadata from search result."""
|
|
search_result = {
|
|
"url": "https://example.com/article",
|
|
"title": "Article Title",
|
|
"content": "Article content here",
|
|
"score": 0.92,
|
|
"type": "page",
|
|
}
|
|
metadata = CitationMetadata.from_search_result(
|
|
search_result,
|
|
query="test query",
|
|
)
|
|
|
|
assert metadata.url == "https://example.com/article"
|
|
assert metadata.title == "Article Title"
|
|
assert metadata.description == "Article content here"
|
|
assert metadata.relevance_score == 0.92
|
|
assert metadata.extra["query"] == "test query"
|
|
assert metadata.extra["result_type"] == "page"
|
|
|
|
def test_metadata_pydantic_validation(self):
|
|
"""Test that Pydantic validates required fields."""
|
|
# URL and title are required
|
|
with pytest.raises(ValidationError):
|
|
CitationMetadata() # Missing required fields
|
|
|
|
with pytest.raises(ValidationError):
|
|
CitationMetadata(url="https://example.com") # Missing title
|
|
|
|
def test_metadata_model_dump(self):
|
|
"""Test Pydantic model_dump method."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
author="John Doe",
|
|
)
|
|
result = metadata.model_dump()
|
|
|
|
assert isinstance(result, dict)
|
|
assert result["url"] == "https://example.com"
|
|
assert result["title"] == "Example"
|
|
|
|
def test_metadata_model_dump_json(self):
|
|
"""Test Pydantic model_dump_json method."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
result = metadata.model_dump_json()
|
|
|
|
assert isinstance(result, str)
|
|
data = json.loads(result)
|
|
assert data["url"] == "https://example.com"
|
|
assert data["title"] == "Example"
|
|
|
|
def test_metadata_with_images_and_extra(self):
|
|
"""Test metadata with list and dict fields."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
images=["https://example.com/image1.jpg", "https://example.com/image2.jpg"],
|
|
favicon="https://example.com/favicon.ico",
|
|
extra={"custom_field": "value", "tags": ["tag1", "tag2"]},
|
|
)
|
|
|
|
assert len(metadata.images) == 2
|
|
assert metadata.favicon == "https://example.com/favicon.ico"
|
|
assert metadata.extra["custom_field"] == "value"
|
|
|
|
|
|
class TestCitation:
|
|
"""Test Citation Pydantic model."""
|
|
|
|
def test_create_basic_citation(self):
|
|
"""Test creating a basic citation."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
citation = Citation(number=1, metadata=metadata)
|
|
|
|
assert citation.number == 1
|
|
assert citation.metadata == metadata
|
|
assert citation.context is None
|
|
assert citation.cited_text is None
|
|
|
|
def test_citation_properties(self):
|
|
"""Test citation property shortcuts."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example Title",
|
|
)
|
|
citation = Citation(number=1, metadata=metadata)
|
|
|
|
assert citation.id == metadata.id
|
|
assert citation.url == "https://example.com"
|
|
assert citation.title == "Example Title"
|
|
|
|
def test_citation_to_markdown_reference(self):
|
|
"""Test markdown reference generation."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
citation = Citation(number=1, metadata=metadata)
|
|
|
|
result = citation.to_markdown_reference()
|
|
assert result == "[Example](https://example.com)"
|
|
|
|
def test_citation_to_numbered_reference(self):
|
|
"""Test numbered reference generation."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example Article",
|
|
)
|
|
citation = Citation(number=5, metadata=metadata)
|
|
|
|
result = citation.to_numbered_reference()
|
|
assert result == "[5] Example Article - https://example.com"
|
|
|
|
def test_citation_to_inline_marker(self):
|
|
"""Test inline marker generation."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
citation = Citation(number=3, metadata=metadata)
|
|
|
|
result = citation.to_inline_marker()
|
|
assert result == "[^3]"
|
|
|
|
def test_citation_to_footnote(self):
|
|
"""Test footnote generation."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example Article",
|
|
)
|
|
citation = Citation(number=2, metadata=metadata)
|
|
|
|
result = citation.to_footnote()
|
|
assert result == "[^2]: Example Article - https://example.com"
|
|
|
|
def test_citation_with_context_and_text(self):
|
|
"""Test citation with context and cited text."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
citation = Citation(
|
|
number=1,
|
|
metadata=metadata,
|
|
context="This is important context",
|
|
cited_text="Important quote from the source",
|
|
)
|
|
|
|
assert citation.context == "This is important context"
|
|
assert citation.cited_text == "Important quote from the source"
|
|
|
|
def test_citation_from_dict(self):
|
|
"""Test creating citation from dictionary."""
|
|
data = {
|
|
"number": 1,
|
|
"metadata": {
|
|
"url": "https://example.com",
|
|
"title": "Example",
|
|
"author": "John Doe",
|
|
},
|
|
"context": "Test context",
|
|
}
|
|
citation = Citation.from_dict(data)
|
|
|
|
assert citation.number == 1
|
|
assert citation.metadata.url == "https://example.com"
|
|
assert citation.metadata.title == "Example"
|
|
assert citation.metadata.author == "John Doe"
|
|
assert citation.context == "Test context"
|
|
|
|
def test_citation_to_dict(self):
|
|
"""Test converting citation to dictionary."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
author="John Doe",
|
|
)
|
|
citation = Citation(
|
|
number=1,
|
|
metadata=metadata,
|
|
context="Test context",
|
|
)
|
|
result = citation.to_dict()
|
|
|
|
assert result["number"] == 1
|
|
assert result["metadata"]["url"] == "https://example.com"
|
|
assert result["metadata"]["author"] == "John Doe"
|
|
assert result["context"] == "Test context"
|
|
|
|
def test_citation_round_trip(self):
|
|
"""Test converting to dict and back."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
author="John Doe",
|
|
relevance_score=0.95,
|
|
)
|
|
original = Citation(number=1, metadata=metadata, context="Test")
|
|
|
|
# Convert to dict and back
|
|
dict_repr = original.to_dict()
|
|
restored = Citation.from_dict(dict_repr)
|
|
|
|
assert restored.number == original.number
|
|
assert restored.metadata.url == original.metadata.url
|
|
assert restored.metadata.title == original.metadata.title
|
|
assert restored.metadata.author == original.metadata.author
|
|
assert restored.metadata.relevance_score == original.metadata.relevance_score
|
|
|
|
def test_citation_model_dump(self):
|
|
"""Test Pydantic model_dump method."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
citation = Citation(number=1, metadata=metadata)
|
|
result = citation.model_dump()
|
|
|
|
assert isinstance(result, dict)
|
|
assert result["number"] == 1
|
|
assert result["metadata"]["url"] == "https://example.com"
|
|
|
|
def test_citation_model_dump_json(self):
|
|
"""Test Pydantic model_dump_json method."""
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
citation = Citation(number=1, metadata=metadata)
|
|
result = citation.model_dump_json()
|
|
|
|
assert isinstance(result, str)
|
|
data = json.loads(result)
|
|
assert data["number"] == 1
|
|
assert data["metadata"]["url"] == "https://example.com"
|
|
|
|
def test_citation_pydantic_validation(self):
|
|
"""Test that Pydantic validates required fields."""
|
|
# Number and metadata are required
|
|
with pytest.raises(ValidationError):
|
|
Citation() # Missing required fields
|
|
|
|
metadata = CitationMetadata(
|
|
url="https://example.com",
|
|
title="Example",
|
|
)
|
|
with pytest.raises(ValidationError):
|
|
Citation(metadata=metadata) # Missing number
|
|
|
|
|
|
class TestCitationIntegration:
|
|
"""Integration tests for citation models."""
|
|
|
|
def test_search_result_to_citation_workflow(self):
|
|
"""Test complete workflow from search result to citation."""
|
|
search_result = {
|
|
"url": "https://example.com/article",
|
|
"title": "Great Article",
|
|
"content": "This is a great article about testing",
|
|
"score": 0.92,
|
|
}
|
|
|
|
# Create metadata from search result
|
|
metadata = CitationMetadata.from_search_result(search_result, query="testing")
|
|
|
|
# Create citation
|
|
citation = Citation(number=1, metadata=metadata, context="Important source")
|
|
|
|
# Verify the workflow
|
|
assert citation.number == 1
|
|
assert citation.url == "https://example.com/article"
|
|
assert citation.title == "Great Article"
|
|
assert citation.metadata.relevance_score == 0.92
|
|
assert citation.to_markdown_reference() == "[Great Article](https://example.com/article)"
|
|
|
|
def test_multiple_citations_with_different_formats(self):
|
|
"""Test handling multiple citations in different formats."""
|
|
citations = []
|
|
|
|
# Create first citation
|
|
metadata1 = CitationMetadata(
|
|
url="https://example.com/1",
|
|
title="First Article",
|
|
)
|
|
citations.append(Citation(number=1, metadata=metadata1))
|
|
|
|
# Create second citation
|
|
metadata2 = CitationMetadata(
|
|
url="https://example.com/2",
|
|
title="Second Article",
|
|
)
|
|
citations.append(Citation(number=2, metadata=metadata2))
|
|
|
|
# Verify all reference formats
|
|
assert citations[0].to_markdown_reference() == "[First Article](https://example.com/1)"
|
|
assert citations[1].to_numbered_reference() == "[2] Second Article - https://example.com/2"
|
|
|
|
def test_citation_json_serialization_roundtrip(self):
|
|
"""Test JSON serialization and deserialization roundtrip."""
|
|
original_data = {
|
|
"number": 1,
|
|
"metadata": {
|
|
"url": "https://example.com",
|
|
"title": "Example",
|
|
"author": "John Doe",
|
|
"relevance_score": 0.95,
|
|
},
|
|
"context": "Test context",
|
|
"cited_text": "Important quote",
|
|
}
|
|
|
|
# Create from dict
|
|
citation = Citation.from_dict(original_data)
|
|
|
|
# Serialize to JSON
|
|
json_str = citation.model_dump_json()
|
|
|
|
# Deserialize from JSON
|
|
restored = Citation.model_validate_json(json_str)
|
|
|
|
# Verify data integrity
|
|
assert restored.number == original_data["number"]
|
|
assert restored.metadata.url == original_data["metadata"]["url"]
|
|
assert restored.metadata.relevance_score == original_data["metadata"]["relevance_score"]
|
|
assert restored.context == original_data["context"]
|