deer-flow/tests/unit/citations/test_models.py

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

"""
Unit tests for citation models.

Tests the Pydantic BaseModel implementation of CitationMetadata and Citation classes.
"""

import json

import pytest
from pydantic import ValidationError

from src.citations.models import Citation, CitationMetadata


class TestCitationMetadata:
    """Test CitationMetadata Pydantic model."""

    def test_create_basic_metadata(self):
        """Test creating basic citation metadata."""
        metadata = CitationMetadata(
            url="https://example.com/article",
            title="Example Article",
        )
        assert metadata.url == "https://example.com/article"
        assert metadata.title == "Example Article"
        assert metadata.domain == "example.com"  # Auto-extracted from URL
        assert metadata.description is None
        assert metadata.images == []
        assert metadata.extra == {}

    def test_metadata_with_all_fields(self):
        """Test creating metadata with all fields populated."""
        metadata = CitationMetadata(
            url="https://github.com/example/repo",
            title="Example Repository",
            description="A great repository",
            content_snippet="This is a snippet",
            raw_content="Full content here",
            author="John Doe",
            published_date="2025-01-24",
            language="en",
            relevance_score=0.95,
            credibility_score=0.88,
        )
        assert metadata.url == "https://github.com/example/repo"
        assert metadata.domain == "github.com"
        assert metadata.author == "John Doe"
        assert metadata.relevance_score == 0.95
        assert metadata.credibility_score == 0.88

    def test_metadata_domain_auto_extraction(self):
        """Test automatic domain extraction from URL."""
        test_cases = [
            ("https://www.example.com/path", "www.example.com"),
            ("http://github.com/user/repo", "github.com"),
            ("https://api.github.com:443/repos", "api.github.com:443"),
        ]

        for url, expected_domain in test_cases:
            metadata = CitationMetadata(url=url, title="Test")
            assert metadata.domain == expected_domain

    def test_metadata_id_generation(self):
        """Test unique ID generation from URL."""
        metadata1 = CitationMetadata(
            url="https://example.com/article",
            title="Article",
        )
        metadata2 = CitationMetadata(
            url="https://example.com/article",
            title="Article",
        )
        # Same URL should produce same ID
        assert metadata1.id == metadata2.id

        metadata3 = CitationMetadata(
            url="https://different.com/article",
            title="Article",
        )
        # Different URL should produce different ID
        assert metadata1.id != metadata3.id

    def test_metadata_id_length(self):
        """Test that ID is truncated to 12 characters."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Test",
        )
        assert len(metadata.id) == 12
        assert metadata.id.isalnum() or all(c in "0123456789abcdef" for c in metadata.id)

    def test_metadata_from_dict(self):
        """Test creating metadata from dictionary."""
        data = {
            "url": "https://example.com",
            "title": "Example",
            "description": "A description",
            "author": "John Doe",
        }
        metadata = CitationMetadata.from_dict(data)
        assert metadata.url == "https://example.com"
        assert metadata.title == "Example"
        assert metadata.description == "A description"
        assert metadata.author == "John Doe"

    def test_metadata_from_dict_removes_id(self):
        """Test that from_dict removes computed 'id' field."""
        data = {
            "url": "https://example.com",
            "title": "Example",
            "id": "some_old_id",  # Should be ignored
        }
        metadata = CitationMetadata.from_dict(data)
        # Should use newly computed ID, not the old one
        assert metadata.id != "some_old_id"

    def test_metadata_to_dict(self):
        """Test converting metadata to dictionary."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
            author="John Doe",
        )
        result = metadata.to_dict()

        assert result["url"] == "https://example.com"
        assert result["title"] == "Example"
        assert result["author"] == "John Doe"
        assert result["id"] == metadata.id
        assert result["domain"] == "example.com"

    def test_metadata_from_search_result(self):
        """Test creating metadata from search result."""
        search_result = {
            "url": "https://example.com/article",
            "title": "Article Title",
            "content": "Article content here",
            "score": 0.92,
            "type": "page",
        }
        metadata = CitationMetadata.from_search_result(
            search_result,
            query="test query",
        )

        assert metadata.url == "https://example.com/article"
        assert metadata.title == "Article Title"
        assert metadata.description == "Article content here"
        assert metadata.relevance_score == 0.92
        assert metadata.extra["query"] == "test query"
        assert metadata.extra["result_type"] == "page"

    def test_metadata_pydantic_validation(self):
        """Test that Pydantic validates required fields."""
        # URL and title are required
        with pytest.raises(ValidationError):
            CitationMetadata()  # Missing required fields

        with pytest.raises(ValidationError):
            CitationMetadata(url="https://example.com")  # Missing title

    def test_metadata_model_dump(self):
        """Test Pydantic model_dump method."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
            author="John Doe",
        )
        result = metadata.model_dump()

        assert isinstance(result, dict)
        assert result["url"] == "https://example.com"
        assert result["title"] == "Example"

    def test_metadata_model_dump_json(self):
        """Test Pydantic model_dump_json method."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        result = metadata.model_dump_json()

        assert isinstance(result, str)
        data = json.loads(result)
        assert data["url"] == "https://example.com"
        assert data["title"] == "Example"

    def test_metadata_with_images_and_extra(self):
        """Test metadata with list and dict fields."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
            images=["https://example.com/image1.jpg", "https://example.com/image2.jpg"],
            favicon="https://example.com/favicon.ico",
            extra={"custom_field": "value", "tags": ["tag1", "tag2"]},
        )

        assert len(metadata.images) == 2
        assert metadata.favicon == "https://example.com/favicon.ico"
        assert metadata.extra["custom_field"] == "value"


class TestCitation:
    """Test Citation Pydantic model."""

    def test_create_basic_citation(self):
        """Test creating a basic citation."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        citation = Citation(number=1, metadata=metadata)

        assert citation.number == 1
        assert citation.metadata == metadata
        assert citation.context is None
        assert citation.cited_text is None

    def test_citation_properties(self):
        """Test citation property shortcuts."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example Title",
        )
        citation = Citation(number=1, metadata=metadata)

        assert citation.id == metadata.id
        assert citation.url == "https://example.com"
        assert citation.title == "Example Title"

    def test_citation_to_markdown_reference(self):
        """Test markdown reference generation."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        citation = Citation(number=1, metadata=metadata)

        result = citation.to_markdown_reference()
        assert result == "[Example](https://example.com)"

    def test_citation_to_numbered_reference(self):
        """Test numbered reference generation."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example Article",
        )
        citation = Citation(number=5, metadata=metadata)

        result = citation.to_numbered_reference()
        assert result == "[5] Example Article - https://example.com"

    def test_citation_to_inline_marker(self):
        """Test inline marker generation."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        citation = Citation(number=3, metadata=metadata)

        result = citation.to_inline_marker()
        assert result == "[^3]"

    def test_citation_to_footnote(self):
        """Test footnote generation."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example Article",
        )
        citation = Citation(number=2, metadata=metadata)

        result = citation.to_footnote()
        assert result == "[^2]: Example Article - https://example.com"

    def test_citation_with_context_and_text(self):
        """Test citation with context and cited text."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        citation = Citation(
            number=1,
            metadata=metadata,
            context="This is important context",
            cited_text="Important quote from the source",
        )

        assert citation.context == "This is important context"
        assert citation.cited_text == "Important quote from the source"

    def test_citation_from_dict(self):
        """Test creating citation from dictionary."""
        data = {
            "number": 1,
            "metadata": {
                "url": "https://example.com",
                "title": "Example",
                "author": "John Doe",
            },
            "context": "Test context",
        }
        citation = Citation.from_dict(data)

        assert citation.number == 1
        assert citation.metadata.url == "https://example.com"
        assert citation.metadata.title == "Example"
        assert citation.metadata.author == "John Doe"
        assert citation.context == "Test context"

    def test_citation_to_dict(self):
        """Test converting citation to dictionary."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
            author="John Doe",
        )
        citation = Citation(
            number=1,
            metadata=metadata,
            context="Test context",
        )
        result = citation.to_dict()

        assert result["number"] == 1
        assert result["metadata"]["url"] == "https://example.com"
        assert result["metadata"]["author"] == "John Doe"
        assert result["context"] == "Test context"

    def test_citation_round_trip(self):
        """Test converting to dict and back."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
            author="John Doe",
            relevance_score=0.95,
        )
        original = Citation(number=1, metadata=metadata, context="Test")

        # Convert to dict and back
        dict_repr = original.to_dict()
        restored = Citation.from_dict(dict_repr)

        assert restored.number == original.number
        assert restored.metadata.url == original.metadata.url
        assert restored.metadata.title == original.metadata.title
        assert restored.metadata.author == original.metadata.author
        assert restored.metadata.relevance_score == original.metadata.relevance_score

    def test_citation_model_dump(self):
        """Test Pydantic model_dump method."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        citation = Citation(number=1, metadata=metadata)
        result = citation.model_dump()

        assert isinstance(result, dict)
        assert result["number"] == 1
        assert result["metadata"]["url"] == "https://example.com"

    def test_citation_model_dump_json(self):
        """Test Pydantic model_dump_json method."""
        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        citation = Citation(number=1, metadata=metadata)
        result = citation.model_dump_json()

        assert isinstance(result, str)
        data = json.loads(result)
        assert data["number"] == 1
        assert data["metadata"]["url"] == "https://example.com"

    def test_citation_pydantic_validation(self):
        """Test that Pydantic validates required fields."""
        # Number and metadata are required
        with pytest.raises(ValidationError):
            Citation()  # Missing required fields

        metadata = CitationMetadata(
            url="https://example.com",
            title="Example",
        )
        with pytest.raises(ValidationError):
            Citation(metadata=metadata)  # Missing number


class TestCitationIntegration:
    """Integration tests for citation models."""

    def test_search_result_to_citation_workflow(self):
        """Test complete workflow from search result to citation."""
        search_result = {
            "url": "https://example.com/article",
            "title": "Great Article",
            "content": "This is a great article about testing",
            "score": 0.92,
        }

        # Create metadata from search result
        metadata = CitationMetadata.from_search_result(search_result, query="testing")

        # Create citation
        citation = Citation(number=1, metadata=metadata, context="Important source")

        # Verify the workflow
        assert citation.number == 1
        assert citation.url == "https://example.com/article"
        assert citation.title == "Great Article"
        assert citation.metadata.relevance_score == 0.92
        assert citation.to_markdown_reference() == "[Great Article](https://example.com/article)"

    def test_multiple_citations_with_different_formats(self):
        """Test handling multiple citations in different formats."""
        citations = []

        # Create first citation
        metadata1 = CitationMetadata(
            url="https://example.com/1",
            title="First Article",
        )
        citations.append(Citation(number=1, metadata=metadata1))

        # Create second citation
        metadata2 = CitationMetadata(
            url="https://example.com/2",
            title="Second Article",
        )
        citations.append(Citation(number=2, metadata=metadata2))

        # Verify all reference formats
        assert citations[0].to_markdown_reference() == "[First Article](https://example.com/1)"
        assert citations[1].to_numbered_reference() == "[2] Second Article - https://example.com/2"

    def test_citation_json_serialization_roundtrip(self):
        """Test JSON serialization and deserialization roundtrip."""
        original_data = {
            "number": 1,
            "metadata": {
                "url": "https://example.com",
                "title": "Example",
                "author": "John Doe",
                "relevance_score": 0.95,
            },
            "context": "Test context",
            "cited_text": "Important quote",
        }

        # Create from dict
        citation = Citation.from_dict(original_data)

        # Serialize to JSON
        json_str = citation.model_dump_json()

        # Deserialize from JSON
        restored = Citation.model_validate_json(json_str)

        # Verify data integrity
        assert restored.number == original_data["number"]
        assert restored.metadata.url == original_data["metadata"]["url"]
        assert restored.metadata.relevance_score == original_data["metadata"]["relevance_score"]
        assert restored.context == original_data["context"]