deer-flow/tests/unit/eval/test_metrics.py

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

"""Unit tests for report evaluation metrics."""

from src.eval.metrics import (
    compute_metrics,
    count_citations,
    count_images,
    count_words,
    detect_sections,
    extract_domains,
    get_word_count_target,
)


class TestCountWords:
    """Tests for word counting function."""

    def test_english_words(self):
        text = "This is a simple test sentence."
        assert count_words(text) == 6

    def test_chinese_characters(self):
        text = "这是一个测试"
        assert count_words(text) == 6

    def test_mixed_content(self):
        text = "Hello 你好 World 世界"
        assert count_words(text) == 4 + 2  # 2 English + 4 Chinese

    def test_empty_string(self):
        assert count_words("") == 0


class TestCountCitations:
    """Tests for citation counting function."""

    def test_markdown_citations(self):
        text = """
        Check out [Google](https://google.com) and [GitHub](https://github.com).
        """
        assert count_citations(text) == 2

    def test_no_citations(self):
        text = "This is plain text without any links."
        assert count_citations(text) == 0

    def test_invalid_urls(self):
        text = "[Link](not-a-url) [Another](ftp://ftp.example.com)"
        assert count_citations(text) == 0

    def test_complex_urls(self):
        text = "[Article](https://example.com/path/to/article?id=123&ref=test)"
        assert count_citations(text) == 1


class TestExtractDomains:
    """Tests for domain extraction function."""

    def test_extract_multiple_domains(self):
        text = """
        https://google.com/search
        https://www.github.com/user/repo
        https://docs.python.org/3/
        """
        domains = extract_domains(text)
        assert len(domains) == 3
        assert "google.com" in domains
        assert "github.com" in domains
        assert "docs.python.org" in domains

    def test_deduplicate_domains(self):
        text = """
        https://example.com/page1
        https://example.com/page2
        https://www.example.com/page3
        """
        domains = extract_domains(text)
        assert len(domains) == 1
        assert "example.com" in domains

    def test_no_urls(self):
        text = "Plain text without URLs"
        assert extract_domains(text) == []


class TestCountImages:
    """Tests for image counting function."""

    def test_markdown_images(self):
        text = """
        ![Alt text](https://example.com/image1.png)
        ![](https://example.com/image2.jpg)
        """
        assert count_images(text) == 2

    def test_no_images(self):
        text = "Text without images [link](url)"
        assert count_images(text) == 0


class TestDetectSections:
    """Tests for section detection function."""

    def test_detect_title(self):
        text = "# My Report Title\n\nSome content here."
        sections = detect_sections(text)
        assert sections.get("title") is True

    def test_detect_key_points(self):
        text = "## Key Points\n- Point 1\n- Point 2"
        sections = detect_sections(text)
        assert sections.get("key_points") is True

    def test_detect_chinese_sections(self):
        text = """# 报告标题
## 要点
- 要点1
## 概述
这是概述内容
        """
        sections = detect_sections(text)
        assert sections.get("title") is True
        assert sections.get("key_points") is True
        assert sections.get("overview") is True

    def test_detect_citations_section(self):
        text = """
        ## Key Citations
        - [Source 1](https://example.com)
        """
        sections = detect_sections(text)
        assert sections.get("key_citations") is True


class TestComputeMetrics:
    """Tests for the main compute_metrics function."""

    def test_complete_report(self):
        report = """
# Research Report Title

## Key Points
- Point 1
- Point 2
- Point 3

## Overview
This is an overview of the research topic.

## Detailed Analysis
Here is the detailed analysis with [source](https://example.com).

![Figure 1](https://example.com/image.png)

## Key Citations
- [Source 1](https://example.com)
- [Source 2](https://another.com)
        """
        metrics = compute_metrics(report)

        assert metrics.has_title is True
        assert metrics.has_key_points is True
        assert metrics.has_overview is True
        assert metrics.has_citations_section is True
        assert metrics.citation_count >= 2
        assert metrics.image_count == 1
        assert metrics.unique_sources >= 1
        assert metrics.section_coverage_score > 0.5

    def test_minimal_report(self):
        report = "Just some text without structure."
        metrics = compute_metrics(report)

        assert metrics.has_title is False
        assert metrics.citation_count == 0
        assert metrics.section_coverage_score < 0.5

    def test_metrics_to_dict(self):
        report = "# Title\n\nSome content"
        metrics = compute_metrics(report)
        result = metrics.to_dict()

        assert isinstance(result, dict)
        assert "word_count" in result
        assert "citation_count" in result
        assert "section_coverage_score" in result


class TestGetWordCountTarget:
    """Tests for word count target function."""

    def test_strategic_investment_target(self):
        target = get_word_count_target("strategic_investment")
        assert target["min"] == 10000
        assert target["max"] == 15000

    def test_news_target(self):
        target = get_word_count_target("news")
        assert target["min"] == 800
        assert target["max"] == 2000

    def test_default_target(self):
        target = get_word_count_target("unknown_style")
        assert target["min"] == 1000
        assert target["max"] == 5000