fix: the crawling error when encountering PDF URLs (#707)

* fix: the crawling error when encountering PDF URLs * Added the unit test for the new feature of crawl tool * fix: address the code review problems * fix: address the code review problems
2026-04-03 06:12:14 +08:00 · 2025-11-25 09:24:52 +08:00
parent da514337da
commit bec97f02ae
4 changed files with 484 additions and 3 deletions
--- a/src/crawler/crawler.py
+++ b/src/crawler/crawler.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT

+import re
 import logging

 from .article import Article
@@ -10,6 +11,126 @@ from .readability_extractor import ReadabilityExtractor
 logger = logging.getLogger(__name__)


+def safe_truncate(text: str, max_length: int = 500) -> str:
+    """
+    Safely truncate text to a maximum length without breaking multi-byte characters.
+    
+    Args:
+        text: The text to truncate
+        max_length: Maximum number of characters to keep
+        
+    Returns:
+        Truncated text that is safe to use without encoding issues
+    """
+    if text is None:
+        return None
+    
+    if len(text) <= max_length:
+        return text
+    
+    # Ensure max_length is at least 3 to accommodate the placeholder
+    if max_length < 3:
+        return "..."[:max_length]
+    
+    # Use Python's built-in textwrap.shorten which handles unicode safely
+    try:
+        import textwrap
+        return textwrap.shorten(text, width=max_length, placeholder="...")
+    except (ImportError, TypeError):
+        # Fallback for older Python versions or if textwrap.shorten has issues
+        # Truncate to max_length - 3 to make room for "..."
+        truncated = text[:max_length - 3]
+        # Remove any incomplete Unicode surrogate pair
+        while truncated and ord(truncated[-1]) >= 0xD800 and ord(truncated[-1]) <= 0xDFFF:
+            truncated = truncated[:-1]
+        return truncated + "..."
+
+
+def is_html_content(content: str) -> bool:
+    """
+    Check if the provided content is HTML.
+    
+    Uses a more robust detection method that checks for common HTML patterns
+    including DOCTYPE declarations, HTML tags, and other HTML markers.
+    """
+    if not content or not content.strip():
+        return False
+    
+    content = content.strip()
+    
+    # Check for HTML comments
+    if content.startswith('<!--') and '-->' in content:
+        return True
+    
+    # Check for DOCTYPE declarations (case insensitive)
+    if re.match(r'^<!DOCTYPE\s+html', content, re.IGNORECASE):
+        return True
+    
+    # Check for XML declarations followed by HTML
+    if content.startswith('<?xml') and '<html' in content:
+        return True
+    
+    # Check for common HTML tags at the beginning
+    html_start_patterns = [
+        r'^<html',
+        r'^<head',
+        r'^<body',
+        r'^<title',
+        r'^<meta',
+        r'^<link',
+        r'^<script',
+        r'^<style',
+        r'^<div',
+        r'^<p>',
+        r'^<p\s',
+        r'^<span',
+        r'^<h[1-6]',
+        r'^<!DOCTYPE',
+        r'^<\!DOCTYPE',  # Some variations
+    ]
+    
+    for pattern in html_start_patterns:
+        if re.match(pattern, content, re.IGNORECASE):
+            return True
+    
+    # Check for any HTML-like tags in the content (more permissive)
+    if re.search(r'<[^>]+>', content):
+        # Additional check: ensure it's not just XML or other markup
+        # Look for common HTML attributes or elements
+        html_indicators = [
+            r'href\s*=',
+            r'src\s*=',
+            r'class\s*=',
+            r'id\s*=',
+            r'<img\s',
+            r'<a\s',
+            r'<div',
+            r'<p>',
+            r'<p\s',
+            r'<!DOCTYPE',
+        ]
+        
+        for indicator in html_indicators:
+            if re.search(indicator, content, re.IGNORECASE):
+                return True
+        
+        # Also check for self-closing HTML tags
+        self_closing_tags = [
+            r'<img\s+[^>]*?/>',
+            r'<br\s*/?>',
+            r'<hr\s*/?>',
+            r'<input\s+[^>]*?/>',
+            r'<meta\s+[^>]*?/>',
+            r'<link\s+[^>]*?/>',
+        ]
+        
+        for tag in self_closing_tags:
+            if re.search(tag, content, re.IGNORECASE):
+                return True
+    
+    return False
+
+
 class Crawler:
    def crawl(self, url: str) -> Article:
        # To help LLMs better understand content, we extract clean
@@ -29,12 +150,39 @@ class Crawler:
            logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}")
            raise
        
+        # Check if we got valid HTML content
+        if not html or not html.strip():
+            logger.warning(f"Empty content received from URL {url}")
+            article = Article(
+                title="Empty Content",
+                html_content="<p>No content could be extracted from this page</p>"
+            )
+            article.url = url
+            return article
+        
+        # Check if content is actually HTML using more robust detection
+        if not is_html_content(html):
+            logger.warning(f"Non-HTML content received from URL {url}, creating fallback article")
+            # Return a simple article with the raw content (safely truncated)
+            article = Article(
+                title="Non-HTML Content",
+                html_content=f"<p>This URL returned content that cannot be parsed as HTML. Raw content: {safe_truncate(html, 500)}</p>"
+            )
+            article.url = url
+            return article
+        
        try:
            extractor = ReadabilityExtractor()
            article = extractor.extract_article(html)
        except Exception as e:
            logger.error(f"Failed to extract article from {url}: {repr(e)}")
-            raise
+            # Fall back to a simple article with the raw HTML (safely truncated)
+            article = Article(
+                title="Content Extraction Failed",
+                html_content=f"<p>Content extraction failed. Raw content: {safe_truncate(html, 500)}</p>"
+            )
+            article.url = url
+            return article
        
        article.url = url
        return article
--- a/src/tools/crawl.py
+++ b/src/tools/crawl.py
@@ -3,7 +3,8 @@

 import json
 import logging
-from typing import Annotated
+from typing import Annotated, Optional
+from urllib.parse import urlparse

 from langchain_core.tools import tool

@@ -13,6 +14,14 @@ from .decorators import log_io

 logger = logging.getLogger(__name__)

+def is_pdf_url(url: Optional[str]) -> bool:
+    """Check if the URL points to a PDF file."""
+    if not url:
+        return False
+    parsed_url = urlparse(url)
+    # Check if the path ends with .pdf (case insensitive)
+    return parsed_url.path.lower().endswith('.pdf')
+

@tool
@log_io
@@ -20,6 +29,17 @@ def crawl_tool(
    url: Annotated[str, "The url to crawl."],
 ) -> str:
    """Use this to crawl a url and get a readable content in markdown format."""
+    # Special handling for PDF URLs
+    if is_pdf_url(url):
+        logger.info(f"PDF URL detected, skipping crawling: {url}")
+        pdf_message = json.dumps({
+            "url": url,
+            "error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
+            "crawled_content": None,
+            "is_pdf": True
+        })
+        return pdf_message
+    
    try:
        crawler = Crawler()
        article = crawler.crawl(url)
--- a/tests/unit/crawler/test_crawler_class.py
+++ b/tests/unit/crawler/test_crawler_class.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: MIT

 import src.crawler as crawler_module
+from src.crawler.crawler import safe_truncate


 def test_crawler_sets_article_url(monkeypatch):
@@ -68,3 +69,232 @@ def test_crawler_calls_dependencies(monkeypatch):
    assert calls["jina"][1] == "html"
    assert "extractor" in calls
    assert calls["extractor"] == "<html>dummy</html>"
+
+
+def test_crawler_handles_empty_content(monkeypatch):
+    """Test that the crawler handles empty content gracefully."""
+    
+    class DummyArticle:
+        def __init__(self, title, html_content):
+            self.title = title
+            self.html_content = html_content
+            self.url = None
+        
+        def to_markdown(self):
+            return f"# {self.title}"
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            return ""  # Empty content
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            # This should not be called for empty content
+            assert False, "ReadabilityExtractor should not be called for empty content"
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com"
+    article = crawler.crawl(url)
+    
+    assert article.url == url
+    assert article.title == "Empty Content"
+    assert "No content could be extracted" in article.html_content
+
+
+def test_crawler_handles_non_html_content(monkeypatch):
+    """Test that the crawler handles non-HTML content gracefully."""
+    
+    class DummyArticle:
+        def __init__(self, title, html_content):
+            self.title = title
+            self.html_content = html_content
+            self.url = None
+        
+        def to_markdown(self):
+            return f"# {self.title}"
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            return "This is plain text content, not HTML"
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            # This should not be called for non-HTML content
+            assert False, "ReadabilityExtractor should not be called for non-HTML content"
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com"
+    article = crawler.crawl(url)
+    
+    assert article.url == url
+    assert article.title == "Non-HTML Content"
+    assert "cannot be parsed as HTML" in article.html_content
+    assert "plain text content" in article.html_content  # Should include a snippet of the original content
+
+
+def test_crawler_handles_extraction_failure(monkeypatch):
+    """Test that the crawler handles readability extraction failure gracefully."""
+    
+    class DummyArticle:
+        def __init__(self, title, html_content):
+            self.title = title
+            self.html_content = html_content
+            self.url = None
+        
+        def to_markdown(self):
+            return f"# {self.title}"
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            return "<html><body>Valid HTML but extraction will fail</body></html>"
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            raise Exception("Extraction failed")
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com"
+    article = crawler.crawl(url)
+    
+    assert article.url == url
+    assert article.title == "Content Extraction Failed"
+    assert "Content extraction failed" in article.html_content
+    assert "Valid HTML but extraction will fail" in article.html_content  # Should include a snippet of the HTML
+
+
+def test_crawler_with_json_like_content(monkeypatch):
+    """Test that the crawler handles JSON-like content gracefully."""
+    
+    class DummyArticle:
+        def __init__(self, title, html_content):
+            self.title = title
+            self.html_content = html_content
+            self.url = None
+        
+        def to_markdown(self):
+            return f"# {self.title}"
+
+    class DummyJinaClient:
+        def crawl(self, url, return_format=None):
+            return '{"title": "Some JSON", "content": "This is JSON content"}'
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            # This should not be called for JSON content
+            assert False, "ReadabilityExtractor should not be called for JSON content"
+
+    monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+    monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
+
+    crawler = crawler_module.Crawler()
+    url = "http://example.com/api/data"
+    article = crawler.crawl(url)
+    
+    assert article.url == url
+    assert article.title == "Non-HTML Content"
+    assert "cannot be parsed as HTML" in article.html_content
+    assert '{"title": "Some JSON"' in article.html_content  # Should include a snippet of the JSON
+
+
+def test_crawler_with_various_html_formats(monkeypatch):
+    """Test that the crawler correctly identifies various HTML formats."""
+    
+    class DummyArticle:
+        def __init__(self, title, html_content):
+            self.title = title
+            self.html_content = html_content
+            self.url = None
+        
+        def to_markdown(self):
+            return f"# {self.title}"
+
+    # Test case 1: HTML with DOCTYPE
+    class DummyJinaClient1:
+        def crawl(self, url, return_format=None):
+            return "<!DOCTYPE html><html><body><p>Test content</p></body></html>"
+
+    # Test case 2: HTML with leading whitespace
+    class DummyJinaClient2:
+        def crawl(self, url, return_format=None):
+            return "\n\n  <html><body><p>Test content</p></body></html>"
+
+    # Test case 3: HTML with comments
+    class DummyJinaClient3:
+        def crawl(self, url, return_format=None):
+            return "<!-- HTML comment --><html><body><p>Test content</p></body></html>"
+
+    # Test case 4: HTML with self-closing tags
+    class DummyJinaClient4:
+        def crawl(self, url, return_format=None):
+            return '<img src="test.jpg" alt="test" /><p>Test content</p>'
+
+    class DummyReadabilityExtractor:
+        def extract_article(self, html):
+            return DummyArticle("Extracted Article", "<p>Extracted content</p>")
+
+    # Test each HTML format
+    test_cases = [
+        (DummyJinaClient1, "HTML with DOCTYPE"),
+        (DummyJinaClient2, "HTML with leading whitespace"),
+        (DummyJinaClient3, "HTML with comments"),
+        (DummyJinaClient4, "HTML with self-closing tags"),
+    ]
+    
+    for JinaClientClass, description in test_cases:
+        monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass)
+        monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
+        
+        crawler = crawler_module.Crawler()
+        url = "http://example.com"
+        article = crawler.crawl(url)
+        
+        assert article.url == url
+        assert article.title == "Extracted Article"
+        assert "Extracted content" in article.html_content
+
+
+def test_safe_truncate_function():
+    """Test the safe_truncate function handles various character sets correctly."""
+    
+    # Test None input
+    assert safe_truncate(None) is None
+    
+    # Test empty string
+    assert safe_truncate("") == ""
+    
+    # Test string shorter than limit
+    assert safe_truncate("Short text") == "Short text"
+    
+    # Test ASCII truncation
+    result = safe_truncate("This is a longer text that needs truncation", 20)
+    assert len(result) <= 20
+    assert "..." in result
+    
+    # Test Unicode/emoji characters
+    text_with_emoji = "Hello! 🌍 Welcome to the world 🚀"
+    result = safe_truncate(text_with_emoji, 20)
+    assert len(result) <= 20
+    assert "..." in result
+    # Verify it's valid UTF-8
+    assert result.encode('utf-8').decode('utf-8') == result
+    
+    # Test very small limit
+    assert safe_truncate("Long text", 1) == "."
+    assert safe_truncate("Long text", 2) == ".."
+    assert safe_truncate("Long text", 3) == "..."
+    
+    # Test with Chinese characters
+    chinese_text = "这是一个中文测试文本"
+    result = safe_truncate(chinese_text, 10)
+    assert len(result) <= 10
+    # Verify it's valid UTF-8
+    assert result.encode('utf-8').decode('utf-8') == result
--- a/tests/unit/tools/test_crawl.py
+++ b/tests/unit/tools/test_crawl.py
@@ -1,7 +1,7 @@
 import json
 from unittest.mock import Mock, patch

-from src.tools.crawl import crawl_tool
+from src.tools.crawl import crawl_tool, is_pdf_url


 class TestCrawlTool:
@@ -131,3 +131,86 @@ class TestCrawlTool:
        assert result_dict["url"] == url
        assert "crawled_content" in result_dict
        assert "No content available" in result_dict["crawled_content"]
+
+
+class TestPDFHandling:
+    """Test PDF URL detection and handling for issue #701."""
+    
+    def test_is_pdf_url_with_pdf_urls(self):
+        """Test that PDF URLs are correctly identified."""
+        test_cases = [
+            ("https://example.com/document.pdf", True),
+            ("https://example.com/file.PDF", True),  # Case insensitive
+            ("https://example.com/path/to/report.pdf", True),
+            ("https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf", True),  # URL from issue
+            ("http://site.com/path/document.pdf?param=value", True),  # With query params
+        ]
+        
+        for url, expected in test_cases:
+            assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
+    
+    def test_is_pdf_url_with_non_pdf_urls(self):
+        """Test that non-PDF URLs are correctly identified."""
+        test_cases = [
+            ("https://example.com/page.html", False),
+            ("https://example.com/article.php", False),
+            ("https://example.com/", False),
+            ("https://example.com/document.pdfx", False),  # Not exactly .pdf
+            ("https://example.com/document.doc", False),
+            ("https://example.com/document.txt", False),
+            ("https://example.com?file=document.pdf", False),  # Query param, not path
+            ("", False),  # Empty string
+            (None, False),  # None value
+        ]
+        
+        for url, expected in test_cases:
+            assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
+    
+    def test_crawl_tool_with_pdf_url(self):
+        """Test that PDF URLs return the expected error structure."""
+        pdf_url = "https://example.com/document.pdf"
+        
+        # Act
+        result = crawl_tool(pdf_url)
+        
+        # Assert
+        assert isinstance(result, str)
+        result_dict = json.loads(result)
+        
+        # Check structure of PDF error response
+        assert result_dict["url"] == pdf_url
+        assert "error" in result_dict
+        assert result_dict["crawled_content"] is None
+        assert result_dict["is_pdf"] is True
+        assert "PDF files cannot be crawled directly" in result_dict["error"]
+    
+    def test_crawl_tool_with_issue_pdf_url(self):
+        """Test with the exact PDF URL from issue #701."""
+        issue_pdf_url = "https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf"
+        
+        # Act
+        result = crawl_tool(issue_pdf_url)
+        
+        # Assert
+        result_dict = json.loads(result)
+        assert result_dict["url"] == issue_pdf_url
+        assert result_dict["is_pdf"] is True
+        assert "cannot be crawled directly" in result_dict["error"]
+    
+    @patch("src.tools.crawl.Crawler")
+    @patch("src.tools.crawl.logger")
+    def test_crawl_tool_skips_crawler_for_pdfs(self, mock_logger, mock_crawler_class):
+        """Test that the crawler is not instantiated for PDF URLs."""
+        pdf_url = "https://example.com/document.pdf"
+        
+        # Act
+        result = crawl_tool(pdf_url)
+        
+        # Assert
+        # Crawler should not be instantiated for PDF URLs
+        mock_crawler_class.assert_not_called()
+        mock_logger.info.assert_called_once_with(f"PDF URL detected, skipping crawling: {pdf_url}")
+        
+        # Should return proper PDF error structure
+        result_dict = json.loads(result)
+        assert result_dict["is_pdf"] is True