fix: the crawling error when encountering PDF URLs (#707)

* fix: the crawling error when encountering PDF URLs * Added the unit test for the new feature of crawl tool * fix: address the code review problems * fix: address the code review problems
2026-04-17 19:44:45 +08:00 · 2025-11-25 09:24:52 +08:00
parent da514337da
commit bec97f02ae
4 changed files with 484 additions and 3 deletions
--- a/tests/unit/tools/test_crawl.py
+++ b/tests/unit/tools/test_crawl.py
@@ -1,7 +1,7 @@
 import json
 from unittest.mock import Mock, patch

-from src.tools.crawl import crawl_tool
+from src.tools.crawl import crawl_tool, is_pdf_url


 class TestCrawlTool:
@@ -131,3 +131,86 @@ class TestCrawlTool:
        assert result_dict["url"] == url
        assert "crawled_content" in result_dict
        assert "No content available" in result_dict["crawled_content"]
+
+
+class TestPDFHandling:
+    """Test PDF URL detection and handling for issue #701."""
+    
+    def test_is_pdf_url_with_pdf_urls(self):
+        """Test that PDF URLs are correctly identified."""
+        test_cases = [
+            ("https://example.com/document.pdf", True),
+            ("https://example.com/file.PDF", True),  # Case insensitive
+            ("https://example.com/path/to/report.pdf", True),
+            ("https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf", True),  # URL from issue
+            ("http://site.com/path/document.pdf?param=value", True),  # With query params
+        ]
+        
+        for url, expected in test_cases:
+            assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
+    
+    def test_is_pdf_url_with_non_pdf_urls(self):
+        """Test that non-PDF URLs are correctly identified."""
+        test_cases = [
+            ("https://example.com/page.html", False),
+            ("https://example.com/article.php", False),
+            ("https://example.com/", False),
+            ("https://example.com/document.pdfx", False),  # Not exactly .pdf
+            ("https://example.com/document.doc", False),
+            ("https://example.com/document.txt", False),
+            ("https://example.com?file=document.pdf", False),  # Query param, not path
+            ("", False),  # Empty string
+            (None, False),  # None value
+        ]
+        
+        for url, expected in test_cases:
+            assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
+    
+    def test_crawl_tool_with_pdf_url(self):
+        """Test that PDF URLs return the expected error structure."""
+        pdf_url = "https://example.com/document.pdf"
+        
+        # Act
+        result = crawl_tool(pdf_url)
+        
+        # Assert
+        assert isinstance(result, str)
+        result_dict = json.loads(result)
+        
+        # Check structure of PDF error response
+        assert result_dict["url"] == pdf_url
+        assert "error" in result_dict
+        assert result_dict["crawled_content"] is None
+        assert result_dict["is_pdf"] is True
+        assert "PDF files cannot be crawled directly" in result_dict["error"]
+    
+    def test_crawl_tool_with_issue_pdf_url(self):
+        """Test with the exact PDF URL from issue #701."""
+        issue_pdf_url = "https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf"
+        
+        # Act
+        result = crawl_tool(issue_pdf_url)
+        
+        # Assert
+        result_dict = json.loads(result)
+        assert result_dict["url"] == issue_pdf_url
+        assert result_dict["is_pdf"] is True
+        assert "cannot be crawled directly" in result_dict["error"]
+    
+    @patch("src.tools.crawl.Crawler")
+    @patch("src.tools.crawl.logger")
+    def test_crawl_tool_skips_crawler_for_pdfs(self, mock_logger, mock_crawler_class):
+        """Test that the crawler is not instantiated for PDF URLs."""
+        pdf_url = "https://example.com/document.pdf"
+        
+        # Act
+        result = crawl_tool(pdf_url)
+        
+        # Assert
+        # Crawler should not be instantiated for PDF URLs
+        mock_crawler_class.assert_not_called()
+        mock_logger.info.assert_called_once_with(f"PDF URL detected, skipping crawling: {pdf_url}")
+        
+        # Should return proper PDF error structure
+        result_dict = json.loads(result)
+        assert result_dict["is_pdf"] is True