fix: the crawling error when encountering PDF URLs (#707)

* fix: the crawling error when encountering PDF URLs

* Added the unit test for the new feature of crawl tool

* fix: address the code review problems

* fix: address the code review problems
This commit is contained in:
Willem Jiang
2025-11-25 09:24:52 +08:00
committed by GitHub
parent da514337da
commit bec97f02ae
4 changed files with 484 additions and 3 deletions

View File

@@ -1,7 +1,7 @@
import json
from unittest.mock import Mock, patch
from src.tools.crawl import crawl_tool
from src.tools.crawl import crawl_tool, is_pdf_url
class TestCrawlTool:
@@ -131,3 +131,86 @@ class TestCrawlTool:
assert result_dict["url"] == url
assert "crawled_content" in result_dict
assert "No content available" in result_dict["crawled_content"]
class TestPDFHandling:
"""Test PDF URL detection and handling for issue #701."""
def test_is_pdf_url_with_pdf_urls(self):
"""Test that PDF URLs are correctly identified."""
test_cases = [
("https://example.com/document.pdf", True),
("https://example.com/file.PDF", True), # Case insensitive
("https://example.com/path/to/report.pdf", True),
("https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf", True), # URL from issue
("http://site.com/path/document.pdf?param=value", True), # With query params
]
for url, expected in test_cases:
assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
def test_is_pdf_url_with_non_pdf_urls(self):
"""Test that non-PDF URLs are correctly identified."""
test_cases = [
("https://example.com/page.html", False),
("https://example.com/article.php", False),
("https://example.com/", False),
("https://example.com/document.pdfx", False), # Not exactly .pdf
("https://example.com/document.doc", False),
("https://example.com/document.txt", False),
("https://example.com?file=document.pdf", False), # Query param, not path
("", False), # Empty string
(None, False), # None value
]
for url, expected in test_cases:
assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
def test_crawl_tool_with_pdf_url(self):
"""Test that PDF URLs return the expected error structure."""
pdf_url = "https://example.com/document.pdf"
# Act
result = crawl_tool(pdf_url)
# Assert
assert isinstance(result, str)
result_dict = json.loads(result)
# Check structure of PDF error response
assert result_dict["url"] == pdf_url
assert "error" in result_dict
assert result_dict["crawled_content"] is None
assert result_dict["is_pdf"] is True
assert "PDF files cannot be crawled directly" in result_dict["error"]
def test_crawl_tool_with_issue_pdf_url(self):
"""Test with the exact PDF URL from issue #701."""
issue_pdf_url = "https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf"
# Act
result = crawl_tool(issue_pdf_url)
# Assert
result_dict = json.loads(result)
assert result_dict["url"] == issue_pdf_url
assert result_dict["is_pdf"] is True
assert "cannot be crawled directly" in result_dict["error"]
@patch("src.tools.crawl.Crawler")
@patch("src.tools.crawl.logger")
def test_crawl_tool_skips_crawler_for_pdfs(self, mock_logger, mock_crawler_class):
"""Test that the crawler is not instantiated for PDF URLs."""
pdf_url = "https://example.com/document.pdf"
# Act
result = crawl_tool(pdf_url)
# Assert
# Crawler should not be instantiated for PDF URLs
mock_crawler_class.assert_not_called()
mock_logger.info.assert_called_once_with(f"PDF URL detected, skipping crawling: {pdf_url}")
# Should return proper PDF error structure
result_dict = json.loads(result)
assert result_dict["is_pdf"] is True