fix: the crawling error when encountering PDF URLs (#707)

* fix: the crawling error when encountering PDF URLs

* Added the unit test for the new feature of crawl tool

* fix: address the code review problems

* fix: address the code review problems
This commit is contained in:
Willem Jiang
2025-11-25 09:24:52 +08:00
committed by GitHub
parent da514337da
commit bec97f02ae
4 changed files with 484 additions and 3 deletions

View File

@@ -2,6 +2,7 @@
# SPDX-License-Identifier: MIT
import src.crawler as crawler_module
from src.crawler.crawler import safe_truncate
def test_crawler_sets_article_url(monkeypatch):
@@ -68,3 +69,232 @@ def test_crawler_calls_dependencies(monkeypatch):
assert calls["jina"][1] == "html"
assert "extractor" in calls
assert calls["extractor"] == "<html>dummy</html>"
def test_crawler_handles_empty_content(monkeypatch):
"""Test that the crawler handles empty content gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "" # Empty content
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for empty content
assert False, "ReadabilityExtractor should not be called for empty content"
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
crawler = crawler_module.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Empty Content"
assert "No content could be extracted" in article.html_content
def test_crawler_handles_non_html_content(monkeypatch):
"""Test that the crawler handles non-HTML content gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "This is plain text content, not HTML"
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for non-HTML content
assert False, "ReadabilityExtractor should not be called for non-HTML content"
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
crawler = crawler_module.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Non-HTML Content"
assert "cannot be parsed as HTML" in article.html_content
assert "plain text content" in article.html_content # Should include a snippet of the original content
def test_crawler_handles_extraction_failure(monkeypatch):
"""Test that the crawler handles readability extraction failure gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "<html><body>Valid HTML but extraction will fail</body></html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
raise Exception("Extraction failed")
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
crawler = crawler_module.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Content Extraction Failed"
assert "Content extraction failed" in article.html_content
assert "Valid HTML but extraction will fail" in article.html_content # Should include a snippet of the HTML
def test_crawler_with_json_like_content(monkeypatch):
"""Test that the crawler handles JSON-like content gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyJinaClient:
def crawl(self, url, return_format=None):
return '{"title": "Some JSON", "content": "This is JSON content"}'
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for JSON content
assert False, "ReadabilityExtractor should not be called for JSON content"
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
crawler = crawler_module.Crawler()
url = "http://example.com/api/data"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Non-HTML Content"
assert "cannot be parsed as HTML" in article.html_content
assert '{"title": "Some JSON"' in article.html_content # Should include a snippet of the JSON
def test_crawler_with_various_html_formats(monkeypatch):
"""Test that the crawler correctly identifies various HTML formats."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
# Test case 1: HTML with DOCTYPE
class DummyJinaClient1:
def crawl(self, url, return_format=None):
return "<!DOCTYPE html><html><body><p>Test content</p></body></html>"
# Test case 2: HTML with leading whitespace
class DummyJinaClient2:
def crawl(self, url, return_format=None):
return "\n\n <html><body><p>Test content</p></body></html>"
# Test case 3: HTML with comments
class DummyJinaClient3:
def crawl(self, url, return_format=None):
return "<!-- HTML comment --><html><body><p>Test content</p></body></html>"
# Test case 4: HTML with self-closing tags
class DummyJinaClient4:
def crawl(self, url, return_format=None):
return '<img src="test.jpg" alt="test" /><p>Test content</p>'
class DummyReadabilityExtractor:
def extract_article(self, html):
return DummyArticle("Extracted Article", "<p>Extracted content</p>")
# Test each HTML format
test_cases = [
(DummyJinaClient1, "HTML with DOCTYPE"),
(DummyJinaClient2, "HTML with leading whitespace"),
(DummyJinaClient3, "HTML with comments"),
(DummyJinaClient4, "HTML with self-closing tags"),
]
for JinaClientClass, description in test_cases:
monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
crawler = crawler_module.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Extracted Article"
assert "Extracted content" in article.html_content
def test_safe_truncate_function():
"""Test the safe_truncate function handles various character sets correctly."""
# Test None input
assert safe_truncate(None) is None
# Test empty string
assert safe_truncate("") == ""
# Test string shorter than limit
assert safe_truncate("Short text") == "Short text"
# Test ASCII truncation
result = safe_truncate("This is a longer text that needs truncation", 20)
assert len(result) <= 20
assert "..." in result
# Test Unicode/emoji characters
text_with_emoji = "Hello! 🌍 Welcome to the world 🚀"
result = safe_truncate(text_with_emoji, 20)
assert len(result) <= 20
assert "..." in result
# Verify it's valid UTF-8
assert result.encode('utf-8').decode('utf-8') == result
# Test very small limit
assert safe_truncate("Long text", 1) == "."
assert safe_truncate("Long text", 2) == ".."
assert safe_truncate("Long text", 3) == "..."
# Test with Chinese characters
chinese_text = "这是一个中文测试文本"
result = safe_truncate(chinese_text, 10)
assert len(result) <= 10
# Verify it's valid UTF-8
assert result.encode('utf-8').decode('utf-8') == result

View File

@@ -1,7 +1,7 @@
import json
from unittest.mock import Mock, patch
from src.tools.crawl import crawl_tool
from src.tools.crawl import crawl_tool, is_pdf_url
class TestCrawlTool:
@@ -131,3 +131,86 @@ class TestCrawlTool:
assert result_dict["url"] == url
assert "crawled_content" in result_dict
assert "No content available" in result_dict["crawled_content"]
class TestPDFHandling:
"""Test PDF URL detection and handling for issue #701."""
def test_is_pdf_url_with_pdf_urls(self):
"""Test that PDF URLs are correctly identified."""
test_cases = [
("https://example.com/document.pdf", True),
("https://example.com/file.PDF", True), # Case insensitive
("https://example.com/path/to/report.pdf", True),
("https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf", True), # URL from issue
("http://site.com/path/document.pdf?param=value", True), # With query params
]
for url, expected in test_cases:
assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
def test_is_pdf_url_with_non_pdf_urls(self):
"""Test that non-PDF URLs are correctly identified."""
test_cases = [
("https://example.com/page.html", False),
("https://example.com/article.php", False),
("https://example.com/", False),
("https://example.com/document.pdfx", False), # Not exactly .pdf
("https://example.com/document.doc", False),
("https://example.com/document.txt", False),
("https://example.com?file=document.pdf", False), # Query param, not path
("", False), # Empty string
(None, False), # None value
]
for url, expected in test_cases:
assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
def test_crawl_tool_with_pdf_url(self):
"""Test that PDF URLs return the expected error structure."""
pdf_url = "https://example.com/document.pdf"
# Act
result = crawl_tool(pdf_url)
# Assert
assert isinstance(result, str)
result_dict = json.loads(result)
# Check structure of PDF error response
assert result_dict["url"] == pdf_url
assert "error" in result_dict
assert result_dict["crawled_content"] is None
assert result_dict["is_pdf"] is True
assert "PDF files cannot be crawled directly" in result_dict["error"]
def test_crawl_tool_with_issue_pdf_url(self):
"""Test with the exact PDF URL from issue #701."""
issue_pdf_url = "https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf"
# Act
result = crawl_tool(issue_pdf_url)
# Assert
result_dict = json.loads(result)
assert result_dict["url"] == issue_pdf_url
assert result_dict["is_pdf"] is True
assert "cannot be crawled directly" in result_dict["error"]
@patch("src.tools.crawl.Crawler")
@patch("src.tools.crawl.logger")
def test_crawl_tool_skips_crawler_for_pdfs(self, mock_logger, mock_crawler_class):
"""Test that the crawler is not instantiated for PDF URLs."""
pdf_url = "https://example.com/document.pdf"
# Act
result = crawl_tool(pdf_url)
# Assert
# Crawler should not be instantiated for PDF URLs
mock_crawler_class.assert_not_called()
mock_logger.info.assert_called_once_with(f"PDF URL detected, skipping crawling: {pdf_url}")
# Should return proper PDF error structure
result_dict = json.loads(result)
assert result_dict["is_pdf"] is True