mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-03 06:12:14 +08:00
fix: the crawling error when encountering PDF URLs (#707)
* fix: the crawling error when encountering PDF URLs * Added the unit test for the new feature of crawl tool * fix: address the code review problems * fix: address the code review problems
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
from .article import Article
|
||||
@@ -10,6 +11,126 @@ from .readability_extractor import ReadabilityExtractor
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def safe_truncate(text: str, max_length: int = 500) -> str:
|
||||
"""
|
||||
Safely truncate text to a maximum length without breaking multi-byte characters.
|
||||
|
||||
Args:
|
||||
text: The text to truncate
|
||||
max_length: Maximum number of characters to keep
|
||||
|
||||
Returns:
|
||||
Truncated text that is safe to use without encoding issues
|
||||
"""
|
||||
if text is None:
|
||||
return None
|
||||
|
||||
if len(text) <= max_length:
|
||||
return text
|
||||
|
||||
# Ensure max_length is at least 3 to accommodate the placeholder
|
||||
if max_length < 3:
|
||||
return "..."[:max_length]
|
||||
|
||||
# Use Python's built-in textwrap.shorten which handles unicode safely
|
||||
try:
|
||||
import textwrap
|
||||
return textwrap.shorten(text, width=max_length, placeholder="...")
|
||||
except (ImportError, TypeError):
|
||||
# Fallback for older Python versions or if textwrap.shorten has issues
|
||||
# Truncate to max_length - 3 to make room for "..."
|
||||
truncated = text[:max_length - 3]
|
||||
# Remove any incomplete Unicode surrogate pair
|
||||
while truncated and ord(truncated[-1]) >= 0xD800 and ord(truncated[-1]) <= 0xDFFF:
|
||||
truncated = truncated[:-1]
|
||||
return truncated + "..."
|
||||
|
||||
|
||||
def is_html_content(content: str) -> bool:
|
||||
"""
|
||||
Check if the provided content is HTML.
|
||||
|
||||
Uses a more robust detection method that checks for common HTML patterns
|
||||
including DOCTYPE declarations, HTML tags, and other HTML markers.
|
||||
"""
|
||||
if not content or not content.strip():
|
||||
return False
|
||||
|
||||
content = content.strip()
|
||||
|
||||
# Check for HTML comments
|
||||
if content.startswith('<!--') and '-->' in content:
|
||||
return True
|
||||
|
||||
# Check for DOCTYPE declarations (case insensitive)
|
||||
if re.match(r'^<!DOCTYPE\s+html', content, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Check for XML declarations followed by HTML
|
||||
if content.startswith('<?xml') and '<html' in content:
|
||||
return True
|
||||
|
||||
# Check for common HTML tags at the beginning
|
||||
html_start_patterns = [
|
||||
r'^<html',
|
||||
r'^<head',
|
||||
r'^<body',
|
||||
r'^<title',
|
||||
r'^<meta',
|
||||
r'^<link',
|
||||
r'^<script',
|
||||
r'^<style',
|
||||
r'^<div',
|
||||
r'^<p>',
|
||||
r'^<p\s',
|
||||
r'^<span',
|
||||
r'^<h[1-6]',
|
||||
r'^<!DOCTYPE',
|
||||
r'^<\!DOCTYPE', # Some variations
|
||||
]
|
||||
|
||||
for pattern in html_start_patterns:
|
||||
if re.match(pattern, content, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Check for any HTML-like tags in the content (more permissive)
|
||||
if re.search(r'<[^>]+>', content):
|
||||
# Additional check: ensure it's not just XML or other markup
|
||||
# Look for common HTML attributes or elements
|
||||
html_indicators = [
|
||||
r'href\s*=',
|
||||
r'src\s*=',
|
||||
r'class\s*=',
|
||||
r'id\s*=',
|
||||
r'<img\s',
|
||||
r'<a\s',
|
||||
r'<div',
|
||||
r'<p>',
|
||||
r'<p\s',
|
||||
r'<!DOCTYPE',
|
||||
]
|
||||
|
||||
for indicator in html_indicators:
|
||||
if re.search(indicator, content, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
# Also check for self-closing HTML tags
|
||||
self_closing_tags = [
|
||||
r'<img\s+[^>]*?/>',
|
||||
r'<br\s*/?>',
|
||||
r'<hr\s*/?>',
|
||||
r'<input\s+[^>]*?/>',
|
||||
r'<meta\s+[^>]*?/>',
|
||||
r'<link\s+[^>]*?/>',
|
||||
]
|
||||
|
||||
for tag in self_closing_tags:
|
||||
if re.search(tag, content, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class Crawler:
|
||||
def crawl(self, url: str) -> Article:
|
||||
# To help LLMs better understand content, we extract clean
|
||||
@@ -29,12 +150,39 @@ class Crawler:
|
||||
logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}")
|
||||
raise
|
||||
|
||||
# Check if we got valid HTML content
|
||||
if not html or not html.strip():
|
||||
logger.warning(f"Empty content received from URL {url}")
|
||||
article = Article(
|
||||
title="Empty Content",
|
||||
html_content="<p>No content could be extracted from this page</p>"
|
||||
)
|
||||
article.url = url
|
||||
return article
|
||||
|
||||
# Check if content is actually HTML using more robust detection
|
||||
if not is_html_content(html):
|
||||
logger.warning(f"Non-HTML content received from URL {url}, creating fallback article")
|
||||
# Return a simple article with the raw content (safely truncated)
|
||||
article = Article(
|
||||
title="Non-HTML Content",
|
||||
html_content=f"<p>This URL returned content that cannot be parsed as HTML. Raw content: {safe_truncate(html, 500)}</p>"
|
||||
)
|
||||
article.url = url
|
||||
return article
|
||||
|
||||
try:
|
||||
extractor = ReadabilityExtractor()
|
||||
article = extractor.extract_article(html)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract article from {url}: {repr(e)}")
|
||||
raise
|
||||
# Fall back to a simple article with the raw HTML (safely truncated)
|
||||
article = Article(
|
||||
title="Content Extraction Failed",
|
||||
html_content=f"<p>Content extraction failed. Raw content: {safe_truncate(html, 500)}</p>"
|
||||
)
|
||||
article.url = url
|
||||
return article
|
||||
|
||||
article.url = url
|
||||
return article
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Annotated
|
||||
from typing import Annotated, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from langchain_core.tools import tool
|
||||
|
||||
@@ -13,6 +14,14 @@ from .decorators import log_io
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def is_pdf_url(url: Optional[str]) -> bool:
|
||||
"""Check if the URL points to a PDF file."""
|
||||
if not url:
|
||||
return False
|
||||
parsed_url = urlparse(url)
|
||||
# Check if the path ends with .pdf (case insensitive)
|
||||
return parsed_url.path.lower().endswith('.pdf')
|
||||
|
||||
|
||||
@tool
|
||||
@log_io
|
||||
@@ -20,6 +29,17 @@ def crawl_tool(
|
||||
url: Annotated[str, "The url to crawl."],
|
||||
) -> str:
|
||||
"""Use this to crawl a url and get a readable content in markdown format."""
|
||||
# Special handling for PDF URLs
|
||||
if is_pdf_url(url):
|
||||
logger.info(f"PDF URL detected, skipping crawling: {url}")
|
||||
pdf_message = json.dumps({
|
||||
"url": url,
|
||||
"error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
|
||||
"crawled_content": None,
|
||||
"is_pdf": True
|
||||
})
|
||||
return pdf_message
|
||||
|
||||
try:
|
||||
crawler = Crawler()
|
||||
article = crawler.crawl(url)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import src.crawler as crawler_module
|
||||
from src.crawler.crawler import safe_truncate
|
||||
|
||||
|
||||
def test_crawler_sets_article_url(monkeypatch):
|
||||
@@ -68,3 +69,232 @@ def test_crawler_calls_dependencies(monkeypatch):
|
||||
assert calls["jina"][1] == "html"
|
||||
assert "extractor" in calls
|
||||
assert calls["extractor"] == "<html>dummy</html>"
|
||||
|
||||
|
||||
def test_crawler_handles_empty_content(monkeypatch):
|
||||
"""Test that the crawler handles empty content gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "" # Empty content
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
# This should not be called for empty content
|
||||
assert False, "ReadabilityExtractor should not be called for empty content"
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Empty Content"
|
||||
assert "No content could be extracted" in article.html_content
|
||||
|
||||
|
||||
def test_crawler_handles_non_html_content(monkeypatch):
|
||||
"""Test that the crawler handles non-HTML content gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "This is plain text content, not HTML"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
# This should not be called for non-HTML content
|
||||
assert False, "ReadabilityExtractor should not be called for non-HTML content"
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Non-HTML Content"
|
||||
assert "cannot be parsed as HTML" in article.html_content
|
||||
assert "plain text content" in article.html_content # Should include a snippet of the original content
|
||||
|
||||
|
||||
def test_crawler_handles_extraction_failure(monkeypatch):
|
||||
"""Test that the crawler handles readability extraction failure gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<html><body>Valid HTML but extraction will fail</body></html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
raise Exception("Extraction failed")
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Content Extraction Failed"
|
||||
assert "Content extraction failed" in article.html_content
|
||||
assert "Valid HTML but extraction will fail" in article.html_content # Should include a snippet of the HTML
|
||||
|
||||
|
||||
def test_crawler_with_json_like_content(monkeypatch):
|
||||
"""Test that the crawler handles JSON-like content gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return '{"title": "Some JSON", "content": "This is JSON content"}'
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
# This should not be called for JSON content
|
||||
assert False, "ReadabilityExtractor should not be called for JSON content"
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com/api/data"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Non-HTML Content"
|
||||
assert "cannot be parsed as HTML" in article.html_content
|
||||
assert '{"title": "Some JSON"' in article.html_content # Should include a snippet of the JSON
|
||||
|
||||
|
||||
def test_crawler_with_various_html_formats(monkeypatch):
|
||||
"""Test that the crawler correctly identifies various HTML formats."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
# Test case 1: HTML with DOCTYPE
|
||||
class DummyJinaClient1:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<!DOCTYPE html><html><body><p>Test content</p></body></html>"
|
||||
|
||||
# Test case 2: HTML with leading whitespace
|
||||
class DummyJinaClient2:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "\n\n <html><body><p>Test content</p></body></html>"
|
||||
|
||||
# Test case 3: HTML with comments
|
||||
class DummyJinaClient3:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<!-- HTML comment --><html><body><p>Test content</p></body></html>"
|
||||
|
||||
# Test case 4: HTML with self-closing tags
|
||||
class DummyJinaClient4:
|
||||
def crawl(self, url, return_format=None):
|
||||
return '<img src="test.jpg" alt="test" /><p>Test content</p>'
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
return DummyArticle("Extracted Article", "<p>Extracted content</p>")
|
||||
|
||||
# Test each HTML format
|
||||
test_cases = [
|
||||
(DummyJinaClient1, "HTML with DOCTYPE"),
|
||||
(DummyJinaClient2, "HTML with leading whitespace"),
|
||||
(DummyJinaClient3, "HTML with comments"),
|
||||
(DummyJinaClient4, "HTML with self-closing tags"),
|
||||
]
|
||||
|
||||
for JinaClientClass, description in test_cases:
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Extracted Article"
|
||||
assert "Extracted content" in article.html_content
|
||||
|
||||
|
||||
def test_safe_truncate_function():
|
||||
"""Test the safe_truncate function handles various character sets correctly."""
|
||||
|
||||
# Test None input
|
||||
assert safe_truncate(None) is None
|
||||
|
||||
# Test empty string
|
||||
assert safe_truncate("") == ""
|
||||
|
||||
# Test string shorter than limit
|
||||
assert safe_truncate("Short text") == "Short text"
|
||||
|
||||
# Test ASCII truncation
|
||||
result = safe_truncate("This is a longer text that needs truncation", 20)
|
||||
assert len(result) <= 20
|
||||
assert "..." in result
|
||||
|
||||
# Test Unicode/emoji characters
|
||||
text_with_emoji = "Hello! 🌍 Welcome to the world 🚀"
|
||||
result = safe_truncate(text_with_emoji, 20)
|
||||
assert len(result) <= 20
|
||||
assert "..." in result
|
||||
# Verify it's valid UTF-8
|
||||
assert result.encode('utf-8').decode('utf-8') == result
|
||||
|
||||
# Test very small limit
|
||||
assert safe_truncate("Long text", 1) == "."
|
||||
assert safe_truncate("Long text", 2) == ".."
|
||||
assert safe_truncate("Long text", 3) == "..."
|
||||
|
||||
# Test with Chinese characters
|
||||
chinese_text = "这是一个中文测试文本"
|
||||
result = safe_truncate(chinese_text, 10)
|
||||
assert len(result) <= 10
|
||||
# Verify it's valid UTF-8
|
||||
assert result.encode('utf-8').decode('utf-8') == result
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import json
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from src.tools.crawl import crawl_tool
|
||||
from src.tools.crawl import crawl_tool, is_pdf_url
|
||||
|
||||
|
||||
class TestCrawlTool:
|
||||
@@ -131,3 +131,86 @@ class TestCrawlTool:
|
||||
assert result_dict["url"] == url
|
||||
assert "crawled_content" in result_dict
|
||||
assert "No content available" in result_dict["crawled_content"]
|
||||
|
||||
|
||||
class TestPDFHandling:
|
||||
"""Test PDF URL detection and handling for issue #701."""
|
||||
|
||||
def test_is_pdf_url_with_pdf_urls(self):
|
||||
"""Test that PDF URLs are correctly identified."""
|
||||
test_cases = [
|
||||
("https://example.com/document.pdf", True),
|
||||
("https://example.com/file.PDF", True), # Case insensitive
|
||||
("https://example.com/path/to/report.pdf", True),
|
||||
("https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf", True), # URL from issue
|
||||
("http://site.com/path/document.pdf?param=value", True), # With query params
|
||||
]
|
||||
|
||||
for url, expected in test_cases:
|
||||
assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
|
||||
|
||||
def test_is_pdf_url_with_non_pdf_urls(self):
|
||||
"""Test that non-PDF URLs are correctly identified."""
|
||||
test_cases = [
|
||||
("https://example.com/page.html", False),
|
||||
("https://example.com/article.php", False),
|
||||
("https://example.com/", False),
|
||||
("https://example.com/document.pdfx", False), # Not exactly .pdf
|
||||
("https://example.com/document.doc", False),
|
||||
("https://example.com/document.txt", False),
|
||||
("https://example.com?file=document.pdf", False), # Query param, not path
|
||||
("", False), # Empty string
|
||||
(None, False), # None value
|
||||
]
|
||||
|
||||
for url, expected in test_cases:
|
||||
assert is_pdf_url(url) == expected, f"Failed for URL: {url}"
|
||||
|
||||
def test_crawl_tool_with_pdf_url(self):
|
||||
"""Test that PDF URLs return the expected error structure."""
|
||||
pdf_url = "https://example.com/document.pdf"
|
||||
|
||||
# Act
|
||||
result = crawl_tool(pdf_url)
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, str)
|
||||
result_dict = json.loads(result)
|
||||
|
||||
# Check structure of PDF error response
|
||||
assert result_dict["url"] == pdf_url
|
||||
assert "error" in result_dict
|
||||
assert result_dict["crawled_content"] is None
|
||||
assert result_dict["is_pdf"] is True
|
||||
assert "PDF files cannot be crawled directly" in result_dict["error"]
|
||||
|
||||
def test_crawl_tool_with_issue_pdf_url(self):
|
||||
"""Test with the exact PDF URL from issue #701."""
|
||||
issue_pdf_url = "https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf"
|
||||
|
||||
# Act
|
||||
result = crawl_tool(issue_pdf_url)
|
||||
|
||||
# Assert
|
||||
result_dict = json.loads(result)
|
||||
assert result_dict["url"] == issue_pdf_url
|
||||
assert result_dict["is_pdf"] is True
|
||||
assert "cannot be crawled directly" in result_dict["error"]
|
||||
|
||||
@patch("src.tools.crawl.Crawler")
|
||||
@patch("src.tools.crawl.logger")
|
||||
def test_crawl_tool_skips_crawler_for_pdfs(self, mock_logger, mock_crawler_class):
|
||||
"""Test that the crawler is not instantiated for PDF URLs."""
|
||||
pdf_url = "https://example.com/document.pdf"
|
||||
|
||||
# Act
|
||||
result = crawl_tool(pdf_url)
|
||||
|
||||
# Assert
|
||||
# Crawler should not be instantiated for PDF URLs
|
||||
mock_crawler_class.assert_not_called()
|
||||
mock_logger.info.assert_called_once_with(f"PDF URL detected, skipping crawling: {pdf_url}")
|
||||
|
||||
# Should return proper PDF error structure
|
||||
result_dict = json.loads(result)
|
||||
assert result_dict["is_pdf"] is True
|
||||
|
||||
Reference in New Issue
Block a user