fix: the crawling error when encountering PDF URLs (#707)

* fix: the crawling error when encountering PDF URLs

* Added the unit test for the new feature of crawl tool

* fix: address the code review problems

* fix: address the code review problems
This commit is contained in:
Willem Jiang
2025-11-25 09:24:52 +08:00
committed by GitHub
parent da514337da
commit bec97f02ae
4 changed files with 484 additions and 3 deletions

View File

@@ -1,6 +1,7 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import re
import logging
from .article import Article
@@ -10,6 +11,126 @@ from .readability_extractor import ReadabilityExtractor
logger = logging.getLogger(__name__)
def safe_truncate(text: str, max_length: int = 500) -> str:
"""
Safely truncate text to a maximum length without breaking multi-byte characters.
Args:
text: The text to truncate
max_length: Maximum number of characters to keep
Returns:
Truncated text that is safe to use without encoding issues
"""
if text is None:
return None
if len(text) <= max_length:
return text
# Ensure max_length is at least 3 to accommodate the placeholder
if max_length < 3:
return "..."[:max_length]
# Use Python's built-in textwrap.shorten which handles unicode safely
try:
import textwrap
return textwrap.shorten(text, width=max_length, placeholder="...")
except (ImportError, TypeError):
# Fallback for older Python versions or if textwrap.shorten has issues
# Truncate to max_length - 3 to make room for "..."
truncated = text[:max_length - 3]
# Remove any incomplete Unicode surrogate pair
while truncated and ord(truncated[-1]) >= 0xD800 and ord(truncated[-1]) <= 0xDFFF:
truncated = truncated[:-1]
return truncated + "..."
def is_html_content(content: str) -> bool:
"""
Check if the provided content is HTML.
Uses a more robust detection method that checks for common HTML patterns
including DOCTYPE declarations, HTML tags, and other HTML markers.
"""
if not content or not content.strip():
return False
content = content.strip()
# Check for HTML comments
if content.startswith('<!--') and '-->' in content:
return True
# Check for DOCTYPE declarations (case insensitive)
if re.match(r'^<!DOCTYPE\s+html', content, re.IGNORECASE):
return True
# Check for XML declarations followed by HTML
if content.startswith('<?xml') and '<html' in content:
return True
# Check for common HTML tags at the beginning
html_start_patterns = [
r'^<html',
r'^<head',
r'^<body',
r'^<title',
r'^<meta',
r'^<link',
r'^<script',
r'^<style',
r'^<div',
r'^<p>',
r'^<p\s',
r'^<span',
r'^<h[1-6]',
r'^<!DOCTYPE',
r'^<\!DOCTYPE', # Some variations
]
for pattern in html_start_patterns:
if re.match(pattern, content, re.IGNORECASE):
return True
# Check for any HTML-like tags in the content (more permissive)
if re.search(r'<[^>]+>', content):
# Additional check: ensure it's not just XML or other markup
# Look for common HTML attributes or elements
html_indicators = [
r'href\s*=',
r'src\s*=',
r'class\s*=',
r'id\s*=',
r'<img\s',
r'<a\s',
r'<div',
r'<p>',
r'<p\s',
r'<!DOCTYPE',
]
for indicator in html_indicators:
if re.search(indicator, content, re.IGNORECASE):
return True
# Also check for self-closing HTML tags
self_closing_tags = [
r'<img\s+[^>]*?/>',
r'<br\s*/?>',
r'<hr\s*/?>',
r'<input\s+[^>]*?/>',
r'<meta\s+[^>]*?/>',
r'<link\s+[^>]*?/>',
]
for tag in self_closing_tags:
if re.search(tag, content, re.IGNORECASE):
return True
return False
class Crawler:
def crawl(self, url: str) -> Article:
# To help LLMs better understand content, we extract clean
@@ -29,12 +150,39 @@ class Crawler:
logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}")
raise
# Check if we got valid HTML content
if not html or not html.strip():
logger.warning(f"Empty content received from URL {url}")
article = Article(
title="Empty Content",
html_content="<p>No content could be extracted from this page</p>"
)
article.url = url
return article
# Check if content is actually HTML using more robust detection
if not is_html_content(html):
logger.warning(f"Non-HTML content received from URL {url}, creating fallback article")
# Return a simple article with the raw content (safely truncated)
article = Article(
title="Non-HTML Content",
html_content=f"<p>This URL returned content that cannot be parsed as HTML. Raw content: {safe_truncate(html, 500)}</p>"
)
article.url = url
return article
try:
extractor = ReadabilityExtractor()
article = extractor.extract_article(html)
except Exception as e:
logger.error(f"Failed to extract article from {url}: {repr(e)}")
raise
# Fall back to a simple article with the raw HTML (safely truncated)
article = Article(
title="Content Extraction Failed",
html_content=f"<p>Content extraction failed. Raw content: {safe_truncate(html, 500)}</p>"
)
article.url = url
return article
article.url = url
return article

View File

@@ -3,7 +3,8 @@
import json
import logging
from typing import Annotated
from typing import Annotated, Optional
from urllib.parse import urlparse
from langchain_core.tools import tool
@@ -13,6 +14,14 @@ from .decorators import log_io
logger = logging.getLogger(__name__)
def is_pdf_url(url: Optional[str]) -> bool:
"""Check if the URL points to a PDF file."""
if not url:
return False
parsed_url = urlparse(url)
# Check if the path ends with .pdf (case insensitive)
return parsed_url.path.lower().endswith('.pdf')
@tool
@log_io
@@ -20,6 +29,17 @@ def crawl_tool(
url: Annotated[str, "The url to crawl."],
) -> str:
"""Use this to crawl a url and get a readable content in markdown format."""
# Special handling for PDF URLs
if is_pdf_url(url):
logger.info(f"PDF URL detected, skipping crawling: {url}")
pdf_message = json.dumps({
"url": url,
"error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
"crawled_content": None,
"is_pdf": True
})
return pdf_message
try:
crawler = Crawler()
article = crawler.crawl(url)