fix: the crawling error when encountering PDF URLs (#707)

* fix: the crawling error when encountering PDF URLs

* Added the unit test for the new feature of crawl tool

* fix: address the code review problems

* fix: address the code review problems
This commit is contained in:
Willem Jiang
2025-11-25 09:24:52 +08:00
committed by GitHub
parent da514337da
commit bec97f02ae
4 changed files with 484 additions and 3 deletions

View File

@@ -3,7 +3,8 @@
import json
import logging
from typing import Annotated
from typing import Annotated, Optional
from urllib.parse import urlparse
from langchain_core.tools import tool
@@ -13,6 +14,14 @@ from .decorators import log_io
logger = logging.getLogger(__name__)
def is_pdf_url(url: Optional[str]) -> bool:
"""Check if the URL points to a PDF file."""
if not url:
return False
parsed_url = urlparse(url)
# Check if the path ends with .pdf (case insensitive)
return parsed_url.path.lower().endswith('.pdf')
@tool
@log_io
@@ -20,6 +29,17 @@ def crawl_tool(
url: Annotated[str, "The url to crawl."],
) -> str:
"""Use this to crawl a url and get a readable content in markdown format."""
# Special handling for PDF URLs
if is_pdf_url(url):
logger.info(f"PDF URL detected, skipping crawling: {url}")
pdf_message = json.dumps({
"url": url,
"error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
"crawled_content": None,
"is_pdf": True
})
return pdf_message
try:
crawler = Crawler()
article = crawler.crawl(url)