fix: the crawling error when encountering PDF URLs (#707)

* fix: the crawling error when encountering PDF URLs * Added the unit test for the new feature of crawl tool * fix: address the code review problems * fix: address the code review problems
2026-04-12 18:14:46 +08:00 · 2025-11-25 09:24:52 +08:00
parent da514337da
commit bec97f02ae
4 changed files with 484 additions and 3 deletions
--- a/src/tools/crawl.py
+++ b/src/tools/crawl.py
@@ -3,7 +3,8 @@

 import json
 import logging
-from typing import Annotated
+from typing import Annotated, Optional
+from urllib.parse import urlparse

 from langchain_core.tools import tool

@@ -13,6 +14,14 @@ from .decorators import log_io

 logger = logging.getLogger(__name__)

+def is_pdf_url(url: Optional[str]) -> bool:
+    """Check if the URL points to a PDF file."""
+    if not url:
+        return False
+    parsed_url = urlparse(url)
+    # Check if the path ends with .pdf (case insensitive)
+    return parsed_url.path.lower().endswith('.pdf')
+

@tool
@log_io
@@ -20,6 +29,17 @@ def crawl_tool(
    url: Annotated[str, "The url to crawl."],
 ) -> str:
    """Use this to crawl a url and get a readable content in markdown format."""
+    # Special handling for PDF URLs
+    if is_pdf_url(url):
+        logger.info(f"PDF URL detected, skipping crawling: {url}")
+        pdf_message = json.dumps({
+            "url": url,
+            "error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
+            "crawled_content": None,
+            "is_pdf": True
+        })
+        return pdf_message
+    
    try:
        crawler = Crawler()
        article = crawler.crawl(url)