mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-12 18:14:46 +08:00
fix: the crawling error when encountering PDF URLs (#707)
* fix: the crawling error when encountering PDF URLs * Added the unit test for the new feature of crawl tool * fix: address the code review problems * fix: address the code review problems
This commit is contained in:
@@ -3,7 +3,8 @@
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Annotated
|
||||
from typing import Annotated, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from langchain_core.tools import tool
|
||||
|
||||
@@ -13,6 +14,14 @@ from .decorators import log_io
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def is_pdf_url(url: Optional[str]) -> bool:
|
||||
"""Check if the URL points to a PDF file."""
|
||||
if not url:
|
||||
return False
|
||||
parsed_url = urlparse(url)
|
||||
# Check if the path ends with .pdf (case insensitive)
|
||||
return parsed_url.path.lower().endswith('.pdf')
|
||||
|
||||
|
||||
@tool
|
||||
@log_io
|
||||
@@ -20,6 +29,17 @@ def crawl_tool(
|
||||
url: Annotated[str, "The url to crawl."],
|
||||
) -> str:
|
||||
"""Use this to crawl a url and get a readable content in markdown format."""
|
||||
# Special handling for PDF URLs
|
||||
if is_pdf_url(url):
|
||||
logger.info(f"PDF URL detected, skipping crawling: {url}")
|
||||
pdf_message = json.dumps({
|
||||
"url": url,
|
||||
"error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
|
||||
"crawled_content": None,
|
||||
"is_pdf": True
|
||||
})
|
||||
return pdf_message
|
||||
|
||||
try:
|
||||
crawler = Crawler()
|
||||
article = crawler.crawl(url)
|
||||
|
||||
Reference in New Issue
Block a user