diff --git a/src/crawler/crawler.py b/src/crawler/crawler.py index 7e5bfbd..8109586 100644 --- a/src/crawler/crawler.py +++ b/src/crawler/crawler.py @@ -1,6 +1,7 @@ # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT +import re import logging from .article import Article @@ -10,6 +11,126 @@ from .readability_extractor import ReadabilityExtractor logger = logging.getLogger(__name__) +def safe_truncate(text: str, max_length: int = 500) -> str: + """ + Safely truncate text to a maximum length without breaking multi-byte characters. + + Args: + text: The text to truncate + max_length: Maximum number of characters to keep + + Returns: + Truncated text that is safe to use without encoding issues + """ + if text is None: + return None + + if len(text) <= max_length: + return text + + # Ensure max_length is at least 3 to accommodate the placeholder + if max_length < 3: + return "..."[:max_length] + + # Use Python's built-in textwrap.shorten which handles unicode safely + try: + import textwrap + return textwrap.shorten(text, width=max_length, placeholder="...") + except (ImportError, TypeError): + # Fallback for older Python versions or if textwrap.shorten has issues + # Truncate to max_length - 3 to make room for "..." + truncated = text[:max_length - 3] + # Remove any incomplete Unicode surrogate pair + while truncated and ord(truncated[-1]) >= 0xD800 and ord(truncated[-1]) <= 0xDFFF: + truncated = truncated[:-1] + return truncated + "..." + + +def is_html_content(content: str) -> bool: + """ + Check if the provided content is HTML. + + Uses a more robust detection method that checks for common HTML patterns + including DOCTYPE declarations, HTML tags, and other HTML markers. + """ + if not content or not content.strip(): + return False + + content = content.strip() + + # Check for HTML comments + if content.startswith('' in content: + return True + + # Check for DOCTYPE declarations (case insensitive) + if re.match(r'^', + r'^]+>', content): + # Additional check: ensure it's not just XML or other markup + # Look for common HTML attributes or elements + html_indicators = [ + r'href\s*=', + r'src\s*=', + r'class\s*=', + r'id\s*=', + r'', + r']*?/>', + r'', + r'', + r']*?/>', + r']*?/>', + r']*?/>', + ] + + for tag in self_closing_tags: + if re.search(tag, content, re.IGNORECASE): + return True + + return False + + class Crawler: def crawl(self, url: str) -> Article: # To help LLMs better understand content, we extract clean @@ -29,12 +150,39 @@ class Crawler: logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}") raise + # Check if we got valid HTML content + if not html or not html.strip(): + logger.warning(f"Empty content received from URL {url}") + article = Article( + title="Empty Content", + html_content="

No content could be extracted from this page

" + ) + article.url = url + return article + + # Check if content is actually HTML using more robust detection + if not is_html_content(html): + logger.warning(f"Non-HTML content received from URL {url}, creating fallback article") + # Return a simple article with the raw content (safely truncated) + article = Article( + title="Non-HTML Content", + html_content=f"

This URL returned content that cannot be parsed as HTML. Raw content: {safe_truncate(html, 500)}

" + ) + article.url = url + return article + try: extractor = ReadabilityExtractor() article = extractor.extract_article(html) except Exception as e: logger.error(f"Failed to extract article from {url}: {repr(e)}") - raise + # Fall back to a simple article with the raw HTML (safely truncated) + article = Article( + title="Content Extraction Failed", + html_content=f"

Content extraction failed. Raw content: {safe_truncate(html, 500)}

" + ) + article.url = url + return article article.url = url return article diff --git a/src/tools/crawl.py b/src/tools/crawl.py index fbb349e..da79f86 100644 --- a/src/tools/crawl.py +++ b/src/tools/crawl.py @@ -3,7 +3,8 @@ import json import logging -from typing import Annotated +from typing import Annotated, Optional +from urllib.parse import urlparse from langchain_core.tools import tool @@ -13,6 +14,14 @@ from .decorators import log_io logger = logging.getLogger(__name__) +def is_pdf_url(url: Optional[str]) -> bool: + """Check if the URL points to a PDF file.""" + if not url: + return False + parsed_url = urlparse(url) + # Check if the path ends with .pdf (case insensitive) + return parsed_url.path.lower().endswith('.pdf') + @tool @log_io @@ -20,6 +29,17 @@ def crawl_tool( url: Annotated[str, "The url to crawl."], ) -> str: """Use this to crawl a url and get a readable content in markdown format.""" + # Special handling for PDF URLs + if is_pdf_url(url): + logger.info(f"PDF URL detected, skipping crawling: {url}") + pdf_message = json.dumps({ + "url": url, + "error": "PDF files cannot be crawled directly. Please download and view the PDF manually.", + "crawled_content": None, + "is_pdf": True + }) + return pdf_message + try: crawler = Crawler() article = crawler.crawl(url) diff --git a/tests/unit/crawler/test_crawler_class.py b/tests/unit/crawler/test_crawler_class.py index 6f6429a..19fef56 100644 --- a/tests/unit/crawler/test_crawler_class.py +++ b/tests/unit/crawler/test_crawler_class.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: MIT import src.crawler as crawler_module +from src.crawler.crawler import safe_truncate def test_crawler_sets_article_url(monkeypatch): @@ -68,3 +69,232 @@ def test_crawler_calls_dependencies(monkeypatch): assert calls["jina"][1] == "html" assert "extractor" in calls assert calls["extractor"] == "dummy" + + +def test_crawler_handles_empty_content(monkeypatch): + """Test that the crawler handles empty content gracefully.""" + + class DummyArticle: + def __init__(self, title, html_content): + self.title = title + self.html_content = html_content + self.url = None + + def to_markdown(self): + return f"# {self.title}" + + class DummyJinaClient: + def crawl(self, url, return_format=None): + return "" # Empty content + + class DummyReadabilityExtractor: + def extract_article(self, html): + # This should not be called for empty content + assert False, "ReadabilityExtractor should not be called for empty content" + + monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) + monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor) + + crawler = crawler_module.Crawler() + url = "http://example.com" + article = crawler.crawl(url) + + assert article.url == url + assert article.title == "Empty Content" + assert "No content could be extracted" in article.html_content + + +def test_crawler_handles_non_html_content(monkeypatch): + """Test that the crawler handles non-HTML content gracefully.""" + + class DummyArticle: + def __init__(self, title, html_content): + self.title = title + self.html_content = html_content + self.url = None + + def to_markdown(self): + return f"# {self.title}" + + class DummyJinaClient: + def crawl(self, url, return_format=None): + return "This is plain text content, not HTML" + + class DummyReadabilityExtractor: + def extract_article(self, html): + # This should not be called for non-HTML content + assert False, "ReadabilityExtractor should not be called for non-HTML content" + + monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) + monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor) + + crawler = crawler_module.Crawler() + url = "http://example.com" + article = crawler.crawl(url) + + assert article.url == url + assert article.title == "Non-HTML Content" + assert "cannot be parsed as HTML" in article.html_content + assert "plain text content" in article.html_content # Should include a snippet of the original content + + +def test_crawler_handles_extraction_failure(monkeypatch): + """Test that the crawler handles readability extraction failure gracefully.""" + + class DummyArticle: + def __init__(self, title, html_content): + self.title = title + self.html_content = html_content + self.url = None + + def to_markdown(self): + return f"# {self.title}" + + class DummyJinaClient: + def crawl(self, url, return_format=None): + return "Valid HTML but extraction will fail" + + class DummyReadabilityExtractor: + def extract_article(self, html): + raise Exception("Extraction failed") + + monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) + monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor) + + crawler = crawler_module.Crawler() + url = "http://example.com" + article = crawler.crawl(url) + + assert article.url == url + assert article.title == "Content Extraction Failed" + assert "Content extraction failed" in article.html_content + assert "Valid HTML but extraction will fail" in article.html_content # Should include a snippet of the HTML + + +def test_crawler_with_json_like_content(monkeypatch): + """Test that the crawler handles JSON-like content gracefully.""" + + class DummyArticle: + def __init__(self, title, html_content): + self.title = title + self.html_content = html_content + self.url = None + + def to_markdown(self): + return f"# {self.title}" + + class DummyJinaClient: + def crawl(self, url, return_format=None): + return '{"title": "Some JSON", "content": "This is JSON content"}' + + class DummyReadabilityExtractor: + def extract_article(self, html): + # This should not be called for JSON content + assert False, "ReadabilityExtractor should not be called for JSON content" + + monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient) + monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor) + + crawler = crawler_module.Crawler() + url = "http://example.com/api/data" + article = crawler.crawl(url) + + assert article.url == url + assert article.title == "Non-HTML Content" + assert "cannot be parsed as HTML" in article.html_content + assert '{"title": "Some JSON"' in article.html_content # Should include a snippet of the JSON + + +def test_crawler_with_various_html_formats(monkeypatch): + """Test that the crawler correctly identifies various HTML formats.""" + + class DummyArticle: + def __init__(self, title, html_content): + self.title = title + self.html_content = html_content + self.url = None + + def to_markdown(self): + return f"# {self.title}" + + # Test case 1: HTML with DOCTYPE + class DummyJinaClient1: + def crawl(self, url, return_format=None): + return "

Test content

" + + # Test case 2: HTML with leading whitespace + class DummyJinaClient2: + def crawl(self, url, return_format=None): + return "\n\n

Test content

" + + # Test case 3: HTML with comments + class DummyJinaClient3: + def crawl(self, url, return_format=None): + return "

Test content

" + + # Test case 4: HTML with self-closing tags + class DummyJinaClient4: + def crawl(self, url, return_format=None): + return 'test

Test content

' + + class DummyReadabilityExtractor: + def extract_article(self, html): + return DummyArticle("Extracted Article", "

Extracted content

") + + # Test each HTML format + test_cases = [ + (DummyJinaClient1, "HTML with DOCTYPE"), + (DummyJinaClient2, "HTML with leading whitespace"), + (DummyJinaClient3, "HTML with comments"), + (DummyJinaClient4, "HTML with self-closing tags"), + ] + + for JinaClientClass, description in test_cases: + monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass) + monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor) + + crawler = crawler_module.Crawler() + url = "http://example.com" + article = crawler.crawl(url) + + assert article.url == url + assert article.title == "Extracted Article" + assert "Extracted content" in article.html_content + + +def test_safe_truncate_function(): + """Test the safe_truncate function handles various character sets correctly.""" + + # Test None input + assert safe_truncate(None) is None + + # Test empty string + assert safe_truncate("") == "" + + # Test string shorter than limit + assert safe_truncate("Short text") == "Short text" + + # Test ASCII truncation + result = safe_truncate("This is a longer text that needs truncation", 20) + assert len(result) <= 20 + assert "..." in result + + # Test Unicode/emoji characters + text_with_emoji = "Hello! 🌍 Welcome to the world 🚀" + result = safe_truncate(text_with_emoji, 20) + assert len(result) <= 20 + assert "..." in result + # Verify it's valid UTF-8 + assert result.encode('utf-8').decode('utf-8') == result + + # Test very small limit + assert safe_truncate("Long text", 1) == "." + assert safe_truncate("Long text", 2) == ".." + assert safe_truncate("Long text", 3) == "..." + + # Test with Chinese characters + chinese_text = "这是一个中文测试文本" + result = safe_truncate(chinese_text, 10) + assert len(result) <= 10 + # Verify it's valid UTF-8 + assert result.encode('utf-8').decode('utf-8') == result diff --git a/tests/unit/tools/test_crawl.py b/tests/unit/tools/test_crawl.py index 15e510a..e75ffb3 100644 --- a/tests/unit/tools/test_crawl.py +++ b/tests/unit/tools/test_crawl.py @@ -1,7 +1,7 @@ import json from unittest.mock import Mock, patch -from src.tools.crawl import crawl_tool +from src.tools.crawl import crawl_tool, is_pdf_url class TestCrawlTool: @@ -131,3 +131,86 @@ class TestCrawlTool: assert result_dict["url"] == url assert "crawled_content" in result_dict assert "No content available" in result_dict["crawled_content"] + + +class TestPDFHandling: + """Test PDF URL detection and handling for issue #701.""" + + def test_is_pdf_url_with_pdf_urls(self): + """Test that PDF URLs are correctly identified.""" + test_cases = [ + ("https://example.com/document.pdf", True), + ("https://example.com/file.PDF", True), # Case insensitive + ("https://example.com/path/to/report.pdf", True), + ("https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf", True), # URL from issue + ("http://site.com/path/document.pdf?param=value", True), # With query params + ] + + for url, expected in test_cases: + assert is_pdf_url(url) == expected, f"Failed for URL: {url}" + + def test_is_pdf_url_with_non_pdf_urls(self): + """Test that non-PDF URLs are correctly identified.""" + test_cases = [ + ("https://example.com/page.html", False), + ("https://example.com/article.php", False), + ("https://example.com/", False), + ("https://example.com/document.pdfx", False), # Not exactly .pdf + ("https://example.com/document.doc", False), + ("https://example.com/document.txt", False), + ("https://example.com?file=document.pdf", False), # Query param, not path + ("", False), # Empty string + (None, False), # None value + ] + + for url, expected in test_cases: + assert is_pdf_url(url) == expected, f"Failed for URL: {url}" + + def test_crawl_tool_with_pdf_url(self): + """Test that PDF URLs return the expected error structure.""" + pdf_url = "https://example.com/document.pdf" + + # Act + result = crawl_tool(pdf_url) + + # Assert + assert isinstance(result, str) + result_dict = json.loads(result) + + # Check structure of PDF error response + assert result_dict["url"] == pdf_url + assert "error" in result_dict + assert result_dict["crawled_content"] is None + assert result_dict["is_pdf"] is True + assert "PDF files cannot be crawled directly" in result_dict["error"] + + def test_crawl_tool_with_issue_pdf_url(self): + """Test with the exact PDF URL from issue #701.""" + issue_pdf_url = "https://pdf.dfcfw.com/pdf/H3_AP202503071644153386_1.pdf" + + # Act + result = crawl_tool(issue_pdf_url) + + # Assert + result_dict = json.loads(result) + assert result_dict["url"] == issue_pdf_url + assert result_dict["is_pdf"] is True + assert "cannot be crawled directly" in result_dict["error"] + + @patch("src.tools.crawl.Crawler") + @patch("src.tools.crawl.logger") + def test_crawl_tool_skips_crawler_for_pdfs(self, mock_logger, mock_crawler_class): + """Test that the crawler is not instantiated for PDF URLs.""" + pdf_url = "https://example.com/document.pdf" + + # Act + result = crawl_tool(pdf_url) + + # Assert + # Crawler should not be instantiated for PDF URLs + mock_crawler_class.assert_not_called() + mock_logger.info.assert_called_once_with(f"PDF URL detected, skipping crawling: {pdf_url}") + + # Should return proper PDF error structure + result_dict = json.loads(result) + assert result_dict["is_pdf"] is True