diff --git a/src/crawler/article.py b/src/crawler/article.py index fd0c95e..a56df67 100644 --- a/src/crawler/article.py +++ b/src/crawler/article.py @@ -18,20 +18,36 @@ class Article: markdown = "" if including_title: markdown += f"# {self.title}\n\n" - markdown += md(self.html_content) + + if self.html_content is None or not str(self.html_content).strip(): + markdown += "*No content available*\n" + else: + markdown += md(self.html_content) + return markdown def to_message(self) -> list[dict]: image_pattern = r"!\[.*?\]\((.*?)\)" content: list[dict[str, str]] = [] - parts = re.split(image_pattern, self.to_markdown()) + markdown = self.to_markdown() + + if not markdown or not markdown.strip(): + return [{"type": "text", "text": "No content available"}] + + parts = re.split(image_pattern, markdown) for i, part in enumerate(parts): if i % 2 == 1: image_url = urljoin(self.url, part.strip()) content.append({"type": "image_url", "image_url": {"url": image_url}}) else: - content.append({"type": "text", "text": part.strip()}) + text_part = part.strip() + if text_part: + content.append({"type": "text", "text": text_part}) + # If after processing all parts, content is still empty, provide a fallback message. + if not content: + content = [{"type": "text", "text": "No content available"}] + return content diff --git a/src/crawler/crawler.py b/src/crawler/crawler.py index 01a2a6e..7e5bfbd 100644 --- a/src/crawler/crawler.py +++ b/src/crawler/crawler.py @@ -1,11 +1,14 @@ # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT +import logging from .article import Article from .jina_client import JinaClient from .readability_extractor import ReadabilityExtractor +logger = logging.getLogger(__name__) + class Crawler: def crawl(self, url: str) -> Article: @@ -19,9 +22,19 @@ class Crawler: # # Instead of using Jina's own markdown converter, we'll use # our own solution to get better readability results. - jina_client = JinaClient() - html = jina_client.crawl(url, return_format="html") - extractor = ReadabilityExtractor() - article = extractor.extract_article(html) + try: + jina_client = JinaClient() + html = jina_client.crawl(url, return_format="html") + except Exception as e: + logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}") + raise + + try: + extractor = ReadabilityExtractor() + article = extractor.extract_article(html) + except Exception as e: + logger.error(f"Failed to extract article from {url}: {repr(e)}") + raise + article.url = url return article diff --git a/src/crawler/jina_client.py b/src/crawler/jina_client.py index 8cd81ed..5cb5a08 100644 --- a/src/crawler/jina_client.py +++ b/src/crawler/jina_client.py @@ -23,4 +23,11 @@ class JinaClient: ) data = {"url": url} response = requests.post("https://r.jina.ai/", headers=headers, json=data) + + if response.status_code != 200: + raise ValueError(f"Jina API returned status {response.status_code}: {response.text}") + + if not response.text or not response.text.strip(): + raise ValueError("Jina API returned empty response") + return response.text diff --git a/src/crawler/readability_extractor.py b/src/crawler/readability_extractor.py index a3a22d0..87b3b97 100644 --- a/src/crawler/readability_extractor.py +++ b/src/crawler/readability_extractor.py @@ -1,15 +1,28 @@ # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT +import logging from readabilipy import simple_json_from_html_string from .article import Article +logger = logging.getLogger(__name__) + class ReadabilityExtractor: def extract_article(self, html: str) -> Article: article = simple_json_from_html_string(html, use_readability=True) + + content = article.get("content") + if not content or not str(content).strip(): + logger.warning("Readability extraction returned empty content") + content = "

No content could be extracted from this page

" + + title = article.get("title") + if not title or not str(title).strip(): + title = "Untitled" + return Article( - title=article.get("title"), - html_content=article.get("content"), + title=title, + html_content=content, ) diff --git a/tests/unit/crawler/test_article.py b/tests/unit/crawler/test_article.py index d9bd384..0b3cddf 100644 --- a/tests/unit/crawler/test_article.py +++ b/tests/unit/crawler/test_article.py @@ -71,3 +71,43 @@ def test_to_message_handles_empty_html(): result = article.to_message() assert isinstance(result, list) assert result[0]["type"] == "text" + + +def test_to_markdown_handles_none_content(): + article = Article("Test Title", None) + result = article.to_markdown(including_title=True) + assert "# Test Title" in result + assert "No content available" in result + + +def test_to_markdown_handles_empty_string(): + article = Article("Test Title", "") + result = article.to_markdown(including_title=True) + assert "# Test Title" in result + assert "No content available" in result + + +def test_to_markdown_handles_whitespace_only(): + article = Article("Test Title", " \n \t ") + result = article.to_markdown(including_title=True) + assert "# Test Title" in result + assert "No content available" in result + + +def test_to_message_handles_none_content(): + article = Article("Title", None) + article.url = "http://test/" + result = article.to_message() + assert isinstance(result, list) + assert len(result) > 0 + assert result[0]["type"] == "text" + assert "No content available" in result[0]["text"] + + +def test_to_message_handles_whitespace_only_content(): + article = Article("Title", " \n ") + article.url = "http://test/" + result = article.to_message() + assert isinstance(result, list) + assert result[0]["type"] == "text" + assert "No content available" in result[0]["text"] diff --git a/tests/unit/crawler/test_jina_client.py b/tests/unit/crawler/test_jina_client.py new file mode 100644 index 0000000..94edade --- /dev/null +++ b/tests/unit/crawler/test_jina_client.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +import pytest +from unittest.mock import patch, Mock +from src.crawler.jina_client import JinaClient + + +class TestJinaClient: + @patch("src.crawler.jina_client.requests.post") + def test_crawl_success(self, mock_post): + # Arrange + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "Test" + mock_post.return_value = mock_response + + client = JinaClient() + + # Act + result = client.crawl("https://example.com") + + # Assert + assert result == "Test" + mock_post.assert_called_once() + + @patch("src.crawler.jina_client.requests.post") + def test_crawl_http_error(self, mock_post): + # Arrange + mock_response = Mock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + mock_post.return_value = mock_response + + client = JinaClient() + + # Act & Assert + with pytest.raises(ValueError) as exc_info: + client.crawl("https://example.com") + + assert "status 500" in str(exc_info.value) + + @patch("src.crawler.jina_client.requests.post") + def test_crawl_empty_response(self, mock_post): + # Arrange + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "" + mock_post.return_value = mock_response + + client = JinaClient() + + # Act & Assert + with pytest.raises(ValueError) as exc_info: + client.crawl("https://example.com") + + assert "empty response" in str(exc_info.value) + + @patch("src.crawler.jina_client.requests.post") + def test_crawl_whitespace_only_response(self, mock_post): + # Arrange + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = " \n \t " + mock_post.return_value = mock_response + + client = JinaClient() + + # Act & Assert + with pytest.raises(ValueError) as exc_info: + client.crawl("https://example.com") + + assert "empty response" in str(exc_info.value) + + @patch("src.crawler.jina_client.requests.post") + def test_crawl_not_found(self, mock_post): + # Arrange + mock_response = Mock() + mock_response.status_code = 404 + mock_response.text = "Not Found" + mock_post.return_value = mock_response + + client = JinaClient() + + # Act & Assert + with pytest.raises(ValueError) as exc_info: + client.crawl("https://example.com") + + assert "status 404" in str(exc_info.value) + + @patch.dict("os.environ", {}, clear=True) + @patch("src.crawler.jina_client.requests.post") + def test_crawl_without_api_key_logs_warning(self, mock_post): + # Arrange + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "Test" + mock_post.return_value = mock_response + + client = JinaClient() + + # Act + result = client.crawl("https://example.com") + + # Assert + assert result == "Test" diff --git a/tests/unit/crawler/test_readability_extractor.py b/tests/unit/crawler/test_readability_extractor.py new file mode 100644 index 0000000..0e375fa --- /dev/null +++ b/tests/unit/crawler/test_readability_extractor.py @@ -0,0 +1,103 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +from unittest.mock import patch +from src.crawler.readability_extractor import ReadabilityExtractor + + +class TestReadabilityExtractor: + @patch("src.crawler.readability_extractor.simple_json_from_html_string") + def test_extract_article_with_valid_content(self, mock_simple_json): + # Arrange + mock_simple_json.return_value = { + "title": "Test Article", + "content": "

Article content

", + } + extractor = ReadabilityExtractor() + + # Act + article = extractor.extract_article("test") + + # Assert + assert article.title == "Test Article" + assert article.html_content == "

Article content

" + + @patch("src.crawler.readability_extractor.simple_json_from_html_string") + def test_extract_article_with_none_content(self, mock_simple_json): + # Arrange + mock_simple_json.return_value = { + "title": "Test Article", + "content": None, + } + extractor = ReadabilityExtractor() + + # Act + article = extractor.extract_article("test") + + # Assert + assert article.title == "Test Article" + assert article.html_content == "

No content could be extracted from this page

" + + @patch("src.crawler.readability_extractor.simple_json_from_html_string") + def test_extract_article_with_empty_content(self, mock_simple_json): + # Arrange + mock_simple_json.return_value = { + "title": "Test Article", + "content": "", + } + extractor = ReadabilityExtractor() + + # Act + article = extractor.extract_article("test") + + # Assert + assert article.title == "Test Article" + assert article.html_content == "

No content could be extracted from this page

" + + @patch("src.crawler.readability_extractor.simple_json_from_html_string") + def test_extract_article_with_whitespace_only_content(self, mock_simple_json): + # Arrange + mock_simple_json.return_value = { + "title": "Test Article", + "content": " \n \t ", + } + extractor = ReadabilityExtractor() + + # Act + article = extractor.extract_article("test") + + # Assert + assert article.title == "Test Article" + assert article.html_content == "

No content could be extracted from this page

" + + @patch("src.crawler.readability_extractor.simple_json_from_html_string") + def test_extract_article_with_none_title(self, mock_simple_json): + # Arrange + mock_simple_json.return_value = { + "title": None, + "content": "

Article content

", + } + extractor = ReadabilityExtractor() + + # Act + article = extractor.extract_article("test") + + # Assert + assert article.title == "Untitled" + assert article.html_content == "

Article content

" + + @patch("src.crawler.readability_extractor.simple_json_from_html_string") + def test_extract_article_with_empty_title(self, mock_simple_json): + # Arrange + mock_simple_json.return_value = { + "title": "", + "content": "

Article content

", + } + extractor = ReadabilityExtractor() + + # Act + article = extractor.extract_article("test") + + # Assert + assert article.title == "Untitled" + assert article.html_content == "

Article content

" diff --git a/tests/unit/tools/test_crawl.py b/tests/unit/tools/test_crawl.py index a21c426..15e510a 100644 --- a/tests/unit/tools/test_crawl.py +++ b/tests/unit/tools/test_crawl.py @@ -110,3 +110,24 @@ class TestCrawlTool: assert "Failed to crawl" in result assert "Markdown conversion error" in result mock_logger.error.assert_called_once() + + @patch("src.tools.crawl.Crawler") + def test_crawl_tool_with_none_content(self, mock_crawler_class): + # Arrange + mock_crawler = Mock() + mock_article = Mock() + mock_article.to_markdown.return_value = "# Article\n\n*No content available*\n" + mock_crawler.crawl.return_value = mock_article + mock_crawler_class.return_value = mock_crawler + + url = "https://example.com" + + # Act + result = crawl_tool(url) + + # Assert + assert isinstance(result, str) + result_dict = json.loads(result) + assert result_dict["url"] == url + assert "crawled_content" in result_dict + assert "No content available" in result_dict["crawled_content"]