diff --git a/src/crawler/__init__.py b/src/crawler/__init__.py index 4f6a6e7..0747da2 100644 --- a/src/crawler/__init__.py +++ b/src/crawler/__init__.py @@ -3,8 +3,7 @@ from .article import Article from .crawler import Crawler +from .jina_client import JinaClient +from .readability_extractor import ReadabilityExtractor -__all__ = [ - "Article", - "Crawler", -] +__all__ = ["Article", "Crawler", "JinaClient", "ReadabilityExtractor"] diff --git a/src/crawler/crawler.py b/src/crawler/crawler.py index fe7ddfc..9f3632d 100644 --- a/src/crawler/crawler.py +++ b/src/crawler/crawler.py @@ -26,13 +26,3 @@ class Crawler: article = extractor.extract_article(html) article.url = url return article - - -if __name__ == "__main__": - if len(sys.argv) == 2: - url = sys.argv[1] - else: - url = "https://fintel.io/zh-hant/s/br/nvdc34" - crawler = Crawler() - article = crawler.crawl(url) - print(article.to_markdown()) diff --git a/tests/unit/crawler/test_article.py b/tests/unit/crawler/test_article.py new file mode 100644 index 0000000..b2aa918 --- /dev/null +++ b/tests/unit/crawler/test_article.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT +import pytest +from src.crawler.article import Article + + +class DummyMarkdownify: + """A dummy markdownify replacement for patching if needed.""" + + @staticmethod + def markdownify(html): + return html + + +def test_to_markdown_includes_title(monkeypatch): + article = Article("Test Title", "
Hello world!
") + result = article.to_markdown(including_title=True) + assert result.startswith("# Test Title") + assert "Hello" in result + + +def test_to_markdown_excludes_title(): + article = Article("Test Title", "Hello world!
") + result = article.to_markdown(including_title=False) + assert not result.startswith("# Test Title") + assert "Hello" in result + + +def test_to_message_with_text_only(): + article = Article("Test Title", "Hello world!
") + article.url = "https://example.com/" + result = article.to_message() + assert isinstance(result, list) + assert any(item["type"] == "text" for item in result) + assert all("type" in item for item in result) + + +def test_to_message_with_image(monkeypatch): + html = 'Intro
'
+ article = Article("Title", html)
+ article.url = "https://host.com/path/"
+ # The markdownify library will convert Start

Mid
End'
+ article = Article("Title", html)
+ article.url = "http://x/"
+ result = article.to_message()
+ image_urls = [
+ item["image_url"]["url"] for item in result if item["type"] == "image_url"
+ ]
+ assert "http://x/a.png" in image_urls
+ assert "http://x/b.jpg" in image_urls
+ text_items = [item for item in result if item["type"] == "text"]
+ assert any("Start" in item["text"] for item in text_items)
+ assert any("Mid" in item["text"] for item in text_items)
+
+
+def test_to_message_handles_empty_html():
+ article = Article("Empty", "")
+ article.url = "http://test/"
+ result = article.to_message()
+ assert isinstance(result, list)
+ assert result[0]["type"] == "text"
diff --git a/tests/unit/crawler/test_crawler_class.py b/tests/unit/crawler/test_crawler_class.py
new file mode 100644
index 0000000..eba2148
--- /dev/null
+++ b/tests/unit/crawler/test_crawler_class.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+import pytest
+import src.crawler as crawler_module
+from src.crawler import Crawler
+
+
+def test_crawler_sets_article_url(monkeypatch):
+ """Test that the crawler sets the article.url field correctly."""
+
+ class DummyArticle:
+ def __init__(self):
+ self.url = None
+
+ def to_markdown(self):
+ return "# Dummy"
+
+ class DummyJinaClient:
+ def crawl(self, url, return_format=None):
+ return "dummy"
+
+ class DummyReadabilityExtractor:
+ def extract_article(self, html):
+ return DummyArticle()
+
+ monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+ monkeypatch.setattr(
+ "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
+ )
+
+ crawler = crawler_module.Crawler()
+ url = "http://example.com"
+ article = crawler.crawl(url)
+ assert article.url == url
+ assert article.to_markdown() == "# Dummy"
+
+
+def test_crawler_calls_dependencies(monkeypatch):
+ """Test that Crawler calls JinaClient.crawl and ReadabilityExtractor.extract_article."""
+ calls = {}
+
+ class DummyJinaClient:
+ def crawl(self, url, return_format=None):
+ calls["jina"] = (url, return_format)
+ return "dummy"
+
+ class DummyReadabilityExtractor:
+ def extract_article(self, html):
+ calls["extractor"] = html
+
+ class DummyArticle:
+ url = None
+
+ def to_markdown(self):
+ return "# Dummy"
+
+ return DummyArticle()
+
+ monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
+ monkeypatch.setattr(
+ "src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
+ )
+
+ crawler = crawler_module.Crawler()
+ url = "http://example.com"
+ crawler.crawl(url)
+ assert "jina" in calls
+ assert calls["jina"][0] == url
+ assert calls["jina"][1] == "html"
+ assert "extractor" in calls
+ assert calls["extractor"] == "dummy"