mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-25 23:14:46 +08:00
feat: support infoquest (#708)
* support infoquest * support html checker * support html checker * change line break format * change line break format * change line break format * change line break format * change line break format * change line break format * change line break format * change line break format * Fix several critical issues in the codebase - Resolve crawler panic by improving error handling - Fix plan validation to prevent invalid configurations - Correct InfoQuest crawler JSON conversion logic * add test for infoquest * add test for infoquest * Add InfoQuest introduction to the README * add test for infoquest * fix readme for infoquest * fix readme for infoquest * resolve the conflict * resolve the conflict * resolve the conflict * Fix formatting of INFOQUEST in SearchEngine enum * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com> Co-authored-by: Willem Jiang <willem.jiang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e179fb1632
commit
7ec9e45702
@@ -3,6 +3,7 @@
|
||||
|
||||
import src.crawler as crawler_module
|
||||
from src.crawler.crawler import safe_truncate
|
||||
from src.crawler.infoquest_client import InfoQuestClient
|
||||
|
||||
|
||||
def test_crawler_sets_article_url(monkeypatch):
|
||||
@@ -18,17 +19,29 @@ def test_crawler_sets_article_url(monkeypatch):
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
pass
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
return DummyArticle()
|
||||
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
assert article.url == url
|
||||
@@ -43,6 +56,16 @@ def test_crawler_calls_dependencies(monkeypatch):
|
||||
def crawl(self, url, return_format=None):
|
||||
calls["jina"] = (url, return_format)
|
||||
return "<html>dummy</html>"
|
||||
|
||||
# Fix: Update DummyInfoQuestClient to accept initialization parameters
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
# We don't need to use these parameters, just accept them
|
||||
pass
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
calls["infoquest"] = (url, return_format)
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
@@ -55,13 +78,17 @@ def test_crawler_calls_dependencies(monkeypatch):
|
||||
return "# Dummy"
|
||||
|
||||
return DummyArticle()
|
||||
|
||||
|
||||
# Add mock for load_yaml_config to ensure it returns configuration with Jina engine
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient) # Include this if InfoQuest might be used
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
crawler.crawl(url)
|
||||
assert "jina" in calls
|
||||
@@ -91,17 +118,62 @@ def test_crawler_handles_empty_content(monkeypatch):
|
||||
def extract_article(self, html):
|
||||
# This should not be called for empty content
|
||||
assert False, "ReadabilityExtractor should not be called for empty content"
|
||||
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Empty Content"
|
||||
assert "No content could be extracted" in article.html_content
|
||||
assert "No content could be extracted from this page" in article.html_content
|
||||
|
||||
|
||||
def test_crawler_handles_error_response_from_client(monkeypatch):
|
||||
"""Test that the crawler handles error responses from the client gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "Error: API returned status 500"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
# This should not be called for error responses
|
||||
assert False, "ReadabilityExtractor should not be called for error responses"
|
||||
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
|
||||
assert "Error: API returned status 500" in article.html_content
|
||||
|
||||
|
||||
def test_crawler_handles_non_html_content(monkeypatch):
|
||||
@@ -125,16 +197,22 @@ def test_crawler_handles_non_html_content(monkeypatch):
|
||||
# This should not be called for non-HTML content
|
||||
assert False, "ReadabilityExtractor should not be called for non-HTML content"
|
||||
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Non-HTML Content"
|
||||
assert "cannot be parsed as HTML" in article.html_content
|
||||
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
|
||||
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
|
||||
assert "plain text content" in article.html_content # Should include a snippet of the original content
|
||||
|
||||
|
||||
@@ -157,11 +235,17 @@ def test_crawler_handles_extraction_failure(monkeypatch):
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
raise Exception("Extraction failed")
|
||||
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
@@ -191,17 +275,23 @@ def test_crawler_with_json_like_content(monkeypatch):
|
||||
def extract_article(self, html):
|
||||
# This should not be called for JSON content
|
||||
assert False, "ReadabilityExtractor should not be called for JSON content"
|
||||
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com/api/data"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Non-HTML Content"
|
||||
assert "cannot be parsed as HTML" in article.html_content
|
||||
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
|
||||
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
|
||||
assert '{"title": "Some JSON"' in article.html_content # Should include a snippet of the JSON
|
||||
|
||||
|
||||
@@ -217,7 +307,7 @@ def test_crawler_with_various_html_formats(monkeypatch):
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
# Test case 1: HTML with DOCTYPE
|
||||
# Test case 1: HTML with DOCTYPE
|
||||
class DummyJinaClient1:
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<!DOCTYPE html><html><body><p>Test content</p></body></html>"
|
||||
@@ -241,6 +331,9 @@ def test_crawler_with_various_html_formats(monkeypatch):
|
||||
def extract_article(self, html):
|
||||
return DummyArticle("Extracted Article", "<p>Extracted content</p>")
|
||||
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "jina"}}
|
||||
|
||||
# Test each HTML format
|
||||
test_cases = [
|
||||
(DummyJinaClient1, "HTML with DOCTYPE"),
|
||||
@@ -252,8 +345,9 @@ def test_crawler_with_various_html_formats(monkeypatch):
|
||||
for JinaClientClass, description in test_cases:
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.Crawler()
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
@@ -298,3 +392,284 @@ def test_safe_truncate_function():
|
||||
assert len(result) <= 10
|
||||
# Verify it's valid UTF-8
|
||||
assert result.encode('utf-8').decode('utf-8') == result
|
||||
|
||||
# ========== InfoQuest Client Tests ==========
|
||||
|
||||
def test_crawler_selects_infoquest_engine(monkeypatch):
|
||||
"""Test that the crawler selects InfoQuestClient when configured to use it."""
|
||||
calls = {}
|
||||
|
||||
class DummyJinaClient:
|
||||
def crawl(self, url, return_format=None):
|
||||
calls["jina"] = True
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
calls["infoquest_init"] = (fetch_time, timeout, navi_timeout)
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
calls["infoquest"] = (url, return_format)
|
||||
return "<html>dummy from infoquest</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
calls["extractor"] = html
|
||||
|
||||
class DummyArticle:
|
||||
url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return "# Dummy"
|
||||
|
||||
return DummyArticle()
|
||||
|
||||
# Mock configuration to use InfoQuest engine with custom parameters
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {
|
||||
"engine": "infoquest",
|
||||
"fetch_time": 30,
|
||||
"timeout": 60,
|
||||
"navi_timeout": 45
|
||||
}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
crawler.crawl(url)
|
||||
|
||||
# Verify InfoQuestClient was used, not JinaClient
|
||||
assert "infoquest_init" in calls
|
||||
assert calls["infoquest_init"] == (30, 60, 45) # Verify parameters were passed correctly
|
||||
assert "infoquest" in calls
|
||||
assert calls["infoquest"][0] == url
|
||||
assert calls["infoquest"][1] == "html"
|
||||
assert "extractor" in calls
|
||||
assert calls["extractor"] == "<html>dummy from infoquest</html>"
|
||||
assert "jina" not in calls
|
||||
|
||||
|
||||
def test_crawler_with_infoquest_empty_content(monkeypatch):
|
||||
"""Test that the crawler handles empty content from InfoQuest client gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
pass
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
return "" # Empty content
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
# This should not be called for empty content
|
||||
assert False, "ReadabilityExtractor should not be called for empty content"
|
||||
|
||||
# Mock configuration to use InfoQuest engine
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Empty Content"
|
||||
assert "No content could be extracted from this page" in article.html_content
|
||||
|
||||
|
||||
def test_crawler_with_infoquest_non_html_content(monkeypatch):
|
||||
"""Test that the crawler handles non-HTML content from InfoQuest client gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
pass
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
return "This is plain text content from InfoQuest, not HTML"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
# This should not be called for non-HTML content
|
||||
assert False, "ReadabilityExtractor should not be called for non-HTML content"
|
||||
|
||||
# Mock configuration to use InfoQuest engine
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
|
||||
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
|
||||
assert "plain text content from InfoQuest" in article.html_content
|
||||
|
||||
|
||||
def test_crawler_with_infoquest_error_response(monkeypatch):
|
||||
"""Test that the crawler handles error responses from InfoQuest client gracefully."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
pass
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
return "Error: InfoQuest API returned status 403: Forbidden"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
# This should not be called for error responses
|
||||
assert False, "ReadabilityExtractor should not be called for error responses"
|
||||
|
||||
# Mock configuration to use InfoQuest engine
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
|
||||
assert "Error: InfoQuest API returned status 403: Forbidden" in article.html_content
|
||||
|
||||
|
||||
def test_crawler_with_infoquest_json_response(monkeypatch):
|
||||
"""Test that the crawler handles JSON responses from InfoQuest client correctly."""
|
||||
|
||||
class DummyArticle:
|
||||
def __init__(self, title, html_content):
|
||||
self.title = title
|
||||
self.html_content = html_content
|
||||
self.url = None
|
||||
|
||||
def to_markdown(self):
|
||||
return f"# {self.title}"
|
||||
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
pass
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<html><body>Content from InfoQuest JSON</body></html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
return DummyArticle("Extracted from JSON", html)
|
||||
|
||||
# Mock configuration to use InfoQuest engine
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
|
||||
monkeypatch.setattr(
|
||||
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
|
||||
)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
url = "http://example.com"
|
||||
article = crawler.crawl(url)
|
||||
|
||||
assert article.url == url
|
||||
assert article.title == "Extracted from JSON"
|
||||
assert "Content from InfoQuest JSON" in article.html_content
|
||||
|
||||
|
||||
def test_infoquest_client_initialization_params():
|
||||
"""Test that InfoQuestClient correctly initializes with the provided parameters."""
|
||||
# Test default parameters
|
||||
client_default = InfoQuestClient()
|
||||
assert client_default.fetch_time == -1
|
||||
assert client_default.timeout == -1
|
||||
assert client_default.navi_timeout == -1
|
||||
|
||||
# Test custom parameters
|
||||
client_custom = InfoQuestClient(fetch_time=30, timeout=60, navi_timeout=45)
|
||||
assert client_custom.fetch_time == 30
|
||||
assert client_custom.timeout == 60
|
||||
assert client_custom.navi_timeout == 45
|
||||
|
||||
|
||||
def test_crawler_with_infoquest_default_parameters(monkeypatch):
|
||||
"""Test that the crawler initializes InfoQuestClient with default parameters when none are provided."""
|
||||
calls = {}
|
||||
|
||||
class DummyInfoQuestClient:
|
||||
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
|
||||
calls["infoquest_init"] = (fetch_time, timeout, navi_timeout)
|
||||
|
||||
def crawl(self, url, return_format=None):
|
||||
return "<html>dummy</html>"
|
||||
|
||||
class DummyReadabilityExtractor:
|
||||
def extract_article(self, html):
|
||||
class DummyArticle:
|
||||
url = None
|
||||
def to_markdown(self):
|
||||
return "# Dummy"
|
||||
return DummyArticle()
|
||||
|
||||
# Mock configuration to use InfoQuest engine without custom parameters
|
||||
def mock_load_config(*args, **kwargs):
|
||||
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
|
||||
|
||||
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
|
||||
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
|
||||
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
|
||||
|
||||
crawler = crawler_module.crawler.Crawler()
|
||||
crawler.crawl("http://example.com")
|
||||
|
||||
# Verify default parameters were passed
|
||||
assert "infoquest_init" in calls
|
||||
assert calls["infoquest_init"] == (-1, -1, -1)
|
||||
230
tests/unit/crawler/test_infoquest_client.py
Normal file
230
tests/unit/crawler/test_infoquest_client.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
import json
|
||||
|
||||
|
||||
|
||||
from src.crawler.infoquest_client import InfoQuestClient
|
||||
|
||||
|
||||
class TestInfoQuestClient:
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_success(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "<html><body>Test Content</body></html>"
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result == "<html><body>Test Content</body></html>"
|
||||
mock_post.assert_called_once()
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_json_response_with_reader_result(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
json_data = {
|
||||
"reader_result": "<p>Extracted content from JSON</p>",
|
||||
"err_code": 0,
|
||||
"err_msg": "success"
|
||||
}
|
||||
mock_response.text = json.dumps(json_data)
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result == "<p>Extracted content from JSON</p>"
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_json_response_with_content_fallback(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
json_data = {
|
||||
"content": "<p>Content fallback from JSON</p>",
|
||||
"err_code": 0,
|
||||
"err_msg": "success"
|
||||
}
|
||||
mock_response.text = json.dumps(json_data)
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result == "<p>Content fallback from JSON</p>"
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_json_response_without_expected_fields(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
json_data = {
|
||||
"unexpected_field": "some value",
|
||||
"err_code": 0,
|
||||
"err_msg": "success"
|
||||
}
|
||||
mock_response.text = json.dumps(json_data)
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result == json.dumps(json_data)
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_http_error(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 500
|
||||
mock_response.text = "Internal Server Error"
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "status 500" in result
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_empty_response(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = ""
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "empty response" in result
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_whitespace_only_response(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = " \n \t "
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "empty response" in result
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_not_found(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_response.text = "Not Found"
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "status 404" in result
|
||||
|
||||
@patch.dict("os.environ", {}, clear=True)
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_without_api_key_logs_warning(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "<html>Test</html>"
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result == "<html>Test</html>"
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_with_timeout_parameters(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "<html>Test</html>"
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient(fetch_time=10, timeout=20, navi_timeout=30)
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result == "<html>Test</html>"
|
||||
# Verify the post call was made with timeout parameters
|
||||
call_args = mock_post.call_args[1]
|
||||
assert call_args['json']['fetch_time'] == 10
|
||||
assert call_args['json']['timeout'] == 20
|
||||
assert call_args['json']['navi_timeout'] == 30
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_with_markdown_format(self, mock_post):
|
||||
# Arrange
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "# Markdown Content"
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com", return_format="markdown")
|
||||
|
||||
# Assert
|
||||
assert result == "# Markdown Content"
|
||||
# Verify the format was set correctly
|
||||
call_args = mock_post.call_args[1]
|
||||
assert call_args['json']['format'] == "markdown"
|
||||
|
||||
@patch("src.crawler.infoquest_client.requests.post")
|
||||
def test_crawl_exception_handling(self, mock_post):
|
||||
# Arrange
|
||||
mock_post.side_effect = Exception("Network error")
|
||||
|
||||
client = InfoQuestClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "Network error" in result
|
||||
@@ -36,11 +36,12 @@ class TestJinaClient:
|
||||
|
||||
client = JinaClient()
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
client.crawl("https://example.com")
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
assert "status 500" in str(exc_info.value)
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "status 500" in result
|
||||
|
||||
@patch("src.crawler.jina_client.requests.post")
|
||||
def test_crawl_empty_response(self, mock_post):
|
||||
@@ -52,11 +53,12 @@ class TestJinaClient:
|
||||
|
||||
client = JinaClient()
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
client.crawl("https://example.com")
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
assert "empty response" in str(exc_info.value)
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "empty response" in result
|
||||
|
||||
@patch("src.crawler.jina_client.requests.post")
|
||||
def test_crawl_whitespace_only_response(self, mock_post):
|
||||
@@ -68,11 +70,12 @@ class TestJinaClient:
|
||||
|
||||
client = JinaClient()
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
client.crawl("https://example.com")
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
assert "empty response" in str(exc_info.value)
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "empty response" in result
|
||||
|
||||
@patch("src.crawler.jina_client.requests.post")
|
||||
def test_crawl_not_found(self, mock_post):
|
||||
@@ -84,11 +87,12 @@ class TestJinaClient:
|
||||
|
||||
client = JinaClient()
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
client.crawl("https://example.com")
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
assert "status 404" in str(exc_info.value)
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "status 404" in result
|
||||
|
||||
@patch.dict("os.environ", {}, clear=True)
|
||||
@patch("src.crawler.jina_client.requests.post")
|
||||
@@ -106,3 +110,17 @@ class TestJinaClient:
|
||||
|
||||
# Assert
|
||||
assert result == "<html>Test</html>"
|
||||
|
||||
@patch("src.crawler.jina_client.requests.post")
|
||||
def test_crawl_exception_handling(self, mock_post):
|
||||
# Arrange
|
||||
mock_post.side_effect = Exception("Network error")
|
||||
|
||||
client = JinaClient()
|
||||
|
||||
# Act
|
||||
result = client.crawl("https://example.com")
|
||||
|
||||
# Assert
|
||||
assert result.startswith("Error:")
|
||||
assert "Network error" in result
|
||||
218
tests/unit/tools/test_infoquest_search_api.py
Normal file
218
tests/unit/tools/test_infoquest_search_api.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from src.tools.infoquest_search.infoquest_search_api import InfoQuestAPIWrapper
|
||||
|
||||
class TestInfoQuestAPIWrapper:
|
||||
@pytest.fixture
|
||||
def wrapper(self):
|
||||
# Create a wrapper instance with mock API key
|
||||
return InfoQuestAPIWrapper(infoquest_api_key="dummy-key")
|
||||
|
||||
@pytest.fixture
|
||||
def mock_response_data(self):
|
||||
# Mock search result data
|
||||
return {
|
||||
"search_result": {
|
||||
"results": [
|
||||
{
|
||||
"content": {
|
||||
"results": {
|
||||
"organic": [
|
||||
{
|
||||
"title": "Test Title",
|
||||
"url": "https://example.com",
|
||||
"desc": "Test description"
|
||||
}
|
||||
],
|
||||
"top_stories": {
|
||||
"items": [
|
||||
{
|
||||
"time_frame": "2 days ago",
|
||||
"title": "Test News",
|
||||
"url": "https://example.com/news",
|
||||
"source": "Test Source"
|
||||
}
|
||||
]
|
||||
},
|
||||
"images": {
|
||||
"items": [
|
||||
{
|
||||
"url": "https://example.com/image.jpg",
|
||||
"alt": "Test image description"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@patch("src.tools.infoquest_search.infoquest_search_api.requests.post")
|
||||
def test_raw_results_success(self, mock_post, wrapper, mock_response_data):
|
||||
# Test successful synchronous search results
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = mock_response_data
|
||||
mock_response.raise_for_status.return_value = None
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
result = wrapper.raw_results("test query", time_range=0, site="")
|
||||
|
||||
assert result == mock_response_data["search_result"]
|
||||
mock_post.assert_called_once()
|
||||
call_args = mock_post.call_args
|
||||
assert "json" in call_args.kwargs
|
||||
assert call_args.kwargs["json"]["query"] == "test query"
|
||||
assert "time_range" not in call_args.kwargs["json"]
|
||||
assert "site" not in call_args.kwargs["json"]
|
||||
|
||||
@patch("src.tools.infoquest_search.infoquest_search_api.requests.post")
|
||||
def test_raw_results_with_time_range_and_site(self, mock_post, wrapper, mock_response_data):
|
||||
# Test search with time range and site filtering
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = mock_response_data
|
||||
mock_response.raise_for_status.return_value = None
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
result = wrapper.raw_results("test query", time_range=30, site="example.com")
|
||||
|
||||
assert result == mock_response_data["search_result"]
|
||||
call_args = mock_post.call_args
|
||||
params = call_args.kwargs["json"]
|
||||
assert params["time_range"] == 30
|
||||
assert params["site"] == "example.com"
|
||||
|
||||
@patch("src.tools.infoquest_search.infoquest_search_api.requests.post")
|
||||
def test_raw_results_http_error(self, mock_post, wrapper):
|
||||
# Test HTTP error handling
|
||||
mock_response = Mock()
|
||||
mock_response.raise_for_status.side_effect = requests.HTTPError("API Error")
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
with pytest.raises(requests.HTTPError):
|
||||
wrapper.raw_results("test query", time_range=0, site="")
|
||||
|
||||
# Check if pytest-asyncio is available, otherwise mark for conditional skipping
|
||||
try:
|
||||
import pytest_asyncio
|
||||
_asyncio_available = True
|
||||
except ImportError:
|
||||
_asyncio_available = False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_results_async_success(self, wrapper, mock_response_data):
|
||||
# Skip only if pytest-asyncio is not installed
|
||||
if not self._asyncio_available:
|
||||
pytest.skip("pytest-asyncio is not installed")
|
||||
|
||||
with patch('json.loads', return_value=mock_response_data):
|
||||
original_method = InfoQuestAPIWrapper.raw_results_async
|
||||
|
||||
async def mock_raw_results_async(self, query, time_range=0, site="", output_format="json"):
|
||||
return mock_response_data["search_result"]
|
||||
|
||||
InfoQuestAPIWrapper.raw_results_async = mock_raw_results_async
|
||||
|
||||
try:
|
||||
result = await wrapper.raw_results_async("test query", time_range=0, site="")
|
||||
assert result == mock_response_data["search_result"]
|
||||
finally:
|
||||
InfoQuestAPIWrapper.raw_results_async = original_method
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_results_async_error(self, wrapper):
|
||||
if not self._asyncio_available:
|
||||
pytest.skip("pytest-asyncio is not installed")
|
||||
|
||||
original_method = InfoQuestAPIWrapper.raw_results_async
|
||||
|
||||
async def mock_raw_results_async_error(self, query, time_range=0, site="", output_format="json"):
|
||||
raise Exception("Error 400: Bad Request")
|
||||
|
||||
InfoQuestAPIWrapper.raw_results_async = mock_raw_results_async_error
|
||||
|
||||
try:
|
||||
with pytest.raises(Exception, match="Error 400: Bad Request"):
|
||||
await wrapper.raw_results_async("test query", time_range=0, site="")
|
||||
finally:
|
||||
InfoQuestAPIWrapper.raw_results_async = original_method
|
||||
|
||||
def test_clean_results_with_images(self, wrapper, mock_response_data):
|
||||
# Test result cleaning functionality
|
||||
raw_results = mock_response_data["search_result"]["results"]
|
||||
cleaned_results = wrapper.clean_results_with_images(raw_results)
|
||||
|
||||
assert len(cleaned_results) == 3
|
||||
|
||||
# Test page result
|
||||
page_result = cleaned_results[0]
|
||||
assert page_result["type"] == "page"
|
||||
assert page_result["title"] == "Test Title"
|
||||
assert page_result["url"] == "https://example.com"
|
||||
assert page_result["desc"] == "Test description"
|
||||
|
||||
# Test news result
|
||||
news_result = cleaned_results[1]
|
||||
assert news_result["type"] == "news"
|
||||
assert news_result["time_frame"] == "2 days ago"
|
||||
assert news_result["title"] == "Test News"
|
||||
assert news_result["url"] == "https://example.com/news"
|
||||
assert news_result["source"] == "Test Source"
|
||||
|
||||
# Test image result
|
||||
image_result = cleaned_results[2]
|
||||
assert image_result["type"] == "image_url"
|
||||
assert image_result["image_url"] == "https://example.com/image.jpg"
|
||||
assert image_result["image_description"] == "Test image description"
|
||||
|
||||
def test_clean_results_empty_categories(self, wrapper):
|
||||
# Test result cleaning with empty categories
|
||||
data = [
|
||||
{
|
||||
"content": {
|
||||
"results": {
|
||||
"organic": [],
|
||||
"top_stories": {"items": []},
|
||||
"images": {"items": []}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
result = wrapper.clean_results_with_images(data)
|
||||
assert len(result) == 0
|
||||
|
||||
def test_clean_results_url_deduplication(self, wrapper):
|
||||
# Test URL deduplication functionality
|
||||
data = [
|
||||
{
|
||||
"content": {
|
||||
"results": {
|
||||
"organic": [
|
||||
{
|
||||
"title": "Test Title 1",
|
||||
"url": "https://example.com",
|
||||
"desc": "Description 1"
|
||||
},
|
||||
{
|
||||
"title": "Test Title 2",
|
||||
"url": "https://example.com",
|
||||
"desc": "Description 2"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
result = wrapper.clean_results_with_images(data)
|
||||
assert len(result) == 1
|
||||
assert result[0]["title"] == "Test Title 1"
|
||||
226
tests/unit/tools/test_infoquest_search_results.py
Normal file
226
tests/unit/tools/test_infoquest_search_results.py
Normal file
@@ -0,0 +1,226 @@
|
||||
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import json
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
|
||||
class TestInfoQuestSearchResults:
|
||||
@pytest.fixture
|
||||
def search_tool(self):
|
||||
"""Create a mock InfoQuestSearchResults instance."""
|
||||
mock_tool = Mock()
|
||||
|
||||
mock_tool.time_range = 30
|
||||
mock_tool.site = "example.com"
|
||||
|
||||
def mock_run(query, **kwargs):
|
||||
sample_cleaned_results = [
|
||||
{
|
||||
"type": "page",
|
||||
"title": "Test Title",
|
||||
"url": "https://example.com",
|
||||
"desc": "Test description"
|
||||
}
|
||||
]
|
||||
sample_raw_results = {
|
||||
"results": [
|
||||
{
|
||||
"content": {
|
||||
"results": {
|
||||
"organic": [
|
||||
{
|
||||
"title": "Test Title",
|
||||
"url": "https://example.com",
|
||||
"desc": "Test description"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
return json.dumps(sample_cleaned_results, ensure_ascii=False), sample_raw_results
|
||||
|
||||
async def mock_arun(query, **kwargs):
|
||||
return mock_run(query, **kwargs)
|
||||
|
||||
mock_tool._run = mock_run
|
||||
mock_tool._arun = mock_arun
|
||||
|
||||
return mock_tool
|
||||
|
||||
@pytest.fixture
|
||||
def sample_raw_results(self):
|
||||
"""Sample raw results from InfoQuest API."""
|
||||
return {
|
||||
"results": [
|
||||
{
|
||||
"content": {
|
||||
"results": {
|
||||
"organic": [
|
||||
{
|
||||
"title": "Test Title",
|
||||
"url": "https://example.com",
|
||||
"desc": "Test description"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def sample_cleaned_results(self):
|
||||
"""Sample cleaned results."""
|
||||
return [
|
||||
{
|
||||
"type": "page",
|
||||
"title": "Test Title",
|
||||
"url": "https://example.com",
|
||||
"desc": "Test description"
|
||||
}
|
||||
]
|
||||
|
||||
def test_init_default_values(self):
|
||||
"""Test initialization with default values using patch."""
|
||||
with patch('src.tools.infoquest_search.infoquest_search_results.InfoQuestAPIWrapper') as mock_wrapper_class:
|
||||
mock_instance = Mock()
|
||||
mock_wrapper_class.return_value = mock_instance
|
||||
|
||||
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
|
||||
|
||||
with patch.object(InfoQuestSearchResults, '__init__', return_value=None) as mock_init:
|
||||
InfoQuestSearchResults(infoquest_api_key="dummy-key")
|
||||
|
||||
mock_init.assert_called_once()
|
||||
|
||||
def test_init_custom_values(self):
|
||||
"""Test initialization with custom values using patch."""
|
||||
with patch('src.tools.infoquest_search.infoquest_search_results.InfoQuestAPIWrapper') as mock_wrapper_class:
|
||||
mock_instance = Mock()
|
||||
mock_wrapper_class.return_value = mock_instance
|
||||
|
||||
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
|
||||
|
||||
with patch.object(InfoQuestSearchResults, '__init__', return_value=None) as mock_init:
|
||||
InfoQuestSearchResults(
|
||||
time_range=10,
|
||||
site="test.com",
|
||||
infoquest_api_key="dummy-key"
|
||||
)
|
||||
|
||||
mock_init.assert_called_once()
|
||||
|
||||
def test_run_success(
|
||||
self,
|
||||
search_tool,
|
||||
sample_raw_results,
|
||||
sample_cleaned_results,
|
||||
):
|
||||
"""Test successful synchronous run."""
|
||||
result, raw = search_tool._run("test query")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert isinstance(raw, dict)
|
||||
assert "results" in raw
|
||||
|
||||
result_data = json.loads(result)
|
||||
assert isinstance(result_data, list)
|
||||
assert len(result_data) > 0
|
||||
|
||||
def test_run_exception(self, search_tool):
|
||||
"""Test synchronous run with exception."""
|
||||
original_run = search_tool._run
|
||||
|
||||
def mock_run_with_error(query, **kwargs):
|
||||
return json.dumps({"error": "API Error"}, ensure_ascii=False), {}
|
||||
|
||||
try:
|
||||
search_tool._run = mock_run_with_error
|
||||
result, raw = search_tool._run("test query")
|
||||
|
||||
result_dict = json.loads(result)
|
||||
assert "error" in result_dict
|
||||
assert "API Error" in result_dict["error"]
|
||||
assert raw == {}
|
||||
finally:
|
||||
search_tool._run = original_run
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_arun_success(
|
||||
self,
|
||||
search_tool,
|
||||
sample_raw_results,
|
||||
sample_cleaned_results,
|
||||
):
|
||||
"""Test successful asynchronous run."""
|
||||
result, raw = await search_tool._arun("test query")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert isinstance(raw, dict)
|
||||
assert "results" in raw
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_arun_exception(self, search_tool):
|
||||
"""Test asynchronous run with exception."""
|
||||
original_arun = search_tool._arun
|
||||
|
||||
async def mock_arun_with_error(query, **kwargs):
|
||||
return json.dumps({"error": "Async API Error"}, ensure_ascii=False), {}
|
||||
|
||||
try:
|
||||
search_tool._arun = mock_arun_with_error
|
||||
result, raw = await search_tool._arun("test query")
|
||||
|
||||
result_dict = json.loads(result)
|
||||
assert "error" in result_dict
|
||||
assert "Async API Error" in result_dict["error"]
|
||||
assert raw == {}
|
||||
finally:
|
||||
search_tool._arun = original_arun
|
||||
|
||||
def test_run_with_run_manager(
|
||||
self,
|
||||
search_tool,
|
||||
sample_raw_results,
|
||||
sample_cleaned_results,
|
||||
):
|
||||
"""Test run with callback manager."""
|
||||
mock_run_manager = Mock()
|
||||
result, raw = search_tool._run("test query", run_manager=mock_run_manager)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert isinstance(raw, dict)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_arun_with_run_manager(
|
||||
self,
|
||||
search_tool,
|
||||
sample_raw_results,
|
||||
sample_cleaned_results,
|
||||
):
|
||||
"""Test async run with callback manager."""
|
||||
mock_run_manager = Mock()
|
||||
result, raw = await search_tool._arun("test query", run_manager=mock_run_manager)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert isinstance(raw, dict)
|
||||
|
||||
def test_api_wrapper_initialization_with_key(self):
|
||||
"""Test API wrapper initialization with key."""
|
||||
with patch('src.tools.infoquest_search.infoquest_search_results.InfoQuestAPIWrapper') as mock_wrapper_class:
|
||||
mock_instance = Mock()
|
||||
mock_wrapper_class.return_value = mock_instance
|
||||
|
||||
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
|
||||
|
||||
with patch.object(InfoQuestSearchResults, '__init__', return_value=None) as mock_init:
|
||||
InfoQuestSearchResults(infoquest_api_key="test-key")
|
||||
|
||||
mock_init.assert_called_once()
|
||||
Reference in New Issue
Block a user