feat: support infoquest (#708)

* support infoquest

* support html checker

* support html checker

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* Fix several critical issues in the codebase
- Resolve crawler panic by improving error handling
- Fix plan validation to prevent invalid configurations
- Correct InfoQuest crawler JSON conversion logic

* add test for infoquest

* add test for infoquest

* Add InfoQuest introduction to the README

* add test for infoquest

* fix readme for infoquest

* fix readme for infoquest

* resolve the conflict

* resolve the conflict

* resolve the conflict

* Fix formatting of INFOQUEST in SearchEngine enum

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
infoquest-byteplus
2025-12-02 08:16:35 +08:00
committed by GitHub
parent e179fb1632
commit 7ec9e45702
22 changed files with 2103 additions and 94 deletions

View File

@@ -3,6 +3,7 @@
import src.crawler as crawler_module
from src.crawler.crawler import safe_truncate
from src.crawler.infoquest_client import InfoQuestClient
def test_crawler_sets_article_url(monkeypatch):
@@ -18,17 +19,29 @@ def test_crawler_sets_article_url(monkeypatch):
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "<html>dummy</html>"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
return DummyArticle()
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
@@ -43,6 +56,16 @@ def test_crawler_calls_dependencies(monkeypatch):
def crawl(self, url, return_format=None):
calls["jina"] = (url, return_format)
return "<html>dummy</html>"
# Fix: Update DummyInfoQuestClient to accept initialization parameters
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
# We don't need to use these parameters, just accept them
pass
def crawl(self, url, return_format=None):
calls["infoquest"] = (url, return_format)
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
@@ -55,13 +78,17 @@ def test_crawler_calls_dependencies(monkeypatch):
return "# Dummy"
return DummyArticle()
# Add mock for load_yaml_config to ensure it returns configuration with Jina engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient) # Include this if InfoQuest might be used
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
crawler.crawl(url)
assert "jina" in calls
@@ -91,17 +118,62 @@ def test_crawler_handles_empty_content(monkeypatch):
def extract_article(self, html):
# This should not be called for empty content
assert False, "ReadabilityExtractor should not be called for empty content"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Empty Content"
assert "No content could be extracted" in article.html_content
assert "No content could be extracted from this page" in article.html_content
def test_crawler_handles_error_response_from_client(monkeypatch):
"""Test that the crawler handles error responses from the client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "Error: API returned status 500"
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for error responses
assert False, "ReadabilityExtractor should not be called for error responses"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "Error: API returned status 500" in article.html_content
def test_crawler_handles_non_html_content(monkeypatch):
@@ -125,16 +197,22 @@ def test_crawler_handles_non_html_content(monkeypatch):
# This should not be called for non-HTML content
assert False, "ReadabilityExtractor should not be called for non-HTML content"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Non-HTML Content"
assert "cannot be parsed as HTML" in article.html_content
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
assert "plain text content" in article.html_content # Should include a snippet of the original content
@@ -157,11 +235,17 @@ def test_crawler_handles_extraction_failure(monkeypatch):
class DummyReadabilityExtractor:
def extract_article(self, html):
raise Exception("Extraction failed")
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
@@ -191,17 +275,23 @@ def test_crawler_with_json_like_content(monkeypatch):
def extract_article(self, html):
# This should not be called for JSON content
assert False, "ReadabilityExtractor should not be called for JSON content"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com/api/data"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Non-HTML Content"
assert "cannot be parsed as HTML" in article.html_content
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
assert '{"title": "Some JSON"' in article.html_content # Should include a snippet of the JSON
@@ -217,7 +307,7 @@ def test_crawler_with_various_html_formats(monkeypatch):
def to_markdown(self):
return f"# {self.title}"
# Test case 1: HTML with DOCTYPE
# Test case 1: HTML with DOCTYPE
class DummyJinaClient1:
def crawl(self, url, return_format=None):
return "<!DOCTYPE html><html><body><p>Test content</p></body></html>"
@@ -241,6 +331,9 @@ def test_crawler_with_various_html_formats(monkeypatch):
def extract_article(self, html):
return DummyArticle("Extracted Article", "<p>Extracted content</p>")
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
# Test each HTML format
test_cases = [
(DummyJinaClient1, "HTML with DOCTYPE"),
@@ -252,8 +345,9 @@ def test_crawler_with_various_html_formats(monkeypatch):
for JinaClientClass, description in test_cases:
monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
@@ -298,3 +392,284 @@ def test_safe_truncate_function():
assert len(result) <= 10
# Verify it's valid UTF-8
assert result.encode('utf-8').decode('utf-8') == result
# ========== InfoQuest Client Tests ==========
def test_crawler_selects_infoquest_engine(monkeypatch):
"""Test that the crawler selects InfoQuestClient when configured to use it."""
calls = {}
class DummyJinaClient:
def crawl(self, url, return_format=None):
calls["jina"] = True
return "<html>dummy</html>"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
calls["infoquest_init"] = (fetch_time, timeout, navi_timeout)
def crawl(self, url, return_format=None):
calls["infoquest"] = (url, return_format)
return "<html>dummy from infoquest</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
calls["extractor"] = html
class DummyArticle:
url = None
def to_markdown(self):
return "# Dummy"
return DummyArticle()
# Mock configuration to use InfoQuest engine with custom parameters
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {
"engine": "infoquest",
"fetch_time": 30,
"timeout": 60,
"navi_timeout": 45
}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
crawler.crawl(url)
# Verify InfoQuestClient was used, not JinaClient
assert "infoquest_init" in calls
assert calls["infoquest_init"] == (30, 60, 45) # Verify parameters were passed correctly
assert "infoquest" in calls
assert calls["infoquest"][0] == url
assert calls["infoquest"][1] == "html"
assert "extractor" in calls
assert calls["extractor"] == "<html>dummy from infoquest</html>"
assert "jina" not in calls
def test_crawler_with_infoquest_empty_content(monkeypatch):
"""Test that the crawler handles empty content from InfoQuest client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "" # Empty content
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for empty content
assert False, "ReadabilityExtractor should not be called for empty content"
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Empty Content"
assert "No content could be extracted from this page" in article.html_content
def test_crawler_with_infoquest_non_html_content(monkeypatch):
"""Test that the crawler handles non-HTML content from InfoQuest client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "This is plain text content from InfoQuest, not HTML"
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for non-HTML content
assert False, "ReadabilityExtractor should not be called for non-HTML content"
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
assert "plain text content from InfoQuest" in article.html_content
def test_crawler_with_infoquest_error_response(monkeypatch):
"""Test that the crawler handles error responses from InfoQuest client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "Error: InfoQuest API returned status 403: Forbidden"
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for error responses
assert False, "ReadabilityExtractor should not be called for error responses"
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "Error: InfoQuest API returned status 403: Forbidden" in article.html_content
def test_crawler_with_infoquest_json_response(monkeypatch):
"""Test that the crawler handles JSON responses from InfoQuest client correctly."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "<html><body>Content from InfoQuest JSON</body></html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
return DummyArticle("Extracted from JSON", html)
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Extracted from JSON"
assert "Content from InfoQuest JSON" in article.html_content
def test_infoquest_client_initialization_params():
"""Test that InfoQuestClient correctly initializes with the provided parameters."""
# Test default parameters
client_default = InfoQuestClient()
assert client_default.fetch_time == -1
assert client_default.timeout == -1
assert client_default.navi_timeout == -1
# Test custom parameters
client_custom = InfoQuestClient(fetch_time=30, timeout=60, navi_timeout=45)
assert client_custom.fetch_time == 30
assert client_custom.timeout == 60
assert client_custom.navi_timeout == 45
def test_crawler_with_infoquest_default_parameters(monkeypatch):
"""Test that the crawler initializes InfoQuestClient with default parameters when none are provided."""
calls = {}
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
calls["infoquest_init"] = (fetch_time, timeout, navi_timeout)
def crawl(self, url, return_format=None):
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
class DummyArticle:
url = None
def to_markdown(self):
return "# Dummy"
return DummyArticle()
# Mock configuration to use InfoQuest engine without custom parameters
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
crawler.crawl("http://example.com")
# Verify default parameters were passed
assert "infoquest_init" in calls
assert calls["infoquest_init"] == (-1, -1, -1)

View File

@@ -0,0 +1,230 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
from unittest.mock import Mock, patch
import json
from src.crawler.infoquest_client import InfoQuestClient
class TestInfoQuestClient:
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_success(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html><body>Test Content</body></html>"
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<html><body>Test Content</body></html>"
mock_post.assert_called_once()
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_json_response_with_reader_result(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
json_data = {
"reader_result": "<p>Extracted content from JSON</p>",
"err_code": 0,
"err_msg": "success"
}
mock_response.text = json.dumps(json_data)
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<p>Extracted content from JSON</p>"
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_json_response_with_content_fallback(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
json_data = {
"content": "<p>Content fallback from JSON</p>",
"err_code": 0,
"err_msg": "success"
}
mock_response.text = json.dumps(json_data)
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<p>Content fallback from JSON</p>"
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_json_response_without_expected_fields(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
json_data = {
"unexpected_field": "some value",
"err_code": 0,
"err_msg": "success"
}
mock_response.text = json.dumps(json_data)
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == json.dumps(json_data)
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_http_error(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 500
mock_response.text = "Internal Server Error"
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "status 500" in result
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_empty_response(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = ""
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_whitespace_only_response(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = " \n \t "
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_not_found(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 404
mock_response.text = "Not Found"
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "status 404" in result
@patch.dict("os.environ", {}, clear=True)
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_without_api_key_logs_warning(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html>Test</html>"
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<html>Test</html>"
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_with_timeout_parameters(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html>Test</html>"
mock_post.return_value = mock_response
client = InfoQuestClient(fetch_time=10, timeout=20, navi_timeout=30)
# Act
result = client.crawl("https://example.com")
# Assert
assert result == "<html>Test</html>"
# Verify the post call was made with timeout parameters
call_args = mock_post.call_args[1]
assert call_args['json']['fetch_time'] == 10
assert call_args['json']['timeout'] == 20
assert call_args['json']['navi_timeout'] == 30
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_with_markdown_format(self, mock_post):
# Arrange
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "# Markdown Content"
mock_post.return_value = mock_response
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com", return_format="markdown")
# Assert
assert result == "# Markdown Content"
# Verify the format was set correctly
call_args = mock_post.call_args[1]
assert call_args['json']['format'] == "markdown"
@patch("src.crawler.infoquest_client.requests.post")
def test_crawl_exception_handling(self, mock_post):
# Arrange
mock_post.side_effect = Exception("Network error")
client = InfoQuestClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "Network error" in result

View File

@@ -36,11 +36,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "status 500" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "status 500" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_empty_response(self, mock_post):
@@ -52,11 +53,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "empty response" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_whitespace_only_response(self, mock_post):
@@ -68,11 +70,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "empty response" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_not_found(self, mock_post):
@@ -84,11 +87,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "status 404" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "status 404" in result
@patch.dict("os.environ", {}, clear=True)
@patch("src.crawler.jina_client.requests.post")
@@ -106,3 +110,17 @@ class TestJinaClient:
# Assert
assert result == "<html>Test</html>"
@patch("src.crawler.jina_client.requests.post")
def test_crawl_exception_handling(self, mock_post):
# Arrange
mock_post.side_effect = Exception("Network error")
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "Network error" in result

View File

@@ -0,0 +1,218 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
from unittest.mock import Mock, patch
import pytest
import requests
from src.tools.infoquest_search.infoquest_search_api import InfoQuestAPIWrapper
class TestInfoQuestAPIWrapper:
@pytest.fixture
def wrapper(self):
# Create a wrapper instance with mock API key
return InfoQuestAPIWrapper(infoquest_api_key="dummy-key")
@pytest.fixture
def mock_response_data(self):
# Mock search result data
return {
"search_result": {
"results": [
{
"content": {
"results": {
"organic": [
{
"title": "Test Title",
"url": "https://example.com",
"desc": "Test description"
}
],
"top_stories": {
"items": [
{
"time_frame": "2 days ago",
"title": "Test News",
"url": "https://example.com/news",
"source": "Test Source"
}
]
},
"images": {
"items": [
{
"url": "https://example.com/image.jpg",
"alt": "Test image description"
}
]
}
}
}
}
]
}
}
@patch("src.tools.infoquest_search.infoquest_search_api.requests.post")
def test_raw_results_success(self, mock_post, wrapper, mock_response_data):
# Test successful synchronous search results
mock_response = Mock()
mock_response.json.return_value = mock_response_data
mock_response.raise_for_status.return_value = None
mock_post.return_value = mock_response
result = wrapper.raw_results("test query", time_range=0, site="")
assert result == mock_response_data["search_result"]
mock_post.assert_called_once()
call_args = mock_post.call_args
assert "json" in call_args.kwargs
assert call_args.kwargs["json"]["query"] == "test query"
assert "time_range" not in call_args.kwargs["json"]
assert "site" not in call_args.kwargs["json"]
@patch("src.tools.infoquest_search.infoquest_search_api.requests.post")
def test_raw_results_with_time_range_and_site(self, mock_post, wrapper, mock_response_data):
# Test search with time range and site filtering
mock_response = Mock()
mock_response.json.return_value = mock_response_data
mock_response.raise_for_status.return_value = None
mock_post.return_value = mock_response
result = wrapper.raw_results("test query", time_range=30, site="example.com")
assert result == mock_response_data["search_result"]
call_args = mock_post.call_args
params = call_args.kwargs["json"]
assert params["time_range"] == 30
assert params["site"] == "example.com"
@patch("src.tools.infoquest_search.infoquest_search_api.requests.post")
def test_raw_results_http_error(self, mock_post, wrapper):
# Test HTTP error handling
mock_response = Mock()
mock_response.raise_for_status.side_effect = requests.HTTPError("API Error")
mock_post.return_value = mock_response
with pytest.raises(requests.HTTPError):
wrapper.raw_results("test query", time_range=0, site="")
# Check if pytest-asyncio is available, otherwise mark for conditional skipping
try:
import pytest_asyncio
_asyncio_available = True
except ImportError:
_asyncio_available = False
@pytest.mark.asyncio
async def test_raw_results_async_success(self, wrapper, mock_response_data):
# Skip only if pytest-asyncio is not installed
if not self._asyncio_available:
pytest.skip("pytest-asyncio is not installed")
with patch('json.loads', return_value=mock_response_data):
original_method = InfoQuestAPIWrapper.raw_results_async
async def mock_raw_results_async(self, query, time_range=0, site="", output_format="json"):
return mock_response_data["search_result"]
InfoQuestAPIWrapper.raw_results_async = mock_raw_results_async
try:
result = await wrapper.raw_results_async("test query", time_range=0, site="")
assert result == mock_response_data["search_result"]
finally:
InfoQuestAPIWrapper.raw_results_async = original_method
@pytest.mark.asyncio
async def test_raw_results_async_error(self, wrapper):
if not self._asyncio_available:
pytest.skip("pytest-asyncio is not installed")
original_method = InfoQuestAPIWrapper.raw_results_async
async def mock_raw_results_async_error(self, query, time_range=0, site="", output_format="json"):
raise Exception("Error 400: Bad Request")
InfoQuestAPIWrapper.raw_results_async = mock_raw_results_async_error
try:
with pytest.raises(Exception, match="Error 400: Bad Request"):
await wrapper.raw_results_async("test query", time_range=0, site="")
finally:
InfoQuestAPIWrapper.raw_results_async = original_method
def test_clean_results_with_images(self, wrapper, mock_response_data):
# Test result cleaning functionality
raw_results = mock_response_data["search_result"]["results"]
cleaned_results = wrapper.clean_results_with_images(raw_results)
assert len(cleaned_results) == 3
# Test page result
page_result = cleaned_results[0]
assert page_result["type"] == "page"
assert page_result["title"] == "Test Title"
assert page_result["url"] == "https://example.com"
assert page_result["desc"] == "Test description"
# Test news result
news_result = cleaned_results[1]
assert news_result["type"] == "news"
assert news_result["time_frame"] == "2 days ago"
assert news_result["title"] == "Test News"
assert news_result["url"] == "https://example.com/news"
assert news_result["source"] == "Test Source"
# Test image result
image_result = cleaned_results[2]
assert image_result["type"] == "image_url"
assert image_result["image_url"] == "https://example.com/image.jpg"
assert image_result["image_description"] == "Test image description"
def test_clean_results_empty_categories(self, wrapper):
# Test result cleaning with empty categories
data = [
{
"content": {
"results": {
"organic": [],
"top_stories": {"items": []},
"images": {"items": []}
}
}
}
]
result = wrapper.clean_results_with_images(data)
assert len(result) == 0
def test_clean_results_url_deduplication(self, wrapper):
# Test URL deduplication functionality
data = [
{
"content": {
"results": {
"organic": [
{
"title": "Test Title 1",
"url": "https://example.com",
"desc": "Description 1"
},
{
"title": "Test Title 2",
"url": "https://example.com",
"desc": "Description 2"
}
]
}
}
}
]
result = wrapper.clean_results_with_images(data)
assert len(result) == 1
assert result[0]["title"] == "Test Title 1"

View File

@@ -0,0 +1,226 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import json
from unittest.mock import Mock, patch
import pytest
class TestInfoQuestSearchResults:
@pytest.fixture
def search_tool(self):
"""Create a mock InfoQuestSearchResults instance."""
mock_tool = Mock()
mock_tool.time_range = 30
mock_tool.site = "example.com"
def mock_run(query, **kwargs):
sample_cleaned_results = [
{
"type": "page",
"title": "Test Title",
"url": "https://example.com",
"desc": "Test description"
}
]
sample_raw_results = {
"results": [
{
"content": {
"results": {
"organic": [
{
"title": "Test Title",
"url": "https://example.com",
"desc": "Test description"
}
]
}
}
}
]
}
return json.dumps(sample_cleaned_results, ensure_ascii=False), sample_raw_results
async def mock_arun(query, **kwargs):
return mock_run(query, **kwargs)
mock_tool._run = mock_run
mock_tool._arun = mock_arun
return mock_tool
@pytest.fixture
def sample_raw_results(self):
"""Sample raw results from InfoQuest API."""
return {
"results": [
{
"content": {
"results": {
"organic": [
{
"title": "Test Title",
"url": "https://example.com",
"desc": "Test description"
}
]
}
}
}
]
}
@pytest.fixture
def sample_cleaned_results(self):
"""Sample cleaned results."""
return [
{
"type": "page",
"title": "Test Title",
"url": "https://example.com",
"desc": "Test description"
}
]
def test_init_default_values(self):
"""Test initialization with default values using patch."""
with patch('src.tools.infoquest_search.infoquest_search_results.InfoQuestAPIWrapper') as mock_wrapper_class:
mock_instance = Mock()
mock_wrapper_class.return_value = mock_instance
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
with patch.object(InfoQuestSearchResults, '__init__', return_value=None) as mock_init:
InfoQuestSearchResults(infoquest_api_key="dummy-key")
mock_init.assert_called_once()
def test_init_custom_values(self):
"""Test initialization with custom values using patch."""
with patch('src.tools.infoquest_search.infoquest_search_results.InfoQuestAPIWrapper') as mock_wrapper_class:
mock_instance = Mock()
mock_wrapper_class.return_value = mock_instance
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
with patch.object(InfoQuestSearchResults, '__init__', return_value=None) as mock_init:
InfoQuestSearchResults(
time_range=10,
site="test.com",
infoquest_api_key="dummy-key"
)
mock_init.assert_called_once()
def test_run_success(
self,
search_tool,
sample_raw_results,
sample_cleaned_results,
):
"""Test successful synchronous run."""
result, raw = search_tool._run("test query")
assert isinstance(result, str)
assert isinstance(raw, dict)
assert "results" in raw
result_data = json.loads(result)
assert isinstance(result_data, list)
assert len(result_data) > 0
def test_run_exception(self, search_tool):
"""Test synchronous run with exception."""
original_run = search_tool._run
def mock_run_with_error(query, **kwargs):
return json.dumps({"error": "API Error"}, ensure_ascii=False), {}
try:
search_tool._run = mock_run_with_error
result, raw = search_tool._run("test query")
result_dict = json.loads(result)
assert "error" in result_dict
assert "API Error" in result_dict["error"]
assert raw == {}
finally:
search_tool._run = original_run
@pytest.mark.asyncio
async def test_arun_success(
self,
search_tool,
sample_raw_results,
sample_cleaned_results,
):
"""Test successful asynchronous run."""
result, raw = await search_tool._arun("test query")
assert isinstance(result, str)
assert isinstance(raw, dict)
assert "results" in raw
@pytest.mark.asyncio
async def test_arun_exception(self, search_tool):
"""Test asynchronous run with exception."""
original_arun = search_tool._arun
async def mock_arun_with_error(query, **kwargs):
return json.dumps({"error": "Async API Error"}, ensure_ascii=False), {}
try:
search_tool._arun = mock_arun_with_error
result, raw = await search_tool._arun("test query")
result_dict = json.loads(result)
assert "error" in result_dict
assert "Async API Error" in result_dict["error"]
assert raw == {}
finally:
search_tool._arun = original_arun
def test_run_with_run_manager(
self,
search_tool,
sample_raw_results,
sample_cleaned_results,
):
"""Test run with callback manager."""
mock_run_manager = Mock()
result, raw = search_tool._run("test query", run_manager=mock_run_manager)
assert isinstance(result, str)
assert isinstance(raw, dict)
@pytest.mark.asyncio
async def test_arun_with_run_manager(
self,
search_tool,
sample_raw_results,
sample_cleaned_results,
):
"""Test async run with callback manager."""
mock_run_manager = Mock()
result, raw = await search_tool._arun("test query", run_manager=mock_run_manager)
assert isinstance(result, str)
assert isinstance(raw, dict)
def test_api_wrapper_initialization_with_key(self):
"""Test API wrapper initialization with key."""
with patch('src.tools.infoquest_search.infoquest_search_results.InfoQuestAPIWrapper') as mock_wrapper_class:
mock_instance = Mock()
mock_wrapper_class.return_value = mock_instance
from src.tools.infoquest_search.infoquest_search_results import InfoQuestSearchResults
with patch.object(InfoQuestSearchResults, '__init__', return_value=None) as mock_init:
InfoQuestSearchResults(infoquest_api_key="test-key")
mock_init.assert_called_once()