feat: support infoquest (#708)

* support infoquest

* support html checker

* support html checker

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* Fix several critical issues in the codebase
- Resolve crawler panic by improving error handling
- Fix plan validation to prevent invalid configurations
- Correct InfoQuest crawler JSON conversion logic

* add test for infoquest

* add test for infoquest

* Add InfoQuest introduction to the README

* add test for infoquest

* fix readme for infoquest

* fix readme for infoquest

* resolve the conflict

* resolve the conflict

* resolve the conflict

* Fix formatting of INFOQUEST in SearchEngine enum

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
infoquest-byteplus
2025-12-02 08:16:35 +08:00
committed by GitHub
parent e179fb1632
commit 7ec9e45702
22 changed files with 2103 additions and 94 deletions

View File

@@ -3,6 +3,7 @@
import src.crawler as crawler_module
from src.crawler.crawler import safe_truncate
from src.crawler.infoquest_client import InfoQuestClient
def test_crawler_sets_article_url(monkeypatch):
@@ -18,17 +19,29 @@ def test_crawler_sets_article_url(monkeypatch):
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "<html>dummy</html>"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
return DummyArticle()
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
@@ -43,6 +56,16 @@ def test_crawler_calls_dependencies(monkeypatch):
def crawl(self, url, return_format=None):
calls["jina"] = (url, return_format)
return "<html>dummy</html>"
# Fix: Update DummyInfoQuestClient to accept initialization parameters
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
# We don't need to use these parameters, just accept them
pass
def crawl(self, url, return_format=None):
calls["infoquest"] = (url, return_format)
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
@@ -55,13 +78,17 @@ def test_crawler_calls_dependencies(monkeypatch):
return "# Dummy"
return DummyArticle()
# Add mock for load_yaml_config to ensure it returns configuration with Jina engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient) # Include this if InfoQuest might be used
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
crawler.crawl(url)
assert "jina" in calls
@@ -91,17 +118,62 @@ def test_crawler_handles_empty_content(monkeypatch):
def extract_article(self, html):
# This should not be called for empty content
assert False, "ReadabilityExtractor should not be called for empty content"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Empty Content"
assert "No content could be extracted" in article.html_content
assert "No content could be extracted from this page" in article.html_content
def test_crawler_handles_error_response_from_client(monkeypatch):
"""Test that the crawler handles error responses from the client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyJinaClient:
def crawl(self, url, return_format=None):
return "Error: API returned status 500"
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for error responses
assert False, "ReadabilityExtractor should not be called for error responses"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "Error: API returned status 500" in article.html_content
def test_crawler_handles_non_html_content(monkeypatch):
@@ -125,16 +197,22 @@ def test_crawler_handles_non_html_content(monkeypatch):
# This should not be called for non-HTML content
assert False, "ReadabilityExtractor should not be called for non-HTML content"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Non-HTML Content"
assert "cannot be parsed as HTML" in article.html_content
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
assert "plain text content" in article.html_content # Should include a snippet of the original content
@@ -157,11 +235,17 @@ def test_crawler_handles_extraction_failure(monkeypatch):
class DummyReadabilityExtractor:
def extract_article(self, html):
raise Exception("Extraction failed")
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
@@ -191,17 +275,23 @@ def test_crawler_with_json_like_content(monkeypatch):
def extract_article(self, html):
# This should not be called for JSON content
assert False, "ReadabilityExtractor should not be called for JSON content"
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com/api/data"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Non-HTML Content"
assert "cannot be parsed as HTML" in article.html_content
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
assert '{"title": "Some JSON"' in article.html_content # Should include a snippet of the JSON
@@ -217,7 +307,7 @@ def test_crawler_with_various_html_formats(monkeypatch):
def to_markdown(self):
return f"# {self.title}"
# Test case 1: HTML with DOCTYPE
# Test case 1: HTML with DOCTYPE
class DummyJinaClient1:
def crawl(self, url, return_format=None):
return "<!DOCTYPE html><html><body><p>Test content</p></body></html>"
@@ -241,6 +331,9 @@ def test_crawler_with_various_html_formats(monkeypatch):
def extract_article(self, html):
return DummyArticle("Extracted Article", "<p>Extracted content</p>")
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "jina"}}
# Test each HTML format
test_cases = [
(DummyJinaClient1, "HTML with DOCTYPE"),
@@ -252,8 +345,9 @@ def test_crawler_with_various_html_formats(monkeypatch):
for JinaClientClass, description in test_cases:
monkeypatch.setattr("src.crawler.crawler.JinaClient", JinaClientClass)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.Crawler()
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
@@ -298,3 +392,284 @@ def test_safe_truncate_function():
assert len(result) <= 10
# Verify it's valid UTF-8
assert result.encode('utf-8').decode('utf-8') == result
# ========== InfoQuest Client Tests ==========
def test_crawler_selects_infoquest_engine(monkeypatch):
"""Test that the crawler selects InfoQuestClient when configured to use it."""
calls = {}
class DummyJinaClient:
def crawl(self, url, return_format=None):
calls["jina"] = True
return "<html>dummy</html>"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
calls["infoquest_init"] = (fetch_time, timeout, navi_timeout)
def crawl(self, url, return_format=None):
calls["infoquest"] = (url, return_format)
return "<html>dummy from infoquest</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
calls["extractor"] = html
class DummyArticle:
url = None
def to_markdown(self):
return "# Dummy"
return DummyArticle()
# Mock configuration to use InfoQuest engine with custom parameters
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {
"engine": "infoquest",
"fetch_time": 30,
"timeout": 60,
"navi_timeout": 45
}}
monkeypatch.setattr("src.crawler.crawler.JinaClient", DummyJinaClient)
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
crawler.crawl(url)
# Verify InfoQuestClient was used, not JinaClient
assert "infoquest_init" in calls
assert calls["infoquest_init"] == (30, 60, 45) # Verify parameters were passed correctly
assert "infoquest" in calls
assert calls["infoquest"][0] == url
assert calls["infoquest"][1] == "html"
assert "extractor" in calls
assert calls["extractor"] == "<html>dummy from infoquest</html>"
assert "jina" not in calls
def test_crawler_with_infoquest_empty_content(monkeypatch):
"""Test that the crawler handles empty content from InfoQuest client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "" # Empty content
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for empty content
assert False, "ReadabilityExtractor should not be called for empty content"
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Empty Content"
assert "No content could be extracted from this page" in article.html_content
def test_crawler_with_infoquest_non_html_content(monkeypatch):
"""Test that the crawler handles non-HTML content from InfoQuest client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "This is plain text content from InfoQuest, not HTML"
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for non-HTML content
assert False, "ReadabilityExtractor should not be called for non-HTML content"
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "cannot be parsed as HTML" in article.html_content or "Content extraction failed" in article.html_content
assert "plain text content from InfoQuest" in article.html_content
def test_crawler_with_infoquest_error_response(monkeypatch):
"""Test that the crawler handles error responses from InfoQuest client gracefully."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "Error: InfoQuest API returned status 403: Forbidden"
class DummyReadabilityExtractor:
def extract_article(self, html):
# This should not be called for error responses
assert False, "ReadabilityExtractor should not be called for error responses"
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title in ["Non-HTML Content", "Content Extraction Failed"]
assert "Error: InfoQuest API returned status 403: Forbidden" in article.html_content
def test_crawler_with_infoquest_json_response(monkeypatch):
"""Test that the crawler handles JSON responses from InfoQuest client correctly."""
class DummyArticle:
def __init__(self, title, html_content):
self.title = title
self.html_content = html_content
self.url = None
def to_markdown(self):
return f"# {self.title}"
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
pass
def crawl(self, url, return_format=None):
return "<html><body>Content from InfoQuest JSON</body></html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
return DummyArticle("Extracted from JSON", html)
# Mock configuration to use InfoQuest engine
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr(
"src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor
)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
url = "http://example.com"
article = crawler.crawl(url)
assert article.url == url
assert article.title == "Extracted from JSON"
assert "Content from InfoQuest JSON" in article.html_content
def test_infoquest_client_initialization_params():
"""Test that InfoQuestClient correctly initializes with the provided parameters."""
# Test default parameters
client_default = InfoQuestClient()
assert client_default.fetch_time == -1
assert client_default.timeout == -1
assert client_default.navi_timeout == -1
# Test custom parameters
client_custom = InfoQuestClient(fetch_time=30, timeout=60, navi_timeout=45)
assert client_custom.fetch_time == 30
assert client_custom.timeout == 60
assert client_custom.navi_timeout == 45
def test_crawler_with_infoquest_default_parameters(monkeypatch):
"""Test that the crawler initializes InfoQuestClient with default parameters when none are provided."""
calls = {}
class DummyInfoQuestClient:
def __init__(self, fetch_time=None, timeout=None, navi_timeout=None):
calls["infoquest_init"] = (fetch_time, timeout, navi_timeout)
def crawl(self, url, return_format=None):
return "<html>dummy</html>"
class DummyReadabilityExtractor:
def extract_article(self, html):
class DummyArticle:
url = None
def to_markdown(self):
return "# Dummy"
return DummyArticle()
# Mock configuration to use InfoQuest engine without custom parameters
def mock_load_config(*args, **kwargs):
return {"CRAWLER_ENGINE": {"engine": "infoquest"}}
monkeypatch.setattr("src.crawler.crawler.InfoQuestClient", DummyInfoQuestClient)
monkeypatch.setattr("src.crawler.crawler.ReadabilityExtractor", DummyReadabilityExtractor)
monkeypatch.setattr("src.crawler.crawler.load_yaml_config", mock_load_config)
crawler = crawler_module.crawler.Crawler()
crawler.crawl("http://example.com")
# Verify default parameters were passed
assert "infoquest_init" in calls
assert calls["infoquest_init"] == (-1, -1, -1)