feat: support infoquest (#708)

* support infoquest

* support html checker

* support html checker

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* Fix several critical issues in the codebase
- Resolve crawler panic by improving error handling
- Fix plan validation to prevent invalid configurations
- Correct InfoQuest crawler JSON conversion logic

* add test for infoquest

* add test for infoquest

* Add InfoQuest introduction to the README

* add test for infoquest

* fix readme for infoquest

* fix readme for infoquest

* resolve the conflict

* resolve the conflict

* resolve the conflict

* Fix formatting of INFOQUEST in SearchEngine enum

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
infoquest-byteplus
2025-12-02 08:16:35 +08:00
committed by GitHub
parent e179fb1632
commit 7ec9e45702
22 changed files with 2103 additions and 94 deletions

View File

@@ -36,11 +36,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "status 500" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "status 500" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_empty_response(self, mock_post):
@@ -52,11 +53,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "empty response" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_whitespace_only_response(self, mock_post):
@@ -68,11 +70,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "empty response" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "empty response" in result
@patch("src.crawler.jina_client.requests.post")
def test_crawl_not_found(self, mock_post):
@@ -84,11 +87,12 @@ class TestJinaClient:
client = JinaClient()
# Act & Assert
with pytest.raises(ValueError) as exc_info:
client.crawl("https://example.com")
# Act
result = client.crawl("https://example.com")
assert "status 404" in str(exc_info.value)
# Assert
assert result.startswith("Error:")
assert "status 404" in result
@patch.dict("os.environ", {}, clear=True)
@patch("src.crawler.jina_client.requests.post")
@@ -106,3 +110,17 @@ class TestJinaClient:
# Assert
assert result == "<html>Test</html>"
@patch("src.crawler.jina_client.requests.post")
def test_crawl_exception_handling(self, mock_post):
# Arrange
mock_post.side_effect = Exception("Network error")
client = JinaClient()
# Act
result = client.crawl("https://example.com")
# Assert
assert result.startswith("Error:")
assert "Network error" in result