feat: support infoquest (#708)

* support infoquest

* support html checker

* support html checker

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* change line break format

* Fix several critical issues in the codebase
- Resolve crawler panic by improving error handling
- Fix plan validation to prevent invalid configurations
- Correct InfoQuest crawler JSON conversion logic

* add test for infoquest

* add test for infoquest

* Add InfoQuest introduction to the README

* add test for infoquest

* fix readme for infoquest

* fix readme for infoquest

* resolve the conflict

* resolve the conflict

* resolve the conflict

* Fix formatting of INFOQUEST in SearchEngine enum

* Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Willem Jiang <143703838+willem-bd@users.noreply.github.com>
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
infoquest-byteplus
2025-12-02 08:16:35 +08:00
committed by GitHub
parent e179fb1632
commit 7ec9e45702
22 changed files with 2103 additions and 94 deletions

View File

@@ -4,9 +4,12 @@
import re
import logging
from .article import Article
from .jina_client import JinaClient
from .readability_extractor import ReadabilityExtractor
from src.config.tools import CrawlerEngine
from src.config import load_yaml_config
from src.crawler.article import Article
from src.crawler.infoquest_client import InfoQuestClient
from src.crawler.jina_client import JinaClient
from src.crawler.readability_extractor import ReadabilityExtractor
logger = logging.getLogger(__name__)
@@ -14,11 +17,11 @@ logger = logging.getLogger(__name__)
def safe_truncate(text: str, max_length: int = 500) -> str:
"""
Safely truncate text to a maximum length without breaking multi-byte characters.
Args:
text: The text to truncate
max_length: Maximum number of characters to keep
Returns:
Truncated text that is safe to use without encoding issues
"""
@@ -49,7 +52,7 @@ def safe_truncate(text: str, max_length: int = 500) -> str:
def is_html_content(content: str) -> bool:
"""
Check if the provided content is HTML.
Uses a more robust detection method that checks for common HTML patterns
including DOCTYPE declarations, HTML tags, and other HTML markers.
"""
@@ -138,17 +141,21 @@ class Crawler:
# them into text and image blocks for one single and unified
# LLM message.
#
# Jina is not the best crawler on readability, however it's
# much easier and free to use.
# The system supports multiple crawler engines:
# - Jina: An accessible solution, though with some limitations in readability extraction
# - InfoQuest: A BytePlus product offering advanced capabilities with configurable parameters
# like fetch_time, timeout, and navi_timeout.
#
# Instead of using Jina's own markdown converter, we'll use
# our own solution to get better readability results.
try:
jina_client = JinaClient()
html = jina_client.crawl(url, return_format="html")
except Exception as e:
logger.error(f"Failed to fetch URL {url} from Jina: {repr(e)}")
raise
# Get crawler configuration
config = load_yaml_config("conf.yaml")
crawler_config = config.get("CRAWLER_ENGINE", {})
# Get the selected crawler tool based on configuration
crawler_client = self._select_crawler_tool(crawler_config)
html = self._crawl_with_tool(crawler_client, url)
# Check if we got valid HTML content
if not html or not html.strip():
@@ -186,3 +193,44 @@ class Crawler:
article.url = url
return article
def _select_crawler_tool(self, crawler_config: dict):
# Only check engine from configuration file
engine = crawler_config.get("engine", CrawlerEngine.JINA.value)
if engine == CrawlerEngine.JINA.value:
logger.info(f"Selecting Jina crawler engine")
return JinaClient()
elif engine == CrawlerEngine.INFOQUEST.value:
logger.info(f"Selecting InfoQuest crawler engine")
# Read timeout parameters directly from crawler_config root level
# These parameters are only effective when engine is set to "infoquest"
fetch_time = crawler_config.get("fetch_time", -1)
timeout = crawler_config.get("timeout", -1)
navi_timeout = crawler_config.get("navi_timeout", -1)
# Log the configuration being used
if fetch_time > 0 or timeout > 0 or navi_timeout > 0:
logger.debug(
f"Initializing InfoQuestCrawler with parameters: "
f"fetch_time={fetch_time}, "
f"timeout={timeout}, "
f"navi_timeout={navi_timeout}"
)
# Initialize InfoQuestClient with the parameters from configuration
return InfoQuestClient(
fetch_time=fetch_time,
timeout=timeout,
navi_timeout=navi_timeout
)
else:
raise ValueError(f"Unsupported crawler engine: {engine}")
def _crawl_with_tool(self, crawler_client, url: str) -> str:
logger.info(f"Crawling URL: {url} using {crawler_client.__class__.__name__}")
try:
return crawler_client.crawl(url, return_format="html")
except Exception as e:
logger.error(f"Failed to fetch URL {url} using {crawler_client.__class__.__name__}: {repr(e)}")
raise

View File

@@ -0,0 +1,153 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""Util that calls InfoQuest Crawler API.
In order to set this up, follow instructions at:
https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
"""
import json
import logging
import os
from typing import Dict, Any
import requests
logger = logging.getLogger(__name__)
class InfoQuestClient:
"""Client for interacting with the InfoQuest web crawling API."""
def __init__(self, fetch_time: int = -1, timeout: int = -1, navi_timeout: int = -1):
logger.info(
"\n============================================\n"
"🚀 BytePlus InfoQuest Crawler Initialization 🚀\n"
"============================================"
)
self.fetch_time = fetch_time
self.timeout = timeout
self.navi_timeout = navi_timeout
self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY"))
config_details = (
f"\n📋 Configuration Details:\n"
f"├── Fetch Timeout: {fetch_time} {'(Default: No timeout)' if fetch_time == -1 else '(Custom)'}\n"
f"├── Timeout: {timeout} {'(Default: No timeout)' if timeout == -1 else '(Custom)'}\n"
f"├── Navigation Timeout: {navi_timeout} {'(Default: No timeout)' if navi_timeout == -1 else '(Custom)'}\n"
f"└── API Key: {'✅ Configured' if self.api_key_set else '❌ Not set'}"
)
logger.info(config_details)
logger.info("\n" + "*" * 70 + "\n")
def crawl(self, url: str, return_format: str = "html") -> str:
logger.debug("Preparing request for URL: %s", url)
# Prepare headers
headers = self._prepare_headers()
# Prepare request data
data = self._prepare_request_data(url, return_format)
# Log request details
logger.debug(
"InfoQuest Crawler request prepared: endpoint=https://reader.infoquest.bytepluses.com, "
"format=%s, has_api_key=%s",
data.get("format"), self.api_key_set
)
logger.debug("Sending crawl request to InfoQuest API")
try:
response = requests.post(
"https://reader.infoquest.bytepluses.com",
headers=headers,
json=data
)
# Check if status code is not 200
if response.status_code != 200:
error_message = f"InfoQuest API returned status {response.status_code}: {response.text}"
logger.error(error_message)
return f"Error: {error_message}"
# Check for empty response
if not response.text or not response.text.strip():
error_message = "InfoQuest Crawler API returned empty response"
logger.error("BytePlus InfoQuest Crawler returned empty response for URL: %s", url)
return f"Error: {error_message}"
# Try to parse response as JSON and extract reader_result
try:
response_data = json.loads(response.text)
# Extract reader_result if it exists
if "reader_result" in response_data:
logger.debug("Successfully extracted reader_result from JSON response")
return response_data["reader_result"]
elif "content" in response_data:
# Fallback to content field if reader_result is not available
logger.debug("Using content field as fallback")
return response_data["content"]
else:
# If neither field exists, return the original response
logger.warning("Neither reader_result nor content field found in JSON response")
except json.JSONDecodeError:
# If response is not JSON, return the original text
logger.debug("Response is not in JSON format, returning as-is")
# Print partial response for debugging
if logger.isEnabledFor(logging.DEBUG):
response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "")
logger.debug(
"Successfully received response, content length: %d bytes, first 200 chars: %s",
len(response.text), response_sample
)
return response.text
except Exception as e:
error_message = f"Request to InfoQuest API failed: {str(e)}"
logger.error(error_message)
return f"Error: {error_message}"
def _prepare_headers(self) -> Dict[str, str]:
"""Prepare request headers."""
headers = {
"Content-Type": "application/json",
}
# Add API key if available
if os.getenv("INFOQUEST_API_KEY"):
headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}"
logger.debug("API key added to request headers")
else:
logger.warning(
"InfoQuest API key is not set. Provide your own key for authentication."
)
return headers
def _prepare_request_data(self, url: str, return_format: str) -> Dict[str, Any]:
"""Prepare request data with formatted parameters."""
# Normalize return_format
if return_format and return_format.lower() == "html":
normalized_format = "HTML"
else:
normalized_format = return_format
data = {"url": url, "format": normalized_format}
# Add timeout parameters if set to positive values
timeout_params = {}
if self.fetch_time > 0:
timeout_params["fetch_time"] = self.fetch_time
if self.timeout > 0:
timeout_params["timeout"] = self.timeout
if self.navi_timeout > 0:
timeout_params["navi_timeout"] = self.navi_timeout
# Log applied timeout parameters
if timeout_params:
logger.debug("Applying timeout parameters: %s", timeout_params)
data.update(timeout_params)
return data

View File

@@ -22,12 +22,21 @@ class JinaClient:
"Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information."
)
data = {"url": url}
response = requests.post("https://r.jina.ai/", headers=headers, json=data)
if response.status_code != 200:
raise ValueError(f"Jina API returned status {response.status_code}: {response.text}")
if not response.text or not response.text.strip():
raise ValueError("Jina API returned empty response")
return response.text
try:
response = requests.post("https://r.jina.ai/", headers=headers, json=data)
if response.status_code != 200:
error_message = f"Jina API returned status {response.status_code}: {response.text}"
logger.error(error_message)
return f"Error: {error_message}"
if not response.text or not response.text.strip():
error_message = "Jina API returned empty response"
logger.error(error_message)
return f"Error: {error_message}"
return response.text
except Exception as e:
error_message = f"Request to Jina API failed: {str(e)}"
logger.error(error_message)
return f"Error: {error_message}"