mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-16 03:14:45 +08:00
153 lines
6.2 KiB
Python
153 lines
6.2 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""Util that calls InfoQuest Crawler API.
|
|
|
|
In order to set this up, follow instructions at:
|
|
https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from typing import Dict, Any
|
|
|
|
import requests
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class InfoQuestClient:
|
|
"""Client for interacting with the InfoQuest web crawling API."""
|
|
|
|
def __init__(self, fetch_time: int = -1, timeout: int = -1, navi_timeout: int = -1):
|
|
logger.info(
|
|
"\n============================================\n"
|
|
"🚀 BytePlus InfoQuest Crawler Initialization 🚀\n"
|
|
"============================================"
|
|
)
|
|
|
|
self.fetch_time = fetch_time
|
|
self.timeout = timeout
|
|
self.navi_timeout = navi_timeout
|
|
self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY"))
|
|
|
|
config_details = (
|
|
f"\n📋 Configuration Details:\n"
|
|
f"├── Fetch Timeout: {fetch_time} {'(Default: No timeout)' if fetch_time == -1 else '(Custom)'}\n"
|
|
f"├── Timeout: {timeout} {'(Default: No timeout)' if timeout == -1 else '(Custom)'}\n"
|
|
f"├── Navigation Timeout: {navi_timeout} {'(Default: No timeout)' if navi_timeout == -1 else '(Custom)'}\n"
|
|
f"└── API Key: {'✅ Configured' if self.api_key_set else '❌ Not set'}"
|
|
)
|
|
|
|
logger.info(config_details)
|
|
logger.info("\n" + "*" * 70 + "\n")
|
|
|
|
def crawl(self, url: str, return_format: str = "html") -> str:
|
|
logger.debug("Preparing request for URL: %s", url)
|
|
|
|
# Prepare headers
|
|
headers = self._prepare_headers()
|
|
|
|
# Prepare request data
|
|
data = self._prepare_request_data(url, return_format)
|
|
|
|
# Log request details
|
|
logger.debug(
|
|
"InfoQuest Crawler request prepared: endpoint=https://reader.infoquest.bytepluses.com, "
|
|
"format=%s",
|
|
data.get("format")
|
|
)
|
|
|
|
logger.debug("Sending crawl request to InfoQuest API")
|
|
try:
|
|
response = requests.post(
|
|
"https://reader.infoquest.bytepluses.com",
|
|
headers=headers,
|
|
json=data
|
|
)
|
|
|
|
# Check if status code is not 200
|
|
if response.status_code != 200:
|
|
error_message = f"InfoQuest API returned status {response.status_code}: {response.text}"
|
|
logger.error(error_message)
|
|
return f"Error: {error_message}"
|
|
|
|
# Check for empty response
|
|
if not response.text or not response.text.strip():
|
|
error_message = "InfoQuest Crawler API returned empty response"
|
|
logger.error("BytePlus InfoQuest Crawler returned empty response for URL: %s", url)
|
|
return f"Error: {error_message}"
|
|
|
|
# Try to parse response as JSON and extract reader_result
|
|
try:
|
|
response_data = json.loads(response.text)
|
|
# Extract reader_result if it exists
|
|
if "reader_result" in response_data:
|
|
logger.debug("Successfully extracted reader_result from JSON response")
|
|
return response_data["reader_result"]
|
|
elif "content" in response_data:
|
|
# Fallback to content field if reader_result is not available
|
|
logger.debug("Using content field as fallback")
|
|
return response_data["content"]
|
|
else:
|
|
# If neither field exists, return the original response
|
|
logger.warning("Neither reader_result nor content field found in JSON response")
|
|
except json.JSONDecodeError:
|
|
# If response is not JSON, return the original text
|
|
logger.debug("Response is not in JSON format, returning as-is")
|
|
|
|
# Print partial response for debugging
|
|
if logger.isEnabledFor(logging.DEBUG):
|
|
response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "")
|
|
logger.debug(
|
|
"Successfully received response, content length: %d bytes, first 200 chars: %s",
|
|
len(response.text), response_sample
|
|
)
|
|
return response.text
|
|
except Exception as e:
|
|
error_message = f"Request to InfoQuest API failed: {str(e)}"
|
|
logger.error(error_message)
|
|
return f"Error: {error_message}"
|
|
|
|
def _prepare_headers(self) -> Dict[str, str]:
|
|
"""Prepare request headers."""
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# Add API key if available
|
|
if os.getenv("INFOQUEST_API_KEY"):
|
|
headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}"
|
|
logger.debug("API key added to request headers")
|
|
else:
|
|
logger.warning(
|
|
"InfoQuest API key is not set. Provide your own key for authentication."
|
|
)
|
|
|
|
return headers
|
|
|
|
def _prepare_request_data(self, url: str, return_format: str) -> Dict[str, Any]:
|
|
"""Prepare request data with formatted parameters."""
|
|
# Normalize return_format
|
|
if return_format and return_format.lower() == "html":
|
|
normalized_format = "HTML"
|
|
else:
|
|
normalized_format = return_format
|
|
|
|
data = {"url": url, "format": normalized_format}
|
|
|
|
# Add timeout parameters if set to positive values
|
|
timeout_params = {}
|
|
if self.fetch_time > 0:
|
|
timeout_params["fetch_time"] = self.fetch_time
|
|
if self.timeout > 0:
|
|
timeout_params["timeout"] = self.timeout
|
|
if self.navi_timeout > 0:
|
|
timeout_params["navi_timeout"] = self.navi_timeout
|
|
|
|
# Log applied timeout parameters
|
|
if timeout_params:
|
|
logger.debug("Applying timeout parameters: %s", timeout_params)
|
|
data.update(timeout_params)
|
|
|
|
return data |