# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT """Util that calls InfoQuest Crawler API. In order to set this up, follow instructions at: https://docs.byteplus.com/en/docs/InfoQuest/What_is_Info_Quest """ import json import logging import os from typing import Dict, Any import requests logger = logging.getLogger(__name__) class InfoQuestClient: """Client for interacting with the InfoQuest web crawling API.""" def __init__(self, fetch_time: int = -1, timeout: int = -1, navi_timeout: int = -1): logger.info( "\n============================================\n" "šŸš€ BytePlus InfoQuest Crawler Initialization šŸš€\n" "============================================" ) self.fetch_time = fetch_time self.timeout = timeout self.navi_timeout = navi_timeout self.api_key_set = bool(os.getenv("INFOQUEST_API_KEY")) config_details = ( f"\nšŸ“‹ Configuration Details:\n" f"ā”œā”€ā”€ Fetch Timeout: {fetch_time} {'(Default: No timeout)' if fetch_time == -1 else '(Custom)'}\n" f"ā”œā”€ā”€ Timeout: {timeout} {'(Default: No timeout)' if timeout == -1 else '(Custom)'}\n" f"ā”œā”€ā”€ Navigation Timeout: {navi_timeout} {'(Default: No timeout)' if navi_timeout == -1 else '(Custom)'}\n" f"└── API Key: {'āœ… Configured' if self.api_key_set else 'āŒ Not set'}" ) logger.info(config_details) logger.info("\n" + "*" * 70 + "\n") def crawl(self, url: str, return_format: str = "html") -> str: logger.debug("Preparing request for URL: %s", url) # Prepare headers headers = self._prepare_headers() # Prepare request data data = self._prepare_request_data(url, return_format) # Log request details logger.debug( "InfoQuest Crawler request prepared: endpoint=https://reader.infoquest.bytepluses.com, " "format=%s", data.get("format") ) logger.debug("Sending crawl request to InfoQuest API") try: response = requests.post( "https://reader.infoquest.bytepluses.com", headers=headers, json=data ) # Check if status code is not 200 if response.status_code != 200: error_message = f"InfoQuest API returned status {response.status_code}: {response.text}" logger.error(error_message) return f"Error: {error_message}" # Check for empty response if not response.text or not response.text.strip(): error_message = "InfoQuest Crawler API returned empty response" logger.error("BytePlus InfoQuest Crawler returned empty response for URL: %s", url) return f"Error: {error_message}" # Try to parse response as JSON and extract reader_result try: response_data = json.loads(response.text) # Extract reader_result if it exists if "reader_result" in response_data: logger.debug("Successfully extracted reader_result from JSON response") return response_data["reader_result"] elif "content" in response_data: # Fallback to content field if reader_result is not available logger.debug("Using content field as fallback") return response_data["content"] else: # If neither field exists, return the original response logger.warning("Neither reader_result nor content field found in JSON response") except json.JSONDecodeError: # If response is not JSON, return the original text logger.debug("Response is not in JSON format, returning as-is") # Print partial response for debugging if logger.isEnabledFor(logging.DEBUG): response_sample = response.text[:200] + ("..." if len(response.text) > 200 else "") logger.debug( "Successfully received response, content length: %d bytes, first 200 chars: %s", len(response.text), response_sample ) return response.text except Exception as e: error_message = f"Request to InfoQuest API failed: {str(e)}" logger.error(error_message) return f"Error: {error_message}" def _prepare_headers(self) -> Dict[str, str]: """Prepare request headers.""" headers = { "Content-Type": "application/json", } # Add API key if available if os.getenv("INFOQUEST_API_KEY"): headers["Authorization"] = f"Bearer {os.getenv('INFOQUEST_API_KEY')}" logger.debug("API key added to request headers") else: logger.warning( "InfoQuest API key is not set. Provide your own key for authentication." ) return headers def _prepare_request_data(self, url: str, return_format: str) -> Dict[str, Any]: """Prepare request data with formatted parameters.""" # Normalize return_format if return_format and return_format.lower() == "html": normalized_format = "HTML" else: normalized_format = return_format data = {"url": url, "format": normalized_format} # Add timeout parameters if set to positive values timeout_params = {} if self.fetch_time > 0: timeout_params["fetch_time"] = self.fetch_time if self.timeout > 0: timeout_params["timeout"] = self.timeout if self.navi_timeout > 0: timeout_params["navi_timeout"] = self.navi_timeout # Log applied timeout parameters if timeout_params: logger.debug("Applying timeout parameters: %s", timeout_params) data.update(timeout_params) return data