feat: integrate volcengine tts functionality

2026-04-16 11:24:45 +08:00 · 2025-04-18 15:28:31 +08:00
parent b2f14d1737
commit a6ab97c970
6 changed files with 251 additions and 8 deletions
--- a/src/tools/init.py
+++ b/src/tools/init.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT

+import os
+
 from .crawl import crawl_tool
 from .python_repl import python_repl_tool
 from .search import (
@@ -9,6 +11,7 @@ from .search import (
    brave_search_tool,
    arxiv_search_tool,
 )
+from .tts import VolcengineTTS
 from src.config import SELECTED_SEARCH_ENGINE, SearchEngine

 # Map search engine names to their respective tools
@@ -25,4 +28,5 @@ __all__ = [
    "crawl_tool",
    "web_search_tool",
    "python_repl_tool",
+    "VolcengineTTS",
 ]
--- a/src/tools/tts.py
+++ b/src/tools/tts.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Text-to-Speech module using volcengine TTS API.
+"""
+
+import json
+import uuid
+import logging
+import requests
+from typing import Optional, Dict, Any
+
+logger = logging.getLogger(__name__)
+
+
+class VolcengineTTS:
+    """
+    Client for volcengine Text-to-Speech API.
+    """
+
+    def __init__(
+        self,
+        appid: str,
+        access_token: str,
+        cluster: str = "volcano_tts",
+        voice_type: str = "BV700_V2_streaming",
+        host: str = "openspeech.bytedance.com",
+    ):
+        """
+        Initialize the volcengine TTS client.
+
+        Args:
+            appid: Platform application ID
+            access_token: Access token for authentication
+            cluster: TTS cluster name
+            voice_type: Voice type to use
+            host: API host
+        """
+        self.appid = appid
+        self.access_token = access_token
+        self.cluster = cluster
+        self.voice_type = voice_type
+        self.host = host
+        self.api_url = f"https://{host}/api/v1/tts"
+        self.header = {"Authorization": f"Bearer;{access_token}"}
+
+    def text_to_speech(
+        self,
+        text: str,
+        encoding: str = "mp3",
+        speed_ratio: float = 1.0,
+        volume_ratio: float = 1.0,
+        pitch_ratio: float = 1.0,
+        text_type: str = "plain",
+        with_frontend: int = 1,
+        frontend_type: str = "unitTson",
+        uid: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        Convert text to speech using volcengine TTS API.
+
+        Args:
+            text: Text to convert to speech
+            encoding: Audio encoding format
+            speed_ratio: Speech speed ratio
+            volume_ratio: Speech volume ratio
+            pitch_ratio: Speech pitch ratio
+            text_type: Text type (plain or ssml)
+            with_frontend: Whether to use frontend processing
+            frontend_type: Frontend type
+            uid: User ID (generated if not provided)
+
+        Returns:
+            Dictionary containing the API response and base64-encoded audio data
+        """
+        if not uid:
+            uid = str(uuid.uuid4())
+
+        request_json = {
+            "app": {
+                "appid": self.appid,
+                "token": self.access_token,
+                "cluster": self.cluster,
+            },
+            "user": {"uid": uid},
+            "audio": {
+                "voice_type": self.voice_type,
+                "encoding": encoding,
+                "speed_ratio": speed_ratio,
+                "volume_ratio": volume_ratio,
+                "pitch_ratio": pitch_ratio,
+            },
+            "request": {
+                "reqid": str(uuid.uuid4()),
+                "text": text,
+                "text_type": text_type,
+                "operation": "query",
+                "with_frontend": with_frontend,
+                "frontend_type": frontend_type,
+            },
+        }
+
+        try:
+            logger.debug(f"Sending TTS request for text: {text[:50]}...")
+            response = requests.post(
+                self.api_url, json.dumps(request_json), headers=self.header
+            )
+            response_json = response.json()
+
+            if response.status_code != 200:
+                logger.error(f"TTS API error: {response_json}")
+                return {"success": False, "error": response_json, "audio_data": None}
+
+            if "data" not in response_json:
+                logger.error(f"TTS API returned no data: {response_json}")
+                return {
+                    "success": False,
+                    "error": "No audio data returned",
+                    "audio_data": None,
+                }
+
+            return {
+                "success": True,
+                "response": response_json,
+                "audio_data": response_json["data"],  # Base64 encoded audio data
+            }
+
+        except Exception as e:
+            logger.exception(f"Error in TTS API call: {str(e)}")
+            return {"success": False, "error": str(e), "audio_data": None}