diff --git a/.gitignore b/.gitignore index cd6f659..1bfd7f6 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,11 @@ ehthumbs.db Thumbs.db +# Python cache +__pycache__/ +*.pyc +*.pyo + # Virtual environments .venv venv/ diff --git a/backend/src/sandbox/local/local_sandbox.py b/backend/src/sandbox/local/local_sandbox.py index 4bb44ba..9e46283 100644 --- a/backend/src/sandbox/local/local_sandbox.py +++ b/backend/src/sandbox/local/local_sandbox.py @@ -142,7 +142,7 @@ class LocalSandbox(Sandbox): shell=True, capture_output=True, text=True, - timeout=30, + timeout=600, ) output = result.stdout if result.stderr: diff --git a/skills/public/podcast-generation/SKILL.md b/skills/public/podcast-generation/SKILL.md new file mode 100644 index 0000000..8143e21 --- /dev/null +++ b/skills/public/podcast-generation/SKILL.md @@ -0,0 +1,124 @@ +--- +name: podcast-generation +description: Use this skill when the user requests to generate, create, or produce podcasts from text content. Converts written content into a two-host conversational podcast audio format with natural dialogue. +--- + +# Podcast Generation Skill + +## Overview + +This skill generates high-quality podcast audio from text content using a multi-stage pipeline. The workflow includes script generation (converting input to conversational dialogue), text-to-speech synthesis, and audio mixing to produce the final podcast. + +## Core Capabilities + +- Convert any text content (articles, reports, documentation) into podcast scripts +- Generate natural two-host conversational dialogue (male and female hosts) +- Synthesize speech audio using text-to-speech +- Mix audio chunks into a final podcast MP3 file +- Support both English and Chinese content + +## Workflow + +### Step 1: Understand Requirements + +When a user requests podcast generation, identify: + +- Source content: The text/article/report to convert into a podcast +- Language: English or Chinese (auto-detected from content) +- Output location: Where to save the generated podcast +- You don't need to check the folder under `/mnt/user-data` + +### Step 2: Prepare Input Content + +The input content should be plain text or markdown. Save it to a text file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}-content.md` + +### Step 3: Execute Generation + +Call the Python script directly without any concerns about timeout or the need for pre-testing: + +```bash +python /mnt/skills/public/podcast-generation/scripts/generate.py \ + --input-file /mnt/user-data/workspace/content-file.md \ + --output-file /mnt/user-data/outputs/generated-podcast.mp3 \ + --locale en +``` + +Parameters: + +- `--input-file`: Absolute path to input text/markdown file (required) +- `--output-file`: Absolute path to output MP3 file (required) +- `--locale`: Language locale - "en" for English or "zh" for Chinese (optional, auto-detected if not specified) + +> [!IMPORTANT] +> - Execute the script in one complete call. Do NOT split the workflow into separate steps (e.g., testing script generation first, then TTS). +> - The script handles all external API calls and audio generation internally with proper timeout management. +> - Do NOT read the Python file, just call it with the parameters. + +## Podcast Generation Example + +User request: "Generate a podcast about the history of artificial intelligence" + +Step 1: Create content file `/mnt/user-data/workspace/ai-history-content.md` with the source text: +```markdown +# The History of Artificial Intelligence + +Artificial intelligence has a rich history spanning over seven decades... + +## Early Beginnings (1950s) +The term "artificial intelligence" was coined by John McCarthy in 1956... + +## The First AI Winter (1970s) +After initial enthusiasm, AI research faced significant setbacks... + +## Modern Era (2010s-Present) +Deep learning revolutionized the field with breakthrough results... +``` + +Step 2: Execute generation: +```bash +python /mnt/skills/public/podcast-generation/scripts/generate.py \ + --input-file /mnt/user-data/workspace/ai-history-content.md \ + --output-file /mnt/user-data/outputs/ai-history-podcast.mp3 \ + --locale en +``` + +## Specific Templates + +Read the following template file only when matching the user request. + +- [Tech Explainer](templates/tech-explainer.md) - For converting technical documentation and tutorials + +## Output Format + +The generated podcast follows the "Hello Deer" format: +- Two hosts: one male, one female +- Natural conversational dialogue +- Starts with "Hello Deer" greeting +- Target duration: approximately 10 minutes +- Alternating speakers for engaging flow + +## Output Handling + +After generation: + +- Podcasts are saved in `/mnt/user-data/outputs/` +- Share generated podcast with user using `present_files` tool +- Provide brief description of the generation result (topic, duration, hosts) +- Offer to regenerate if adjustments needed + +## Requirements + +The following environment variables must be set: +- `OPENAI_API_KEY` or equivalent LLM API key for script generation +- `VOLCENGINE_TTS_APPID`: Volcengine TTS application ID +- `VOLCENGINE_TTS_ACCESS_TOKEN`: Volcengine TTS access token +- `VOLCENGINE_TTS_CLUSTER`: Volcengine TTS cluster (optional, defaults to "volcano_tts") + +## Notes + +- **Always execute the full pipeline in one call** - no need to test individual steps or worry about timeouts +- Input content language is auto-detected and matched in output +- The script generation uses LLM to create natural conversational dialogue +- Technical content is automatically simplified for audio accessibility +- Complex notations (formulas, code) are translated to plain language +- Long content may result in longer podcasts diff --git a/skills/public/podcast-generation/scripts/generate.py b/skills/public/podcast-generation/scripts/generate.py new file mode 100644 index 0000000..5ad7ea9 --- /dev/null +++ b/skills/public/podcast-generation/scripts/generate.py @@ -0,0 +1,349 @@ +import argparse +import base64 +import json +import logging +import os +import re +import uuid +from typing import Literal, Optional + +import requests + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# Types +class ScriptLine: + def __init__(self, speaker: Literal["male", "female"] = "male", paragraph: str = ""): + self.speaker = speaker + self.paragraph = paragraph + + +class Script: + def __init__(self, locale: Literal["en", "zh"] = "en", lines: list[ScriptLine] = None): + self.locale = locale + self.lines = lines or [] + + @classmethod + def from_dict(cls, data: dict) -> "Script": + script = cls(locale=data.get("locale", "en")) + for line in data.get("lines", []): + script.lines.append( + ScriptLine( + speaker=line.get("speaker", "male"), + paragraph=line.get("paragraph", ""), + ) + ) + return script + + +# Prompt template for script generation +SCRIPT_WRITER_PROMPT = """You are a skilled podcast script writer for "Hello Deer", a conversational podcast show with two hosts. + +Transform the provided content into an engaging podcast script following these guidelines: + +## Format Requirements +- Output as JSON with this structure: {{"locale": "en" or "zh", "lines": [{{"speaker": "male" or "female", "paragraph": "dialogue text"}}]}} +- Only two hosts: male and female, alternating naturally +- Target runtime: approximately 10 minutes of dialogue +- Start with the male host saying a greeting that includes "Hello Deer" + +## Tone & Style +- Natural, conversational dialogue - like two friends chatting +- Use casual expressions and conversational transitions +- Avoid overly formal language or academic tone +- Include reactions, follow-up questions, and natural interjections + +## Content Guidelines +- Frequent back-and-forth between hosts +- Keep sentences short and easy to follow when spoken +- Plain text only - no markdown formatting in the output +- Translate technical concepts into accessible language +- No mathematical formulas, code, or complex notation +- Make content engaging and accessible for audio-only listeners +- Exclude meta information like dates, author names, or document structure + +## Language +- Match the locale of the input content +- Use "{locale}" for the output locale + +Now transform this content into a podcast script: + +{content} +""" + + +def extract_json_from_text(text: str) -> dict: + """Extract JSON from text that might contain markdown code blocks or extra content.""" + # Try to find JSON in markdown code blocks first + json_block_pattern = r"```(?:json)?\s*(\{[\s\S]*?\})\s*```" + match = re.search(json_block_pattern, text) + if match: + return json.loads(match.group(1)) + + # Try to find raw JSON object + json_pattern = r"\{[\s\S]*\}" + match = re.search(json_pattern, text) + if match: + return json.loads(match.group(0)) + + # Last resort: try parsing the whole text + return json.loads(text) + + +def generate_script(content: str, locale: str) -> Script: + """Generate podcast script from content using LLM.""" + logger.info("Generating podcast script...") + + api_key = os.getenv("OPENAI_API_KEY") + base_url = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1") + model = os.getenv("OPENAI_MODEL", "gpt-4o") + + if not api_key: + raise ValueError("OPENAI_API_KEY environment variable is not set") + + prompt = SCRIPT_WRITER_PROMPT.format(content=content, locale=locale) + + # First try with JSON mode + try: + response = requests.post( + f"{base_url}/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json={ + "model": model, + "messages": [ + {"role": "system", "content": "You are a podcast script writer. Always respond with valid JSON only, no markdown formatting."}, + {"role": "user", "content": prompt}, + ], + "response_format": {"type": "json_object"}, + }, + ) + + if response.status_code != 200: + raise Exception(f"LLM API error: {response.status_code} - {response.text}") + + result = response.json() + logger.info(f"API response keys: {result.keys()}") + if "error" in result: + raise Exception(f"API error: {result['error']}") + response_content = result["choices"][0]["message"]["content"] + logger.info(f"LLM response preview: {response_content[:200]}...") + script_json = json.loads(response_content) + + except (json.JSONDecodeError, KeyError) as e: + # Fallback: try without JSON mode for models that don't support it + logger.warning(f"JSON mode failed ({e}), trying without response_format...") + + response = requests.post( + f"{base_url}/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json={ + "model": model, + "messages": [ + {"role": "system", "content": "You are a podcast script writer. Respond with valid JSON only, no markdown or extra text."}, + {"role": "user", "content": prompt}, + ], + }, + ) + + if response.status_code != 200: + raise Exception(f"LLM API error: {response.status_code} - {response.text}") + + result = response.json() + response_content = result["choices"][0]["message"]["content"] + logger.debug(f"LLM response (fallback): {response_content[:500]}...") + script_json = extract_json_from_text(response_content) + + # Validate structure + if "lines" not in script_json: + raise ValueError(f"Invalid script format: missing 'lines' key. Got keys: {list(script_json.keys())}") + + script = Script.from_dict(script_json) + + logger.info(f"Generated script with {len(script.lines)} lines") + return script + + +def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: + """Convert text to speech using Volcengine TTS.""" + app_id = os.getenv("VOLCENGINE_TTS_APPID") + access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") + cluster = os.getenv("VOLCENGINE_TTS_CLUSTER", "volcano_tts") + + if not app_id or not access_token: + raise ValueError( + "VOLCENGINE_TTS_APPID and VOLCENGINE_TTS_ACCESS_TOKEN environment variables must be set" + ) + + url = "https://openspeech.bytedance.com/api/v1/tts" + + # Authentication: Bearer token with semicolon separator + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer;{access_token}", + } + + payload = { + "app": { + "appid": app_id, + "token": "access_token", # literal string, not the actual token + "cluster": cluster, + }, + "user": {"uid": "podcast-generator"}, + "audio": { + "voice_type": voice_type, + "encoding": "mp3", + "speed_ratio": 1.2, + }, + "request": { + "reqid": str(uuid.uuid4()), # must be unique UUID + "text": text, + "text_type": "plain", + "operation": "query", + }, + } + + try: + response = requests.post(url, json=payload, headers=headers) + + if response.status_code != 200: + logger.error(f"TTS API error: {response.status_code} - {response.text}") + return None + + result = response.json() + if result.get("code") != 3000: + logger.error(f"TTS error: {result.get('message')} (code: {result.get('code')})") + return None + + audio_data = result.get("data") + if audio_data: + return base64.b64decode(audio_data) + + except Exception as e: + logger.error(f"TTS error: {str(e)}") + + return None + + +def tts_node(script: Script) -> list[bytes]: + """Convert script lines to audio chunks using TTS.""" + logger.info("Converting script to audio...") + audio_chunks = [] + + for i, line in enumerate(script.lines): + # Select voice based on speaker gender + if line.speaker == "male": + voice_type = "zh_male_yangguangqingnian_moon_bigtts" # Male voice + else: + voice_type = "zh_female_sajiaonvyou_moon_bigtts" # Female voice + + logger.info(f"Processing line {i + 1}/{len(script.lines)} ({line.speaker})") + audio = text_to_speech(line.paragraph, voice_type) + + if audio: + audio_chunks.append(audio) + else: + logger.warning(f"Failed to generate audio for line {i + 1}") + + logger.info(f"Generated {len(audio_chunks)} audio chunks") + return audio_chunks + + +def mix_audio(audio_chunks: list[bytes]) -> bytes: + """Combine audio chunks into a single audio file.""" + logger.info("Mixing audio chunks...") + output = b"".join(audio_chunks) + logger.info("Audio mixing complete") + return output + + +def detect_locale(content: str) -> str: + """Auto-detect content locale based on character analysis.""" + chinese_chars = sum(1 for char in content if "\u4e00" <= char <= "\u9fff") + total_chars = len(content) + + if total_chars > 0 and chinese_chars / total_chars > 0.1: + return "zh" + return "en" + + +def generate_podcast( + input_file: str, + output_file: str, + locale: Optional[str] = None, +) -> str: + """Generate a podcast from input content.""" + + # Read input content + with open(input_file, "r", encoding="utf-8") as f: + content = f.read() + + if not content.strip(): + raise ValueError("Input file is empty") + + # Auto-detect locale if not specified + if not locale: + locale = detect_locale(content) + logger.info(f"Auto-detected locale: {locale}") + + # Step 1: Generate script + script = generate_script(content, locale) + + # Step 2: Convert to audio + audio_chunks = tts_node(script) + + if not audio_chunks: + raise Exception("Failed to generate any audio") + + # Step 3: Mix audio + output_audio = mix_audio(audio_chunks) + + # Save output + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + with open(output_file, "wb") as f: + f.write(output_audio) + + return f"Successfully generated podcast to {output_file}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate podcast from text content") + parser.add_argument( + "--input-file", + required=True, + help="Absolute path to input text/markdown file", + ) + parser.add_argument( + "--output-file", + required=True, + help="Output path for generated podcast MP3", + ) + parser.add_argument( + "--locale", + choices=["en", "zh"], + default=None, + help="Language locale (auto-detected if not specified)", + ) + + args = parser.parse_args() + + try: + result = generate_podcast( + args.input_file, + args.output_file, + args.locale, + ) + print(result) + except Exception as e: + import traceback + print(f"Error generating podcast: {e}") + traceback.print_exc() diff --git a/skills/public/podcast-generation/templates/tech-explainer.md b/skills/public/podcast-generation/templates/tech-explainer.md new file mode 100644 index 0000000..8dff4af --- /dev/null +++ b/skills/public/podcast-generation/templates/tech-explainer.md @@ -0,0 +1,63 @@ +# Tech Explainer Podcast Template + +Use this template when converting technical documentation, API guides, or developer tutorials into podcasts. + +## Input Preparation + +When the user wants to convert technical content to a podcast, help them structure the input: + +1. **Simplify Code Examples**: Replace code snippets with plain language descriptions + - Instead of showing actual code, describe what the code does + - Focus on concepts rather than syntax + +2. **Remove Complex Notation**: + - Mathematical formulas should be explained in words + - API endpoints described by function rather than URL paths + - Configuration examples summarized as settings descriptions + +3. **Add Context**: + - Explain why the technology matters + - Include real-world use cases + - Add analogies for complex concepts + +## Example Transformation + +### Original Technical Content: +```markdown +# Using the API + +POST /api/v1/users +{ + "name": "John", + "email": "john@example.com" +} + +Response: 201 Created +``` + +### Podcast-Ready Content: +```markdown +# Creating Users with the API + +The user creation feature allows applications to register new users in the system. +When you want to add a new user, you send their name and email address to the server. +If everything goes well, the server confirms the user was created successfully. +This is commonly used in signup flows, admin dashboards, or when importing users from other systems. +``` + +## Generation Command + +```bash +python /mnt/skills/public/podcast-generation/scripts/generate.py \ + --input-file /mnt/user-data/workspace/tech-content.md \ + --output-file /mnt/user-data/outputs/tech-explainer-podcast.mp3 \ + --locale en +``` + +## Tips for Technical Podcasts + +- Keep episodes focused on one main concept +- Use analogies to explain abstract concepts +- Include practical "why this matters" context +- Avoid jargon without explanation +- Make the dialogue accessible to beginners