feat: add image and video generation skills

2026-04-02 22:02:13 +08:00 · 2026-01-25 21:57:44 +08:00
parent af4fc800ee
commit ae0e7de3b7
6 changed files with 542 additions and 81 deletions
--- a/skills/public/doraemon-comic-aigc/scripts/generate.py
+++ b/skills/public/doraemon-comic-aigc/scripts/generate.py
@@ -1,49 +0,0 @@
-import argparse
-import base64
-import os
-
-import requests
-
-
-def generate_image(prompt: str, output_path: str) -> str:
-    api_key = os.getenv("GEMINI_API_KEY")
-    if not api_key:
-        return "GEMINI_API_KEY is not set"
-    response = requests.post(
-        "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent",
-        headers={
-            "x-goog-api-key": api_key,
-            "Content-Type": "application/json",
-        },
-        json={
-            "generationConfig": {"imageConfig": {"aspectRatio": "9:16"}},
-            "contents": [{"parts": [{"text": prompt}]}],
-        },
-    )
-    parts: list[dict] = response.json()["candidates"][0]["content"]["parts"]
-    image_parts = [part for part in parts if part.get("inlineData", False)]
-    if len(image_parts) == 1:
-        base64_image = image_parts[0]["inlineData"]["data"]
-        # Save the image to a file
-        with open(output_path, "wb") as f:
-            f.write(base64.b64decode(base64_image))
-        return f"Successfully generated image to {output_path}"
-    else:
-        return "Failed to generate image"
-
-
-def main(input_path: str, output_path: str):
-    with open(
-        input_path,
-        "r",
-    ) as f:
-        raw = f.read()
-        print(generate_image(raw, output_path))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Generate Doraemon comic image")
-    parser.add_argument("--input_path", required=True, help="Path to the input prompt JSON file")
-    parser.add_argument("--output_path", required=True, help="Path to save the output image")
-    args = parser.parse_args()
-    main(args.input_path, args.output_path)
--- a/skills/public/image-generation/SKILL.md
+++ b/skills/public/image-generation/SKILL.md
@@ -0,0 +1,167 @@
+---
+name: image-generation
+description: Use this skill when the user requests to generate, create, imagine, or visualize images including characters, scenes, products, or any visual content. Supports structured prompts and reference images for guided generation.
+---
+
+# Image Generation Skill
+
+## Overview
+
+This skill generates high-quality images using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing image generation with optional reference images.
+
+## Core Capabilities
+
+- Create structured JSON prompts for AIGC image generation
+- Support multiple reference images for style/composition guidance
+- Generate images through automated Python script execution
+- Handle various image generation scenarios (character design, scenes, products, etc.)
+
+## Workflow
+
+### Step 1: Understand Requirements
+
+When a user requests image generation, identify:
+
+- Subject/content: What should be in the image
+- Style preferences: Art style, mood, color palette
+- Technical specs: Aspect ratio, composition, lighting
+- Reference images: Any images to guide generation
+- You don't need to check the folder under `/mnt/user-data`
+
+### Step 2: Create Structured Prompt
+
+Generate a structured JSON file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}.json`
+
+### Step 3: Execute Generation
+
+Call the Python script:
+```bash
+python /mnt/skills/public/image-generation/scripts/generate.py \
+  --prompt-file /mnt/user-data/workspace/prompt-file.json \
+  --reference-images /path/to/ref1.jpg /path/to/ref2.png \
+  --output-file /mnt/user-data/outputs/generated-image.jpg
+  --aspect-ratio 16:9
+```
+
+Parameters:
+
+- `--prompt-file`: Absolute path to JSON prompt file (required)
+- `--reference-images`: Absolute paths to reference images (optional, space-separated)
+- `--output-file`: Absolute path to output image file (required)
+- `--aspect-ratio`: Aspect ratio of the generated image (optional, default: 16:9)
+
+[!NOTE]
+Do NOT read the python file, just call it with the parameters.
+
+## Character Generation Example
+
+User request: "Create a Tokyo street style woman character in 1990s"
+
+Create prompt file: `/mnt/user-data/workspace/asian-woman.json`
+```json
+{
+  "characters": [{
+    "gender": "female",
+    "age": "mid-20s",
+    "ethnicity": "Japanese",
+    "body_type": "slender, elegant",
+    "facial_features": "delicate features, expressive eyes, subtle makeup with emphasis on lips, long dark hair partially wet from rain",
+    "clothing": "stylish trench coat, designer handbag, high heels, contemporary Tokyo street fashion",
+    "accessories": "minimal jewelry, statement earrings, leather handbag",
+    "era": "1990s"
+  }],
+  "negative_prompt": "blurry face, deformed, low quality, overly sharp digital look, oversaturated colors, artificial lighting, studio setting, posed, selfie angle",
+  "style": "Leica M11 street photography aesthetic, film-like rendering, natural color palette with slight warmth, bokeh background blur, analog photography feel",
+  "composition": "medium shot, rule of thirds, subject slightly off-center, environmental context of Tokyo street visible, shallow depth of field isolating subject",
+  "lighting": "neon lights from signs and storefronts, wet pavement reflections, soft ambient city glow, natural street lighting, rim lighting from background neons",
+  "color_palette": "muted naturalistic tones, warm skin tones, cool blue and magenta neon accents, desaturated compared to digital photography, film grain texture"
+}
+```
+
+Execute generation:
+```bash
+python /mnt/skills/public/image-generation/scripts/generate.py \
+  --prompt-file /mnt/user-data/workspace/cyberpunk-hacker.json \
+  --output-file /mnt/user-data/outputs/cyberpunk-hacker-01.jpg \
+  --aspect-ratio 2:3
+```
+
+With reference images:
+```json
+{
+  "characters": [{
+    "gender": "based on [Image 1]",
+    "age": "based on [Image 1]",
+    "ethnicity": "human from [Image 1] adapted to Star Wars universe",
+    "body_type": "based on [Image 1]",
+    "facial_features": "matching [Image 1] with slight weathered look from space travel",
+    "clothing": "Star Wars style outfit - worn leather jacket with utility vest, cargo pants with tactical pouches, scuffed boots, belt with holster",
+    "accessories": "blaster pistol on hip, comlink device on wrist, goggles pushed up on forehead, satchel with supplies, personal vehicle based on [Image 2]",
+    "era": "Star Wars universe, post-Empire era"
+  }],
+  "prompt": "Character inspired by [Image 1] standing next to a vehicle inspired by [Image 2] on a bustling alien planet street in Star Wars universe aesthetic. Character wearing worn leather jacket with utility vest, cargo pants with tactical pouches, scuffed boots, belt with blaster holster. The vehicle adapted to Star Wars aesthetic with weathered metal panels, repulsor engines, desert dust covering, parked on the street. Exotic alien marketplace street with multi-level architecture, weathered metal structures, hanging market stalls with colorful awnings, alien species walking by as background characters. Twin suns casting warm golden light, atmospheric dust particles in air, moisture vaporators visible in distance. Gritty lived-in Star Wars aesthetic, practical effects look, film grain texture, cinematic composition.",
+  "negative_prompt": "clean futuristic look, sterile environment, overly CGI appearance, fantasy medieval elements, Earth architecture, modern city",
+  "style": "Star Wars original trilogy aesthetic, lived-in universe, practical effects inspired, cinematic film look, slightly desaturated with warm tones",
+  "composition": "medium wide shot, character in foreground with alien street extending into background, environmental storytelling, rule of thirds",
+  "lighting": "warm golden hour lighting from twin suns, rim lighting on character, atmospheric haze, practical light sources from market stalls",
+  "color_palette": "warm sandy tones, ochre and sienna, dusty blues, weathered metals, muted earth colors with pops of alien market colors",
+  "technical": {
+    "aspect_ratio": "9:16",
+    "quality": "high",
+    "detail_level": "highly detailed with film-like texture"
+  }
+}
+```
+```bash
+python /mnt/skills/public/image-generation/scripts/generate.py \
+  --prompt-file /mnt/user-data/workspace/star-wars-scene.json \
+  --reference-images /mnt/user-data/uploads/character-ref.jpg /mnt/user-data/uploads/vehicle-ref.jpg \
+  --output-file /mnt/user-data/outputs/star-wars-scene-01.jpg \
+  --aspect-ratio 16:9
+```
+
+## Common Scenarios
+
+Use different JSON schemas for different scenarios.
+
+**Character Design**:
+- Physical attributes (gender, age, ethnicity, body type)
+- Facial features and expressions
+- Clothing and accessories
+- Historical era or setting
+- Pose and context
+
+**Scene Generation**:
+- Environment description
+- Time of day, weather
+- Mood and atmosphere
+- Focal points and composition
+
+**Product Visualization**:
+- Product details and materials
+- Lighting setup
+- Background and context
+- Presentation angle
+
+## Specific Templates
+
+Read the following template file only when matching the user request.
+
+- [Doraemon Comic](templates/doraemon.md)
+
+## Output Handling
+
+After generation:
+
+- Images are typically saved in `/mnt/user-data/outputs/`
+- Share generated images with user using present_files tool
+- Provide brief description of the generation result
+- Offer to iterate if adjustments needed
+
+## Notes
+
+- Always use English for prompts regardless of user's language
+- JSON format ensures structured, parsable prompts
+- Reference images enhance generation quality significantly
+- Iterative refinement is normal for optimal results
+- For character generation, include the detailed character object plus a consolidated prompt field
--- a/skills/public/image-generation/scripts/generate.py
+++ b/skills/public/image-generation/scripts/generate.py
@@ -0,0 +1,96 @@
+import base64
+import os
+
+import requests
+
+
+def generate_image(
+    prompt_file: str,
+    reference_images: list[str],
+    output_file: str,
+    aspect_ratio: str = "16:9",
+) -> str:
+    with open(prompt_file, "r") as f:
+        prompt = f.read()
+    parts = []
+    i = 0
+    for reference_image in reference_images:
+        i += 1
+        with open(reference_image, "rb") as f:
+            image_b64 = base64.b64encode(f.read()).decode("utf-8")
+        parts.append(
+            {
+                "inlineData": {
+                    "mimeType": "image/jpeg",
+                    "data": image_b64,
+                }
+            }
+        )
+
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        return "GEMINI_API_KEY is not set"
+    response = requests.post(
+        "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent",
+        headers={
+            "x-goog-api-key": api_key,
+            "Content-Type": "application/json",
+        },
+        json={
+            "generationConfig": {"imageConfig": {"aspectRatio": aspect_ratio}},
+            "contents": [{"parts": [*parts, {"text": prompt}]}],
+        },
+    )
+    json = response.json()
+    parts: list[dict] = json["candidates"][0]["content"]["parts"]
+    image_parts = [part for part in parts if part.get("inlineData", False)]
+    if len(image_parts) == 1:
+        base64_image = image_parts[0]["inlineData"]["data"]
+        # Save the image to a file
+        with open(output_file, "wb") as f:
+            f.write(base64.b64decode(base64_image))
+        return f"Successfully generated image to {output_file}"
+    else:
+        raise Exception("Failed to generate image")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Generate images using Gemini API")
+    parser.add_argument(
+        "--prompt-file",
+        required=True,
+        help="Absolute path to JSON prompt file",
+    )
+    parser.add_argument(
+        "--reference-images",
+        nargs="*",
+        default=[],
+        help="Absolute paths to reference images (space-separated)",
+    )
+    parser.add_argument(
+        "--output-file",
+        required=True,
+        help="Output path for generated image",
+    )
+    parser.add_argument(
+        "--aspect-ratio",
+        required=False,
+        default="16:9",
+        help="Aspect ratio of the generated image",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        print(
+            generate_image(
+                args.prompt_file,
+                args.reference_images,
+                args.output_file,
+                args.aspect_ratio,
+            )
+        )
+    except Exception:
+        print("Error while generating image.")
--- a/skills/public/image-generation/templates/doraemon.md
+++ b/skills/public/image-generation/templates/doraemon.md
@@ -1,19 +1,10 @@
---
-name: doraemon-comic-aigc
-description: Generate 8-panel Doraemon comic strip on a single 9:16 canvas. Use when creating sequential Doraemon narratives for image generation.
---
-
 # Doraemon 8-Panel Comic Generator

-Generate JSON spec for 8 panels arranged on ONE 9:16 vertical canvas (1080x1920).
-
 ## Workflow

 1. Extract story context (theme, gadget, conflict, punchline)
 2. Map to 8 narrative beats
-3. Output JSON to `/mnt/user-data/outputs/prompt.json`
-4. Run `python /mnt/skills/custom/doraemon-comic-aigc/scripts/generate.py --input_path /mnt/user-data/outputs/prompt.json --output_path /mnt/user-data/outputs/doraemon.png `
-5. Directly present the output image as well as the `prompt.json` using the `present_files` tool without checking the file existence
+3. Use the provided prompt template to generate the JSON prompt file

 ## Panel Layout

@@ -33,15 +24,13 @@ Right column: x=540, width=450

 ## Characters

-| Name | Primary Color | Key Feature |
-|------|---------------|-------------|
-| Doraemon | #0095D9 | Blue robot cat, red nose, yellow bell |
-| Nobita | #FFD700 | Round glasses, yellow shirt |
-| Shizuka | #FFB6C1 | Pink dress, brown hair |
-| Giant | #FFA500 | Orange shirt, large build |
-| Suneo | #98FB98 | Green outfit, pompadour |
+* Doraemon
+* Nobita
+* Shizuka
+* Giant
+* Suneo

-## Output JSON Schema
+## Prompt Template

 ```json
 {
@@ -55,14 +44,14 @@ Right column: x=540, width=450
      "text": "[Story Title]",
      "position": { "x": 540, "y": 100 },
      "style": {
-        "fontFamily": "Doraemon, sans-serif",
-        "fontSize": 56,
-        "fontWeight": "bold",
+        "font_family": "Doraemon, sans-serif",
+        "font_size": 56,
+        "font_weight": "bold",
        "color": "#0095D9",
-        "textAlign": "center",
+        "text_align": "center",
        "stroke": "#FFFFFF",
-        "strokeWidth": 4,
-        "textShadow": "3px 3px 0px #FFD700"
+        "stroke_width": 4,
+        "text_shadow": "3px 3px 0px #FFD700"
      }
    }
  },
@@ -89,11 +78,11 @@ Right column: x=540, width=450
            "text": "[Dialogue text]",
            "position": { "x": 0, "y": 0 },
            "style": {
-              "bubbleType": "speech",
+              "bubble_type": "speech",
              "backgroundColor": "#FFFFFF",
-              "borderColor": "#000000",
-              "fontSize": 22,
-              "textAlign": "center"
+              "border_color": "#000000",
+              "font_size": 22,
+              "text_align": "center"
            }
          }
        ],
@@ -105,16 +94,19 @@ Right column: x=540, width=450
    "text": "[Closing note] - Doraemon",
    "position": { "x": 540, "y": 1860 },
    "style": {
-      "fontFamily": "Doraemon, sans-serif",
-      "fontSize": 24,
+      "font_family": "Doraemon, sans-serif",
+      "font_size": 24,
      "color": "#0095D9",
-      "textAlign": "center"
+      "text_align": "center"
    }
  },
-  "soundEffects": []
 }
 ```

 ## Story Pattern

 Setup → Problem → Gadget → Misuse → Backfire → Chaos → Consequence → Ironic Punchline
+
+## Aspect Ratio
+
+9:16
--- a/skills/public/video-generation/SKILL.md
+++ b/skills/public/video-generation/SKILL.md
@@ -0,0 +1,139 @@
+---
+name: video-generation
+description: Use this skill when the user requests to generate, create, or imagine videos. Supports structured prompts and reference image for guided generation.
+---
+
+# Video Generation Skill
+
+## Overview
+
+This skill generates high-quality videos using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing video generation with optional reference image.
+
+## Core Capabilities
+
+- Create structured JSON prompts for AIGC video generation
+- Support reference image as guidance or the first/last frame of the video
+- Generate videos through automated Python script execution
+
+## Workflow
+
+### Step 1: Understand Requirements
+
+When a user requests video generation, identify:
+
+- Subject/content: What should be in the image
+- Style preferences: Art style, mood, color palette
+- Technical specs: Aspect ratio, composition, lighting
+- Reference image: Any image to guide generation
+- You don't need to check the folder under `/mnt/user-data`
+
+### Step 2: Create Structured Prompt
+
+Generate a structured JSON file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}.json`
+
+### Step 3: Create Reference Image (Optional when image-generation skill is available)
+
+Generate reference image for the video generation.
+
+- If only 1 image is provided, use it as the guided frame of the video
+
+### Step 3: Execute Generation
+
+Call the Python script:
+```bash
+python /mnt/skills/public/video-generation/scripts/generate.py \
+  --prompt-file /mnt/user-data/workspace/prompt-file.json \
+  --reference-images /path/to/ref1.jpg \
+  --output-file /mnt/user-data/outputs/generated-video.mp4 \
+  --aspect-ratio 16:9
+```
+
+Parameters:
+
+- `--prompt-file`: Absolute path to JSON prompt file (required)
+- `--reference-images`: Absolute paths to reference image (optional)
+- `--output-file`: Absolute path to output image file (required)
+- `--aspect-ratio`: Aspect ratio of the generated image (optional, default: 16:9)
+
+[!NOTE]
+Do NOT read the python file, instead just call it with the parameters.
+
+## Video Generation Example
+
+User request: "Generate a short video clip depicting the opening scene from "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe"
+
+Step 1: Search for the opening scene of "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe" online
+
+Step 2: Create a JSON prompt file with the following content:
+
+```json
+{
+  "title": "The Chronicles of Narnia - Train Station Farewell",
+  "background": {
+    "description": "World War II evacuation scene at a crowded London train station. Steam and smoke fill the air as children are being sent to the countryside to escape the Blitz.",
+    "era": "1940s wartime Britain",
+    "location": "London railway station platform"
+  },
+  "characters": ["Mrs. Pevensie", "Lucy Pevensie"],
+  "camera": {
+    "type": "Close-up two-shot",
+    "movement": "Static with subtle handheld movement",
+    "angle": "Profile view, intimate framing",
+    "focus": "Both faces in focus, background soft bokeh"
+  },
+  "dialogue": [
+    {
+      "character": "Mrs. Pevensie",
+      "text": "You must be brave for me, darling. I'll come for you... I promise."
+    },
+    {
+      "character": "Lucy Pevensie",
+      "text": "I will be, mother. I promise."
+    }
+  ],
+  "audio": [
+    {
+      "type": "Train whistle blows (signaling departure)",
+      "volume": 1
+    },
+    {
+      "type": "Strings swell emotionally, then fade",
+      "volume": 0.5
+    },
+    {
+      "type": "Ambient sound of the train station",
+      "volume": 0.5
+    }
+  ]
+}
+```
+
+Step 3: Use the image-generation skill to generate the reference image
+
+Load the image-generation skill and generate a single reference image `narnia-farewell-scene-01.jpg` according to the skill.
+
+Step 4: Use the generate.py script to generate the video
+```bash
+python /mnt/skills/public/video-generation/scripts/generate.py \
+  --prompt-file /mnt/user-data/workspace/narnia-farewell-scene.json \
+  --reference-images /mnt/user-data/outputs/narnia-farewell-scene-01.jpg \
+  --output-file /mnt/user-data/outputs/narnia-farewell-scene-01.mp4 \
+  --aspect-ratio 16:9
+```
+> Do NOT read the python file, just call it with the parameters.
+
+## Output Handling
+
+After generation:
+
+- Videos are typically saved in `/mnt/user-data/outputs/`
+- Share generated videos (come first) with user as well as generated image if applicable, using `present_files` tool
+- Provide brief description of the generation result
+- Offer to iterate if adjustments needed
+
+## Notes
+
+- Always use English for prompts regardless of user's language
+- JSON format ensures structured, parsable prompts
+- Reference image enhance generation quality significantly
+- Iterative refinement is normal for optimal results
--- a/skills/public/video-generation/scripts/generate.py
+++ b/skills/public/video-generation/scripts/generate.py
@@ -0,0 +1,116 @@
+import base64
+import os
+import time
+
+import requests
+
+
+def generate_video(
+    prompt_file: str,
+    reference_images: list[str],
+    output_file: str,
+    aspect_ratio: str = "16:9",
+) -> str:
+    with open(prompt_file, "r") as f:
+        prompt = f.read()
+    referenceImages = []
+    i = 0
+    json = {
+        "instances": [{"prompt": prompt}],
+    }
+    for reference_image in reference_images:
+        i += 1
+        with open(reference_image, "rb") as f:
+            image_b64 = base64.b64encode(f.read()).decode("utf-8")
+        referenceImages.append(
+            {
+                "image": {"mimeType": "image/jpeg", "bytesBase64Encoded": image_b64},
+                "referenceType": "asset",
+            }
+        )
+    if i > 0:
+        json["instances"][0]["referenceImages"] = referenceImages
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        return "GEMINI_API_KEY is not set"
+    response = requests.post(
+        "https://generativelanguage.googleapis.com/v1beta/models/veo-3.1-generate-preview:predictLongRunning",
+        headers={
+            "x-goog-api-key": api_key,
+            "Content-Type": "application/json",
+        },
+        json=json,
+    )
+    json = response.json()
+    operation_name = json["name"]
+    while True:
+        response = requests.get(
+            f"https://generativelanguage.googleapis.com/v1beta/{operation_name}",
+            headers={
+                "x-goog-api-key": api_key,
+            },
+        )
+        json = response.json()
+        if json.get("done", False):
+            sample = json["response"]["generateVideoResponse"]["generatedSamples"][0]
+            url = sample["video"]["uri"]
+            download(url, output_file)
+            break
+        time.sleep(3)
+    return f"The video has been generated successfully to {output_file}"
+
+
+def download(url: str, output_file: str):
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        return "GEMINI_API_KEY is not set"
+    response = requests.get(
+        url,
+        headers={
+            "x-goog-api-key": api_key,
+        },
+    )
+    with open(output_file, "wb") as f:
+        f.write(response.content)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Generate videos using Gemini API")
+    parser.add_argument(
+        "--prompt-file",
+        required=True,
+        help="Absolute path to JSON prompt file",
+    )
+    parser.add_argument(
+        "--reference-images",
+        nargs="*",
+        default=[],
+        help="Absolute paths to reference images (space-separated)",
+    )
+    parser.add_argument(
+        "--output-file",
+        required=True,
+        help="Output path for generated image",
+    )
+    parser.add_argument(
+        "--aspect-ratio",
+        required=False,
+        default="16:9",
+        help="Aspect ratio of the generated image",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        print(
+            generate_video(
+                args.prompt_file,
+                args.reference_images,
+                args.output_file,
+                args.aspect_ratio,
+            )
+        )
+    except Exception as e:
+        print(f"Error while generating video: {e}")