直播与短视频
直播与短视频知识分享
AI短视频自动化生产技术方案:从脚本生成到多模态合成的完整流水线
AI短视频自动化生产技术方案:从脚本生成到多模态合成的完整流水线
# AI短视频自动化生产技术方案:从脚本生成到多模态合成的完整流水线
## 摘要
短视频生产效率是2026年内容创作的核心竞争力。AI自动化生产流水线将人工3小时的工作压缩到10分钟以内——从热点抓取、脚本生成、素材匹配、AI配音、字幕生成到视频合成全流程自动化。
## 一、整体架构设计
```
┌─────────────────────────────────────────────┐
│ AI短视频自动化生产流水线 │
├─────────────────────────────────────────────┤
│ 1. 选题引擎 │
│ 热点抓取 → 趋势分析 → 选题推荐 │
├─────────────────────────────────────────────┤
│ 2. 脚本工厂 │
│ LLM生成 → 分镜规划 → 多版本草稿 │
├─────────────────────────────────────────────┤
│ 3. 素材中心 │
│ 库存检索 → AI生成 → 版权检测 │
├─────────────────────────────────────────────┤
│ 4. 合成引擎 │
│ AI配音 → 字幕 → 转场 → 视频渲染 │
├─────────────────────────────────────────────┤
│ 5. 质量检测 │
│ 画质检测 → 内容合规 → A/B封面 │
└─────────────────────────────────────────────┘
```
## 二、实战:全流程代码实现
### 2.1 热点抓取与选题
```python
import requests
from readability import Document
from bs4 import BeautifulSoup
class TrendingFetcher:
def __init__(self, api_key: str):
self.api_key = api_key
self.headers = {"Authorization": f"Bearer {api_key}"}
self.sources = {
"weibo": "https://weibo.com/ajax/side/hotSearch",
"zhihu": "https://www.zhihu.com/api/v4/search/top_search",
"douyin": "https://www.douyin.com/hot",
}
def fetch_all_trends(self) -> list[dict]:
"""抓取微博、知乎、抖音热榜"""
trends = []
# 微博热搜
weibo = requests.get(self.sources["weibo"], timeout=10)
if weibo.status_code == 200:
data = weibo.json()
for item in data.get("data", {}).get("realtime", [])[:20]:
trends.append({
"title": item["word"],
"heat": item["num"],
"source": "weibo",
})
# 知乎热榜
zhihu = requests.get(self.sources["zhihu"], timeout=10)
if zhihu.status_code == 200:
data = zhihu.json()
for item in data.get("data", [])[:20]:
trends.append({
"title": item["query"],
"heat": item["normalized_hot_value"],
"source": "zhihu",
})
return sorted(trends, key=lambda x: x["heat"], reverse=True)[:20]
def screen_by_keywords(self, trends: list[dict], keywords: list[str]) -> list[dict]:
"""按关键词筛选相关内容"""
import re
screened = []
for t in trends:
for kw in keywords:
if re.search(kw, t["title"], re.IGNORECASE):
screened.append(t)
break
return screened
```
### 2.2 脚本生成(LLM调用)
```python
from openai import OpenAI
client = OpenAI(api_key="your-api-key")
SCRIPT_SYSTEM_PROMPT = """你是一位资深短视频脚本策划,擅长制造爆款。
输出JSON格式:
{
"title": "吸引人的标题(15字内)",
"hook": "前3秒钩子(一句话)",
"scenes": [
{
"duration": 5,
"visual": "画面描述",
"audio": "旁白文案",
"text_overlay": "屏幕文字",
"bgm_cue": "背景音乐提示"
}
],
"cta": "行动号召(关注/点赞)",
"duration_total": 60
}"""
def generate_script(topic: str, duration: int = 60) -> dict:
"""生成短视频脚本"""
response = client.chat.completions.create(
model="qwen-max",
messages=[
{"role": "system", "content": SCRIPT_SYSTEM_PROMPT},
{"role": "user", "content": f"主题:{topic}\n时长:{duration}秒\n风格:实用干货"}
],
response_format={"type": "json_object"},
temperature=0.7,
)
import json
return json.loads(response.choices[0].message.content)
```
### 2.3 素材匹配与AI生成
```python
from PIL import Image
import requests
from io import BytesIO
class MaterialProvider:
def __init__(self):
self.stock_keywords = ["科技", "AI", "数据", "代码"]
def search_stock(self, query: str, count: int = 5) -> list[str]:
"""从Pexels/Pixabay等免费图库搜索素材"""
# Pexels API示例
url = f"https://api.pexels.com/v1/search?query={query}&per_page={count}"
headers = {"Authorization": "YOUR_PEXELS_KEY"}
resp = requests.get(url, headers=headers).json()
return [photo["src"]["large"] for photo in resp.get("photos", [])]
def generate_ai_image(self, prompt: str, style: str = "cinematic") -> str:
"""用Stable Diffusion / DALL-E生成素材"""
response = client.images.generate(
model="dall-e-3",
prompt=f"{prompt}, {style} style, high quality, 16:9",
size="1792x1024",
quality="hd",
n=1,
)
return response.data[0].url
def download_and_resize(self, url: str, size=(1920, 1080)) -> Image.Image:
"""下载并调整素材尺寸"""
resp = requests.get(url, timeout=10)
img = Image.open(BytesIO(resp.content))
return img.resize(size, Image.LANCZOS)
```
### 2.4 AI配音(TTS)
```python
from elevenlabs import generate, save, set_api_key
class TTSEngine:
def __init__(self, elevenlabs_key: str):
set_api_key(elevenlabs_key)
def synthesize(self, text: str, voice: str = "zh-CN-Standard-A") -> str:
"""AI配音,支持多语言和口音"""
# 方案1:ElevenLabs(质量最高)
audio = generate(
text=text,
voice=voice,
model="eleven_multilingual_v2"
)
output_path = f"/tmp/tts_{hash(text)}.mp3"
save(audio, output_path)
return output_path
# 方案2:Edge TTS(免费,中文效果好)
# import edge_tts
# await edge_tts.Communicate(text, "zh-CN-XiaoxiaoNeural").save(output_path)
# return output_path
```
### 2.5 字幕生成(ASR + 时间轴)
```python
from aip import AipSpeech # 百度语音识别
class SubtitleGenerator:
def __init__(self, app_id, api_key, secret_key):
self.client = AipSpeech(app_id, api_key, secret_key)
def generate_subtitles(self, audio_path: str) -> list[dict]:
"""生成SRT格式字幕"""
# 调用ASR获取时间戳
with open(audio_path, "rb") as f:
audio_data = f.read()
result = self.client.asr(audio_data, "mp3", 16000, {
"dev_pid": 1537, # 普通话
"cuid": "short-video-bot",
})
# 整理为SRT格式
subtitles = []
if "result" in result:
for i, segment in enumerate(result.get("segments", [])):
subtitles.append({
"index": i + 1,
"start": segment["start"],
"end": segment["end"],
"text": segment["text"],
})
return subtitles
def to_srt(self, subtitles: list[dict]) -> str:
"""转换为SRT字幕文件格式"""
def fmt(seconds):
h, rem = divmod(int(seconds), 3600)
m, s = divmod(rem, 60)
ms = int((seconds - int(seconds)) * 1000)
return f"{h:02}:{m:02}:{s:02},{ms:03}"
lines = []
for sub in subtitles:
lines.append(str(sub["index"]))
lines.append(f"{fmt(sub['start'])} --> {fmt(sub['end'])}")
lines.append(sub["text"])
lines.append("")
return "\n".join(lines)
```
### 2.6 视频合成(MoviePy)
```python
from moviepy.editor import *
from moviepy.audio.fx import audio_fadein, audio_fadeout
class VideoComposer:
def compose(self, script: dict, materials: list, voice_path: str, subtitle_path: str) -> str:
"""合成最终视频"""
clips = []
total_duration = 0
for i, scene in enumerate(script["scenes"]):
# 加载素材
if i < len(materials):
img = ImageClip(materials[i]).set_duration(scene["duration"])
else:
img = ColorClip(size=(1920, 1080), color=(30, 30, 30), duration=scene["duration"])
# 添加字幕
if i < len(script["scenes"]):
txt_clip = TextClip(
script["scenes"][i]["text_overlay"],
font="SimHei",
fontsize=60,
color="white",
stroke_color="black",
stroke_width=2,
).set_position(("center", 0.8), relative=True).set_duration(scene["duration"])
scene_clip = CompositeVideoClip([img, txt_clip])
else:
scene_clip = img
clips.append(scene_clip)
total_duration += scene["duration"]
# 拼接所有场景
video = concatenate_videoclips(clips, method="compose")
# 添加配音
voice = AudioFileClip(voice_path).set_duration(total_duration)
voice = audio_fadein(voice, 0.5)
video = video.set_audio(voice)
# 添加背景音乐(音量降低)
# bgm = AudioFileClip("bgm.mp3").set_duration(total_duration).volumex(0.1)
# video = video.set_audio(CompositeAudioClip([voice, bgm]))
output_path = f"/tmp/output_{script['title']}.mp4"
video.write_videofile(
output_path,
fps=30,
codec="libx264",
audio_codec="aac",
temp_audiofile="temp-audio.m4a",
remove_temp=True,
preset="fast", # 渲染速度优化
)
return output_path
```
## 三、流水线编排
```python
from dataclasses import dataclass
from typing import List
@dataclass
class VideoTask:
topic: str
duration: int = 60
status: str = "pending" # pending/processing/done/failed
class VideoProductionPipeline:
def __init__(self):
self.trend_fetcher = TrendingFetcher(api_key="...")
self.script_gen = ScriptGenerator()
self.material = MaterialProvider()
self.tts = TTSEngine(elevenlabs_key="...")
self.subtitle = SubtitleGenerator(...)
self.composer = VideoComposer()
def produce(self, task: VideoTask) -> str:
"""完整生产流水线"""
task.status = "processing"
# Step 1: 生成脚本
print(f"[{task.topic}] 生成脚本...")
script = self.script_gen.generate(task.topic, task.duration)
# Step 2: 获取素材
print(f"[{task.topic}] 获取素材...")
materials = []
for scene in script["scenes"]:
img_url = self.material.search_stock(scene["visual"])[0]
img = self.material.download_and_resize(img_url)
materials.append(img)
# Step 3: AI配音
print(f"[{task.topic}] AI配音...")
full_text = " ".join(s["audio"] for s in script["scenes"])
voice_path = self.tts.synthesize(full_text)
# Step 4: 字幕
print(f"[{task.topic}] 生成字幕...")
subtitles = self.subtitle.generate_subtitles(voice_path)
srt_content = self.subtitle.to_srt(subtitles)
# Step 5: 视频合成
print(f"[{task.topic}] 合成视频...")
output = self.composer.compose(script, materials, voice_path, srt_content)
task.status = "done"
print(f"[{task.topic}] 完成!输出: {output}")
return output
# 批量生产
pipeline = VideoProductionPipeline()
tasks = [
VideoTask(topic="Python异步编程入门"),
VideoTask(topic="Docker容器实战技巧"),
VideoTask(topic="Rust vs Go性能对比"),
]
for task in tasks:
pipeline.produce(task)
```
## 四、成本与性能优化
| 环节 | 方案 | 单次成本 | 优化方案 |
|------|------|---------|---------|
| 脚本生成 | Qwen Max | ¥0.02 | 用Qwen-Turbo ¥0.002 |
| 素材获取 | Pexels免费 | ¥0 | 自建素材库 |
| AI配音 | Edge TTS | 免费 | 用Edge TTS完全免费 |
| 字幕生成 | 百度ASR | ¥0.01/分钟 | Whisper本地部署 |
| 视频合成 | MoviePy CPU | 算力成本 | GPU加速(快5倍) |
| **合计** | | **¥0.03/条** | **¥0.01/条** |
## 总结
AI短视频自动化生产已从概念变成可落地的工程方案。核心是选对工具链(LLM脚本+免费TTS+FFmpeg合成),并用流水线并行化生产。单条成本可控制在¥0.01以内,是人肉生产的1/300。
---
*本文由北科信息日采集系统自动生成,发布日期:2026-05-05*