| import gradio as gr |
| import torch |
| import torchaudio |
| import numpy as np |
| import os |
| import sys |
| from pathlib import Path |
| import tempfile |
| import soundfile as sf |
|
|
| |
| title = "CosyVoice - 语音处理模型" |
| description = """ |
| CosyVoice是一个先进的语音处理模型,支持语音识别、语音合成等功能。 |
| |
| 上传音频文件或使用麦克风录制语音,体验CosyVoice的强大功能。 |
| """ |
|
|
| |
| cosyvoice_model = None |
| model_loaded = False |
|
|
| |
| def load_cosyvoice_model(): |
| """加载CosyVoice模型""" |
| global cosyvoice_model, model_loaded |
| |
| if model_loaded: |
| return cosyvoice_model |
| |
| print("\n" + "="*60) |
| print("正在加载CosyVoice模型...") |
| print("="*60) |
| |
| try: |
| |
| print("\n尝试使用官方 CosyVoice 包...") |
| try: |
| |
| third_party_path = os.path.join(os.path.dirname(__file__), 'third_party', 'Matcha-TTS') |
| if os.path.exists(third_party_path): |
| sys.path.insert(0, third_party_path) |
| |
| from cosyvoice.cli.cosyvoice import CosyVoice |
| |
| |
| model_name = "FunAudioLLM/CosyVoice-300M" |
| print(f"从 {model_name} 加载...") |
| |
| |
| from huggingface_hub import snapshot_download |
| model_dir = snapshot_download(repo_id=model_name, cache_dir="./models") |
| |
| |
| cosyvoice = CosyVoice(model_dir=model_dir) |
| |
| cosyvoice_model = { |
| 'model': cosyvoice, |
| 'type': 'cosyvoice_official', |
| 'has_inference': True, |
| 'sample_rate': getattr(cosyvoice, 'sample_rate', 22050) |
| } |
| model_loaded = True |
| print("✓ 成功使用官方 CosyVoice 包加载模型") |
| print("="*60 + "\n") |
| return cosyvoice_model |
| |
| except ImportError as ie: |
| print(f"⚠ 官方 CosyVoice 包不可用: {ie}") |
| print(" 尝试其他加载方式...") |
| |
| |
| print("\n尝试使用 transformers AutoModel...") |
| try: |
| from transformers import AutoModel |
| |
| model_name = "FunAudioLLM/CosyVoice-300M" |
| print(f"从 {model_name} 加载...") |
| |
| |
| model = AutoModel.from_pretrained( |
| model_name, |
| trust_remote_code=True, |
| torch_dtype=torch.float32, |
| low_cpu_mem_usage=True |
| ) |
| model.eval() |
| |
| |
| has_inference_sft = hasattr(model, 'inference_sft') |
| has_inference_zero_shot = hasattr(model, 'inference_zero_shot') |
| has_inference_cross_lingual = hasattr(model, 'inference_cross_lingual') |
| |
| print(f"模型类型: {type(model).__name__}") |
| print(f"推理方法:") |
| print(f" - inference_sft: {has_inference_sft}") |
| print(f" - inference_zero_shot: {has_inference_zero_shot}") |
| print(f" - inference_cross_lingual: {has_inference_cross_lingual}") |
| |
| if has_inference_sft or has_inference_zero_shot: |
| cosyvoice_model = { |
| 'model': model, |
| 'type': 'transformers', |
| 'has_inference': True, |
| 'sample_rate': getattr(model, 'sample_rate', 22050) |
| } |
| model_loaded = True |
| print("✓ 成功使用 transformers 加载模型") |
| print("="*60 + "\n") |
| return cosyvoice_model |
| else: |
| print("⚠ 模型缺少必要的推理方法") |
| raise ValueError("Model missing inference methods") |
| |
| except Exception as te: |
| print(f"⚠ transformers 加载失败: {te}") |
| import traceback |
| traceback.print_exc() |
| |
| |
| print("\n尝试下载模型文件...") |
| from huggingface_hub import snapshot_download |
| |
| model_name = "FunAudioLLM/CosyVoice-300M" |
| model_dir = snapshot_download( |
| repo_id=model_name, |
| allow_patterns=["*.pt", "*.pth", "*.bin", "*.json", "*.yaml", "*.txt", "*.safetensors"], |
| cache_dir="./models" |
| ) |
| |
| print(f"✓ 模型文件已下载到: {model_dir}") |
| print("\n⚠ 注意: 模型文件已下载,但无法加载推理引擎") |
| print(" 建议:") |
| print(" 1. 安装完整的 CosyVoice 包: pip install cosyvoice") |
| print(" 2. 或在 Hugging Face Space 中使用演示模式") |
| print("="*60 + "\n") |
| |
| cosyvoice_model = None |
| model_loaded = True |
| return None |
| |
| except Exception as e: |
| print(f"✗ 模型加载失败: {e}") |
| import traceback |
| print(f"详细错误:\n{traceback.format_exc()}") |
| |
| print("\n⚠ 使用演示模式") |
| print("提示: 要使用完整功能,请:") |
| print(" 1. 确保网络连接正常") |
| print(" 2. 确保有足够的磁盘空间(约2GB)") |
| print(" 3. 安装 CosyVoice: pip install cosyvoice") |
| print("="*60 + "\n") |
| |
| cosyvoice_model = None |
| model_loaded = True |
| return None |
|
|
| def process_audio(audio_file): |
| """处理音频文件 - 语音识别""" |
| if audio_file is None: |
| return "请上传音频文件" |
| |
| try: |
| |
| model = load_cosyvoice_model() |
| |
| |
| if isinstance(audio_file, tuple): |
| sample_rate, audio_data = audio_file |
| else: |
| |
| import soundfile as sf |
| audio_data, sample_rate = sf.read(audio_file) |
| |
| duration = len(audio_data) / sample_rate |
| |
| |
| if model is not None: |
| try: |
| |
| |
| result = f""" |
| ✓ 音频处理成功 |
| |
| 音频信息: |
| - 采样率: {sample_rate} Hz |
| - 时长: {duration:.2f} 秒 |
| - 数据形状: {audio_data.shape} |
| - 数据类型: {audio_data.dtype} |
| |
| 模型状态: CosyVoice模型已加载 |
| 注意: CosyVoice主要用于语音合成(TTS),如需语音识别请使用ASR模型 |
| """ |
| return result |
| except Exception as e: |
| return f"模型推理失败: {str(e)}" |
| else: |
| |
| result = f""" |
| 音频信息: |
| - 采样率: {sample_rate} Hz |
| - 时长: {duration:.2f} 秒 |
| - 数据点数: {len(audio_data)} |
| |
| ⚠ 演示模式(模型未加载) |
| 提示: 请确保安装CosyVoice模型以使用完整功能 |
| """ |
| return result |
| except Exception as e: |
| return f"处理失败: {str(e)}" |
|
|
| def text_to_speech(text, speaker="中文女", prompt_audio=None, prompt_text=None): |
| """文本转语音 - 使用CosyVoice |
| |
| Args: |
| text: 要合成的文本 |
| speaker: 说话人(用于SFT模式) |
| prompt_audio: 提示音频路径(用于zero-shot模式) |
| prompt_text: 提示文本(用于zero-shot模式) |
| """ |
| if not text or text.strip() == "": |
| return None, "请输入要转换的文本" |
| |
| try: |
| |
| model = load_cosyvoice_model() |
| |
| |
| if model is not None: |
| try: |
| |
| if isinstance(model, dict): |
| model_type = model.get('type', 'unknown') |
| |
| |
| if model_type == 'cosyvoice_official': |
| cosyvoice = model['model'] |
| sample_rate = model.get('sample_rate', 22050) |
| |
| print(f"使用官方CosyVoice API: text={text[:50]}...") |
| |
| |
| |
| try: |
| |
| if prompt_text is None: |
| |
| prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' |
| |
| |
| if prompt_audio is None: |
| |
| possible_prompt_paths = [ |
| './asset/zero_shot_prompt.wav', |
| './CosyVoice/asset/zero_shot_prompt.wav', |
| './zero_shot_prompt.wav' |
| ] |
| for path in possible_prompt_paths: |
| if os.path.exists(path): |
| prompt_audio = path |
| print(f"使用提示音频: {path}") |
| break |
| |
| |
| if prompt_audio and os.path.exists(prompt_audio): |
| print(f"使用 inference_zero_shot: text={text[:30]}, prompt={prompt_text[:50]}") |
| audio_chunks = [] |
| for i, output in enumerate(cosyvoice.inference_zero_shot( |
| text, |
| prompt_text, |
| prompt_audio, |
| stream=False |
| )): |
| if isinstance(output, dict) and 'tts_speech' in output: |
| audio_chunks.append(output['tts_speech']) |
| else: |
| audio_chunks.append(output) |
| |
| if audio_chunks: |
| if torch.is_tensor(audio_chunks[0]): |
| audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy() |
| else: |
| audio_data = np.concatenate(audio_chunks, axis=-1) |
| |
| if audio_data.ndim > 1: |
| audio_data = audio_data.flatten() |
| |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n模式: Zero-shot\n模型: CosyVoice (官方API)" |
| |
| |
| else: |
| print(f"使用 inference_sft: text={text[:30]}, speaker={speaker}") |
| |
| |
| audio_chunks = [] |
| for i, output in enumerate(cosyvoice.inference_sft(text, speaker, stream=False)): |
| if isinstance(output, dict) and 'tts_speech' in output: |
| audio_chunks.append(output['tts_speech']) |
| else: |
| audio_chunks.append(output) |
| |
| if audio_chunks: |
| if torch.is_tensor(audio_chunks[0]): |
| audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy() |
| else: |
| audio_data = np.concatenate(audio_chunks, axis=-1) |
| |
| if audio_data.ndim > 1: |
| audio_data = audio_data.flatten() |
| |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (官方API - SFT)" |
| |
| except Exception as e: |
| print(f"CosyVoice API 调用失败: {e}") |
| import traceback |
| traceback.print_exc() |
| |
| return generate_demo_audio(text, speaker, error=f"API调用失败: {str(e)}") |
| |
| elif model_type == 'transformers': |
| |
| tts_model = model['model'] |
| |
| |
| with torch.no_grad(): |
| |
| if hasattr(tts_model, 'inference_sft'): |
| print(f"使用inference_sft方法: text={text}, speaker={speaker}") |
| outputs = tts_model.inference_sft(text, speaker) |
| |
| |
| if isinstance(outputs, dict): |
| if 'tts_speech' in outputs: |
| audio_data = outputs['tts_speech'] |
| elif 'audio' in outputs: |
| audio_data = outputs['audio'] |
| else: |
| |
| audio_data = next(iter(outputs.values())) |
| elif isinstance(outputs, (list, tuple)): |
| audio_data = outputs[0] |
| else: |
| audio_data = outputs |
| |
| |
| if torch.is_tensor(audio_data): |
| audio_data = audio_data.cpu().numpy() |
| |
| |
| if audio_data.ndim > 1: |
| audio_data = audio_data.flatten() |
| |
| sample_rate = 22050 |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" |
| |
| |
| elif hasattr(tts_model, 'inference'): |
| print(f"使用inference方法: text={text}, speaker={speaker}") |
| outputs = tts_model.inference(text, speaker) |
| |
| if torch.is_tensor(outputs): |
| audio_data = outputs.cpu().numpy() |
| else: |
| audio_data = outputs |
| |
| if audio_data.ndim > 1: |
| audio_data = audio_data.flatten() |
| |
| sample_rate = 22050 |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" |
| |
| |
| elif hasattr(tts_model, 'generate'): |
| print(f"使用generate方法: text={text}") |
| |
| inputs = {"text": text, "speaker": speaker} |
| outputs = tts_model.generate(**inputs) |
| |
| if torch.is_tensor(outputs): |
| audio_data = outputs.cpu().numpy() |
| elif isinstance(outputs, dict): |
| audio_data = outputs.get('audio', outputs.get('waveform', next(iter(outputs.values())))) |
| if torch.is_tensor(audio_data): |
| audio_data = audio_data.cpu().numpy() |
| else: |
| audio_data = outputs |
| |
| if audio_data.ndim > 1: |
| audio_data = audio_data.flatten() |
| |
| sample_rate = 22050 |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" |
| |
| else: |
| |
| print(f"模型没有可用的推理方法") |
| print(f"可用方法: {[m for m in dir(tts_model) if not m.startswith('_')][:20]}") |
| return generate_demo_audio(text, speaker, error="模型缺少推理方法 (inference_sft/inference/generate)") |
| |
| elif model_type == 'pytorch': |
| |
| pytorch_model = model['model'] |
| |
| |
| try: |
| if hasattr(pytorch_model, 'inference_sft'): |
| output = pytorch_model.inference_sft(text, speaker) |
| elif hasattr(pytorch_model, 'inference'): |
| output = pytorch_model.inference(text, speaker) |
| else: |
| |
| return generate_demo_audio(text, speaker, error="PyTorch模型缺少推理方法") |
| |
| |
| if isinstance(output, dict) and 'tts_speech' in output: |
| audio_data = output['tts_speech'] |
| if torch.is_tensor(audio_data): |
| audio_data = audio_data.cpu().numpy() |
| elif torch.is_tensor(output): |
| audio_data = output.cpu().numpy() |
| else: |
| audio_data = output |
| |
| sample_rate = 22050 |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: PyTorch" |
| except Exception as e: |
| return generate_demo_audio(text, speaker, error=f"PyTorch推理失败: {str(e)}") |
| |
| elif model_type == 'downloaded': |
| |
| return generate_demo_audio(text, speaker) |
| |
| else: |
| return generate_demo_audio(text, speaker) |
| |
| elif hasattr(model, 'inference_sft'): |
| |
| output = model.inference_sft(text, speaker) |
| |
| |
| if isinstance(output, dict) and 'tts_speech' in output: |
| audio_data = output['tts_speech'].cpu().numpy() |
| sample_rate = 22050 |
| else: |
| audio_data = output |
| sample_rate = 22050 |
| |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice官方" |
| |
| else: |
| |
| if hasattr(model, 'inference'): |
| output = model.inference(text, speaker) |
| audio_data = output if isinstance(output, np.ndarray) else output.cpu().numpy() |
| sample_rate = 22050 |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}" |
| else: |
| |
| return generate_demo_audio(text, speaker) |
| |
| except Exception as e: |
| print(f"模型推理错误: {str(e)}") |
| |
| return generate_demo_audio(text, speaker, error=str(e)) |
| else: |
| |
| return generate_demo_audio(text, speaker) |
| |
| except Exception as e: |
| return None, f"语音合成失败: {str(e)}" |
|
|
| def generate_demo_audio(text, speaker, error=None): |
| """生成演示音频(当模型不可用时)""" |
| |
| sample_rate = 22050 |
| duration = min(len(text) * 0.2, 5.0) |
| t = np.linspace(0, duration, int(sample_rate * duration), False) |
| |
| frequency = 440 |
| audio_data = 0.3 * np.sin(2 * np.pi * frequency * t) |
| audio_data += 0.2 * np.sin(2 * np.pi * frequency * 1.5 * t) |
| |
| fade_samples = int(sample_rate * 0.1) |
| audio_data[:fade_samples] *= np.linspace(0, 1, fade_samples) |
| audio_data[-fade_samples:] *= np.linspace(1, 0, fade_samples) |
| |
| audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
| |
| status_msg = f"⚠ 演示模式\n文本: {text}\n说话人: {speaker}\n" |
| if error: |
| status_msg += f"错误: {error}\n" |
| status_msg += "提示: 这是演示音频,不是真实的语音合成结果。请确保模型正确加载。" |
| |
| return audio_tuple, status_msg |
|
|
| |
| load_cosyvoice_model() |
|
|
| |
| if cosyvoice_model is not None: |
| model_status_msg = "✓ CosyVoice模型已成功加载" |
| model_status_color = "green" |
| else: |
| model_status_msg = "⚠ 演示模式(模型未加载)" |
| model_status_color = "orange" |
|
|
| |
| try: |
| theme = gr.themes.Soft() |
| except: |
| theme = None |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown(f"# {title}") |
| gr.Markdown(description) |
| |
| |
| if cosyvoice_model is not None: |
| status_emoji = "✅" |
| status_text = "CosyVoice模型已成功加载并可用" |
| status_style = "background-color: #d4edda; padding: 10px; border-radius: 5px; border-left: 4px solid #28a745;" |
| else: |
| status_emoji = "⚠️" |
| status_text = "演示模式 - 模型未加载。要使用完整功能,请安装CosyVoice模型。" |
| status_style = "background-color: #fff3cd; padding: 10px; border-radius: 5px; border-left: 4px solid #ffc107;" |
| |
| gr.HTML(f'<div style="{status_style}"><strong>{status_emoji} 模型状态:</strong> {status_text}</div>') |
| |
| with gr.Tab("语音识别"): |
| with gr.Row(): |
| with gr.Column(): |
| audio_input = gr.Audio( |
| label="上传音频文件或录制语音", |
| type="numpy", |
| sources=["upload", "microphone"] |
| ) |
| process_btn = gr.Button("处理音频", variant="primary") |
| |
| with gr.Column(): |
| output_text = gr.Textbox( |
| label="识别结果", |
| lines=5, |
| placeholder="识别结果将显示在这里..." |
| ) |
| |
| process_btn.click( |
| fn=process_audio, |
| inputs=audio_input, |
| outputs=output_text |
| ) |
| |
| with gr.Tab("文本转语音"): |
| with gr.Row(): |
| with gr.Column(): |
| text_input = gr.Textbox( |
| label="输入文本", |
| placeholder="请输入要转换为语音的文本...", |
| lines=3 |
| ) |
| speaker_input = gr.Dropdown( |
| label="选择说话人(SFT模式)", |
| choices=["中文女", "中文男", "英文女", "英文男", "粤语女", "粤语男", "日语男", "韩语女"], |
| value="中文女" |
| ) |
| |
| |
| with gr.Accordion("高级选项 - Zero-shot 声音克隆", open=False): |
| prompt_audio_input = gr.Audio( |
| label="上传提示音频(3-10秒)", |
| type="filepath", |
| sources=["upload"] |
| ) |
| prompt_text_input = gr.Textbox( |
| label="提示文本(音频对应的文字)", |
| placeholder="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。", |
| lines=2 |
| ) |
| gr.Markdown(""" |
| **使用说明:** |
| - 上传一段3-10秒的参考音频 |
| - 输入音频对应的文字内容 |
| - 格式:`You are a helpful assistant.<|endofprompt|>音频对应的文字` |
| - 系统将克隆该音频的音色来合成新文本 |
| """) |
| |
| tts_btn = gr.Button("生成语音", variant="primary") |
| |
| with gr.Column(): |
| audio_output = gr.Audio(label="生成的语音") |
| tts_status = gr.Textbox(label="状态") |
| |
| tts_btn.click( |
| fn=text_to_speech, |
| inputs=[text_input, speaker_input, prompt_audio_input, prompt_text_input], |
| outputs=[audio_output, tts_status] |
| ) |
| |
| with gr.Tab("关于"): |
| gr.Markdown(""" |
| ## CosyVoice 模型 |
| |
| CosyVoice是一个先进的语音处理模型,具有以下特点: |
| |
| - 高质量的语音识别 |
| - 自然的语音合成 |
| - 多语言支持 |
| - 实时处理能力 |
| |
| ### 使用方法 |
| 1. 在"语音识别"标签页上传音频文件进行识别 |
| 2. 在"文本转语音"标签页输入文本生成语音 |
| 3. 支持麦克风实时录制 |
| |
| ### 技术特性 |
| - 基于Transformer架构 |
| - 支持多种音频格式 |
| - 高精度识别和合成 |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch(theme=theme) |