HQ-SVC / gradio_app.py

Update gradio_app.py

c7b3ab7 verified 2 days ago

9.46 kB

	import os
	import sys
	import torch
	import numpy as np
	import gradio as gr
	import soundfile as sf
	import tempfile
	import hashlib
	import requests
	import socket
	from huggingface_hub import snapshot_download

	# ================= 1. 环境与智能同步逻辑 (支持纯离线) =================
	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

	def sync_model_files():
	"""智能同步：优先保证离线可用，仅在在线且文件缺失时强制同步"""
	repo_id = "shawnpi/HQ-SVC"

	# 定义核心权重路径（根据你的 YAML 配置对齐）
	model_pth = "utils/pretrain/250000_step_val_loss_0.50.pth"
	vocoder_dir = "utils/pretrain/nsf_hifigan/model"
	rmvpe_path = "utils/pretrain/rmvpe/model.pt"
	# 检查本地核心文件是否已存在
	local_exists = os.path.exists(model_pth) and os.path.exists(vocoder_dir)

	if local_exists:
	print(">>> [离线模式] 检测到本地权重已完整")
	return

	# 如果本地文件缺失，则尝试网络同步
	print(">>> [同步模式] 本地权重不完整，正在检测网络以获取权重...")

	try:
	snapshot_download(
	repo_id=repo_id,
	allow_patterns=["utils/pretrain/*", "config.json"],
	local_dir=".",
	local_dir_use_symlinks=False,
	# 如果依然失败（如镜像站也连不上），则尝试仅使用本地缓存
	resume_download=True
	)
	print(">>> 权重同步完成。")
	except Exception as e:
	if local_exists:
	print(f">>> 同步失败但本地已有文件，将尝试继续运行。错误: {e}")
	else:
	print(f">>> [严重错误] 同步失败且本地缺少权重，程序可能无法运行: {e}")

	# 在一切开始前执行智能同步
	sync_model_files()

	# ================= 2. 路径与模型加载逻辑 =================
	now_dir = os.path.dirname(os.path.abspath(__file__))
	sys.path.append(now_dir)
	utils_path = os.path.join(now_dir, 'utils')
	if utils_path not in sys.path:
	sys.path.append(utils_path)

	from logger.utils import load_config
	from utils.models.models_v2_beta import load_hq_svc
	from utils.vocoder import Vocoder
	from utils.data_preprocessing import load_facodec, load_f0_extractor, load_volume_extractor, get_processed_file

	# 全局变量缓存
	NET_G = None
	VOCODER = None
	ARGS = None
	PREPROCESSORS = {}
	TARGET_CACHE = {"file_hash": None, "spk_ave": None, "all_tar_f0": None}

	def initialize_models(config_path):
	global NET_G, VOCODER, ARGS, PREPROCESSORS
	ARGS = load_config(config_path)
	ARGS.config = config_path
	device = ARGS.device

	# 实例化模型
	VOCODER = Vocoder(vocoder_type='nsf-hifigan', vocoder_ckpt='utils/pretrain/nsf_hifigan/model', device=device)
	NET_G = load_hq_svc(mode='infer', device=device, model_path=ARGS.model_path, args=ARGS)
	NET_G.eval()

	fa_encoder, fa_decoder = load_facodec(device)
	PREPROCESSORS = {
	"fa_encoder": fa_encoder, "fa_decoder": fa_decoder,
	"f0_extractor": load_f0_extractor(ARGS),
	"volume_extractor": load_volume_extractor(ARGS),
	"content_encoder": None, "spk_encoder": None
	}

	# ================= 3. 推理逻辑 (保持鲁棒性) =================
	def predict(source_audio, target_files, shift_key, adjust_f0):
	global TARGET_CACHE
	if source_audio is None:
	return "⚠️ 系统提示：未检测到源音频。请确保文件已上传完毕。", None

	if not os.path.exists(source_audio):
	return "❌ 系统错误：找不到音频文件，请重新上传。", None

	sr, encoder_sr, device = ARGS.sample_rate, ARGS.encoder_sr, ARGS.device

	try:
	with torch.no_grad():
	is_reconstruction = (target_files is None or len(target_files) == 0)
	target_names = "".join([f.name if hasattr(f, 'name') else f for f in (target_files or [])])
	current_hash = hashlib.md5(target_names.encode()).hexdigest()

	if is_reconstruction:
	t_data = get_processed_file(source_audio, sr, encoder_sr, VOCODER, PREPROCESSORS["volume_extractor"], PREPROCESSORS["f0_extractor"], PREPROCESSORS["fa_encoder"], PREPROCESSORS["fa_decoder"], None, None, device=device)
	spk_ave, all_tar_f0 = t_data['spk'].squeeze().to(device), t_data['f0_origin']
	status = "✨ Super-Resolution"
	elif TARGET_CACHE["file_hash"] == current_hash:
	spk_ave, all_tar_f0 = TARGET_CACHE["spk_ave"], TARGET_CACHE["all_tar_f0"]
	status = "🚀 Cache Loaded"
	else:
	spk_list, f0_list = [], []
	for f in (target_files[:20] if target_files else []):
	f_path = f.name if hasattr(f, 'name') else f
	if not f_path or not os.path.exists(f_path): continue
	t_data = get_processed_file(f_path, sr, encoder_sr, VOCODER, PREPROCESSORS["volume_extractor"], PREPROCESSORS["f0_extractor"], PREPROCESSORS["fa_encoder"], PREPROCESSORS["fa_decoder"], None, None, device=device)
	if t_data:
	spk_list.append(t_data['spk'])
	f0_list.append(t_data['f0_origin'])

	if not spk_list: return "❌ 终端提示：参考音频处理失败。", None
	spk_ave = torch.stack(spk_list).mean(dim=0).squeeze().to(device)
	all_tar_f0 = np.concatenate(f0_list)
	TARGET_CACHE.update({"file_hash": current_hash, "spk_ave": spk_ave, "all_tar_f0": all_tar_f0})
	status = "✅ VOICE CONVERSION"

	src_data = get_processed_file(source_audio, sr, encoder_sr, VOCODER, PREPROCESSORS["volume_extractor"], PREPROCESSORS["f0_extractor"], PREPROCESSORS["fa_encoder"], PREPROCESSORS["fa_decoder"], None, None, device=device)
	f0 = src_data['f0'].unsqueeze(0).to(device)

	if adjust_f0 and not is_reconstruction:
	src_f0_valid = src_data['f0_origin'][src_data['f0_origin'] > 0]
	tar_f0_valid = all_tar_f0[all_tar_f0 > 0]
	if len(src_f0_valid) > 0 and len(tar_f0_valid) > 0:
	shift_key = round(12 * np.log2(tar_f0_valid.mean() / src_f0_valid.mean()))

	f0 = f0 * 2 ** (float(shift_key) / 12)
	mel_g = NET_G(src_data['vq_post'].unsqueeze(0).to(device), f0, src_data['vol'].unsqueeze(0).to(device), spk_ave, gt_spec=None, infer=True, infer_speedup=ARGS.infer_speedup, method=ARGS.infer_method, vocoder=VOCODER)
	wav_g = VOCODER.infer(mel_g, f0) if ARGS.vocoder == 'nsf-hifigan' else VOCODER.infer(mel_g)

	out_p = tempfile.mktemp(suffix=".wav")
	sf.write(out_p, wav_g.squeeze().cpu().numpy(), 44100)
	return f"{status} \| Pitch Shifted: {shift_key}", out_p
	except Exception as e:
	return f"❌ 推理运行出错：{str(e)}", None

	custom_css = """
	@import url('https://fonts.googleapis.com/css2?family=Press+Start+2P&display=swap');
	:root { --font: 'Press Start 2P', cursive !important; }
	* { font-family: 'Press Start 2P', cursive !important; border-radius: 0px !important; }
	.gradio-container {
	background: linear-gradient(rgba(0,0,0,0.85), rgba(0,0,0,0.85)),
	url('https://img.moegirl.org.cn/common/d/d3/K-ON_key_visual_2.jpg');
	background-size: cover;
	}
	.gr-box, .gr-input, .gr-button { border: 4px solid #000 !important; box-shadow: 8px 8px 0px #000 !important; }
	label, p, .time-info { color: #f36c18 !important; font-size: 10px !important; text-transform: uppercase; }
	h1 { color: #FFFF00 !important; text-shadow: 4px 4px 0px #000 !important; text-align: center; }
	button.primary { background-color: #ff69b4 !important; color: #fff !important; }
	footer { display: none !important; }
	"""

	# ================= 4. UI 界面 =================
	def build_ui():
	with gr.Blocks(css=custom_css, title="HQ-SVC Pixel Pro") as demo:
	gr.HTML('<div style="text-align:center; margin:20px 0;"><img src="file/images/kon-new.gif" style="max-width:400px; border:4px solid #000; box-shadow:8px 8px 0px #000;"></div>')
	gr.Markdown("# 🎸HQ-SVC: SINGING VOICE CONVERSION AND SUPER-RESOLUTION🍰")

	with gr.Row():
	with gr.Column():
	src_audio = gr.Audio(label="STEP 1: SOURCE VOICE", type="filepath")
	tar_files = gr.File(label="STEP 2: TARGET REFERENCE", file_count="multiple")
	with gr.Row():
	key_shift = gr.Number(label="PITCH SHIFT", value=0)
	auto_f0 = gr.Checkbox(label="AUTO PITCH", value=False)
	run_btn = gr.Button("🎤 START CONVERSION!", variant="primary")

	with gr.Column():
	status_box = gr.Textbox(label="SYSTEM TERMINAL", interactive=False)
	result_audio = gr.Audio(label="OUTPUT (44.1kHz HQ)")

	run_btn.click(predict, [src_audio, tar_files, key_shift, auto_f0], [status_box, result_audio])
	return demo

	if __name__ == "__main__":
	config_p = "configs/hq_svc_infer.yaml"
	if os.path.exists(config_p):
	initialize_models(config_p)

	demo = build_ui()
	temp_dir = tempfile.gettempdir()
	demo.launch(
	share=True,
	allowed_paths=[os.path.join(os.path.dirname(__file__), "images"), os.path.dirname(__file__), temp_dir]
	)