BigVGAN是NVIDIA开发的一款通用神经声码器,能将梅尔频谱图转换为高保真音频波形,支持零样本生成未见过的说话人、语言、音乐等音频。
核心创新点:
代码仓库:NVIDIA/BigVGAN
| 配套软件 | 版本要求 |
|---|---|
| Python | 3.11.10 |
| torch | 2.5.1 |
| torch_npu | 2.5.1 |
| torchvision | 0.16.0 |
| torchaudio | 2.5.1 |
# 增加执行权限
chmod +x ./Ascend-cann-toolkit_{version}_linux-{arch}.run
chmod +x ./Ascend-cann-kernels-{soc}_{version}_linux.run
# 校验安装包
./Ascend-cann-toolkit_{version}_linux-{arch}.run --check
./Ascend-cann-kernels-{soc}_{version}_linux.run --check
# 执行安装
./Ascend-cann-toolkit_{version}_linux-{arch}.run --install
./Ascend-cann-kernels-{soc}_{version}_linux.run --install
# 设置环境变量
source /usr/local/Ascend/ascend-toolkit/set_env.shgit clone https://github.com/NVIDIA/BigVGAN
cd BigVGAN
pip install -r requirements.txtdevice = 'npu'
import torch
import bigvgan
import librosa
from meldataset import get_mel_spectrogram
import torch_npu
from torch_npu.contrib import transfer_to_npu
from zhl_bigvgan import BigVGAN
# NPU配置
torch_npu.npu.config.allow_internal_format=False
torch_npu.npu.set_compile_mode(jit_compile=False)
# 加载预训练模型
model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
# 模型准备
model.remove_weight_norm()
model = model.eval().to(device)
# 音频处理流程
wav_path = '/path/to/your/audio.wav'
wav, sr = librosa.load(wav_path, sr=model.h.sampling_rate, mono=True)
wav = torch.FloatTensor(wav).unsqueeze(0)
# 生成梅尔频谱
mel = get_mel_spectrogram(wav, model.h).to(device)
# 波形生成
with torch.inference_mode():
wav_gen = model(mel)
wav_gen_float = wav_gen.squeeze(0).cpu()
# 格式转换
wav_gen_int16 = (wav_gen_float * 32767.0).numpy().astype('int16')优化效果: