模型详情

这是一个 CRNN 声音事件检测模型，先在 AudioSet 上进行预训练，然后在 AudioSet-strong 上进行微调。该模型包含 8 个卷积层和一个 GRU，时间分辨率为 40ms，总参数约 640 万。

用法

import torch
from transformers import AutoModel
import torchaudio

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(
    "wsntxxn/cnn8rnn-audioset-sed",
    trust_remote_code=True
).to(device)

wav1, sr1 = torchaudio.load("/path/to/file1.wav")
wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]

wav2, sr2 = torchaudio.load("/path/to/file2.wav")
wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]

wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)

with torch.no_grad():
    output = model(waveform=wav_batch)
    # output: {
    #     "framewise_output": (2, 447, n_frames),
    #     "clipwise_output": (2, 447)
    # }

# classes is in `model.classes`
# for example, the probability sequence of male speech is:
male_speech_prob = output[:, model.classes.index("Male speech, man speaking"), :]

Ascend NPU Usage (昇腾NPU适配)

import torch
from transformers import AutoModel
import torchaudio
from npu_compat import get_device, model_to_device

device = get_device()  # auto-detect NPU > CUDA > CPU
model = AutoModel.from_pretrained(
    "wsntxxn/cnn8rnn-audioset-sed",
    trust_remote_code=True
)
model = model_to_device(model, device)
if device.type == 'npu':
    model = model.half()  # Ascend910 GRU requires float16
model.eval()

wav1, sr1 = torchaudio.load("/path/to/file1.wav")
wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]

wav_batch = torch.nn.utils.rnn.pad_sequence([wav1], batch_first=True)
if device.type == 'npu':
    wav_batch = wav_batch.half()

with torch.no_grad():
    output = model(waveform=wav_batch.to(device))

用法

import torch
from transformers import AutoModel
import torchaudio

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(
    "wsntxxn/cnn8rnn-audioset-sed",
    trust_remote_code=True
).to(device)

wav1, sr1 = torchaudio.load("/path/to/file1.wav")
wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]

wav2, sr2 = torchaudio.load("/path/to/file2.wav")
wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]

wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)

with torch.no_grad():
    output = model(waveform=wav_batch)
    # output: {
    #     "framewise_output": (2, 447, n_frames),
    #     "clipwise_output": (2, 447)
    # }

# classes is in `model.classes`
# for example, the probability sequence of male speech is:
male_speech_prob = output[:, model.classes.index("Male speech, man speaking"), :]

Ascend NPU Usage (昇腾NPU适配)

import torch
from transformers import AutoModel
import torchaudio
from npu_compat import get_device, model_to_device

device = get_device()  # auto-detect NPU > CUDA > CPU
model = AutoModel.from_pretrained(
    "wsntxxn/cnn8rnn-audioset-sed",
    trust_remote_code=True
)
model = model_to_device(model, device)
if device.type == 'npu':
    model = model.half()  # Ascend910 GRU requires float16
model.eval()

wav1, sr1 = torchaudio.load("/path/to/file1.wav")
wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]

wav_batch = torch.nn.utils.rnn.pad_sequence([wav1], batch_first=True)
if device.type == 'npu':
    wav_batch = wav_batch.half()

with torch.no_grad():
    output = model(waveform=wav_batch.to(device))