目录
- 导入需要用到的包
- 1.数据预处理
- 1.1 数据集下载
- 1.2 音频分割
- 1.3 音频长度对齐
- 1.4 自定义数据集类
- 2.搭建卷积神经网络
- 3.训练及测试
- 4. 结果
导入需要用到的包
import os
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import tensor
import torchaudio
from torchaudio.dataset import YESNO
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import librosa
1.数据预处理
1.1 数据集下载
# YESNO数据集会被下载到./dataset/waves_yesno/文件夹中
# 数据集包含60个音频文件,文件命名格式类似0_0_1_1_0_1_0_1.wav,每个音频中包含与文件名对应的
# 八个单词(yes或no),0代表yes,1代表no
YESNO = YESNO(root='./dataset', download=True)
1.2 音频分割
由于下载的音频文件中包含多个单词,而我们希望一个音频文件对应一个单词,这样可以将音频分类问题转换为二分类问题,具体的分割流程如下:
- 获取音频的频率随时间变化的序列及音频不同部分的标签(0 / 1)
pre_len = 8
for fi, filename in enumerate(os.listdir('./dataset/waves_yesno')):
waveform, label = torchaudio.load(f'./dataset/waves_yesno/{filename}')
- 设置阈值,本实验中阈值由均值与标准差之和得到
threshold = waveform[0].mean() + waveform[0].std()
- 遍历频率序列,记录每个单词起止时的采样点。基于以下规则记录采样点:
- 当频率高于阈值时,记此时的采样点为起始点
- 当多个采样点(如1000个采样点)的频率连续低于阈值时,记此时的采样点为终止点
start_times = []
end_times = []
segment_start = None
count = 0
for i in range(len(waveform[0])):
if abs(waveform[0][i].item()) > threshold:
count = 0
if segment_start is None:
segment_start = i
elif abs(waveform[0][i].item()) < threshold and segment_start is not None:
if count < 1000:
count = count + 1
continue
else:
start_times.append(segment_start)
end_times.append(i)
segment_start = None
count = 0
- 根据起止采样点分割音频
segments = []
for i in range(len(start_times)):
segment = waveform[0][start_times[i]:end_times[i]]
segments.append(segment)
labels = filename.split('.')[0].split('_')
for i in range(len(labels), len(segments)):
labels.append(i)
- 存储音频,音频文件的命名格式为:音频编号_音频标签.wav
fi = fi * pre_len
for i, segment in enumerate(segments):
torchaudio.save(f'./dataset/waves_yesno_seg/{fi}_{labels[i]}.wav', segment.view(1, segment.size()[0]), sample_rate=8000)
fi = fi + 1
pre_len = len(segments)
注! 对于个别音频文件,使用上述方法分割音频时并不能刚好将音频文件分割成8份,因此文件的编号并不能简单的用来计算,而是需要记录前一个文件分割成了几份。因此使用变量
pre_len
来存储前一个文件的分割份数。
1.3 音频长度对齐
由于输入层的大小需要人为设定并且在训练过程中不可更改,因此需要输入的数据的长度想等。由于分割后的音频长度必然存在不相等的情况,因此使用PyTorch提供的pad_sequence
方法来对齐音频。
waveforms = []
labels = []
for fi, filename in enumerate(os.listdir('./dataset/waves_yesno_seg/')):
waveform, sr = torchaudio.load(f'./dataset/waves_yesno_seg/{filename}')
waveforms.append(waveform[0].view(-1))
labels.append(filename.split('.')[0].split('_')[-1])
waveforms = pad_sequence(waveforms, batch_first=True)
# 13283为对齐后的音频长度
for i, waveform in enumerate(waveforms):
torchaudio.save(f'./dataset/waves_yesno_pad/{i}_{labels[i]}.wav', waveform.view(1, 13283), 8000)
1.4 自定义数据集类
将音频数据转换为梅尔频谱图,以便可以放入卷积神经网络进行训练。
# 定义一个自定义数据集类
class AudioDataset(Dataset):
def __init__(self, file_list, label_list):
self.file_list = file_list
self.label_list = label_list
def __len__(self):
return len(self.file_list)
def __getitem__(self, idx):
file_path = self.file_list[idx]
label = self.label_list[idx]
# 使用 Librosa 加载音频文件,并将其转换为梅尔频谱图
y, sr = librosa.load(file_path, sr=16000)
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=80)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
mel_spec_db = np.expand_dims(mel_spec_db, axis=0)
# 返回梅尔频谱图和标签
return mel_spec_db, label
2.搭建卷积神经网络
class AudioCNN(nn.Module):
def __init__(self):
super(AudioCNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.bn1 = nn.BatchNorm2d(32)
self.relu1 = nn.ReLU(inplace=True)
self.pool1 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))
self.conv2 = nn.Conv2d(32, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.bn2 = nn.BatchNorm2d(64)
self.relu2 = nn.ReLU(inplace=True)
self.pool2 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))
self.fc1 = nn.Linear(16640, 128)
self.bn3 = nn.BatchNorm1d(128)
self.relu3 = nn.ReLU(inplace=True)
self.fc2 = nn.Linear(128, 2)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu2(x)
x = self.pool2(x)
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = self.bn3(x)
x = self.relu3(x)
x = self.fc2(x)
return x
3.训练及测试
- 导入训练数据和测试数据,将前300个音频当作训练数据,之后的音频当作测试数据
train_files = ['./dataset/waves_yesno_pad/' + filename for filename in os.listdir('./dataset/waves_yesno_pad/')[:300]]
train_labels = [int(filename.split('.')[0].split('_')[-1]) for filename in os.listdir('./dataset/waves_yesno_pad/')[:300]]
test_files = ['./dataset/waves_yesno_pad/' + filename for filename in os.listdir('./dataset/waves_yesno_pad/')[300:]]
test_labels = [int(filename.split('.')[0].split('_')[-1]) for filename in os.listdir('./dataset/waves_yesno_pad/')[300:]]
train_dataset = AudioDataset(train_files, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = AudioDataset(test_files, test_labels)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
- 实例化神经网络,定义损失函数(交叉熵损失)和优化器(Adam)
model = AudioCNN()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
- 使用GPU进行训练
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
- 训练
def train(epoch):
for idx, (inputs, labels) in enumerate(train_loader):
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if idx % 10 == 0:
print('epoch: {}, loss: {}'.format(epoch, loss.item()))
torch.save(model.state_dict(), './model/yesno_net.pkl')
torch.save(optimizer.state_dict(), './model/yesno_optimizer.pkl')
idx = idx + 1
- 测试
def test():
loss_list = []
sample_num = 0
acc_num = 0
for idx, (inputs, labels) in enumerate(test_loader):
inputs = inputs.to(device)
labels = labels.to(device)
with torch.no_grad():
outputs = model(inputs)
cur_loss = criterion(outputs, labels).cpu()
# outputs中的每一项均为包含两个位于0和1之间的浮点数的数组,较大浮点数所在位置即为预测值
pred = outputs.argmax(dim=1, keepdim=True)
# 统计预测正确的个数
acc_num = acc_num + pred.eq(labels.view_as(pred)).sum().item()
# 记录预测的样本数
sample_num = sample_num + labels.size()[0]
loss_list.append(cur_loss)
print('平均损失率:{}, 平均准确率:{}'.format(np.mean(loss_list), acc_num / sample_num))
4. 结果
平均损失率:0.07645493745803833, 平均准确率:0.9671052631578947