1D-CNNでSpeaker Recognition
1D-CNNを使って、音声データから発話者の識別を行います。
下のデータセットを使います。
Speaker Recognition Dataset|Kaggle Dataset
作成のきっかけKerasのSpeaker Recognitionです。こちらは、FFT(高速フーリエ変換)をしていますが、ここでは変換なしで行います。(Normalizeすらしてなかったので、後でします。)
PyTorchのコード
import time import os import glob from pathlib import Path import numpy as np import matplotlib.pyplot as plt import librosa import librosa.display %matplotlib inline import torch import torch.nn as nn import torch.optim as optim from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score
ラベル
speaker_label = {'Jens_Stoltenberg' : 0,
'Benjamin_Netanyau': 1,
'Julia_Gillard' : 2,
'Magaret_Tarcher' : 3,
'Nelson_Mandela' : 4,
}
データの読み込み
def Read_file_label(path_name, speaker_label):
file_label_list = []
file_list = []
label_list = []
for label in speaker_label.keys():
n_file = len(glob.glob(os.path.join(path_name, list(speaker_label.keys())[0])+"/*.wav"))
for i in range(n_file):
file_label_list.append((os.path.join(path_name, label, f"{i}.wav"), speaker_label[label]))
file_list.append(os.path.join(path_name, label, f"{i}.wav"))
label_list.append(speaker_label[label])
return file_label_list, file_list, label_list
class Audio_Datasets(torch.utils.data.Dataset):
def __init__(self, file_label_list, sr):
self.path = file_label_list
self.sr = sr
def __len__(self):
return len(self.path)
def __getitem__(self, idx):
data, _ = librosa.load(self.path[idx][0], sr=self.sr)
#data = data[:self.sr]
data = data.reshape(1, self.sr)
label = np.array(self.path[idx][1])
return data, label
file_label_list, file_list, label_list = Read_file_label(path_name, speaker_label)
X_train, X_valid, y_train, y_valid = train_test_split(file_list, label_list, test_size=0.2, random_state=31)
train = [(X_train[idx], y_train[idx]) for idx in range(len(X_train))]
valid = [(X_valid[idx], y_valid[idx]) for idx in range(len(X_valid))]
train_loader = torch.utils.data.DataLoader(Audio_Datasets(train, sr), batch_size=128, shuffle = True)
valid_loader = torch.utils.data.DataLoader(Audio_Datasets(valid, sr), batch_size=128, shuffle = True)
ネットワーク
ネットワークの構成は適当なので、参考程度に…
後で、しっかりしたものにしたい…
class Net1D(nn.Module):
def __init__(self):
super(Net1D,self).__init__()
self.conv1 = nn.Sequential(nn.Conv1d(1, 8, kernel_size=3, stride=1),
nn.BatchNorm1d(8),
nn.ReLU(inplace=True),
nn.MaxPool1d(kernel_size=3, stride=2),
)
self.conv2 = nn.Sequential(nn.Conv1d(8, 32, kernel_size=5, stride=1),
nn.BatchNorm1d(32),
nn.ReLU(inplace=True),
nn.MaxPool1d(kernel_size=5, stride=2),
)
self.conv3 = nn.Sequential(nn.Conv1d(32, 64, kernel_size=7, stride=1),
nn.BatchNorm1d(64),
nn.ReLU(inplace=True),
nn.MaxPool1d(kernel_size=7, stride=2),
)
self.conv4 = nn.Sequential(nn.Conv1d(64, 128, kernel_size=9, stride=1),
nn.BatchNorm1d(128),
nn.ReLU(inplace=True),
nn.MaxPool1d(kernel_size=9, stride=2),
)
self.dense = nn.Sequential(nn.Linear(126464, 512),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(512,128),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(128, 5),
)
def forward(self,x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = x.view(x.size(0),-1)
x = self.dense(x)
return x
def check_size(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = x.view(x.size(0),-1)
return x
LossとAccuracy

コード全体
GitHub - betashort/Speaker_Recognition
...
参考

Keras documentation: Speaker Recognition
...
Just a moment...
