python PaddleSpeech實現(xiàn)嬰兒啼哭識別
一、基于PaddleSpeech的嬰兒啼哭識別
1.項目背景
對嬰兒來說,啼哭聲是一種通訊的方式,一個非常有限的,但類似成年人進(jìn)行交流的方式。它也是一種生物報警器,向外界傳達(dá)著嬰兒生理和心理的需求。基于啼哭聲聲波攜帶的信息,嬰兒的身體狀況才能被確定,疾病才能被檢測出來。因此,有效辨識啼哭聲,成功地將嬰兒啼哭聲“翻譯”成“成人語言”,讓我們能夠讀懂啼哭聲的含義,有重大的實際意義。
2.數(shù)據(jù)說明:
- 1.訓(xùn)練數(shù)據(jù)集包含六類哭聲,已人工添加噪聲。
A:awake(蘇醒)
B:diaper(換尿布)
C:hug(要抱抱)
D:hungry(饑餓)
E:sleepy(困乏)
F:uncomfortable(不舒服)
- 2.噪聲數(shù)據(jù)來源Noisex-92標(biāo)準(zhǔn)數(shù)據(jù)庫。
二、PaddleSpeech環(huán)境準(zhǔn)備
# 環(huán)境準(zhǔn)備:安裝paddlespeech和paddleaudio !python -m pip install -q -U pip --user !pip install paddlespeech paddleaudio -U -q
!pip list|grep paddle
import warnings
warnings.filterwarnings("ignore")
import IPython
import numpy as np
import matplotlib.pyplot as plt
import paddle
%matplotlib inline
三、數(shù)據(jù)預(yù)處理
1.數(shù)據(jù)解壓縮
# !unzip -qoa data/data41960/dddd.zip
2.查看聲音文件
from paddleaudio import load
data, sr = load(file='train/awake/awake_0.wav', mono=True, dtype='float32') # 單通道,float32音頻樣本點
print('wav shape: {}'.format(data.shape))
print('sample rate: {}'.format(sr))
# 展示音頻波形
plt.figure()
plt.plot(data)
plt.show()
from paddleaudio import load
data, sr = load(file='train/diaper/diaper_0.wav', mono=True, dtype='float32') # 單通道,float32音頻樣本點
print('wav shape: {}'.format(data.shape))
print('sample rate: {}'.format(sr))
# 展示音頻波形
plt.figure()
plt.plot(data)
plt.show()
!paddlespeech cls --input train/awake/awake_0.wav
!paddlespeech help
3.音頻文件長度處理
# 查音頻長度
import contextlib
import wave
def get_sound_len(file_path):
with contextlib.closing(wave.open(file_path, 'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
wav_length = frames / float(rate)
return wav_length
# 編譯wav文件
import glob
sound_files=glob.glob('train/*/*.wav')
print(sound_files[0])
print(len(sound_files))
# 統(tǒng)計最長、最短音頻
sounds_len=[]
for sound in sound_files:
sounds_len.append(get_sound_len(sound))
print("音頻最大長度:",max(sounds_len),"秒")
print("音頻最小長度:",min(sounds_len),"秒")
!cp train/hungry/hungry_0.wav ~/
!pip install pydub -q
# 音頻信息查看
import math
import soundfile as sf
import numpy as np
import librosa
data, samplerate = sf.read('hungry_0.wav')
channels = len(data.shape)
length_s = len(data)/float(samplerate)
format_rate=16000
print(f"channels: {channels}")
print(f"length_s: {length_s}")
print(f"samplerate: {samplerate}")
# 統(tǒng)一到34s
from pydub import AudioSegment
audio = AudioSegment.from_wav('hungry_0.wav')
print(str(audio.duration_seconds))
i = 1
padded = audio
while padded.duration_seconds * 1000 < 34000:
padded = audio * i
i = i + 1
padded[0:34000].set_frame_rate(16000).export('padded-file.wav', format='wav')
import math
import soundfile as sf
import numpy as np
import librosa
data, samplerate = sf.read('padded-file.wav')
channels = len(data.shape)
length_s = len(data)/float(samplerate)
format_rate=16000
print(f"channels: {channels}")
print(f"length_s: {length_s}")
print(f"samplerate: {samplerate}")
# 定義函數(shù),如未達(dá)到最大長度,則重復(fù)填充,最終從超過34s的音頻中截取
from pydub import AudioSegment
def convert_sound_len(filename):
audio = AudioSegment.from_wav(filename)
i = 1
padded = audio*i
while padded.duration_seconds * 1000 < 34000:
i = i + 1
padded = audio * i
padded[0:34000].set_frame_rate(16000).export(filename, format='wav')
# 統(tǒng)一所有音頻到定長
for sound in sound_files:
convert_sound_len(sound)
3.自定義數(shù)據(jù)集
import os
from paddlespeech.audio.datasets.dataset import AudioClassificationDataset
class CustomDataset(AudioClassificationDataset):
# List all the class labels
label_list = [
'awake',
'diaper',
'hug',
'hungry',
'sleepy',
'uncomfortable'
]
train_data_dir='./train/'
def __init__(self, **kwargs):
files, labels = self._get_data()
super(CustomDataset, self).__init__(
files=files, labels=labels, feat_type='raw', **kwargs)
# 返回音頻文件、label值
def _get_data(self):
'''
This method offer information of wave files and labels.
'''
files = []
labels = []
for i in range(len(self.label_list)):
single_class_path=os.path.join(self.train_data_dir, self.label_list[i])
for sound in os.listdir(single_class_path):
# print(sound)
if 'wav' in sound:
sound=os.path.join(single_class_path, sound)
files.append(sound)
labels.append(i)
return files, labels
# 定義dataloader
import paddle
from paddlespeech.audio.features import LogMelSpectrogram
# Feature config should be align with pretrained model
sample_rate = 16000
feat_conf = {
'sr': sample_rate,
'n_fft': 1024,
'hop_length': 320,
'window': 'hann',
'win_length': 1024,
'f_min': 50.0,
'f_max': 14000.0,
'n_mels': 64,
}
train_ds = CustomDataset(sample_rate=sample_rate)
feature_extractor = LogMelSpectrogram(**feat_conf)
train_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=64, shuffle=True, drop_last=False)
train_loader = paddle.io.DataLoader(
train_ds,
batch_sampler=train_sampler,
return_list=True,
use_buffer_reader=True)
四、模型訓(xùn)練
1.選取預(yù)訓(xùn)練模型
選取cnn14作為 backbone,用于提取音頻的特征:
from paddlespeech.cls.models import cnn14 backbone = cnn14(pretrained=True, extract_embedding=True)
2.構(gòu)建分類模型
SoundClassifer接收cnn14作為backbone模型,并創(chuàng)建下游的分類網(wǎng)絡(luò):
import paddle.nn as nn
class SoundClassifier(nn.Layer):
def __init__(self, backbone, num_class, dropout=0.1):
super().__init__()
self.backbone = backbone
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(self.backbone.emb_size, num_class)
def forward(self, x):
x = x.unsqueeze(1)
x = self.backbone(x)
x = self.dropout(x)
logits = self.fc(x)
return logits
model = SoundClassifier(backbone, num_class=len(train_ds.label_list))
3.finetune
# 定義優(yōu)化器和 Loss optimizer = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters()) criterion = paddle.nn.loss.CrossEntropyLoss()
from paddleaudio.utils import logger
epochs = 20
steps_per_epoch = len(train_loader)
log_freq = 10
eval_freq = 10
for epoch in range(1, epochs + 1):
model.train()
avg_loss = 0
num_corrects = 0
num_samples = 0
for batch_idx, batch in enumerate(train_loader):
waveforms, labels = batch
feats = feature_extractor(waveforms)
feats = paddle.transpose(feats, [0, 2, 1]) # [B, N, T] -> [B, T, N]
logits = model(feats)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
optimizer._learning_rate.step()
optimizer.clear_grad()
# Calculate loss
avg_loss += loss.numpy()[0]
# Calculate metrics
preds = paddle.argmax(logits, axis=1)
num_corrects += (preds == labels).numpy().sum()
num_samples += feats.shape[0]
if (batch_idx + 1) % log_freq == 0:
lr = optimizer.get_lr()
avg_loss /= log_freq
avg_acc = num_corrects / num_samples
print_msg = 'Epoch={}/{}, Step={}/{}'.format(
epoch, epochs, batch_idx + 1, steps_per_epoch)
print_msg += ' loss={:.4f}'.format(avg_loss)
print_msg += ' acc={:.4f}'.format(avg_acc)
print_msg += ' lr={:.6f}'.format(lr)
logger.train(print_msg)
avg_loss = 0
num_corrects = 0
num_samples = 0
[2022-08-24 02:20:49,381] [ TRAIN] - Epoch=17/20, Step=10/15 loss=1.3319 acc=0.4875 lr=0.000100
[2022-08-24 02:21:08,107] [ TRAIN] - Epoch=18/20, Step=10/15 loss=1.3222 acc=0.4719 lr=0.000100
[2022-08-24 02:21:08,107] [ TRAIN] - Epoch=18/20, Step=10/15 loss=1.3222 acc=0.4719 lr=0.000100
[2022-08-24 02:21:26,884] [ TRAIN] - Epoch=19/20, Step=10/15 loss=1.2539 acc=0.5125 lr=0.000100
[2022-08-24 02:21:26,884] [ TRAIN] - Epoch=19/20, Step=10/15 loss=1.2539 acc=0.5125 lr=0.000100
[2022-08-24 02:21:45,579] [ TRAIN] - Epoch=20/20, Step=10/15 loss=1.2021 acc=0.5281 lr=0.000100
[2022-08-24 02:21:45,579] [ TRAIN] - Epoch=20/20, Step=10/15 loss=1.2021 acc=0.5281 lr=0.000100
五、模型訓(xùn)練
top_k = 3
wav_file = 'test/test_0.wav'
n_fft = 1024
win_length = 1024
hop_length = 320
f_min=50.0
f_max=16000.0
waveform, sr = load(wav_file, sr=sr)
feature_extractor = LogMelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window='hann',
f_min=f_min,
f_max=f_max,
n_mels=64)
feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
feats = paddle.transpose(feats, [0, 2, 1]) # [B, N, T] -> [B, T, N]
logits = model(feats)
probs = nn.functional.softmax(logits, axis=1).numpy()
sorted_indices = probs[0].argsort()
msg = f'[{wav_file}]\n'
for idx in sorted_indices[-1:-top_k-1:-1]:
msg += f'{train_ds.label_list[idx]}: {probs[0][idx]:.5f}\n'
print(msg)
[test/test_0.wav]
diaper: 0.50155
sleepy: 0.41397
hug: 0.05912
六、注意事項
- 1.自定義數(shù)據(jù)集,格式可參考文檔;
- 2.統(tǒng)一音頻尺寸(例如音頻長度、采樣頻率)
以上就是python PaddleSpeech實現(xiàn)嬰兒啼哭識別的詳細(xì)內(nèi)容,更多關(guān)于python PaddleSpeech嬰兒啼哭識別的資料請關(guān)注腳本之家其它相關(guān)文章!
相關(guān)文章
python+requests實現(xiàn)接口測試的完整步驟
這篇文章主要給大家介紹了關(guān)于python+requests實現(xiàn)接口測試的完整步驟,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2020-10-10
python利用高階函數(shù)實現(xiàn)剪枝函數(shù)
這篇文章主要為大家詳細(xì)介紹了python利用高階函數(shù)實現(xiàn)剪枝函數(shù)的相關(guān)資料,具有一定的參考價值,感興趣的小伙伴們可以參考一下2018-03-03
Pyinstaller 打包發(fā)布經(jīng)驗總結(jié)
這篇文章主要介紹了Pyinstaller 打包發(fā)布經(jīng)驗總結(jié),使用Pyinstaller打包Python項目包含了大量的坑,感興趣的可以一起來了解一下2020-06-06

