python PaddleSpeech implementation for baby cry recognition

I. Infant Cry Recognition Based on PaddleSpeech

1. Project background

For the infant, the cry is a form of communication, a very limited but adult-like way of communicating. It is also a biological alarm that communicates to the outside world the physical and psychological needs of the infant. Based on the information carried by the sound waves of the cry, the physical condition of the infant can be determined and diseases can be detected. Therefore, it is of great practical importance to effectively recognize cries and to successfully "translate" infant cries into "adult language" so that we can read their meaning.

2. Data description:

1. The training dataset contains six types of cries to which noise has been added manually.

A: awake

B: diaper (diaper changing)

C: hug

D: hungry

E: sleepy

F: uncomfortable

2. Noise data from Noisex-92 standard database.

II. PaddleSpeech environment preparation

# Environment preparation: install paddlespeech and paddleaudio
!python -m pip install -q -U pip --user
!pip install paddlespeech paddleaudio -U -q

!pip list|grep paddle

import warnings
("ignore")
import IPython
import numpy as np
import  as plt
import paddle
%matplotlib inline

III. Data pre-processing

1. Data decompression

# !unzip -qoa data/data41960/

2. View sound files

from paddleaudio import load
data, sr = load(file='train/awake/awake_0.wav', mono=True, dtype='float32')  # Single channel, float32 audio sample points
print('wav shape: {}'.format())
print('sample rate: {}'.format(sr))
# Show audio waveforms
()
(data)
()

from paddleaudio import load
data, sr = load(file='train/diaper/diaper_0.wav', mono=True, dtype='float32')  # Single channel, float32 audio sample points
print('wav shape: {}'.format())
print('sample rate: {}'.format(sr))
# Show audio waveforms
()
(data)
()

!paddlespeech cls --input train/awake/awake_0.wav

!paddlespeech help

3. Audio file length processing

# Check the audio length
import contextlib
import wave
def get_sound_len(file_path):
    with ((file_path, 'r')) as f:
        frames = ()
        rate = ()
        wav_length = frames / float(rate)
    return wav_length

# Compile wav files
import glob
sound_files=('train/*/*.wav')
print(sound_files[0])
print(len(sound_files))

# Counting the longest and shortest audio
sounds_len=[]
for sound in sound_files:
    sounds_len.append(get_sound_len(sound))
print("Maximum length of audio:",max(sounds_len),"Seconds.")
print("Minimum audio length:",min(sounds_len),"Seconds.")

!cp train/hungry/hungry_0.wav ~/

!pip install pydub -q

# Audio information view
import math
import soundfile as sf
import numpy as np
import librosa
data, samplerate = ('hungry_0.wav')
channels = len()
length_s = len(data)/float(samplerate)
format_rate=16000
print(f"channels: {channels}")
print(f"length_s: {length_s}")
print(f"samplerate: {samplerate}")

# Harmonized to 34s
from pydub import AudioSegment
audio = AudioSegment.from_wav('hungry_0.wav')
print(str(audio.duration_seconds))
i = 1
padded = audio
while padded.duration_seconds * 1000 &lt; 34000:
    padded = audio * i
    i = i + 1
padded[0:34000].set_frame_rate(16000).export('', format='wav')

import math
import soundfile as sf
import numpy as np
import librosa
data, samplerate = ('')
channels = len()
length_s = len(data)/float(samplerate)
format_rate=16000
print(f"channels: {channels}")
print(f"length_s: {length_s}")
print(f"samplerate: {samplerate}")

# Define a function that repeats the padding if the maximum length is not reached, ultimately intercepting from more than 34s of audio
from pydub import AudioSegment
def convert_sound_len(filename):
    audio = AudioSegment.from_wav(filename)
    i = 1
    padded = audio*i
    while padded.duration_seconds * 1000 &lt; 34000:
        i = i + 1
        padded = audio * i
    padded[0:34000].set_frame_rate(16000).export(filename, format='wav')

# Harmonize all audio to fixed length
for sound in sound_files:
    convert_sound_len(sound)

3. Customized data sets

import os
from  import AudioClassificationDataset
class CustomDataset(AudioClassificationDataset):
    # List all the class labels
    label_list = [
        'awake',
        'diaper',
        'hug',
        'hungry',
        'sleepy',
        'uncomfortable'
    ]
    train_data_dir='./train/'
    def __init__(self, **kwargs):
        files, labels = self._get_data()
        super(CustomDataset, self).__init__(
            files=files, labels=labels, feat_type='raw', **kwargs)
    # Return audio file, label value
    def _get_data(self):
        '''
        This method offer information of wave files and labels.
        '''
        files = []
        labels = []
        for i in range(len(self.label_list)):
            single_class_path=(self.train_data_dir, self.label_list[i])            
            for sound in (single_class_path):
                # print(sound)
                if 'wav' in sound:
                    sound=(single_class_path, sound)
                    (sound)
                    (i)
        return files, labels

# Define dataloader
import paddle
from  import LogMelSpectrogram
# Feature config should be align with pretrained model
sample_rate = 16000
feat_conf = {
  'sr': sample_rate,
  'n_fft': 1024,
  'hop_length': 320,
  'window': 'hann',
  'win_length': 1024,
  'f_min': 50.0,
  'f_max': 14000.0,
  'n_mels': 64,
}
train_ds = CustomDataset(sample_rate=sample_rate)
feature_extractor = LogMelSpectrogram(**feat_conf)
train_sampler = (
    train_ds, batch_size=64, shuffle=True, drop_last=False)
train_loader = (
    train_ds,
    batch_sampler=train_sampler,
    return_list=True,
    use_buffer_reader=True)

IV. Model training

1. Selection of pre-training models

The cnn14 is selected as the backbone for extracting the features of the audio:

from  import cnn14
backbone = cnn14(pretrained=True, extract_embedding=True)

2. Constructing a classification model

SoundClassifer receives cnn14 as a backbone model and creates a downstream classification network:

import  as nn
class SoundClassifier():
    def __init__(self, backbone, num_class, dropout=0.1):
        super().__init__()
         = backbone
         = (dropout)
         = (.emb_size, num_class)
    def forward(self, x):
        x = (1)
        x = (x)
        x = (x)
        logits = (x)
        return logits
model = SoundClassifier(backbone, num_class=len(train_ds.label_list))

# Define the Optimizer and Loss
optimizer = (learning_rate=1e-4, parameters=())
criterion = ()

from  import logger
epochs = 20
steps_per_epoch = len(train_loader)
log_freq = 10
eval_freq = 10
for epoch in range(1, epochs + 1):
    ()
    avg_loss = 0
    num_corrects = 0
    num_samples = 0
    for batch_idx, batch in enumerate(train_loader):
        waveforms, labels = batch
        feats = feature_extractor(waveforms)
        feats = (feats, [0, 2, 1])  # [B, N, T] -&gt; [B, T, N]
        logits = model(feats)
        loss = criterion(logits, labels)
        ()
        ()
        if isinstance(optimizer._learning_rate,
                      ):
            optimizer._learning_rate.step()
        optimizer.clear_grad()
        # Calculate loss
        avg_loss += ()[0]
        # Calculate metrics
        preds = (logits, axis=1)
        num_corrects += (preds == labels).numpy().sum()
        num_samples += [0]
        if (batch_idx + 1) % log_freq == 0:
            lr = optimizer.get_lr()
            avg_loss /= log_freq
            avg_acc = num_corrects / num_samples
            print_msg = 'Epoch={}/{}, Step={}/{}'.format(
                epoch, epochs, batch_idx + 1, steps_per_epoch)
            print_msg += ' loss={:.4f}'.format(avg_loss)
            print_msg += ' acc={:.4f}'.format(avg_acc)
            print_msg += ' lr={:.6f}'.format(lr)
            (print_msg)
            avg_loss = 0
            num_corrects = 0
            num_samples = 0

[2022-08-24 02:20:49,381] [ TRAIN] - Epoch=17/20, Step=10/15 loss=1.3319 acc=0.4875 lr=0.000100
[2022-08-24 02:21:08,107] [ TRAIN] - Epoch=18/20, Step=10/15 loss=1.3222 acc=0.4719 lr=0.000100
[2022-08-24 02:21:08,107] [ TRAIN] - Epoch=18/20, Step=10/15 loss=1.3222 acc=0.4719 lr=0.000100
[2022-08-24 02:21:26,884] [ TRAIN] - Epoch=19/20, Step=10/15 loss=1.2539 acc=0.5125 lr=0.000100
[2022-08-24 02:21:26,884] [ TRAIN] - Epoch=19/20, Step=10/15 loss=1.2539 acc=0.5125 lr=0.000100
[2022-08-24 02:21:45,579] [ TRAIN] - Epoch=20/20, Step=10/15 loss=1.2021 acc=0.5281 lr=0.000100
[2022-08-24 02:21:45,579] [ TRAIN] - Epoch=20/20, Step=10/15 loss=1.2021 acc=0.5281 lr=0.000100

V. Model training

top_k = 3
wav_file = 'test/test_0.wav'
n_fft = 1024
win_length = 1024
hop_length = 320
f_min=50.0
f_max=16000.0
waveform, sr = load(wav_file, sr=sr)
feature_extractor = LogMelSpectrogram(
    sr=sr, 
    n_fft=n_fft, 
    hop_length=hop_length, 
    win_length=win_length, 
    window='hann', 
    f_min=f_min, 
    f_max=f_max, 
    n_mels=64)
feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
feats = (feats, [0, 2, 1])  # [B, N, T] -&gt; [B, T, N]
logits = model(feats)
probs = (logits, axis=1).numpy()
sorted_indices = probs[0].argsort()
msg = f'[{wav_file}]\n'
for idx in sorted_indices[-1:-top_k-1:-1]:
    msg += f'{train_ds.label_list[idx]}: {probs[0][idx]:.5f}\n'
print(msg)

[test/test_0.wav]
diaper: 0.50155
sleepy: 0.41397
hug: 0.05912

VI. Precautions

1. Customize the dataset, the format can refer to the documentation;
2. Harmonize audio dimensions (e.g. audio length, sampling frequency)

Above is the detailed content of python PaddleSpeech to realize the baby crying recognition, more information about python PaddleSpeech baby crying recognition please pay attention to my other related articles!