-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdata.py
254 lines (214 loc) · 8.17 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python3
"""
@author: Jithin Sasikumar
Load, preprocess audio dataset and extract features. The audio files are loaded,
each file is preprocessed, and dumped as `.npy` files which is convenient to work
with. Thus, dumped `.npy` files can be loaded and performed with some additional
preprocessing steps and can be used for training.
"""
import os
import librosa
import numpy as np
from tqdm import tqdm
from typing import List, Tuple
from keras.utils import to_categorical
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
from src.exception_handler import DirectoryError, ValueError
@dataclass
class Dataset:
"""
Dataclass that represent a dataset which is flexible to be used
for any model training.
"""
x_train: np.ndarray = None
y_train: np.array = None
x_test: np.ndarray = None
y_test: np.array = None
@dataclass
class Preprocess:
"""Preprocess audio dataset to be used for training.
"""
dataset_: Dataset = None
train_dir: str = "./dataset/train/"
n_mfcc: int = 49
mfcc_length: int = 40
sampling_rate: int = 8000
extension: str = ".npy"
def __post_init__(self) -> None:
"""
Dunder method to perform exception handling to catch invalid directory.
Returns:
None.
Raises
------
DirectoryError: Exception
If self.train_dir does not exist.
"""
if not os.path.exists(self.train_dir):
raise DirectoryError(
f"{self.train_dir} doesn't exists. Please enter a valid path !!!")
@property
def labels(self) -> List:
"""
Class property to return the labels from data.
Returns
-------
List of labels.
"""
return ['.'.join(file_.split('.')[:-1])
for file_ in os.listdir(self.train_dir)
if os.path.isfile(os.path.join(self.train_dir, file_))
and check_fileType(filename = file_, extension = self.extension)]
def __load_dataset(self, labels: List,
load_format: str = ".npy") -> Tuple[np.ndarray]:
"""
Private method to load `.npy` files to preprocess.
Parameters
----------
labels: List
List of labels.
load_format: str
Format to load from disk. Defaults to `.npy`.
Returns
-------
data, labels: Tuple[np.ndarray]
Tuple representing data(X) and its labels(y).
"""
data = np.load(f"{self.train_dir + labels[0] + load_format}")
labels = np.zeros(data.shape[0])
for index, label in enumerate(self.labels[1:]):
x = np.load(f"{self.train_dir + label + load_format}")
data = np.vstack((data, x))
labels = np.append(labels, np.full(x.shape[0],
fill_value = (index + 1)))
return data, labels
def preprocess_dataset(self, labels: List,
test_split_percent: float) -> Dataset:
"""
Preprocess the loaded dataset.
Parameters
----------
labels: List
List of labels.
test_split_percent: float
Train-test split percentage/ratio.
Returns
-------
instanceof(Dataset):
Instance of Dataset after preprocessing.
The labels are one-hot encoded.
Raises
------
ValueError: Exception
If loaded dataset is empty or null.
"""
X, y = self.__load_dataset(labels)
x_train, x_test, y_train, y_test = train_test_split(X, y,
test_size = test_split_percent,
random_state=42, shuffle = True)
for data in (x_train, x_test, y_train, y_test):
if data is None:
raise ValueError(f"{data} is null. Please check and preprocess again!!!")
return Dataset(x_train, to_categorical(y_train, num_classes = len(labels)),
x_test, to_categorical(y_test, num_classes = len(labels)))
def dump_audio_files(self, audio_files_dir: str, labels: List, n_mfcc: int,
mfcc_length: int, sampling_rate: int,
save_format: str = ".npy") -> None:
"""
Method to load, process and dump audio files as `.npy` for training.
This method is `optional` and used only with audio files. If not, skip this
method and use preprocess_dataset() directly for training.
Returns
-------
None.
"""
for label in labels:
mfcc_features_np = list()
audio_files = [audio_files_dir + label + '/' + audio_file
for audio_file in os.listdir(audio_files_dir + '/' + label)]
for audioFile in tqdm(audio_files):
mfcc_features = convert_audio_to_mfcc(audioFile, n_mfcc,
mfcc_length, sampling_rate)
mfcc_features_np.append(mfcc_features)
np.save(f"{self.train_dir + label + save_format}", mfcc_features_np)
print(f".npy files dumped to {self.train_dir}")
def wrap_labels(self) -> List:
"""
Wrapper funtion to read labels from file.
This is not a generic approach but it's required for inference. The main reason is,
due to the memory limitation in Git, large files cannot be added. Even though Git LFS
can be used but it's not feasible for this current application. So this function is
a little play around.
Returns:
--------
labels: List
"""
with open(f"{self.train_dir}/labels.txt", "r") as file:
file_data: str = file.read()
labels: List = file_data.split(",")
file.close()
return labels
def convert_audio_to_mfcc(audio_file_path: str,
n_mfcc: int, mfcc_length: int,
sampling_rate: int) -> np.ndarray:
"""
Helper function to convert each audio file to MFCC features. It's
a generic function which can be called without an instance.
Parameters
----------
audio_file_path: str
Path of audio file.
n_mfcc: int
Number of MFCCs to return.
mfcc_length: int
Length of MFCC features for each audio input.
sampling_rate: int
Target sampling rate.
Returns
-------
mfcc_features: np.ndarray
Extracted MFCC features of the audio file.
"""
audio, sampling_rate = librosa.load(audio_file_path, sr = sampling_rate)
mfcc_features: np.ndarray = librosa.feature.mfcc(audio,
n_mfcc = n_mfcc,
sr = sampling_rate)
if(mfcc_length > mfcc_features.shape[1]):
padding_width = mfcc_length - mfcc_features.shape[1]
mfcc_features = np.pad(mfcc_features,
pad_width =((0, 0), (0, padding_width)), mode ='constant')
else:
mfcc_features = mfcc_features[:, :mfcc_length]
return mfcc_features
def check_fileType(filename: str, extension: str) -> bool:
"""
Helper function to check the extension of a file.
Parameters
----------
filename: str
Input filename
extension: str
File extension to check.
Returns
-------
bool: True if a file exists, else False.
"""
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in extension
def print_shape(name: str, arr: np.array) -> None:
"""
Helper function to print shapes of input np arrays.
Note:
To avoid boilerplate code!!!!
Parameters
----------
name: str
Name of input array.
arr: np.array
Input array itself.
Returns
-------
None
"""
print(f"Shape of {name}: {arr.shape}")