-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathVocalseparation.py
89 lines (71 loc) · 2.97 KB
/
Vocalseparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Code source: Brian McFee
# License: ISC
##################
# Standard imports
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
y, sr = librosa.load('/home/pranav/Desktop/myrepo/Test/ak.mp3', duration=120)
# And compute the spectrogram magnitude and phase
S_full, phase = librosa.magphase(librosa.stft(y))
idx = slice(*librosa.time_to_frames([30, 35], sr=sr))
plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(S_full[:, idx], ref=np.max),
y_axis='log', x_axis='time', sr=sr)
plt.colorbar()
plt.tight_layout()
# We'll compare frames using cosine similarity, and aggregate similar frames
# by taking their (per-frequency) median value.
#
# To avoid being biased by local continuity, we constrain similar frames to be
# separated by at least 2 seconds.
#
# This suppresses sparse/non-repetetitive deviations from the average spectrum,
# and works well to discard vocal elements.
S_filter = librosa.decompose.nn_filter(S_full,
aggregate=np.median,
metric='cosine',
width=int(librosa.time_to_frames(2, sr=sr)))
# The output of the filter shouldn't be greater than the input
# if we assume signals are additive. Taking the pointwise minimium
# with the input spectrum forces this.
S_filter = np.minimum(S_full, S_filter)
# We can also use a margin to reduce bleed between the vocals and instrumentation masks.
# Note: the margins need not be equal for foreground and background separation
margin_b, margin_f = 2, 10
power = 2
mask_b = librosa.util.softmask(S_filter,
margin_b * (S_full - S_filter),
power=power)
mask_f = librosa.util.softmask(S_full - S_filter,
margin_f * S_filter,
power=power)
# Once we have the masks, simply multiply them with the input spectrum
# to separate the components
S_foreground = mask_f * S_full
S_background = mask_b * S_full
back = librosa.istft(S_background)
fore = librosa.istft(S_foreground)
librosa.output.write_wav('/home/pranav/Desktop/myrepo/back.wav', back, sr)
librosa.output.write_wav('/home/pranav/Desktop/myrepo/fore.wav', fore, sr)
# sphinx_gallery_thumbnail_number = 2
plt.figure(figsize=(12, 8))
plt.subplot(3, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(S_full[:, idx], ref=np.max),
y_axis='log', sr=sr)
plt.title('Full spectrum')
plt.colorbar()
plt.subplot(3, 1, 2)
librosa.display.specshow(librosa.amplitude_to_db(S_background[:, idx], ref=np.max),
y_axis='log', sr=sr)
plt.title('Background')
plt.colorbar()
plt.subplot(3, 1, 3)
librosa.display.specshow(librosa.amplitude_to_db(S_foreground[:, idx], ref=np.max),
y_axis='log', x_axis='time', sr=sr)
plt.title('Foreground')
plt.colorbar()
plt.tight_layout()
plt.show()