-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_masking_threshold.py
149 lines (120 loc) · 4.97 KB
/
generate_masking_threshold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import scipy.io.wavfile as wav
import numpy as np
from scipy.fftpack import fft
from scipy.fftpack import ifft
from scipy import signal
import scipy
import librosa
def compute_PSD_matrix(audio, n_fft, hop_length, win_length):
"""
First, perform STFT.
Then, compute the PSD.
Last, normalize PSD.
"""
# win = np.sqrt(8.0 / 3.) * librosa.core.stft(audio, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length, center=False)
win = np.sqrt(8.0 / 3.) * librosa.core.stft(audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
z = abs(win / win_length)
psd_max = np.max(z * z)
psd = 10 * np.log10(z * z + 0.0000000000000000001)
PSD = 96 - np.max(psd) + psd
return PSD, psd_max
def Bark(f):
"""returns the bark-scale value for input frequency f (in Hz)"""
return 13 * np.arctan(0.00076 * f) + 3.5 * np.arctan(pow(f / 7500.0, 2))
def quiet(f):
"""returns threshold in quiet measured in SPL at frequency f with an offset 12(in Hz)"""
thresh = 3.64 * pow(f * 0.001, -0.8) - 6.5 * np.exp(-0.6 * pow(0.001 * f - 3.3, 2)) + 0.001 * pow(0.001 * f, 4) - 12
return thresh
def two_slops(bark_psd, delta_TM, bark_maskee):
"""
returns the masking threshold for each masker using two slopes as the spread function
"""
Ts = []
for tone_mask in range(bark_psd.shape[0]):
bark_masker = bark_psd[tone_mask, 0]
dz = bark_maskee - bark_masker
zero_index = np.argmax(dz > 0)
sf = np.zeros(len(dz))
sf[:zero_index] = 27 * dz[:zero_index]
sf[zero_index:] = (-27 + 0.37 * max(bark_psd[tone_mask, 1] - 40, 0)) * dz[zero_index:]
T = bark_psd[tone_mask, 1] + delta_TM[tone_mask] + sf
Ts.append(T)
return Ts
def compute_th(PSD, barks, ATH, freqs):
""" returns the global masking threshold
"""
# Identification of tonal maskers
# find the index of maskers that are the local maxima
length = len(PSD)
masker_index = signal.argrelextrema(PSD, np.greater)[0]
# delete the boundary of maskers for smoothing
if 0 in masker_index:
masker_index = np.delete(0)
if length - 1 in masker_index:
masker_index = np.delete(length - 1)
num_local_max = len(masker_index)
# treat all the maskers as tonal (conservative way)
# smooth the PSD
p_k = pow(10, PSD[masker_index] / 10.)
p_k_prev = pow(10, PSD[masker_index - 1] / 10.)
p_k_post = pow(10, PSD[masker_index + 1] / 10.)
P_TM = 10 * np.log10(p_k_prev + p_k + p_k_post)
# P_TM = 10 * np.log10(p_k)
# bark_psd: the first column bark, the second column: P_TM, the third column: the index of points
_BARK = 0
_PSD = 1
_INDEX = 2
bark_psd = np.zeros([num_local_max, 3])
bark_psd[:, _BARK] = barks[masker_index]
bark_psd[:, _PSD] = P_TM
bark_psd[:, _INDEX] = masker_index
# delete the masker that doesn't have the highest PSD within 0.5 Bark around its frequency
for i in range(num_local_max):
next = i + 1
if next >= bark_psd.shape[0]:
break
while bark_psd[next, _BARK] - bark_psd[i, _BARK] < 0.5:
# masker must be higher than quiet threshold
if quiet(freqs[int(bark_psd[i, _INDEX])]) > bark_psd[i, _PSD]:
bark_psd = np.delete(bark_psd, (i), axis=0)
if next == bark_psd.shape[0]:
break
if bark_psd[i, _PSD] < bark_psd[next, _PSD]:
bark_psd = np.delete(bark_psd, (i), axis=0)
else:
bark_psd = np.delete(bark_psd, (next), axis=0)
if next == bark_psd.shape[0]:
break
# compute the individual masking threshold
delta_TM = 1 * (-6.025 - 0.275 * bark_psd[:, 0])
Ts = two_slops(bark_psd, delta_TM, barks)
Ts = np.array(Ts)
# compute the global masking threshold
# theta_x = np.row_stack((Ts, ATH.reshape(1, -1)))
# theta_x = np.max(theta_x, axis=0)
theta_x = 10 * np.log10(np.sum(pow(10, Ts / 10.), axis=0) + pow(10, ATH / 10.))
return theta_x
def generate_th(
audio: np.array,
sample_rate: int,
n_fft: int,
hop_length: int,
win_length: int,
):
"""
returns the masking threshold theta_xs and the max psd of the audio
"""
PSD, psd_max = compute_PSD_matrix(audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
freqs = librosa.core.fft_frequencies(sr=sample_rate, n_fft=n_fft)
barks = Bark(freqs)
# compute the quiet threshold
ATH = np.zeros(len(barks)) - np.inf
bark_ind = np.argmax(barks > 1)
ATH[bark_ind:] = quiet(freqs[bark_ind:])
# compute the global masking threshold theta_xs
theta_xs = []
# compute the global masking threshold in each window
for i in range(PSD.shape[1]):
theta_xs.append(compute_th(PSD[:, i], barks, ATH, freqs))
theta_xs = np.array(theta_xs)
return theta_xs, psd_max