dspeech.feature source code

1 /**
2    Speech feature extraction module
3 
4    See also:
5    - "Chapter 5 Speech Input/Output," HTK book 3.5 alpha1, pp. 89 - 1
6    - https://github.com/jameslyons/python_speech_features
7 
8    TODO: preemphasis and dither
9 */
10 module dspeech.feature;
11 
12 /** Compute Wave -> Spectrogram -> FBANK -> MFCC eventually
13 
14 - preemphasised and dithered wave
15 
16 $(IMG dspeech.feature.speech_wav.png )
17 
18 - power spectrogram
19 
20 $(IMG dspeech.feature.speech_spectrogram.png )
21 
22 - log-Mel filterbank (FBANK)
23 
24 $(IMG dspeech.feature.speech_fbank.png )
25 
26 - Mel-frequency cepstral coefficients (MFCC)
27 
28 $(IMG dspeech.feature.speech_mfcc.png )
29 
30  */
31 unittest
32 {
33     import dspeech.plot : docDir, docWav, plotVector, plotMatrix;
34     import dffmpeg : Audio;
35     import mir.ndslice : map, sliced;
36 
37     auto raw = Audio!short().load(docWav);
38     auto wav = raw.data.sliced.dither(1.0).preemphasis(0.97);
39     wav.plotVector.save(docDir ~ "dspeech.feature.speech_wav.png", 400, 200);
40     auto sp = wav.spectrogram(512, 256);
41     sp.plotMatrix.save(docDir ~ "dspeech.feature.speech_spectrogram.png", 400, 200);
42     auto fbanks = sp.toFbank(melMatrix(sp.length!0, 80, cast(double) raw.sample_rate));
43     fbanks.plotMatrix.save(docDir ~ "dspeech.feature.speech_fbank.png", 400, 200);
44     auto mfccs = fbanks.toMfcc;
45     mfccs.plotMatrix.save(docDir ~ "dspeech.feature.speech_mfcc.png", 400, 200);
46 }
47 
48 
49 
50 import numir.signal : hann;
51 
52 /// Dither quantized wave
53 auto dither(S)(S wave, double stddev = 1.0)
54 {
55     import mir.ndslice : map;
56     import numir.random : RNG;
57     import mir.random.variable: NormalVariable;
58 
59     return wave.map!(a => a + NormalVariable!double(0, stddev)(RNG.get));
60 }
61 
62 
63 /// Preemphasis wave signal
64 auto preemphasis(S)(S wave, double factor = 0.97)
65 {
66     return wave[1 .. $] - factor * wave[0 .. $-1];
67 }
68 
69 /**
70    Computes a magnitude 2-D spectrogram from a time-domain 1-D signal
71 
72 Params:
73     windowFun = window function
74     xs = 1-D slice signal
75     nperseg = (default 256) short-time frame width for each FFT segment
76     noverlap = (default nperseg / 2) short-time frame overlapped length for each FFT segment
77 
78 Returns: a magnitude 2D spectrogram
79 */
80 auto spectrogram(alias windowFun=hann, bool power=false, S)(S xs, size_t nperseg = 256, size_t noverlap = 128)
81 {
82     import numir.signal : stft;
83     import mir.math.common : sqrt;
84     import mir.ndslice.topology : map;
85     import mir.ndslice : transposed;
86 
87     auto stfs = stft!windowFun(xs, nperseg, noverlap);
88     auto upper = stfs.transposed[nperseg / 2 .. $];
89     static if (power)
90     {
91         return upper.map!(a => (a.re * a.re + a.im * a.im));
92     }
93     else
94     {
95         return upper.map!(a => sqrt(a.re * a.re + a.im * a.im));
96     }
97 }
98 
99 /// example spectrogram of freq-modulated sin(a t + b cos(c t))
100 /// its plot should be simple sinwave in freq domain
101 /// $(IMG dspeech.feature.spectrogram.png )
102 unittest
103 {
104     import mir.ndslice : as, map, iota;
105     import mir.math : sin, cos;
106     import std.math : PI;
107     import dspeech.plot : plotMatrix, docDir;
108 
109     auto time = iota(50000).as!double / 10e3;
110     auto mod = 1e3 * map!cos(2.0 * PI * 0.5 * time);
111     auto xs = map!sin(2.0 * PI * 3e3 * time  + mod);
112     auto sp = spectrogram(xs);
113     auto fig = plotMatrix(sp);
114     fig.save(docDir ~ "dspeech.feature.spectrogram.png",
115              cast(int) sp.length!1 * 2, cast(int) sp.length!0 * 2);
116 }
117 
118 @nogc @safe nothrow pure melScale(double freq)
119 {
120     import mir.math.common : log;
121     return 1127.0 * log(1.0 + freq / 700.0);
122 }
123 
124 
125 
126 /**
127 Computes Mel-scale filterbank matrix
128 
129 Params:
130     nFreq: the number of FFT bins
131     nMel: the number of filterbank bins
132     sampleFreq: sampling frequency of FFT signal
133     lowFreq: lowest frequency threshold of the filterbank
134     highFreq: highest (relative from Nyquist freq) frequency threshold of the filterbank
135 
136 TODO:
137     make this function @nogc
138  */
139 @safe nothrow pure melMatrix(Dtype = double)(size_t nFreq, size_t nMel, Dtype sampleFreq, Dtype lowFreq= 20, Dtype highFreq = 0)
140 {
141     import mir.ndslice : iota, as, sliced, map;
142     import numir : zeros;
143     const nyquist = 0.5 * sampleFreq;
144     highFreq += nyquist;
145     const lowMel = lowFreq.melScale;
146     const highMel = highFreq.melScale;
147     const deltaMel = (highMel - lowMel) / nMel;
148     const deltaFreq = nyquist / nFreq;
149     const binsMel = iota(nMel + 2).as!Dtype * deltaMel + lowMel;
150 
151     return iota(nFreq, nMel).map!(
152         (i) {
153             const iFreq = i / nMel;
154             const iMel = i % nMel;
155             const leftMel = binsMel[iMel];
156             const centerMel = binsMel[iMel + 1];
157             const rightMel = binsMel[iMel + 2];
158             const mel = melScale(deltaFreq * iFreq);
159             if (mel >= rightMel)
160             {
161                 assert(iFreq > 0, "too large nMel you set");
162                 return cast(Dtype) 0;
163             }
164             if (mel > leftMel)
165             {
166                 return (mel <= centerMel ? mel - leftMel : rightMel - mel) / deltaMel;
167             }
168             return cast(Dtype) 0;
169         });
170 }
171 
172 /// $(IMG dspeech.feature.mel.png )
173 unittest
174 {
175     import mir.ndslice : transposed;
176     import dspeech.plot : plotMatrix, docDir;
177     auto m = melMatrix!double(200, 40, 16000.0);
178     m.transposed.plotMatrix.save(
179         docDir ~ "dspeech.feature.mel.png",
180         cast(int) m.length!0 * 3, cast(int) m.length!1 * 9);
181 }
182 
183 /**
184 Computes type-II DCT on 1-d signal:
185 
186 $(MATH y[k] = 2 f \sum_{n=0}^{N-1} x[n] \cos \pi k \frac{2n + 1}{2 N} ),
187 
188 where the scaling factor f = sqrt(1 / 4 N) if k = 0,  f = sqrt(1 / 2 N) otherwise,
189 
190 Params:
191     xs = input array
192 
193 Returns:
194     DCT transformed 1-d array
195 
196 See_also: https://docs.scipy.org/doc/scipy/reference/generated/scipy.fftpack.dct.html
197  */
198 auto dct(alias sumKind = "precise", S)(S xs)
199 {
200     import std.math : PI;
201     import mir.math.common;
202     import mir.math.sum;
203     import mir.primitives : DimensionCount;
204     static assert(DimensionCount!S == 1);
205     import mir.ndslice;
206     const N = xs.length;
207     const f0 = 2 * sqrt(0.25 / N);
208     const f = 2 * sqrt(0.5 / N);
209     return  iota(N).map!(k => (k == 0 ? f0 : f) * iota(N).map!(n => xs[n] * cos(PI * k * (2 * n + 1) / (2 * N))).sum!sumKind);
210 }
211 
212 
213 /// compare with scipy.fftpack.dct
214 unittest
215 {
216     import mir.ndslice;
217     import numir.testing : approxEqual;
218     auto y = dct([4.0, 3.0, 5.0, 10.0].sliced);
219     // python: scipy.fftpack.dct([4., 3., 5., 10.], type=2, norm="ortho")
220     assert(approxEqual([11.        , -4.46088499,  3.        , -0.31702534].sliced, y.slice));
221 }
222 
223 
224 /// Computes log-Mel filterbank (FBANK) feature from spectrogram signal and Mel matrix
225 auto toFbank(S, M)(S spect, M melmat)
226 {
227     import lubeck : mtimes;
228     import mir.ndslice : map, transposed;
229     import mir.math.common : log, fmax;
230 
231     return melmat.transposed.mtimes(spect).map!(x => fmax(x, typeof(x).epsilon).log);
232 }
233 
234 
235 /// Computes MFCC feature from log-Mel filterbank (FBANK) feature
236 auto toMfcc(S)(S fbanks, size_t nceps = 13, double lifter = 22)
237 {
238     import numir : alongDim;
239     import mir.ndslice : map, iota, ndiota, transposed, slice;
240     import mir.math.common : sin;
241     import std.math : PI;
242 
243     auto l = 1.0 + 0.5 * iota(nceps).map!(i => lifter * sin(PI * i / lifter));
244     auto N = fbanks.length!1;
245     return ndiota(nceps, N).map!(i => dct(fbanks[0 .. $, i[1]])[i[0] + 1] * l[i[0]]);
246 }