1 /** 2 Speech feature extraction module 3 4 See also: 5 - "Chapter 5 Speech Input/Output," HTK book 3.5 alpha1, pp. 89 - 1 6 - https://github.com/jameslyons/python_speech_features 7 8 TODO: preemphasis and dither 9 */ 10 module dspeech.feature; 11 12 /** Compute Wave -> Spectrogram -> FBANK -> MFCC eventually 13 14 - preemphasised and dithered wave 15 16 $(IMG dspeech.feature.speech_wav.png ) 17 18 - power spectrogram 19 20 $(IMG dspeech.feature.speech_spectrogram.png ) 21 22 - log-Mel filterbank (FBANK) 23 24 $(IMG dspeech.feature.speech_fbank.png ) 25 26 - Mel-frequency cepstral coefficients (MFCC) 27 28 $(IMG dspeech.feature.speech_mfcc.png ) 29 30 */ 31 unittest 32 { 33 import dspeech.plot : docDir, docWav, plotVector, plotMatrix; 34 import dffmpeg : Audio; 35 import mir.ndslice : map, sliced; 36 37 auto raw = Audio!short().load(docWav); 38 auto wav = raw.data.sliced.dither(1.0).preemphasis(0.97); 39 wav.plotVector.save(docDir ~ "dspeech.feature.speech_wav.png", 400, 200); 40 auto sp = wav.spectrogram(512, 256); 41 sp.plotMatrix.save(docDir ~ "dspeech.feature.speech_spectrogram.png", 400, 200); 42 auto fbanks = sp.toFbank(melMatrix(sp.length!0, 80, cast(double) raw.sample_rate)); 43 fbanks.plotMatrix.save(docDir ~ "dspeech.feature.speech_fbank.png", 400, 200); 44 auto mfccs = fbanks.toMfcc; 45 mfccs.plotMatrix.save(docDir ~ "dspeech.feature.speech_mfcc.png", 400, 200); 46 } 47 48 49 50 import numir.signal : hann; 51 52 /// Dither quantized wave 53 auto dither(S)(S wave, double stddev = 1.0) 54 { 55 import mir.ndslice : map; 56 import numir.random : RNG; 57 import mir.random.variable: NormalVariable; 58 59 return wave.map!(a => a + NormalVariable!double(0, stddev)(RNG.get)); 60 } 61 62 63 /// Preemphasis wave signal 64 auto preemphasis(S)(S wave, double factor = 0.97) 65 { 66 return wave[1 .. $] - factor * wave[0 .. $-1]; 67 } 68 69 /** 70 Computes a magnitude 2-D spectrogram from a time-domain 1-D signal 71 72 Params: 73 windowFun = window function 74 xs = 1-D slice signal 75 nperseg = (default 256) short-time frame width for each FFT segment 76 noverlap = (default nperseg / 2) short-time frame overlapped length for each FFT segment 77 78 Returns: a magnitude 2D spectrogram 79 */ 80 auto spectrogram(alias windowFun=hann, bool power=false, S)(S xs, size_t nperseg = 256, size_t noverlap = 128) 81 { 82 import numir.signal : stft; 83 import mir.math.common : sqrt; 84 import mir.ndslice.topology : map; 85 import mir.ndslice : transposed; 86 87 auto stfs = stft!windowFun(xs, nperseg, noverlap); 88 auto upper = stfs.transposed[nperseg / 2 .. $]; 89 static if (power) 90 { 91 return upper.map!(a => (a.re * a.re + a.im * a.im)); 92 } 93 else 94 { 95 return upper.map!(a => sqrt(a.re * a.re + a.im * a.im)); 96 } 97 } 98 99 /// example spectrogram of freq-modulated sin(a t + b cos(c t)) 100 /// its plot should be simple sinwave in freq domain 101 /// $(IMG dspeech.feature.spectrogram.png ) 102 unittest 103 { 104 import mir.ndslice : as, map, iota; 105 import mir.math : sin, cos; 106 import std.math : PI; 107 import dspeech.plot : plotMatrix, docDir; 108 109 auto time = iota(50000).as!double / 10e3; 110 auto mod = 1e3 * map!cos(2.0 * PI * 0.5 * time); 111 auto xs = map!sin(2.0 * PI * 3e3 * time + mod); 112 auto sp = spectrogram(xs); 113 auto fig = plotMatrix(sp); 114 fig.save(docDir ~ "dspeech.feature.spectrogram.png", 115 cast(int) sp.length!1 * 2, cast(int) sp.length!0 * 2); 116 } 117 118 @nogc @safe nothrow pure melScale(double freq) 119 { 120 import mir.math.common : log; 121 return 1127.0 * log(1.0 + freq / 700.0); 122 } 123 124 125 126 /** 127 Computes Mel-scale filterbank matrix 128 129 Params: 130 nFreq: the number of FFT bins 131 nMel: the number of filterbank bins 132 sampleFreq: sampling frequency of FFT signal 133 lowFreq: lowest frequency threshold of the filterbank 134 highFreq: highest (relative from Nyquist freq) frequency threshold of the filterbank 135 136 TODO: 137 make this function @nogc 138 */ 139 @safe nothrow pure melMatrix(Dtype = double)(size_t nFreq, size_t nMel, Dtype sampleFreq, Dtype lowFreq= 20, Dtype highFreq = 0) 140 { 141 import mir.ndslice : iota, as, sliced, map; 142 import numir : zeros; 143 const nyquist = 0.5 * sampleFreq; 144 highFreq += nyquist; 145 const lowMel = lowFreq.melScale; 146 const highMel = highFreq.melScale; 147 const deltaMel = (highMel - lowMel) / nMel; 148 const deltaFreq = nyquist / nFreq; 149 const binsMel = iota(nMel + 2).as!Dtype * deltaMel + lowMel; 150 151 return iota(nFreq, nMel).map!( 152 (i) { 153 const iFreq = i / nMel; 154 const iMel = i % nMel; 155 const leftMel = binsMel[iMel]; 156 const centerMel = binsMel[iMel + 1]; 157 const rightMel = binsMel[iMel + 2]; 158 const mel = melScale(deltaFreq * iFreq); 159 if (mel >= rightMel) 160 { 161 assert(iFreq > 0, "too large nMel you set"); 162 return cast(Dtype) 0; 163 } 164 if (mel > leftMel) 165 { 166 return (mel <= centerMel ? mel - leftMel : rightMel - mel) / deltaMel; 167 } 168 return cast(Dtype) 0; 169 }); 170 } 171 172 /// $(IMG dspeech.feature.mel.png ) 173 unittest 174 { 175 import mir.ndslice : transposed; 176 import dspeech.plot : plotMatrix, docDir; 177 auto m = melMatrix!double(200, 40, 16000.0); 178 m.transposed.plotMatrix.save( 179 docDir ~ "dspeech.feature.mel.png", 180 cast(int) m.length!0 * 3, cast(int) m.length!1 * 9); 181 } 182 183 /** 184 Computes type-II DCT on 1-d signal: 185 186 $(MATH y[k] = 2 f \sum_{n=0}^{N-1} x[n] \cos \pi k \frac{2n + 1}{2 N} ), 187 188 where the scaling factor f = sqrt(1 / 4 N) if k = 0, f = sqrt(1 / 2 N) otherwise, 189 190 Params: 191 xs = input array 192 193 Returns: 194 DCT transformed 1-d array 195 196 See_also: https://docs.scipy.org/doc/scipy/reference/generated/scipy.fftpack.dct.html 197 */ 198 auto dct(alias sumKind = "precise", S)(S xs) 199 { 200 import std.math : PI; 201 import mir.math.common; 202 import mir.math.sum; 203 import mir.primitives : DimensionCount; 204 static assert(DimensionCount!S == 1); 205 import mir.ndslice; 206 const N = xs.length; 207 const f0 = 2 * sqrt(0.25 / N); 208 const f = 2 * sqrt(0.5 / N); 209 return iota(N).map!(k => (k == 0 ? f0 : f) * iota(N).map!(n => xs[n] * cos(PI * k * (2 * n + 1) / (2 * N))).sum!sumKind); 210 } 211 212 213 /// compare with scipy.fftpack.dct 214 unittest 215 { 216 import mir.ndslice; 217 import numir.testing : approxEqual; 218 auto y = dct([4.0, 3.0, 5.0, 10.0].sliced); 219 // python: scipy.fftpack.dct([4., 3., 5., 10.], type=2, norm="ortho") 220 assert(approxEqual([11. , -4.46088499, 3. , -0.31702534].sliced, y.slice)); 221 } 222 223 224 /// Computes log-Mel filterbank (FBANK) feature from spectrogram signal and Mel matrix 225 auto toFbank(S, M)(S spect, M melmat) 226 { 227 import lubeck : mtimes; 228 import mir.ndslice : map, transposed; 229 import mir.math.common : log, fmax; 230 231 return melmat.transposed.mtimes(spect).map!(x => fmax(x, typeof(x).epsilon).log); 232 } 233 234 235 /// Computes MFCC feature from log-Mel filterbank (FBANK) feature 236 auto toMfcc(S)(S fbanks, size_t nceps = 13, double lifter = 22) 237 { 238 import numir : alongDim; 239 import mir.ndslice : map, iota, ndiota, transposed, slice; 240 import mir.math.common : sin; 241 import std.math : PI; 242 243 auto l = 1.0 + 0.5 * iota(nceps).map!(i => lifter * sin(PI * i / lifter)); 244 auto N = fbanks.length!1; 245 return ndiota(nceps, N).map!(i => dct(fbanks[0 .. $, i[1]])[i[0] + 1] * l[i[0]]); 246 }