1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/**
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
*
* See LICENSE for clarification regarding multiple authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This file is copied/modified from kaldi/src/feat/mel-computations.h
#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
#include <cmath>
#include <cstdint>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include "feature-window.h"
namespace knf {
struct FrameExtractionOptions;
struct MelBanksOptions {
int32_t num_bins = 25; // e.g. 25; number of triangular bins
float low_freq = 20; // e.g. 20; lower frequency cutoff
// an upper frequency cutoff; 0 -> no cutoff, negative
// ->added to the Nyquist frequency to get the cutoff.
float high_freq = 0;
float vtln_low = 100; // vtln lower cutoff of warping function.
// vtln upper cutoff of warping function: if negative, added
// to the Nyquist frequency to get the cutoff.
float vtln_high = -500;
bool debug_mel = false;
// htk_mode is a "hidden" config, it does not show up on command line.
// Enables more exact compatibility with HTK, for testing purposes. Affects
// mel-energy flooring and reproduces a bug in HTK.
bool htk_mode = false;
// Note that if you set is_librosa, you probably need to set
// low_freq to 0.
// Please see
// https://librosa.org/doc/main/generated/librosa.filters.mel.html
bool is_librosa = false;
// used only when is_librosa=true
// Possible values: "", slaney. We don't support a numeric value here, but
// it can be added on demand.
// See https://librosa.org/doc/main/generated/librosa.filters.mel.html
std::string norm = "slaney";
// used only when is_librosa is true
bool use_slaney_mel_scale = true;
// used only when is_librosa is true
bool floor_to_int_bin = false;
std::string ToString() const {
std::ostringstream os;
os << "num_bins: " << num_bins << "\n";
os << "low_freq: " << low_freq << "\n";
os << "high_freq: " << high_freq << "\n";
os << "vtln_low: " << vtln_low << "\n";
os << "vtln_high: " << vtln_high << "\n";
os << "debug_mel: " << debug_mel << "\n";
os << "htk_mode: " << htk_mode << "\n";
os << "is_librosa: " << is_librosa << "\n";
os << "norm: " << norm << "\n";
os << "use_slaney_mel_scale: " << use_slaney_mel_scale << "\n";
os << "floor_to_int_bin: " << floor_to_int_bin << "\n";
return os.str();
}
};
std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
class MelBanks {
public:
// see also https://en.wikipedia.org/wiki/Mel_scale
// htk, mel to hz
static inline float InverseMelScale(float mel_freq) {
return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
}
// htk, hz to mel
static inline float MelScale(float freq) {
return 1127.0f * logf(1.0f + freq / 700.0f);
}
// slaney, mel to hz
static inline float InverseMelScaleSlaney(float mel_freq) {
if (mel_freq <= 15) {
return 200.0f / 3 * mel_freq;
}
// return 1000 * expf((mel_freq - 15) * logf(6.4f) / 27);
// Note: log(6.4)/27 = 0.06875177742094911
return 1000 * expf((mel_freq - 15) * 0.06875177742094911f);
}
// slaney, hz to mel
static inline float MelScaleSlaney(float freq) {
if (freq <= 1000) {
return freq * 3 / 200.0f;
}
// return 15 + 27 * logf(freq / 1000) / logf(6.4f)
//
// Note: 27/log(6.4) = 14.545078505785561
return 15 + 14.545078505785561f * logf(freq / 1000);
}
static float VtlnWarpFreq(
float vtln_low_cutoff,
float vtln_high_cutoff, // discontinuities in warp func
float low_freq,
float high_freq, // upper+lower frequency cutoffs in
// the mel computation
float vtln_warp_factor, float freq);
static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff,
float low_freq, float high_freq,
float vtln_warp_factor, float mel_freq);
// TODO(fangjun): Remove vtln_warp_factor
MelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts, float vtln_warp_factor);
// Initialize with a 2-d weights matrix
// @param weights Pointer to the start address of the matrix
// @param num_rows It equls to number of mel bins
// @param num_cols It equals to (number of fft bins)/2+1
MelBanks(const float *weights, int32_t num_rows, int32_t num_cols);
/// Compute Mel energies (note: not log energies).
/// At input, "fft_energies" contains the FFT energies (not log).
///
/// @param fft_energies 1-D array of size num_fft_bins/2+1
/// @param mel_energies_out 1-D array of size num_mel_bins
void Compute(const float *fft_energies, float *mel_energies_out) const;
int32_t NumBins() const { return bins_.size(); }
private:
// for kaldi-compatible
void InitKaldiMelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
float vtln_warp_factor);
// for librosa-compatible
// See https://librosa.org/doc/main/generated/librosa.filters.mel.html
void InitLibrosaMelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
float vtln_warp_factor);
private:
// the "bins_" vector is a vector, one for each bin, of a pair:
// (the first nonzero fft-bin), (the vector of weights).
std::vector<std::pair<int32_t, std::vector<float>>> bins_;
// TODO(fangjun): Remove debug_ and htk_mode_
bool debug_ = false;
bool htk_mode_ = false;
};
// Compute liftering coefficients (scaling on cepstral coeffs)
// coeffs are numbered slightly differently from HTK: the zeroth
// index is C0, which is not affected.
void ComputeLifterCoeffs(float Q, std::vector<float> *coeffs);
} // namespace knf
#endif // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_