voxudio 0.7.2

A real-time audio processing library with ONNX runtime support
Documentation
// kaldi-native-fbank/csrc/feature-window.cc
//
// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)

// This file is copied/modified from kaldi/src/feat/feature-window.cc

#include "feature-window.h"

#include <algorithm>
#include <cmath>
#include <cstddef>
#include <limits>
#include <vector>
#include <iostream>
#include "kaldi-math.h"

namespace knf {

std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
  os << opts.ToString();
  return os;
}

std::vector<float> GetWindow(const std::string &window_type,
                             int32_t window_size,
                             float blackman_coeff /*= 0.42*/) {
  std::vector<float> window(window_size);
  int32_t frame_length = window_size;
  KNF_CHECK_GT(frame_length, 0);

  float *window_data = window.data();

  double a = M_2PI / (frame_length - 1);
  if (window_type == "hann") {
    // see https://pytorch.org/docs/stable/generated/torch.hann_window.html
    // We assume periodic is true
    a = M_2PI / frame_length;
  }

  for (int32_t i = 0; i < frame_length; i++) {
    double i_fl = static_cast<double>(i);
    if (window_type == "hanning") {
      window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
    } else if (window_type == "sine") {
      // when you are checking ws wikipedia, please
      // note that 0.5 * a = M_PI/(frame_length-1)
      window_data[i] = sin(0.5 * a * i_fl);
    } else if (window_type == "hamming") {
      window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
    } else if (window_type == "hann") {
      window_data[i] = 0.50 - 0.50 * cos(a * i_fl);
    } else if (window_type == "povey") {
      // like hamming but goes to zero at edges.
      window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
    } else if (window_type == "rectangular") {
      window_data[i] = 1.0;
    } else if (window_type == "blackman") {
      window_data[i] = blackman_coeff - 0.5 * cos(a * i_fl) +
                       (0.5 - blackman_coeff) * cos(2 * a * i_fl);
    } else {
      fprintf(stderr, "Invalid window type '%s'\n", window_type.c_str());
      exit(-1);
    }
  }

  return window;
}

FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
    : FeatureWindowFunction(opts.window_type, opts.WindowSize(),
                            opts.blackman_coeff) {}

FeatureWindowFunction::FeatureWindowFunction(const std::string &window_type,
                                             int32_t window_size,
                                             float blackman_coeff /*= 0.42*/)
    : window_(knf::GetWindow(window_type, window_size, blackman_coeff)) {}

FeatureWindowFunction::FeatureWindowFunction(const std::vector<float> &window)
    : window_(window) {}

void FeatureWindowFunction::Apply(float *wave) const {
  int32_t window_size = window_.size();
  const float *p = window_.data();
  for (int32_t k = 0; k != window_size; ++k) {
    wave[k] *= p[k];
  }
}

int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
  int64_t frame_shift = opts.WindowShift();
  if (opts.snip_edges) {
    return frame * frame_shift;
  } else {
    int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
            beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
    return beginning_of_frame;
  }
}

int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
                  bool flush /*= true*/) {
  int64_t frame_shift = opts.WindowShift();
  int64_t frame_length = opts.WindowSize();
  if (opts.snip_edges) {
    // with --snip-edges=true (the default), we use a HTK-like approach to
    // determining the number of frames-- all frames have to fit completely into
    // the waveform, and the first frame begins at sample zero.
    if (num_samples < frame_length)
      return 0;
    else
      return (1 + ((num_samples - frame_length) / frame_shift));
    // You can understand the expression above as follows: 'num_samples -
    // frame_length' is how much room we have to shift the frame within the
    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
    // is how many times we can shift it (integer arithmetic rounds down).
  } else {
    // if --snip-edges=false, the number of frames is determined by rounding the
    // (file-length / frame-shift) to the nearest integer.  The point of this
    // formula is to make the number of frames an obvious and predictable
    // function of the frame shift and signal length, which makes many
    // segmentation-related questions simpler.
    //
    // Because integer division in C++ rounds toward zero, we add (half the
    // frame-shift minus epsilon) before dividing, to have the effect of
    // rounding towards the closest integer.
    int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;

    if (flush) return num_frames;

    // note: 'end' always means the last plus one, i.e. one past the last.
    int64_t end_sample_of_last_frame =
        FirstSampleOfFrame(num_frames - 1, opts) + frame_length;

    // the following code is optimized more for clarity than efficiency.
    // If flush == false, we can't output frames that extend past the end
    // of the signal.
    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
      num_frames--;
      end_sample_of_last_frame -= frame_shift;
    }
    return num_frames;
  }
}

void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
                   int32_t f, const FrameExtractionOptions &opts,
                   const FeatureWindowFunction &window_function,
                   std::vector<float> *window,
                   float *log_energy_pre_window /*= nullptr*/) {
  KNF_CHECK(sample_offset >= 0 && wave.size() != 0);

  int32_t frame_length = opts.WindowSize();
  int32_t frame_length_padded = opts.PaddedWindowSize();

  int64_t num_samples = sample_offset + wave.size();
  int64_t start_sample = FirstSampleOfFrame(f, opts);
  int64_t end_sample = start_sample + frame_length;

  if (opts.snip_edges) {
    KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
  } else {
    KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
  }

  if (window->size() != frame_length_padded) {
    window->resize(frame_length_padded);
  }

  // wave_start and wave_end are start and end indexes into 'wave', for the
  // piece of wave that we're trying to extract.
  int32_t wave_start = int32_t(start_sample - sample_offset);
  int32_t wave_end = wave_start + frame_length;

  if (wave_start >= 0 && wave_end <= wave.size()) {
    // the normal case-- no edge effects to consider.
    std::copy(wave.begin() + wave_start,
              wave.begin() + wave_start + frame_length, window->data());
  } else {
    // Deal with any end effects by reflection, if needed.  This code will only
    // be reached for about two frames per utterance, so we don't concern
    // ourselves excessively with efficiency.
    int32_t wave_dim = wave.size();
    for (int32_t s = 0; s < frame_length; ++s) {
      int32_t s_in_wave = s + wave_start;
      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
        // reflect around the beginning or end of the wave.
        // e.g. -1 -> 0, -2 -> 1.
        // dim -> dim - 1, dim + 1 -> dim - 2.
        // the code supports repeated reflections, although this
        // would only be needed in pathological cases.
        if (s_in_wave < 0)
          s_in_wave = -s_in_wave - 1;
        else
          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
      }
      (*window)[s] = wave[s_in_wave];
    }
  }
  ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
}

static void RemoveDcOffset(float *d, int32_t n) {
  float sum = 0;
  for (int32_t i = 0; i != n; ++i) {
    sum += d[i];
  }

  float mean = sum / n;

  for (int32_t i = 0; i != n; ++i) {
    d[i] -= mean;
  }
}

float InnerProduct(const float *a, const float *b, int32_t n) {
  float sum = 0;
  for (int32_t i = 0; i != n; ++i) {
    sum += a[i] * b[i];
  }
  return sum;
}

void Dither(float *d, int32_t n, float dither_value) {
  if (dither_value == 0.0) {
    return;
  }

  RandomState rstate;
  for (int32_t i = 0; i < n; ++i) {
    d[i] += RandGauss(&rstate) * dither_value;
  }
}

static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
  if (preemph_coeff == 0.0) {
    return;
  }

  KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);

  for (int32_t i = n - 1; i > 0; --i) {
    d[i] -= preemph_coeff * d[i - 1];
  }
  d[0] -= preemph_coeff * d[0];
}

void ProcessWindow(const FrameExtractionOptions &opts,
                   const FeatureWindowFunction &window_function, float *window,
                   float *log_energy_pre_window /*= nullptr*/) {
  int32_t frame_length = opts.WindowSize();

  if (opts.dither != 0.0) {
    Dither(window, frame_length, opts.dither);
  }

  if (opts.remove_dc_offset) {
    RemoveDcOffset(window, frame_length);
  }

  if (log_energy_pre_window != NULL) {
    float energy = std::max<float>(InnerProduct(window, window, frame_length),
                                   std::numeric_limits<float>::epsilon());
    *log_energy_pre_window = std::log(energy);
  }

  if (opts.preemph_coeff != 0.0) {
    Preemphasize(window, frame_length, opts.preemph_coeff);
  }

  window_function.Apply(window);
}

}  // namespace knf