#pragma once
#include "ggml.h"
#include "clip-model.h"
#include <cstdint>
#include <vector>
#include <string>
#define MTMD_INTERNAL_HEADER
struct mtmd_audio_mel {
int n_len;
int n_len_org;
int n_mel;
std::vector<float> data;
};
struct mtmd_audio_mel_filters {
int32_t n_mel;
int32_t n_fft;
std::vector<float> data;
};
struct mtmd_audio_cache {
std::vector<float> sin_vals;
std::vector<float> cos_vals;
std::vector<float> hann_window;
mtmd_audio_mel_filters filters;
void fill_sin_cos_table(uint32_t n);
void fill_hann_window(uint32_t length, bool periodic);
void fill_mel_filterbank_matrix(int n_mel,
int n_fft,
int sample_rate, float fmin = 0.0f, float fmax = -1.0f, bool slaney_area_norm = true,
float scale = 1.0f,
bool use_htk = false
);
};
struct mtmd_audio_preprocessor {
const clip_hparams & hparams;
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
virtual ~mtmd_audio_preprocessor() = default;
virtual void initialize() = 0; virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
};
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
private:
mtmd_audio_cache cache;
};
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
private:
mtmd_audio_cache cache;
};
struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_gemma4a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
private:
mtmd_audio_cache cache;
};
struct mtmd_audio_streaming_istft {
mtmd_audio_streaming_istft(int n_fft, int hop_length);
void reset();
std::vector<float> process_frame(const float * frame_spectrum);
std::vector<float> flush();
private:
int n_fft;
int hop_length;
int n_fft_bins;
mtmd_audio_cache cache;
std::vector<float> overlap_buffer;
std::vector<float> window_sum_buffer;
int padding_to_remove;
std::vector<float> ifft_in;
std::vector<float> ifft_out;
};