#pragma once
#include "ggml.h"
#include "clip-model.h"
#include <vector>
#include <string>
#define MTMD_INTERNAL_HEADER
struct mtmd_image_preprocessor {
const clip_hparams & hparams;
mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
virtual ~mtmd_image_preprocessor() = default;
virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
};
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
struct slice_coordinates {
int x;
int y;
clip_image_size size;
};
struct slice_instructions {
clip_image_size overview_size; clip_image_size refined_size; clip_image_size grid_size; std::vector<slice_coordinates> slices;
};
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
private:
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
int ensure_divide(int length, int patch_size);
clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
};
struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
static constexpr int min_tiles = 2;
static constexpr int max_tiles = 10;
static constexpr float max_pixels_tolerance = 2.0f;
static constexpr int tile_size = 512;
using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
private:
clip_image_size find_closest_aspect_ratio(
float aspect_ratio,
const std::vector<clip_image_size> & target_ratios,
int width, int height);
std::vector<clip_image_size> get_target_ratios();
clip_image_size get_grid_layout(int height, int width);
};
struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};