sipp-sys 0.1.0

Native llama.cpp FFI layer for Sipp
#[cxx::bridge(namespace = "sipp::sys")]
pub mod ffi {
    unsafe extern "C++" {
        include!("sipp_cxx.h");

        type NativeRuntime;
        type NativeBatch;
        type CommonSampler;

        fn backend_init();
        fn backend_load_all();
        fn set_llama_log_quiet(quiet: bool);
        fn backend_observability_json(include_details: bool) -> String;
        fn mtmd_default_marker() -> String;

        fn load_native_runtime(
            model_path: &str,
            args_json: &str,
        ) -> Result<UniquePtr<NativeRuntime>>;
        fn n_ctx(self: &NativeRuntime) -> i32;
        fn n_batch(self: &NativeRuntime) -> i32;
        fn n_ubatch(self: &NativeRuntime) -> i32;
        fn n_seq_max(self: &NativeRuntime) -> i32;
        fn n_threads(self: &NativeRuntime) -> i32;
        fn n_threads_batch(self: &NativeRuntime) -> i32;
        fn n_embd_out(self: &NativeRuntime) -> i32;
        fn n_cls_out(self: &NativeRuntime) -> i32;
        fn pooling_type(self: &NativeRuntime) -> i32;
        fn has_encoder(self: &NativeRuntime) -> bool;
        fn has_decoder(self: &NativeRuntime) -> bool;
        fn has_chat_template(self: &NativeRuntime) -> bool;
        fn is_recurrent(self: &NativeRuntime) -> bool;
        fn is_hybrid(self: &NativeRuntime) -> bool;
        fn kv_unified(self: &NativeRuntime) -> bool;
        fn flash_attention(self: &NativeRuntime) -> String;
        fn cache_type_k(self: &NativeRuntime) -> String;
        fn cache_type_v(self: &NativeRuntime) -> String;
        fn chat_template_source(self: &NativeRuntime) -> String;
        fn bos_token(self: &NativeRuntime) -> i32;
        fn eos_token(self: &NativeRuntime) -> i32;
        fn decoder_start_token(self: &NativeRuntime) -> i32;
        fn is_eog(self: &NativeRuntime, token: i32) -> bool;
        fn mtmd_ready(self: &NativeRuntime) -> bool;
        fn tokenize(
            self: &NativeRuntime,
            text: &str,
            add_special: bool,
            parse_special: bool,
        ) -> Result<Vec<i32>>;
        fn token_to_piece(self: &NativeRuntime, token: i32, special: bool) -> Result<String>;
        fn token_to_piece_bytes_into(
            self: &NativeRuntime,
            token: i32,
            special: bool,
            out: &mut Vec<u8>,
        ) -> Result<()>;
        fn apply_chat_template_json(
            self: &NativeRuntime,
            messages_json: &str,
            add_assistant: bool,
        ) -> Result<String>;
        fn decode(self: Pin<&mut NativeRuntime>, batch: &NativeBatch) -> Result<i32>;
        fn encode(self: Pin<&mut NativeRuntime>, batch: &NativeBatch) -> Result<i32>;
        fn synchronize(self: Pin<&mut NativeRuntime>) -> bool;
        fn clear_sequence(self: Pin<&mut NativeRuntime>, seq_id: i32, p0: i32, p1: i32) -> bool;
        fn add_sequence_delta(
            self: Pin<&mut NativeRuntime>,
            seq_id: i32,
            p0: i32,
            p1: i32,
            delta: i32,
        );
        fn embeddings_seq(self: &NativeRuntime, seq_id: i32) -> Result<Vec<f32>>;
        fn state_seq(self: &NativeRuntime, seq_id: i32) -> Result<Vec<u8>>;
        fn set_state_seq(self: Pin<&mut NativeRuntime>, seq_id: i32, data: &[u8]) -> bool;
        fn init_mtmd(
            self: Pin<&mut NativeRuntime>,
            projector_path: &str,
            use_gpu: bool,
            n_threads: i32,
        ) -> bool;
        fn mtmd_support_vision(self: &NativeRuntime) -> bool;
        fn mtmd_eval_images(
            self: Pin<&mut NativeRuntime>,
            prompt: &str,
            image_bytes: &[u8],
            image_sizes: &[i32],
            add_special: bool,
            parse_special: bool,
            n_past: i32,
            seq_id: i32,
            n_batch: i32,
            logits_last: bool,
        ) -> Result<i32>;

        fn make_native_batch() -> UniquePtr<NativeBatch>;
        fn ensure_capacity(
            self: Pin<&mut NativeBatch>,
            max_tokens: i32,
            max_sequences: i32,
        ) -> Result<()>;
        fn reset(self: Pin<&mut NativeBatch>);
        fn add_token(
            self: Pin<&mut NativeBatch>,
            token: i32,
            pos: i32,
            seq_id: i32,
            logits: bool,
        ) -> bool;
        fn n_tokens(self: &NativeBatch) -> i32;
        fn token(self: &NativeBatch, index: i32) -> i32;
        fn pos(self: &NativeBatch, index: i32) -> i32;
        fn seq_id(self: &NativeBatch, index: i32) -> i32;
        fn logits(self: &NativeBatch, index: i32) -> bool;
        fn clear_logits(self: Pin<&mut NativeBatch>);
        fn set_last_logits(self: Pin<&mut NativeBatch>);

        fn create_sampler(
            runtime: &NativeRuntime,
            sampling_json: &str,
            grammar: &str,
            json_schema: &str,
        ) -> Result<UniquePtr<CommonSampler>>;
        fn sampler_accept(self: Pin<&mut CommonSampler>, token: i32, accept_grammar: bool) -> bool;
        fn sampler_reset(self: Pin<&mut CommonSampler>);
        fn sampler_backend_sampling(self: &CommonSampler) -> bool;
        fn sampler_sample(
            sampler: Pin<&mut CommonSampler>,
            runtime: Pin<&mut NativeRuntime>,
            idx: i32,
        ) -> i32;
        fn sampler_attach(
            sampler: Pin<&mut CommonSampler>,
            runtime: Pin<&mut NativeRuntime>,
            seq_id: i32,
        ) -> bool;
        fn sampler_detach(runtime: Pin<&mut NativeRuntime>, seq_id: i32) -> bool;
    }
}

pub use ffi::*;