baracuda_flashinfer_sys/lib.rs
1//! Raw C-ABI FFI surface for the vendored FlashInfer inference kernels.
2//!
3//! `baracuda-flashinfer` wraps this with a safe, typed API. Use this
4//! crate directly only if you need a function that the safe layer
5//! hasn't wrapped yet (in which case please file a bug).
6//!
7//! FlashInfer (`flashinfer-ai/flashinfer`, Apache-2.0) is header-only /
8//! template-heavy, so there is no shared library to dynamically load.
9//! Instead, baracuda compiles thin C-ABI launcher shims around the
10//! vendored FlashInfer headers inside [`baracuda-kernels-sys`]; this
11//! crate re-exports those `extern "C"` symbols under a dedicated crate
12//! name so downstream code can depend on the FlashInfer FFI surface
13//! without pulling the whole kernels-sys symbol table into scope.
14//!
15//! Almost all callers should prefer the safe, typed wrappers in
16//! [`baracuda-flashinfer`] (the sibling crate) over these raw symbols.
17//!
18//! # Feature gating
19//!
20//! Every symbol is behind the `flashinfer` cargo feature (OFF by
21//! default), which transitively enables `baracuda-kernels-sys/flashinfer`
22//! and compiles the vendored launcher `.cu` files. With the feature
23//! off, this crate is empty.
24//!
25//! # Symbol families
26//!
27//! - `*_paged_decode_*` — batched paged-KV decode
28//! (`BatchDecodeWithPagedKVCacheDispatched`). f16 / bf16 / f32.
29//! - `*_paged_kv_append_decode_*` — decode-time KV-cache append.
30//! - `*_merge_state_in_place_*` / `*_merge_states_*` — cascade /
31//! prefix-cache LSE-aware attention-state merge.
32//! - `*_top_k_sampling_*` / `*_top_p_sampling_*` /
33//! `*_min_p_sampling_*` / `*_top_k_top_p_sampling_*` — sort-free
34//! sampling from a row-normalized probability tensor.
35
36#![no_std]
37
38// The raw FlashInfer C-ABI, compiled + defined in `baracuda-kernels-sys`.
39// Re-exported verbatim (raw names preserved) so this stays an honest
40// `-sys` facade. Grouped by family for readability.
41#[cfg(feature = "flashinfer")]
42pub use baracuda_kernels_sys::{
43 // Paged-KV decode.
44 baracuda_kernels_flashinfer_paged_decode_workspace_size,
45 baracuda_kernels_flashinfer_paged_decode_f16_run,
46 baracuda_kernels_flashinfer_paged_decode_bf16_run,
47 baracuda_kernels_flashinfer_paged_decode_f32_run,
48 baracuda_kernels_flashinfer_paged_decode_can_implement,
49 // Paged-KV append (decode-time, one token per request).
50 baracuda_kernels_flashinfer_paged_kv_append_decode_f16_run,
51 baracuda_kernels_flashinfer_paged_kv_append_decode_bf16_run,
52 baracuda_kernels_flashinfer_paged_kv_append_decode_f32_run,
53 baracuda_kernels_flashinfer_paged_kv_append_decode_can_implement,
54 // Cascade / prefix-cache state merge.
55 baracuda_kernels_flashinfer_merge_state_in_place_f16_run,
56 baracuda_kernels_flashinfer_merge_state_in_place_bf16_run,
57 baracuda_kernels_flashinfer_merge_state_in_place_f32_run,
58 baracuda_kernels_flashinfer_merge_state_in_place_can_implement,
59 baracuda_kernels_flashinfer_merge_states_f16_run,
60 baracuda_kernels_flashinfer_merge_states_bf16_run,
61 baracuda_kernels_flashinfer_merge_states_f32_run,
62 baracuda_kernels_flashinfer_merge_states_can_implement,
63 // Sort-free sampling.
64 baracuda_kernels_flashinfer_top_k_sampling_f32_run,
65 baracuda_kernels_flashinfer_top_k_sampling_f32_can_implement,
66 baracuda_kernels_flashinfer_top_p_sampling_f32_run,
67 baracuda_kernels_flashinfer_top_p_sampling_f32_can_implement,
68 baracuda_kernels_flashinfer_min_p_sampling_f32_run,
69 baracuda_kernels_flashinfer_min_p_sampling_f32_can_implement,
70 baracuda_kernels_flashinfer_top_k_top_p_sampling_f32_run,
71 baracuda_kernels_flashinfer_top_k_top_p_sampling_f32_can_implement,
72};