Skip to main content

baracuda_flashinfer_sys/
lib.rs

1//! Raw C-ABI FFI surface for the vendored FlashInfer inference kernels.
2//!
3//! `baracuda-flashinfer` wraps this with a safe, typed API. Use this
4//! crate directly only if you need a function that the safe layer
5//! hasn't wrapped yet (in which case please file a bug).
6//!
7//! FlashInfer (`flashinfer-ai/flashinfer`, Apache-2.0) is header-only /
8//! template-heavy, so there is no shared library to dynamically load.
9//! Instead, baracuda compiles thin C-ABI launcher shims around the
10//! vendored FlashInfer headers inside [`baracuda-kernels-sys`]; this
11//! crate re-exports those `extern "C"` symbols under a dedicated crate
12//! name so downstream code can depend on the FlashInfer FFI surface
13//! without pulling the whole kernels-sys symbol table into scope.
14//!
15//! Almost all callers should prefer the safe, typed wrappers in
16//! [`baracuda-flashinfer`] (the sibling crate) over these raw symbols.
17//!
18//! # Feature gating
19//!
20//! Every symbol is behind the `flashinfer` cargo feature (OFF by
21//! default), which transitively enables `baracuda-kernels-sys/flashinfer`
22//! and compiles the vendored launcher `.cu` files. With the feature
23//! off, this crate is empty.
24//!
25//! # Symbol families
26//!
27//! - `*_paged_decode_*` — batched paged-KV decode
28//!   (`BatchDecodeWithPagedKVCacheDispatched`). f16 / bf16 / f32.
29//! - `*_paged_kv_append_decode_*` — decode-time KV-cache append.
30//! - `*_merge_state_in_place_*` / `*_merge_states_*` — cascade /
31//!   prefix-cache LSE-aware attention-state merge.
32//! - `*_top_k_sampling_*` / `*_top_p_sampling_*` /
33//!   `*_min_p_sampling_*` / `*_top_k_top_p_sampling_*` — sort-free
34//!   sampling from a row-normalized probability tensor.
35
36#![no_std]
37
38// The raw FlashInfer C-ABI, compiled + defined in `baracuda-kernels-sys`.
39// Re-exported verbatim (raw names preserved) so this stays an honest
40// `-sys` facade. Grouped by family for readability.
41#[cfg(feature = "flashinfer")]
42pub use baracuda_kernels_sys::{
43    // Paged-KV decode.
44    baracuda_kernels_flashinfer_paged_decode_workspace_size,
45    baracuda_kernels_flashinfer_paged_decode_f16_run,
46    baracuda_kernels_flashinfer_paged_decode_bf16_run,
47    baracuda_kernels_flashinfer_paged_decode_f32_run,
48    baracuda_kernels_flashinfer_paged_decode_can_implement,
49    // Paged-KV append (decode-time, one token per request).
50    baracuda_kernels_flashinfer_paged_kv_append_decode_f16_run,
51    baracuda_kernels_flashinfer_paged_kv_append_decode_bf16_run,
52    baracuda_kernels_flashinfer_paged_kv_append_decode_f32_run,
53    baracuda_kernels_flashinfer_paged_kv_append_decode_can_implement,
54    // Cascade / prefix-cache state merge.
55    baracuda_kernels_flashinfer_merge_state_in_place_f16_run,
56    baracuda_kernels_flashinfer_merge_state_in_place_bf16_run,
57    baracuda_kernels_flashinfer_merge_state_in_place_f32_run,
58    baracuda_kernels_flashinfer_merge_state_in_place_can_implement,
59    baracuda_kernels_flashinfer_merge_states_f16_run,
60    baracuda_kernels_flashinfer_merge_states_bf16_run,
61    baracuda_kernels_flashinfer_merge_states_f32_run,
62    baracuda_kernels_flashinfer_merge_states_can_implement,
63    // Sort-free sampling.
64    baracuda_kernels_flashinfer_top_k_sampling_f32_run,
65    baracuda_kernels_flashinfer_top_k_sampling_f32_can_implement,
66    baracuda_kernels_flashinfer_top_p_sampling_f32_run,
67    baracuda_kernels_flashinfer_top_p_sampling_f32_can_implement,
68    baracuda_kernels_flashinfer_min_p_sampling_f32_run,
69    baracuda_kernels_flashinfer_min_p_sampling_f32_can_implement,
70    baracuda_kernels_flashinfer_top_k_top_p_sampling_f32_run,
71    baracuda_kernels_flashinfer_top_k_top_p_sampling_f32_can_implement,
72};