mistralrs-core 0.8.1

Fast, flexible LLM inference.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
//! Multi-head Latent Attention (MLA) module for efficient decode kernels.
//!
//! This module provides shared infrastructure for MLA-based attention mechanisms
//! used in models like DeepSeek V2/V3 and GLM4 MoE Lite.

mod forward;
mod weights;

pub use forward::{
    mla_cache_forward, mla_decode_forward, should_use_mla_cache, should_use_mla_decode,
};
pub use weights::MlaWeights;