mlxrs 0.1.0 - Docs.rs

//! Tokenizer support — granular, capability-named feature gates.
//!
//! The public module API composes from independently selectable, minimal-dep
//! features (each `tokenizer-`-prefixed; umbrellas `embeddings` / `lm` /
//! `vlm` / `audio` only compose them):
//!
//! - `tokenizer` — [`Tokenizer`](crate::tokenizer::Tokenizer) load +
//!   `encode`/`decode` + special tokens read from `tokenizer.json`. Pulls
//!   **only** the `tokenizers` crate (no `serde_json`, no `minijinja`).
//! - `tokenizer-config` — parse `tokenizer_config.json` (bos/eos/unk,
//!   `chat_template`, added tokens). Adds `serde_json`.
//! - `tokenizer-stream` — `StreamingDetokenizer` +
//!   `NaiveStreamingDetokenizer` (no `serde_json`).
//! - `tokenizer-gpt2` — GPT-2 bytes↔unicode table (committed
//!   `cargo xtask-codegen` artifact).
//! - `tokenizer-bpe` — GPT-2 byte-level streaming detok + decoder-class
//!   inference (pulls `tokenizer-stream` + `tokenizer-gpt2`).
//! - `tokenizer-spm` — SentencePiece (+no-space) streaming detok +
//!   decoder-class inference.
//! - `tokenizer-chat` — jinja `apply_chat_template` (pulls `tokenizer-config`).
//! - `tokenizer-deepseek-v32` — the one shipped chat-template override.
//! - `tokenizer-tools` — tool-call parsers + `infer_tool_parser`.
//!
//! When a model's `tokenizer.json` decoder wants the SPM or BPE streaming
//! detokenizer but that feature is disabled, the
//! [`Tokenizer::detokenizer`](crate::tokenizer::Tokenizer::detokenizer)
//! factory falls back to the naive detokenizer (and warns once) — it never
//! panics or hard-errors.
//!
//! A faithful Rust port of `mlx-lm`'s tokenizer surface, cross-referenced
//! against `mlx-swift-lm`'s `MLXLMCommon` abstractions. Intentionally **not**
//! ported: the Python `NewlineTokenizer` + `AutoTokenizer.register(...)` —
//! model-specific tokenizer registration is per-model architecture and out of
//! scope. Loading is local-path only; no Hugging Face Hub network download.

#[cfg(feature = "tokenizer-chat")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokenizer-chat")))]
pub mod chat;
pub mod encode_options;
/// Committed codegen tables (`@generated by cargo xtask-codegen` from
/// `mlxrs/data/tokenizer/`). Replaces the old `build.rs` + `OUT_DIR`
/// `include!`s so a normal build never compiles `tokenizers`/`serde_json`/
/// `toml`. Only compiled when a consuming capability feature is enabled.
#[cfg(any(
  feature = "tokenizer-gpt2",
  feature = "tokenizer-bpe",
  feature = "tokenizer-tools",
  feature = "tokenizer-deepseek-v32"
))]
mod generated;
/// SentencePiece Unigram / BPE tokenizer (protobuf reader + Viterbi
/// lattice + byte-fallback). Standalone, self-contained — uses neither
/// the `tokenizers` crate nor [`wrapper::Tokenizer`]; loads from a raw
/// `*.model` protobuf or the JSON `tokenizer.json` model subtree (with
/// `tokenizer-config`). Gated under `audio` for now — the first caller
/// is `crate::audio::stt::streaming`; promote to a standalone feature
/// when a non-audio caller needs it.
#[cfg(feature = "audio")]
#[cfg_attr(docsrs, doc(cfg(feature = "audio")))]
pub mod sentencepiece;
#[cfg(feature = "tokenizer-stream")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokenizer-stream")))]
pub mod stream;
#[cfg(feature = "tokenizer-tools")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokenizer-tools")))]
pub mod tools;
pub mod wrapper;

pub use encode_options::{EncodeOptions, Encoded};
/// Re-export the SPM Unigram/BPE tokenizer top-level surface — gated
/// under `audio` for now (see [`sentencepiece`] for the rationale).
#[cfg(feature = "audio")]
#[cfg_attr(docsrs, doc(cfg(feature = "audio")))]
pub use sentencepiece::{
  SentencePieceModelType, SentencePiecePieceType, SentencePieceToken, SentencePieceTokenizer,
};
/// SPM/BPE streaming detokenizers. Each is gated on its own feature; the
/// naive detokenizer + trait come with bare `tokenizer-stream`.
#[cfg(feature = "tokenizer-bpe")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokenizer-bpe")))]
pub use stream::BpeStreamingDetokenizer;
#[cfg(feature = "tokenizer-spm")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokenizer-spm")))]
pub use stream::SpmStreamingDetokenizer;
/// Decoder-class inference needs `serde_json` (parses the `decoder` node), so
/// it is available only when `tokenizer-spm` or `tokenizer-bpe` is enabled.
#[cfg(any(feature = "tokenizer-spm", feature = "tokenizer-bpe"))]
#[cfg_attr(
  docsrs,
  doc(cfg(any(feature = "tokenizer-spm", feature = "tokenizer-bpe")))
)]
pub use stream::infer_detokenizer_class;
#[cfg(feature = "tokenizer-stream")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokenizer-stream")))]
pub use stream::{
  Detokenizer, DetokenizerClass, NaiveHfDetokenizer, NaiveStreamingDetokenizer,
  StreamingDetokenizer,
};
/// Tool-call parsing (`tokenizer-tools`) — the per-format parsers, the
/// streaming [`tools::ToolCallProcessor`], and the selectors.
#[cfg(feature = "tokenizer-tools")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokenizer-tools")))]
pub use tools::{ToolCall, ToolCallProcessor, ToolParser, infer_tool_parser, parser_by_name};
pub use wrapper::{Tokenizer, no_bos_or_eos};