rlx_text/tokenizer.rs
1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Thin wrapper around the `tokenizers` crate.
17//!
18//! Many `rlx-<family>` runners import `tokenizers::Tokenizer` directly
19//! and re-implement the same "load from path → encode → decode" dance.
20//! This module collapses that to one entry point.
21
22use anyhow::{Context, Result};
23use std::path::Path;
24
25pub use tokenizers::Tokenizer as RawTokenizer;
26
27/// Owned tokenizer handle. Use [`load_tokenizer`] to construct.
28pub struct TokenizerHandle {
29 inner: RawTokenizer,
30}
31
32impl TokenizerHandle {
33 pub fn from_raw(raw: RawTokenizer) -> Self {
34 Self { inner: raw }
35 }
36
37 pub fn raw(&self) -> &RawTokenizer {
38 &self.inner
39 }
40
41 pub fn raw_mut(&mut self) -> &mut RawTokenizer {
42 &mut self.inner
43 }
44
45 /// Encode `text` and return token ids.
46 pub fn encode(&self, text: &str, add_special: bool) -> Result<Vec<u32>> {
47 let enc = self
48 .inner
49 .encode(text, add_special)
50 .map_err(|e| anyhow::anyhow!("tokenizer encode: {e}"))?;
51 Ok(enc.get_ids().to_vec())
52 }
53
54 /// Decode ids back to a string.
55 pub fn decode(&self, ids: &[u32], skip_special: bool) -> Result<String> {
56 self.inner
57 .decode(ids, skip_special)
58 .map_err(|e| anyhow::anyhow!("tokenizer decode: {e}"))
59 }
60}
61
62/// Load a HF-format `tokenizer.json` from disk.
63pub fn load_tokenizer(path: &Path) -> Result<TokenizerHandle> {
64 let raw = RawTokenizer::from_file(path)
65 .map_err(|e| anyhow::anyhow!("loading tokenizer at {path:?}: {e}"))
66 .with_context(|| format!("tokenizer.json at {path:?}"))?;
67 Ok(TokenizerHandle::from_raw(raw))
68}
69
70/// Standalone decode helper (no handle needed).
71pub fn decode_ids(t: &TokenizerHandle, ids: &[u32], skip_special: bool) -> Result<String> {
72 t.decode(ids, skip_special)
73}