Skip to main content

rlx_text/
tokenizer.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Thin wrapper around the `tokenizers` crate.
17//!
18//! Many `rlx-<family>` runners import `tokenizers::Tokenizer` directly
19//! and re-implement the same "load from path → encode → decode" dance.
20//! This module collapses that to one entry point.
21
22use anyhow::{Context, Result};
23use std::path::Path;
24
25pub use tokenizers::Tokenizer as RawTokenizer;
26
27/// Owned tokenizer handle. Use [`load_tokenizer`] to construct.
28pub struct TokenizerHandle {
29    inner: RawTokenizer,
30}
31
32impl TokenizerHandle {
33    pub fn from_raw(raw: RawTokenizer) -> Self {
34        Self { inner: raw }
35    }
36
37    pub fn raw(&self) -> &RawTokenizer {
38        &self.inner
39    }
40
41    pub fn raw_mut(&mut self) -> &mut RawTokenizer {
42        &mut self.inner
43    }
44
45    /// Encode `text` and return token ids.
46    pub fn encode(&self, text: &str, add_special: bool) -> Result<Vec<u32>> {
47        let enc = self
48            .inner
49            .encode(text, add_special)
50            .map_err(|e| anyhow::anyhow!("tokenizer encode: {e}"))?;
51        Ok(enc.get_ids().to_vec())
52    }
53
54    /// Decode ids back to a string.
55    pub fn decode(&self, ids: &[u32], skip_special: bool) -> Result<String> {
56        self.inner
57            .decode(ids, skip_special)
58            .map_err(|e| anyhow::anyhow!("tokenizer decode: {e}"))
59    }
60}
61
62/// Load a HF-format `tokenizer.json` from disk.
63pub fn load_tokenizer(path: &Path) -> Result<TokenizerHandle> {
64    let raw = RawTokenizer::from_file(path)
65        .map_err(|e| anyhow::anyhow!("loading tokenizer at {path:?}: {e}"))
66        .with_context(|| format!("tokenizer.json at {path:?}"))?;
67    Ok(TokenizerHandle::from_raw(raw))
68}
69
70/// Standalone decode helper (no handle needed).
71pub fn decode_ids(t: &TokenizerHandle, ids: &[u32], skip_special: bool) -> Result<String> {
72    t.decode(ids, skip_special)
73}