oxibonsai_tokenizer/lib.rs
1//! # oxibonsai-tokenizer
2//!
3//! Pure Rust BPE tokenizer for OxiBonsai — MeCrab-compatible, WASM-safe.
4//!
5//! This crate is a **production-ready** BPE implementation that can load
6//! HuggingFace `tokenizer.json` files (Qwen3, Llama-3, Mistral, Gemma, ...)
7//! directly without pulling in the `tokenizers` crate. Features:
8//!
9//! - [`OxiTokenizer`] — high-level encode/decode API
10//! - [`Vocabulary`] — bidirectional token ↔ ID mapping with special-token support
11//! - [`BpeMerges`] — ordered BPE merge table
12//! - [`bpe_encode`] / [`pretokenize`] — core BPE primitives
13//! - [`byte_fallback_id`] — `<0xHH>` byte-fallback helper
14//! - [`TokenizerError`] / [`TokenizerResult`] — error types
15//! - [`hf_format::HfTokenizerJson`] — HuggingFace `tokenizer.json` parser
16//! - [`streaming::StreamingDecoder`] — UTF-8-safe streaming decoder
17//! - [`chat_templates::ChatTemplateKind`] — canned templates for ChatML,
18//! Llama-3, Mistral, Gemma and Qwen
19//!
20//! ## Quick start (character-level mode — no trained vocab required)
21//!
22//! ```rust
23//! use oxibonsai_tokenizer::OxiTokenizer;
24//!
25//! let tok = OxiTokenizer::char_level_stub(256);
26//! let ids = tok.encode("Hello!").expect("encode should succeed");
27//! assert!(!ids.is_empty());
28//! ```
29//!
30//! ## Loading from JSON vocab + merges
31//!
32//! ```rust
33//! use oxibonsai_tokenizer::{OxiTokenizer, TokenizerConfig};
34//!
35//! let vocab_json = r#"{"a":10,"b":11,"ab":20,"<unk>":0,"<bos>":1,"<eos>":2,"<pad>":3}"#;
36//! let merges_json = r#"[["a","b"]]"#;
37//! let tok = OxiTokenizer::from_json(vocab_json, merges_json, TokenizerConfig::default())
38//! .expect("loading should succeed");
39//! assert_eq!(tok.vocab_size(), 7);
40//! ```
41//!
42//! ## Loading from a HuggingFace `tokenizer.json`
43//!
44//! ```no_run
45//! use oxibonsai_tokenizer::OxiTokenizer;
46//!
47//! let tok = OxiTokenizer::from_json_file("tokenizer.json")
48//! .expect("HF tokenizer should load");
49//! let ids = tok.encode("Hello!").expect("encode");
50//! let text = tok.decode(&ids).expect("decode");
51//! assert_eq!(text, "Hello!");
52//! ```
53
54pub mod bpe;
55pub mod chat_templates;
56pub mod error;
57pub mod hf_format;
58pub mod serialization;
59pub mod streaming;
60pub mod tests;
61pub mod tokenizer;
62pub mod trainer;
63pub mod unigram;
64pub mod utils;
65pub mod vocab;
66pub mod wordpiece;
67
68// Re-export the most commonly used types at the crate root.
69pub use bpe::{bpe_encode, byte_fallback_id, pretokenize, BpeMerges};
70pub use chat_templates::{ChatMessage, ChatTemplateKind};
71pub use error::{TokenizerError, TokenizerResult};
72pub use hf_format::{
73 byte_to_unicode, bytes_to_unicode_map, unicode_to_byte, HfModelType, HfTokenizerJson,
74};
75pub use serialization::{
76 base64_decode, base64_encode, SerializationError, TokenizerState, FORMAT_MAGIC,
77};
78pub use streaming::StreamingDecoder;
79pub use tokenizer::{OxiTokenizer, TokenizerConfig};
80pub use trainer::{
81 BpeTrainer, MergeRule, SymbolPair, TrainedTokenizer, TrainerConfig, TrainerError, TrainingStats,
82};
83pub use unigram::{UnigramError, UnigramVocab};
84pub use vocab::Vocabulary;
85pub use wordpiece::{WordPieceError, WordPieceVocab, WORDPIECE_CONTINUATION_PREFIX};