Skip to main content

oxibonsai_tokenizer/
error.rs

1//! Error types for the OxiBonsai tokenizer.
2
3use thiserror::Error;
4
5/// All errors that can occur during tokenization operations.
6///
7/// This enum is marked `#[non_exhaustive]` so that new variants can be added
8/// in future minor releases without a breaking semver change.  Consumers must
9/// always include a catch-all arm when matching on [`TokenizerError`].
10#[derive(Debug, Error, Clone, PartialEq, Eq)]
11#[non_exhaustive]
12pub enum TokenizerError {
13    /// A token string was not found in the vocabulary.
14    #[error("unknown token: {0:?}")]
15    UnknownToken(String),
16
17    /// The vocabulary data is malformed or inconsistent.
18    #[error("invalid vocabulary: {0}")]
19    InvalidVocab(String),
20
21    /// Encoding of input text failed.
22    #[error("encode failed: {0}")]
23    EncodeFailed(String),
24
25    /// Decoding of token IDs failed.
26    #[error("decode failed: {0}")]
27    DecodeFailed(String),
28
29    /// JSON deserialization failed.
30    #[error("invalid JSON: {0}")]
31    InvalidJson(String),
32
33    /// A HuggingFace `tokenizer.json` file could not be parsed or interpreted.
34    ///
35    /// Includes missing required fields (`model`, `vocab`, `merges`), unsupported
36    /// BPE types, and malformed merge entries.
37    #[error("HF tokenizer format error: {0}")]
38    HfFormat(String),
39
40    /// A streaming decoder received token IDs that together do not form a
41    /// complete UTF-8 sequence and further bytes are required to finish.
42    ///
43    /// This variant is primarily returned by [`crate::streaming::StreamingDecoder::finish`]
44    /// when the stream ends mid-character.
45    #[error("incomplete UTF-8 sequence at end of stream")]
46    IncompleteUtf8,
47
48    /// Rendering a chat-template failed (missing variable, bad syntax, ...).
49    #[error("template render failed: {0}")]
50    TemplateRender(String),
51
52    /// An underlying I/O operation (file read, etc.) failed.
53    ///
54    /// We wrap the `io::Error` as a `String` so that `TokenizerError` can
55    /// continue to derive `Clone, PartialEq, Eq` — `std::io::Error` itself
56    /// does not implement those traits.
57    #[error("I/O error: {0}")]
58    Io(String),
59}
60
61impl From<std::io::Error> for TokenizerError {
62    fn from(err: std::io::Error) -> Self {
63        Self::Io(err.to_string())
64    }
65}
66
67/// Convenience result alias for tokenizer operations.
68pub type TokenizerResult<T> = Result<T, TokenizerError>;
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73
74    #[test]
75    fn display_unknown_token() {
76        let e = TokenizerError::UnknownToken("foo".to_owned());
77        let s = format!("{e}");
78        assert!(s.contains("foo"));
79    }
80
81    #[test]
82    fn display_hf_format() {
83        let e = TokenizerError::HfFormat("bad merges".to_owned());
84        let s = format!("{e}");
85        assert!(s.contains("bad merges"));
86        assert!(s.contains("HF"));
87    }
88
89    #[test]
90    fn display_incomplete_utf8() {
91        let e = TokenizerError::IncompleteUtf8;
92        let s = format!("{e}");
93        assert!(s.to_ascii_lowercase().contains("utf-8"));
94    }
95
96    #[test]
97    fn display_template_render() {
98        let e = TokenizerError::TemplateRender("no such var".to_owned());
99        let s = format!("{e}");
100        assert!(s.contains("no such var"));
101    }
102
103    #[test]
104    fn io_error_conversion_preserves_message() {
105        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "missing");
106        let tok_err: TokenizerError = io_err.into();
107        match tok_err {
108            TokenizerError::Io(msg) => assert!(msg.contains("missing")),
109            other => panic!("expected Io variant, got {other:?}"),
110        }
111    }
112
113    #[test]
114    fn tokenizer_error_is_clone() {
115        let e = TokenizerError::InvalidVocab("oops".to_owned());
116        let c = e.clone();
117        assert_eq!(e, c);
118    }
119
120    #[test]
121    fn tokenizer_error_equality() {
122        let a = TokenizerError::EncodeFailed("x".to_owned());
123        let b = TokenizerError::EncodeFailed("x".to_owned());
124        let c = TokenizerError::EncodeFailed("y".to_owned());
125        assert_eq!(a, b);
126        assert_ne!(a, c);
127    }
128}