use std::sync::Mutex;
static TOKENIZER_FALLBACK: Mutex<bool> = Mutex::new(false);
pub fn tokenizer_fallback_enabled() -> bool {
*TOKENIZER_FALLBACK
.lock()
.expect("tokenizer_fallback flag mutex poisoned")
}
pub fn set_tokenizer_fallback(enabled: bool) -> bool {
let mut g = TOKENIZER_FALLBACK
.lock()
.expect("tokenizer_fallback flag mutex poisoned");
let prev = *g;
*g = enabled;
prev
}
pub struct TokenizerFallbackGuard {
previous: bool,
}
impl TokenizerFallbackGuard {
pub fn set(enabled: bool) -> Self {
let previous = set_tokenizer_fallback(enabled);
Self { previous }
}
}
impl Drop for TokenizerFallbackGuard {
fn drop(&mut self) {
set_tokenizer_fallback(self.previous);
}
}
pub fn bpe_chunk_text(text: &str) -> Vec<String> {
if text.is_empty() {
return Vec::new();
}
let tokenizer = match axon_csys::tokens::cl100k_base() {
Ok(t) => t,
Err(_) => return Vec::new(),
};
let token_ids = match tokenizer.encode_ordinary(text) {
Ok(ids) => ids,
Err(_) => return Vec::new(),
};
let mut chunks = Vec::with_capacity(token_ids.len());
for id in &token_ids {
let bytes = match tokenizer.decode_bytes(&[*id]) {
Ok(b) => b,
Err(_) => continue,
};
let s = String::from_utf8_lossy(&bytes).to_string();
if !s.is_empty() {
chunks.push(s);
}
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
static FLAG_TEST_LOCK: Mutex<()> = Mutex::new(());
#[test]
fn flag_default_is_off() {
let _serial = FLAG_TEST_LOCK
.lock()
.unwrap_or_else(|p| p.into_inner());
set_tokenizer_fallback(false);
assert!(!tokenizer_fallback_enabled());
}
#[test]
fn set_returns_previous_value() {
let _serial = FLAG_TEST_LOCK
.lock()
.unwrap_or_else(|p| p.into_inner());
set_tokenizer_fallback(false);
let prev = set_tokenizer_fallback(true);
assert!(!prev);
let prev = set_tokenizer_fallback(false);
assert!(prev);
}
#[test]
fn guard_restores_flag_on_drop() {
let _serial = FLAG_TEST_LOCK
.lock()
.unwrap_or_else(|p| p.into_inner());
set_tokenizer_fallback(false);
{
let _g = TokenizerFallbackGuard::set(true);
assert!(tokenizer_fallback_enabled());
}
assert!(!tokenizer_fallback_enabled(), "guard must restore on drop");
}
#[test]
fn guard_restores_to_previous_not_default() {
let _serial = FLAG_TEST_LOCK
.lock()
.unwrap_or_else(|p| p.into_inner());
set_tokenizer_fallback(true);
{
let _g = TokenizerFallbackGuard::set(false);
assert!(!tokenizer_fallback_enabled());
}
assert!(
tokenizer_fallback_enabled(),
"guard restores to PREVIOUS (true), not default (false)"
);
set_tokenizer_fallback(false);
}
#[test]
fn bpe_chunk_empty_text_returns_empty_vec() {
let chunks = bpe_chunk_text("");
assert!(chunks.is_empty());
}
#[test]
fn bpe_chunk_english_produces_token_level_granularity() {
let chunks = bpe_chunk_text("Hello world");
assert!(
!chunks.is_empty(),
"BPE on 'Hello world' must produce ≥1 chunk"
);
let joined: String = chunks.join("");
assert_eq!(joined, "Hello world");
}
#[test]
fn bpe_chunk_finer_than_whitespace_for_long_text() {
let text = "The quick brown fox jumps over the lazy dog repeatedly.";
let word_chunk_count = text.split_whitespace().count().div_ceil(3);
let bpe_chunks = bpe_chunk_text(text);
assert!(
bpe_chunks.len() > word_chunk_count,
"BPE ({}) must be finer than whitespace chunks-of-3 ({})",
bpe_chunks.len(),
word_chunk_count
);
let joined: String = bpe_chunks.join("");
assert_eq!(joined, text);
}
#[test]
fn bpe_chunk_round_trip_preserves_content() {
let text = "axon for axon — four-pillar streaming language.";
let chunks = bpe_chunk_text(text);
let joined: String = chunks.join("");
assert_eq!(joined, text);
}
}