use std::collections::HashMap;
use crate::error::{RealizarError, Result};
#[derive(Debug, Clone)]
pub struct Vocabulary {
token_to_id: HashMap<String, u32>,
id_to_token: HashMap<u32, String>,
}
impl Vocabulary {
pub fn from_tokens(tokens: Vec<String>) -> Result<Self> {
if tokens.is_empty() {
return Err(RealizarError::UnsupportedOperation {
operation: "create_vocabulary".to_string(),
reason: "Vocabulary cannot be empty".to_string(),
});
}
let mut token_to_id = HashMap::new();
let mut id_to_token = HashMap::new();
for (id, token) in tokens.into_iter().enumerate() {
let id = u32::try_from(id).map_err(|_| RealizarError::UnsupportedOperation {
operation: "convert_token_id".to_string(),
reason: format!("Token ID {id} exceeds u32 limit"),
})?;
if token_to_id.contains_key(&token) {
return Err(RealizarError::UnsupportedOperation {
operation: "create_vocabulary".to_string(),
reason: format!("Duplicate token: {token}"),
});
}
token_to_id.insert(token.clone(), id);
id_to_token.insert(id, token);
}
Ok(Self {
token_to_id,
id_to_token,
})
}
#[must_use]
pub fn get_id(&self, token: &str) -> Option<u32> {
self.token_to_id.get(token).copied()
}
#[must_use]
pub fn get_token(&self, id: u32) -> Option<&str> {
self.id_to_token.get(&id).map(String::as_str)
}
#[must_use]
pub fn size(&self) -> usize {
self.token_to_id.len()
}
}
#[derive(Debug, Clone)]
pub struct Tokenizer {
vocab: Vocabulary,
unk_token_id: u32,
}
#[derive(Debug, Clone)]
pub struct BPETokenizer {
token_to_id: HashMap<String, u32>,
id_to_token: HashMap<u32, String>,
merges: Vec<(String, String)>,
unk_token_id: u32,
}
impl BPETokenizer {
pub fn new(vocab: Vec<String>, merges: Vec<(String, String)>, unk_token: &str) -> Result<Self> {
if vocab.is_empty() {
return Err(RealizarError::UnsupportedOperation {
operation: "create_bpe_tokenizer".to_string(),
reason: "Vocabulary cannot be empty".to_string(),
});
}
let mut token_to_id = HashMap::new();
let mut id_to_token = HashMap::new();
for (id, token) in vocab.into_iter().enumerate() {
let id = u32::try_from(id).map_err(|_| RealizarError::UnsupportedOperation {
operation: "convert_token_id".to_string(),
reason: format!("Token ID {id} exceeds u32 limit"),
})?;
token_to_id.insert(token.clone(), id);
id_to_token.insert(id, token);
}
let unk_token_id =
*token_to_id
.get(unk_token)
.ok_or_else(|| RealizarError::UnsupportedOperation {
operation: "create_bpe_tokenizer".to_string(),
reason: format!("Unknown token '{unk_token}' not in vocabulary"),
})?;
Ok(Self {
token_to_id,
id_to_token,
merges,
unk_token_id,
})
}
#[must_use]
pub fn encode(&self, text: &str) -> Vec<u32> {
if text.is_empty() {
return Vec::new();
}
let words: Vec<String> = text
.split(' ')
.enumerate()
.flat_map(|(i, word)| {
if word.is_empty() {
vec![]
} else if i == 0 {
vec![word.to_string()]
} else {
vec![format!(" {word}")]
}
})
.collect();
let mut result = Vec::new();
for word in words {
let mut tokens: Vec<String> = word.chars().map(|c| c.to_string()).collect();
for (first, second) in &self.merges {
tokens = Self::apply_merge(&tokens, first, second);
}
for token in tokens {
let id = self
.token_to_id
.get(&token)
.copied()
.unwrap_or(self.unk_token_id);
result.push(id);
}
}
result
}
fn apply_merge(tokens: &[String], first: &str, second: &str) -> Vec<String> {
if tokens.len() < 2 {
return tokens.to_vec();
}
let mut result = Vec::new();
let mut i = 0;
while i < tokens.len() {
if i + 1 < tokens.len() && tokens[i] == first && tokens[i + 1] == second {
result.push(format!("{first}{second}"));
i += 2;
} else {
result.push(tokens[i].clone());
i += 1;
}
}
result
}
pub fn decode(&self, token_ids: &[u32]) -> Result<String> {
let mut result = String::new();
for &id in token_ids {
let token =
self.id_to_token
.get(&id)
.ok_or_else(|| RealizarError::UnsupportedOperation {
operation: "decode_bpe_token".to_string(),
reason: format!("Invalid token ID: {id}"),
})?;
result.push_str(token);
}
Ok(result)
}
#[must_use]
pub fn vocab_size(&self) -> usize {
self.token_to_id.len()
}
#[must_use]
pub fn get_token_id(&self, token: &str) -> Option<u32> {
self.token_to_id.get(token).copied()
}
#[must_use]
pub fn get_token(&self, id: u32) -> Option<&str> {
self.id_to_token.get(&id).map(String::as_str)
}
}
type ViterbiState = (Vec<f32>, Vec<Option<String>>);
#[derive(Debug, Clone)]
pub struct SentencePieceTokenizer {
token_to_id: HashMap<String, u32>,
id_to_token: HashMap<u32, String>,
scores: HashMap<String, f32>,
unk_token_id: u32,
}
impl SentencePieceTokenizer {
pub fn new(vocab: Vec<(String, f32)>, unk_token: &str) -> Result<Self> {
if vocab.is_empty() {
return Err(RealizarError::UnsupportedOperation {
operation: "create_sentencepiece_tokenizer".to_string(),
reason: "Vocabulary cannot be empty".to_string(),
});
}
let mut token_to_id = HashMap::new();
let mut id_to_token = HashMap::new();
let mut scores = HashMap::new();
for (id, (token, score)) in vocab.into_iter().enumerate() {
let id = u32::try_from(id).map_err(|_| RealizarError::UnsupportedOperation {
operation: "convert_token_id".to_string(),
reason: format!("Token ID {id} exceeds u32 limit"),
})?;
token_to_id.insert(token.clone(), id);
id_to_token.insert(id, token.clone());
scores.insert(token, score);
}
let unk_token_id =
*token_to_id
.get(unk_token)
.ok_or_else(|| RealizarError::UnsupportedOperation {
operation: "create_sentencepiece_tokenizer".to_string(),
reason: format!("Unknown token '{unk_token}' not in vocabulary"),
})?;
Ok(Self {
token_to_id,
id_to_token,
scores,
unk_token_id,
})
}
#[must_use]
pub fn encode(&self, text: &str) -> Vec<u32> {
if text.is_empty() {
return Vec::new();
}
let chars: Vec<char> = text.chars().collect();
let (_best_score, best_token) = self.viterbi_forward(&chars);
let tokens = Self::viterbi_backtrack(&chars, &best_token);
tokens
.into_iter()
.map(|t| {
self.token_to_id
.get(&t)
.copied()
.unwrap_or(self.unk_token_id)
})
.collect()
}
fn viterbi_forward(&self, chars: &[char]) -> ViterbiState {
let n = chars.len();
let mut best_score = vec![f32::NEG_INFINITY; n + 1];
let mut best_token: Vec<Option<String>> = vec![None; n + 1];
best_score[0] = 0.0;
for end in 1..=n {
for start in 0..end {
let substr: String = chars[start..end].iter().collect();
if let Some(&score) = self.scores.get(&substr) {
let new_score = best_score[start] + score;
if new_score > best_score[end] {
best_score[end] = new_score;
best_token[end] = Some(substr);
}
}
}
if best_token[end].is_none() && best_score[end - 1] > f32::NEG_INFINITY {
let char_str: String = chars[end - 1..end].iter().collect();
best_score[end] = best_score[end - 1] - 100.0; best_token[end] = Some(char_str);
}
}
(best_score, best_token)
}
fn viterbi_backtrack(chars: &[char], best_token: &[Option<String>]) -> Vec<String> {
let n = chars.len();
let mut tokens = Vec::new();
let mut pos = n;
while pos > 0 {
if let Some(token) = &best_token[pos] {
tokens.push(token.clone());
pos -= token.chars().count();
} else {
let char_str: String = chars[pos - 1..pos].iter().collect();
tokens.push(char_str);
pos -= 1;
}
}
tokens.reverse();
tokens
}
pub fn decode(&self, token_ids: &[u32]) -> Result<String> {
let mut result = String::new();
for &id in token_ids {
let token =
self.id_to_token
.get(&id)
.ok_or_else(|| RealizarError::UnsupportedOperation {
operation: "decode_sentencepiece_token".to_string(),
reason: format!("Invalid token ID: {id}"),
})?;
result.push_str(token);
}
Ok(result)
}
#[must_use]
pub fn vocab_size(&self) -> usize {
self.token_to_id.len()
}
#[must_use]
pub fn get_token_id(&self, token: &str) -> Option<u32> {
self.token_to_id.get(token).copied()
}
#[must_use]
pub fn get_token(&self, id: u32) -> Option<&str> {
self.id_to_token.get(&id).map(String::as_str)
}
#[must_use]
pub fn get_score(&self, token: &str) -> Option<f32> {
self.scores.get(token).copied()
}
}
impl Tokenizer {
pub fn new(vocab: Vocabulary, unk_token: &str) -> Result<Self> {
let unk_token_id =
vocab
.get_id(unk_token)
.ok_or_else(|| RealizarError::UnsupportedOperation {
operation: "create_tokenizer".to_string(),
reason: format!("Unknown token '{unk_token}' not in vocabulary"),
})?;
Ok(Self {
vocab,
unk_token_id,
})
}
#[must_use]
pub fn encode(&self, text: &str) -> Vec<u32> {
text.split_whitespace()
.map(|word| self.vocab.get_id(word).unwrap_or(self.unk_token_id))
.collect()
}
pub fn decode(&self, token_ids: &[u32]) -> Result<String> {
let tokens: Result<Vec<&str>> = token_ids
.iter()
.map(|&id| {
self.vocab
.get_token(id)
.ok_or_else(|| RealizarError::UnsupportedOperation {
operation: "decode_token".to_string(),
reason: format!("Invalid token ID: {id}"),
})
})
.collect();
Ok(tokens?.join(" "))
}
#[must_use]
pub fn vocab_size(&self) -> usize {
self.vocab.size()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vocabulary_from_tokens() {
let tokens = vec![
"<unk>".to_string(),
"hello".to_string(),
"world".to_string(),
];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
assert_eq!(vocab.size(), 3);
assert_eq!(vocab.get_id("<unk>"), Some(0));
assert_eq!(vocab.get_id("hello"), Some(1));
assert_eq!(vocab.get_id("world"), Some(2));
assert_eq!(vocab.get_token(0), Some("<unk>"));
assert_eq!(vocab.get_token(1), Some("hello"));
assert_eq!(vocab.get_token(2), Some("world"));
}
#[test]
fn test_vocabulary_empty_error() {
let result = Vocabulary::from_tokens(vec![]);
assert!(result.is_err());
}
#[test]
fn test_vocabulary_duplicate_error() {
let tokens = vec![
"hello".to_string(),
"world".to_string(),
"hello".to_string(), ];
let result = Vocabulary::from_tokens(tokens);
assert!(result.is_err());
}
#[test]
fn test_vocabulary_get_missing() {
let tokens = vec!["hello".to_string()];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
assert_eq!(vocab.get_id("world"), None);
assert_eq!(vocab.get_token(999), None);
}
#[test]
fn test_tokenizer_encode_decode() {
let tokens = vec![
"<unk>".to_string(),
"hello".to_string(),
"world".to_string(),
];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
let tokenizer = Tokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hello world");
assert_eq!(encoded, vec![1, 2]);
let decoded = tokenizer.decode(&encoded).unwrap();
assert_eq!(decoded, "hello world");
}
#[test]
fn test_tokenizer_unknown_token() {
let tokens = vec!["<unk>".to_string(), "hello".to_string()];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
let tokenizer = Tokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hello foo");
assert_eq!(encoded, vec![1, 0]);
}
#[test]
fn test_tokenizer_invalid_unk_token() {
let tokens = vec!["hello".to_string()];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
let result = Tokenizer::new(vocab, "<unk>");
assert!(result.is_err());
}
#[test]
fn test_tokenizer_decode_invalid_id() {
let tokens = vec!["<unk>".to_string(), "hello".to_string()];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
let tokenizer = Tokenizer::new(vocab, "<unk>").unwrap();
let result = tokenizer.decode(&[1, 999]); assert!(result.is_err());
}
#[test]
fn test_tokenizer_empty_string() {
let tokens = vec!["<unk>".to_string()];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
let tokenizer = Tokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("");
assert_eq!(encoded, Vec::<u32>::new());
let decoded = tokenizer.decode(&[]).unwrap();
assert_eq!(decoded, "");
}
#[test]
fn test_tokenizer_vocab_size() {
let tokens = vec![
"<unk>".to_string(),
"hello".to_string(),
"world".to_string(),
];
let vocab = Vocabulary::from_tokens(tokens).unwrap();
let tokenizer = Tokenizer::new(vocab, "<unk>").unwrap();
assert_eq!(tokenizer.vocab_size(), 3);
}
#[test]
fn test_bpe_tokenizer_creation() {
let vocab = vec![
"<unk>".to_string(),
"h".to_string(),
"e".to_string(),
"l".to_string(),
"o".to_string(),
"he".to_string(),
"ll".to_string(),
"hel".to_string(),
"hello".to_string(),
];
let merges = vec![
("h".to_string(), "e".to_string()),
("l".to_string(), "l".to_string()),
("he".to_string(), "l".to_string()),
("hel".to_string(), "lo".to_string()),
];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").unwrap();
assert_eq!(tokenizer.vocab_size(), 9);
}
#[test]
fn test_bpe_tokenizer_empty_vocab_error() {
let result = BPETokenizer::new(vec![], vec![], "<unk>");
assert!(result.is_err());
}
#[test]
fn test_bpe_tokenizer_invalid_unk_token_error() {
let vocab = vec!["hello".to_string()];
let result = BPETokenizer::new(vocab, vec![], "<unk>");
assert!(result.is_err());
}
#[test]
fn test_bpe_encode_no_merges() {
let vocab = vec!["<unk>".to_string(), "h".to_string(), "i".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").unwrap();
let encoded = tokenizer.encode("hi");
assert_eq!(encoded, vec![1, 2]); }
#[test]
fn test_bpe_encode_with_merges() {
let vocab = vec![
"<unk>".to_string(),
"h".to_string(),
"e".to_string(),
"l".to_string(),
"o".to_string(),
"he".to_string(),
"ll".to_string(),
];
let merges = vec![
("h".to_string(), "e".to_string()),
("l".to_string(), "l".to_string()),
];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").unwrap();
let encoded = tokenizer.encode("hello");
assert_eq!(encoded, vec![5, 6, 4]);
}
#[test]
fn test_bpe_encode_unknown_char() {
let vocab = vec!["<unk>".to_string(), "h".to_string(), "i".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").unwrap();
let encoded = tokenizer.encode("hix");
assert_eq!(encoded, vec![1, 2, 0]);
}
#[test]
fn test_bpe_encode_empty_string() {
let vocab = vec!["<unk>".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").unwrap();
let encoded = tokenizer.encode("");
assert!(encoded.is_empty());
}
#[test]
fn test_bpe_encode_multiple_words() {
let vocab = vec![
"<unk>".to_string(),
"h".to_string(),
"i".to_string(),
" ".to_string(),
" h".to_string(),
];
let merges = vec![(" ".to_string(), "h".to_string())];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").unwrap();
let encoded = tokenizer.encode("hi hi");
assert_eq!(encoded, vec![1, 2, 4, 2]); }
#[test]
fn test_bpe_decode() {
let vocab = vec!["<unk>".to_string(), "hel".to_string(), "lo".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").unwrap();
let decoded = tokenizer.decode(&[1, 2]).unwrap();
assert_eq!(decoded, "hello");
}
#[test]
fn test_bpe_decode_empty() {
let vocab = vec!["<unk>".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").unwrap();
let decoded = tokenizer.decode(&[]).unwrap();
assert_eq!(decoded, "");
}
#[test]
fn test_bpe_decode_invalid_id_error() {
let vocab = vec!["<unk>".to_string(), "hi".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").unwrap();
let result = tokenizer.decode(&[1, 999]);
assert!(result.is_err());
}
#[test]
fn test_bpe_encode_decode_roundtrip() {
let vocab = vec![
"<unk>".to_string(),
"h".to_string(),
"e".to_string(),
"l".to_string(),
"o".to_string(),
"he".to_string(),
"ll".to_string(),
"lo".to_string(),
"hel".to_string(),
"hello".to_string(),
];
let merges = vec![
("h".to_string(), "e".to_string()),
("l".to_string(), "l".to_string()),
("l".to_string(), "o".to_string()),
("he".to_string(), "l".to_string()),
("hel".to_string(), "lo".to_string()),
];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").unwrap();
let encoded = tokenizer.encode("hello");
let decoded = tokenizer.decode(&encoded).unwrap();
assert_eq!(decoded, "hello");
}
#[test]
fn test_bpe_get_token_methods() {
let vocab = vec!["<unk>".to_string(), "hello".to_string()];
let tokenizer = BPETokenizer::new(vocab, vec![], "<unk>").unwrap();
assert_eq!(tokenizer.get_token_id("hello"), Some(1));
assert_eq!(tokenizer.get_token_id("world"), None);
assert_eq!(tokenizer.get_token(1), Some("hello"));
assert_eq!(tokenizer.get_token(999), None);
}
#[test]
fn test_bpe_multiple_consecutive_merges() {
let vocab = vec![
"<unk>".to_string(),
"a".to_string(),
"b".to_string(),
"ab".to_string(),
"abab".to_string(),
];
let merges = vec![
("a".to_string(), "b".to_string()),
("ab".to_string(), "ab".to_string()),
];
let tokenizer = BPETokenizer::new(vocab, merges, "<unk>").unwrap();
let encoded = tokenizer.encode("abab");
assert_eq!(encoded, vec![4]);
}
#[test]
fn test_sentencepiece_tokenizer_creation() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("hello".to_string(), -1.0),
("world".to_string(), -1.5),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
assert_eq!(tokenizer.vocab_size(), 3);
}
#[test]
fn test_sentencepiece_empty_vocab_error() {
let result = SentencePieceTokenizer::new(vec![], "<unk>");
assert!(result.is_err());
}
#[test]
fn test_sentencepiece_invalid_unk_token_error() {
let vocab = vec![("hello".to_string(), -1.0)];
let result = SentencePieceTokenizer::new(vocab, "<unk>");
assert!(result.is_err());
}
#[test]
fn test_sentencepiece_encode_empty() {
let vocab = vec![("<unk>".to_string(), 0.0)];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("");
assert!(encoded.is_empty());
}
#[test]
fn test_sentencepiece_encode_single_token() {
let vocab = vec![("<unk>".to_string(), 0.0), ("hello".to_string(), -1.0)];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hello");
assert_eq!(encoded, vec![1]);
}
#[test]
fn test_sentencepiece_encode_prefers_higher_score() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("h".to_string(), -5.0),
("e".to_string(), -5.0),
("l".to_string(), -5.0),
("o".to_string(), -5.0),
("hel".to_string(), -2.0),
("lo".to_string(), -2.0),
("hello".to_string(), -1.0),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hello");
assert_eq!(encoded, vec![7]);
}
#[test]
fn test_sentencepiece_encode_subwords() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("h".to_string(), -1.0),
("e".to_string(), -1.0),
("l".to_string(), -1.0),
("o".to_string(), -1.0),
("he".to_string(), -0.5),
("llo".to_string(), -0.5),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hello");
assert_eq!(encoded, vec![5, 6]);
}
#[test]
fn test_sentencepiece_decode() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("hel".to_string(), -1.0),
("lo".to_string(), -1.0),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let decoded = tokenizer.decode(&[1, 2]).unwrap();
assert_eq!(decoded, "hello");
}
#[test]
fn test_sentencepiece_decode_empty() {
let vocab = vec![("<unk>".to_string(), 0.0)];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let decoded = tokenizer.decode(&[]).unwrap();
assert_eq!(decoded, "");
}
#[test]
fn test_sentencepiece_decode_invalid_id_error() {
let vocab = vec![("<unk>".to_string(), 0.0), ("hi".to_string(), -1.0)];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let result = tokenizer.decode(&[1, 999]);
assert!(result.is_err());
}
#[test]
fn test_sentencepiece_encode_decode_roundtrip() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("h".to_string(), -2.0),
("e".to_string(), -2.0),
("l".to_string(), -2.0),
("o".to_string(), -2.0),
("hello".to_string(), -1.0),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hello");
let decoded = tokenizer.decode(&encoded).unwrap();
assert_eq!(decoded, "hello");
}
#[test]
fn test_sentencepiece_get_methods() {
let vocab = vec![("<unk>".to_string(), 0.0), ("hello".to_string(), -1.5)];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
assert_eq!(tokenizer.get_token_id("hello"), Some(1));
assert_eq!(tokenizer.get_token_id("world"), None);
assert_eq!(tokenizer.get_token(1), Some("hello"));
assert_eq!(tokenizer.get_token(999), None);
assert!((tokenizer.get_score("hello").unwrap() - (-1.5)).abs() < 1e-6);
assert_eq!(tokenizer.get_score("world"), None);
}
#[test]
fn test_sentencepiece_unknown_character() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("h".to_string(), -1.0),
("i".to_string(), -1.0),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hix");
assert_eq!(encoded.len(), 3);
assert_eq!(encoded[0], 1); assert_eq!(encoded[1], 2); assert_eq!(encoded[2], 0);
}
#[test]
fn test_sentencepiece_multiple_words() {
let vocab = vec![
("<unk>".to_string(), 0.0),
("hello".to_string(), -1.0),
(" ".to_string(), -0.5),
("world".to_string(), -1.0),
];
let tokenizer = SentencePieceTokenizer::new(vocab, "<unk>").unwrap();
let encoded = tokenizer.encode("hello world");
assert_eq!(encoded, vec![1, 2, 3]); }
}