use crate::tokenizer::Tokenizer;
pub struct CharacterTokenizer {}
impl CharacterTokenizer {
pub fn new() -> Self {
CharacterTokenizer {}
}
}
impl Tokenizer for CharacterTokenizer {
fn encode(&self, text: &str) -> Vec<usize> {
text.chars().map(|c| c as usize).collect::<Vec<usize>>()
}
fn decode(&self, tokens: &[usize]) -> String {
tokens
.iter()
.map(|&token| char::from_u32(token as u32)
.unwrap_or('⍰'))
.collect::<String>()
}
fn count_tokens(&self, text: &str) -> usize {
text.chars().count()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenizer_encode() {
let tokenizer = CharacterTokenizer::new();
assert_eq!(tokenizer.encode(&"abc".to_string()), vec![97, 98, 99]);
assert_eq!(
tokenizer.encode(&"abcde".to_string()),
vec![97, 98, 99, 100, 101]
);
}
#[test]
fn test_tokenizer_decode() {
let tokenizer = CharacterTokenizer::new();
assert_eq!(tokenizer.encode(&"abc".to_string()), vec![97, 98, 99]);
assert_eq!(tokenizer.decode(&vec![97, 98, 99]), "abc");
}
}