1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
use crate::tokenizer::{NormalizedString, Offsets, PreTokenizer, Result}; pub struct CharDelimiterSplit { delimiter: char, } impl CharDelimiterSplit { pub fn new(delimiter: char) -> Self { CharDelimiterSplit { delimiter } } } impl PreTokenizer for CharDelimiterSplit { fn pre_tokenize(&self, normalized: &mut NormalizedString) -> Result<Vec<(String, Offsets)>> { let mut words = vec![]; let mut word = Vec::with_capacity(1000); let mut offset = 0; normalized.get().chars().for_each(|c| { if c == self.delimiter { if !word.is_empty() { let offsets = (offset - word.len(), offset); words.push((word.drain(0..).collect::<String>(), offsets)); } } else { word.push(c); } offset += 1; }); if !word.is_empty() { let offsets = (offset - word.len(), offset); words.push((word.drain(0..).collect::<String>(), offsets)); } Ok(words) } }