use crate::tokenization::hf_tokenizers::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use unicode_categories::UnicodeCategories;
fn is_punc(x: char) -> bool {
char::is_ascii_punctuation(&x) || x.is_punctuation()
}
#[derive(Copy, Clone, Debug)]
pub struct Punctuation;
impl_serde_unit_struct!(PunctuationVisitor, Punctuation);
impl PreTokenizer for Punctuation {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, s| s.split(is_punc, SplitDelimiterBehavior::Isolated))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenization::hf_tokenizers::{OffsetReferential, OffsetType};
#[test]
fn punctuation_basic() {
let pretok = Punctuation;
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Original, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("Hey friend", (0, 10)),
("!", (10, 11)),
(" How are you", (11, 27)),
("?", (27, 28)),
("!", (28, 29)),
("?", (29, 30)),
]
);
}
}