tokenizers/pre_tokenizers/
delimiter.rs

1use serde::{Deserialize, Serialize};
2
3use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
4use crate::utils::macro_rules_attribute;
5
6#[derive(Copy, Clone, Debug, PartialEq, Eq)]
7#[non_exhaustive]
8#[macro_rules_attribute(impl_serde_type!)]
9pub struct CharDelimiterSplit {
10    pub delimiter: char,
11}
12
13impl CharDelimiterSplit {
14    pub fn new(delimiter: char) -> Self {
15        Self { delimiter }
16    }
17}
18
19impl PreTokenizer for CharDelimiterSplit {
20    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
21        // TODO: Maybe add the option to specify the behavior
22        pretokenized.split(|_, normalized| {
23            normalized.split(self.delimiter, SplitDelimiterBehavior::Removed)
24        })
25    }
26}