pub(crate) mod byte_level;
mod split;
use crate::{
json_structs::{PreTokenizerConfig, PreTokenizerKind},
pre_tokenized::PreTokenizedString,
};
pub use self::{
byte_level::ByteLevel,
split::{Split, SplitBehavior},
};
pub(crate) use self::byte_level::BYTE_TO_CHAR;
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error("invalid config value: {0}")]
Json(#[from] serde_json::Error),
#[error("regex error: {0}")]
Regex(#[from] fancy_regex::Error),
#[error("unsupported pre-tokenizer type: {0}")]
Unsupported(String),
}
#[derive(Clone, Debug)]
pub enum PreTokenizer {
ByteLevel(ByteLevel),
Split(Split),
Sequence(Vec<PreTokenizer>),
}
impl PreTokenizer {
pub fn from_config(config: PreTokenizerConfig) -> Result<Self, Error> {
match config {
PreTokenizerConfig::ByteLevel(bl) => Ok(Self::ByteLevel(bl)),
PreTokenizerConfig::Split(s) => Ok(Self::Split(s)),
PreTokenizerConfig::Sequence { pretokenizers } => {
let steps = pretokenizers
.into_iter()
.map(Self::from_config)
.collect::<Result<Vec<_>, _>>()?;
Ok(Self::Sequence(steps))
}
PreTokenizerConfig::Other(v) => {
let typ = v.get("type").and_then(|t| t.as_str()).unwrap_or("unknown");
Err(Error::Unsupported(typ.to_string()))
}
other => {
let kind = PreTokenizerKind::from(&other);
Err(Error::Unsupported(kind.to_string()))
}
}
}
pub fn pre_tokenize(&self, pts: &mut PreTokenizedString) -> Result<(), Error> {
match self {
Self::ByteLevel(bl) => bl.pre_tokenize(pts),
Self::Split(s) => s.pre_tokenize(pts),
Self::Sequence(steps) => {
for step in steps {
step.pre_tokenize(pts)?;
}
Ok(())
}
}
}
}