use crate::{
join_patterns,
support::regex::ConstRegexPattern,
};
pub const OA_GPT2_PATTERN_SLOW: ConstRegexPattern = ConstRegexPattern::Fancy(join_patterns!(
r"'s",
r"'t",
r"'re",
r"'ve",
r"'m",
r"'ll",
r"'d",
r" ?[\p{L}]+",
r" ?[\p{N}]+",
r" ?[^\s\p{L}\p{N}]+",
r"\s+(?!\S)",
r"\s+",
));
pub const OA_GPT2_PATTERN: ConstRegexPattern = ConstRegexPattern::Fancy(join_patterns!(
r"'(?:[sdmt]|ll|ve|re)",
r" ?\p{L}++",
r" ?\p{N}++",
r" ?[^\s\p{L}\p{N}]++",
r"\s++$",
r"\s+(?!\S)",
r"\s",
));
pub const OA_R50K_BASE_PATTERN: ConstRegexPattern = OA_GPT2_PATTERN;
pub const OA_P50K_BASE_PATTERN: ConstRegexPattern = OA_R50K_BASE_PATTERN;
pub const OA_CL100K_BASE_PATTERN: ConstRegexPattern = ConstRegexPattern::Fancy(join_patterns!(
r"'(?i:[sdmt]|ll|ve|re)",
r"[^\r\n\p{L}\p{N}]?+\p{L}++",
r"\p{N}{1,3}+",
r" ?[^\s\p{L}\p{N}]++[\r\n]*+",
r"\s++$",
r"\s*[\r\n]",
r"\s+(?!\S)",
r"\s",
));
pub const OA_O200K_BASE_PATTERN: ConstRegexPattern = ConstRegexPattern::Fancy(join_patterns!(
r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
r"\p{N}{1,3}",
r" ?[^\s\p{L}\p{N}]+[\r\n/]*",
r"\s*[\r\n]+",
r"\s+(?!\S)",
r"\s+"
));
pub(crate) const OA_R50K_BASE_PATTERN_RA: &str = join_patterns!(
r"'(?:[sdmt]|ll|ve|re)",
r" ?\p{L}+",
r" ?\p{N}+",
r" ?[^\s\p{L}\p{N}]+",
r"\s+$",
r"\s+",
);
pub(crate) const OA_CL100K_BASE_PATTERN_RA: &str = join_patterns!(
r"'(?i:[sdmt]|ll|ve|re)",
r"[^\r\n\p{L}\p{N}]?\p{L}+",
r"\p{N}{1,3}",
r" ?[^\s\p{L}\p{N}]+[\r\n]*",
r"\s+$",
r"\s*[\r\n]",
r"\s+",
);
pub(crate) const OA_O200K_BASE_PATTERN_RA: &str = join_patterns!(
r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
r"\p{N}{1,3}",
r" ?[^\s\p{L}\p{N}]+[\r\n/]*",
r"\s*[\r\n]+",
r"\s+",
);
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_patterns_compile() {
assert!(OA_R50K_BASE_PATTERN.compile().is_ok());
assert!(OA_GPT2_PATTERN_SLOW.compile().is_ok());
assert!(OA_CL100K_BASE_PATTERN.compile().is_ok());
assert!(OA_CL100K_BASE_PATTERN.compile().is_ok());
assert!(OA_O200K_BASE_PATTERN.compile().is_ok());
}
}