use std::sync::OnceLock;
use crate::process::{
process_type::ProcessType,
transform::{
constants::*,
delete::{DeleteFilter, DeleteMatcher},
filter::FilterIterator,
normalize::{NormalizeFilter, NormalizeMatcher},
romanize::{RomanizeFilter, RomanizeMatcher},
variant_norm::{VariantNormFilter, VariantNormMatcher},
},
};
#[derive(Clone)]
pub(crate) enum TransformStep {
None,
VariantNorm(VariantNormMatcher),
Delete(DeleteMatcher),
Normalize(NormalizeMatcher),
Romanize(RomanizeMatcher),
RomanizeChar(RomanizeMatcher),
EmojiNorm(RomanizeMatcher),
}
pub(crate) enum TransformFilter<'a> {
Delete(FilterIterator<'a, DeleteFilter<'a>>),
Normalize(FilterIterator<'a, NormalizeFilter<'a>>),
VariantNorm(FilterIterator<'a, VariantNormFilter<'a>>),
Romanize(FilterIterator<'a, RomanizeFilter<'a>>),
}
impl Iterator for TransformFilter<'_> {
type Item = u8;
#[inline(always)]
fn next(&mut self) -> Option<u8> {
match self {
Self::Delete(i) => i.next(),
Self::Normalize(i) => i.next(),
Self::VariantNorm(i) => i.next(),
Self::Romanize(i) => i.next(),
}
}
}
impl TransformStep {
#[inline(always)]
pub(crate) fn is_noop_on_ascii_input(&self) -> bool {
matches!(
self,
Self::None
| Self::VariantNorm(_)
| Self::Romanize(_)
| Self::RomanizeChar(_)
| Self::EmojiNorm(_)
)
}
#[inline(always)]
pub(crate) fn is_non_bijective(&self) -> bool {
matches!(self, Self::Delete(_))
}
#[inline(always)]
pub(crate) fn filter_bytes<'a>(&'a self, text: &'a str) -> Option<TransformFilter<'a>> {
match self {
Self::Delete(m) => Some(TransformFilter::Delete(m.filter_bytes(text))),
Self::Normalize(m) => Some(TransformFilter::Normalize(m.filter_bytes(text))),
Self::VariantNorm(m) => Some(TransformFilter::VariantNorm(m.filter_bytes(text))),
Self::Romanize(m) | Self::RomanizeChar(m) => {
Some(TransformFilter::Romanize(m.filter_bytes(text)))
}
Self::None | Self::EmojiNorm(_) => None,
}
}
#[inline(always)]
pub(crate) fn apply(&self, text: &str, parent_density: f32) -> Option<(String, f32)> {
if parent_density >= 1.0 {
return match self {
Self::None
| Self::VariantNorm(_)
| Self::Romanize(_)
| Self::RomanizeChar(_)
| Self::EmojiNorm(_) => None,
Self::Delete(matcher) => matcher.delete(text).map(|s| (s, 1.0)),
Self::Normalize(matcher) => matcher.replace(text).map(|s| (s, 1.0)),
};
}
match self {
Self::None => None,
Self::VariantNorm(matcher) => matcher.replace(text).map(|s| (s, parent_density)),
Self::Delete(matcher) => matcher.delete(text).map(|s| (s, parent_density)),
Self::Normalize(matcher) => matcher.replace(text).map(|s| (s, parent_density)),
Self::Romanize(matcher) | Self::RomanizeChar(matcher) | Self::EmojiNorm(matcher) => {
matcher.replace(text).map(|s| (s, 1.0))
}
}
}
}
static TRANSFORM_STEP_CACHE: [OnceLock<TransformStep>; 8] = [
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
];
pub(crate) fn get_transform_step(process_type_bit: ProcessType) -> &'static TransformStep {
debug_assert!(
process_type_bit.bits().is_power_of_two() || process_type_bit == ProcessType::None,
"get_transform_step requires a single-bit ProcessType, got {:?}",
process_type_bit
);
let index = process_type_bit.bits().trailing_zeros() as usize;
debug_assert!(index < TRANSFORM_STEP_CACHE.len());
TRANSFORM_STEP_CACHE[index].get_or_init(|| build_transform_step(process_type_bit))
}
fn build_transform_step(process_type_bit: ProcessType) -> TransformStep {
match process_type_bit {
ProcessType::None => TransformStep::None,
ProcessType::VariantNorm => TransformStep::VariantNorm(VariantNormMatcher::new(
VARIANT_NORM_L1_BYTES,
VARIANT_NORM_L2_BYTES,
)),
ProcessType::Delete => TransformStep::Delete(DeleteMatcher::new(DELETE_BITSET_BYTES)),
ProcessType::Normalize => TransformStep::Normalize(NormalizeMatcher::new(
NORMALIZE_L1_BYTES,
NORMALIZE_L2_BYTES,
NORMALIZE_STR_BYTES,
)),
ProcessType::Romanize => TransformStep::Romanize(RomanizeMatcher::new(
ROMANIZE_L1_BYTES,
ROMANIZE_L2_BYTES,
ROMANIZE_STR_BYTES,
false,
)),
ProcessType::RomanizeChar => TransformStep::RomanizeChar(RomanizeMatcher::new(
ROMANIZE_L1_BYTES,
ROMANIZE_L2_BYTES,
ROMANIZE_STR_BYTES,
true,
)),
ProcessType::EmojiNorm => TransformStep::EmojiNorm(RomanizeMatcher::new(
EMOJI_NORM_L1_BYTES,
EMOJI_NORM_L2_BYTES,
EMOJI_NORM_STR_BYTES,
false,
)),
_ => unreachable!("unsupported single-bit ProcessType"),
}
}