#[cfg(feature = "runtime_build")]
use ahash::AHashMap;
use std::sync::OnceLock;
use crate::process::process_type::ProcessType;
use crate::process::transform::constants::*;
use crate::process::transform::delete::DeleteMatcher;
use crate::process::transform::replace::{FanjianMatcher, NormalizeMatcher, PinyinMatcher};
pub(crate) struct StepOutput {
pub(crate) changed: Option<String>,
pub(crate) is_ascii: bool,
}
impl StepOutput {
#[inline(always)]
pub(crate) fn unchanged(is_ascii: bool) -> Self {
Self {
changed: None,
is_ascii,
}
}
#[inline(always)]
pub(crate) fn changed(changed: String, is_ascii: bool) -> Self {
Self {
changed: Some(changed),
is_ascii,
}
}
}
#[derive(Clone)]
pub(crate) enum TransformStep {
None,
Fanjian(FanjianMatcher),
Delete(DeleteMatcher),
Normalize(NormalizeMatcher),
PinYin(PinyinMatcher),
PinYinChar(PinyinMatcher),
}
impl TransformStep {
#[inline(always)]
pub(crate) fn apply(&self, text: &str, parent_is_ascii: bool) -> StepOutput {
match self {
Self::None => StepOutput::unchanged(parent_is_ascii),
Self::Fanjian(matcher) => matcher.replace(text).map_or_else(
|| StepOutput::unchanged(parent_is_ascii),
|changed| StepOutput::changed(changed, false),
),
Self::Delete(matcher) => matcher.delete(text).map_or_else(
|| StepOutput::unchanged(parent_is_ascii),
|(changed, is_ascii)| StepOutput::changed(changed, parent_is_ascii || is_ascii),
),
Self::Normalize(matcher) => matcher.replace(text).map_or_else(
|| StepOutput::unchanged(parent_is_ascii),
|(changed, is_ascii)| StepOutput::changed(changed, is_ascii),
),
Self::PinYin(matcher) | Self::PinYinChar(matcher) => matcher.replace(text).map_or_else(
|| StepOutput::unchanged(parent_is_ascii),
|(changed, is_ascii)| StepOutput::changed(changed, is_ascii),
),
}
}
}
static TRANSFORM_STEP_CACHE: [OnceLock<TransformStep>; 8] = [
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
];
pub(crate) fn get_transform_step(process_type_bit: ProcessType) -> &'static TransformStep {
debug_assert!(
process_type_bit.bits().is_power_of_two() || process_type_bit == ProcessType::None,
"get_transform_step requires a single-bit ProcessType, got {:?}",
process_type_bit
);
let index = process_type_bit.bits().trailing_zeros() as usize;
debug_assert!(index < TRANSFORM_STEP_CACHE.len());
TRANSFORM_STEP_CACHE[index].get_or_init(|| build_transform_step(process_type_bit))
}
#[cfg(feature = "runtime_build")]
fn build_transform_step(process_type_bit: ProcessType) -> TransformStep {
match process_type_bit {
ProcessType::None => TransformStep::None,
ProcessType::Fanjian => {
let mut map = AHashMap::new();
for line in FANJIAN.trim().lines() {
let mut split = line.split('\t');
let key = split.next().unwrap().chars().next().unwrap() as u32;
let value = split.next().unwrap().chars().next().unwrap() as u32;
if key != value {
map.insert(key, value);
}
}
TransformStep::Fanjian(FanjianMatcher::from_map(map))
}
ProcessType::Delete => {
TransformStep::Delete(DeleteMatcher::from_sources(TEXT_DELETE, WHITE_SPACE))
}
ProcessType::Normalize => {
let mut dict = AHashMap::new();
for process_map in [NORM, NUM_NORM] {
dict.extend(process_map.trim().lines().map(|pair| {
let mut split = pair.split('\t');
(split.next().unwrap(), split.next().unwrap())
}));
}
dict.retain(|&key, value| key != *value);
TransformStep::Normalize(NormalizeMatcher::from_dict(dict))
}
ProcessType::PinYin => {
let mut map = AHashMap::new();
for line in PINYIN.trim().lines() {
let mut split = line.split('\t');
let key = split.next().unwrap().chars().next().unwrap() as u32;
map.insert(key, split.next().unwrap());
}
TransformStep::PinYin(PinyinMatcher::from_map(map, false))
}
ProcessType::PinYinChar => {
let mut map = AHashMap::new();
for line in PINYIN.trim().lines() {
let mut split = line.split('\t');
let key = split.next().unwrap().chars().next().unwrap() as u32;
map.insert(key, split.next().unwrap());
}
TransformStep::PinYinChar(PinyinMatcher::from_map(map, true))
}
_ => unreachable!("unsupported single-bit ProcessType"),
}
}
#[cfg(not(feature = "runtime_build"))]
fn build_transform_step(process_type_bit: ProcessType) -> TransformStep {
match process_type_bit {
ProcessType::None => TransformStep::None,
ProcessType::Fanjian => {
TransformStep::Fanjian(FanjianMatcher::new(FANJIAN_L1_BYTES, FANJIAN_L2_BYTES))
}
ProcessType::Delete => TransformStep::Delete(DeleteMatcher::new(DELETE_BITSET_BYTES)),
ProcessType::Normalize => TransformStep::Normalize(
NormalizeMatcher::new(NORMALIZE_PROCESS_LIST_STR.lines())
.with_replacements(NORMALIZE_PROCESS_REPLACE_LIST_STR.lines().collect()),
),
ProcessType::PinYin => TransformStep::PinYin(PinyinMatcher::new(
PINYIN_L1_BYTES,
PINYIN_L2_BYTES,
PINYIN_STR_BYTES,
false,
)),
ProcessType::PinYinChar => TransformStep::PinYinChar(PinyinMatcher::new(
PINYIN_L1_BYTES,
PINYIN_L2_BYTES,
PINYIN_STR_BYTES,
true,
)),
_ => unreachable!("unsupported single-bit ProcessType"),
}
}