#[cfg(feature = "runtime_build")]
use ahash::AHashMap;
use std::sync::OnceLock;
use crate::process::process_type::ProcessType;
use crate::process::transform::constants::*;
use crate::process::transform::delete::DeleteMatcher;
use crate::process::transform::replace::{FanjianMatcher, NormalizeMatcher, PinyinMatcher};
pub(crate) struct StepOutput {
pub(crate) changed: Option<String>,
pub(crate) output_density: f32,
}
impl StepOutput {
#[inline(always)]
pub(crate) fn unchanged(density: f32) -> Self {
Self {
changed: None,
output_density: density,
}
}
#[inline(always)]
pub(crate) fn changed(changed: String, density: f32) -> Self {
Self {
changed: Some(changed),
output_density: density,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum AsciiInputBehavior {
NoOp,
MayChangeButStaysAscii,
}
#[derive(Clone)]
pub(crate) enum TransformStep {
None,
Fanjian(FanjianMatcher),
Delete(DeleteMatcher),
Normalize(NormalizeMatcher),
PinYin(PinyinMatcher),
PinYinChar(PinyinMatcher),
}
impl TransformStep {
#[inline(always)]
pub(crate) fn ascii_input_behavior(&self) -> AsciiInputBehavior {
match self {
Self::None | Self::Fanjian(_) | Self::PinYin(_) | Self::PinYinChar(_) => {
AsciiInputBehavior::NoOp
}
Self::Delete(_) | Self::Normalize(_) => AsciiInputBehavior::MayChangeButStaysAscii,
}
}
#[inline(always)]
pub(crate) fn is_noop_on_ascii_input(&self) -> bool {
matches!(self.ascii_input_behavior(), AsciiInputBehavior::NoOp)
}
#[inline(always)]
pub(crate) fn output_use_bytewise(&self, parent_use_bytewise: bool) -> bool {
match self {
Self::PinYin(_) | Self::PinYinChar(_) => true,
_ => parent_use_bytewise,
}
}
#[inline(always)]
pub(crate) fn apply(&self, text: &str, parent_density: f32) -> StepOutput {
if parent_density == 0.0 {
return match self.ascii_input_behavior() {
AsciiInputBehavior::NoOp => StepOutput::unchanged(0.0),
AsciiInputBehavior::MayChangeButStaysAscii => match self {
Self::Delete(matcher) => matcher.delete(text).map_or_else(
|| StepOutput::unchanged(0.0),
|(changed, _)| StepOutput::changed(changed, 0.0),
),
Self::Normalize(matcher) => matcher.replace(text).map_or_else(
|| StepOutput::unchanged(0.0),
|(changed, _)| StepOutput::changed(changed, 0.0),
),
_ => unreachable!("ASCII behavior and step variant must agree"),
},
};
}
match self {
Self::None => StepOutput::unchanged(parent_density),
Self::Fanjian(matcher) => matcher.replace(text).map_or_else(
|| StepOutput::unchanged(parent_density),
|changed| StepOutput::changed(changed, parent_density),
),
Self::Delete(matcher) => matcher.delete(text).map_or_else(
|| StepOutput::unchanged(parent_density),
|(changed, density)| StepOutput::changed(changed, density),
),
Self::Normalize(matcher) => matcher.replace(text).map_or_else(
|| StepOutput::unchanged(parent_density),
|(changed, density)| StepOutput::changed(changed, density),
),
Self::PinYin(matcher) | Self::PinYinChar(matcher) => matcher.replace(text).map_or_else(
|| StepOutput::unchanged(parent_density),
|(changed, density)| StepOutput::changed(changed, density),
),
}
}
}
static TRANSFORM_STEP_CACHE: [OnceLock<TransformStep>; 8] = [
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
];
pub(crate) fn get_transform_step(process_type_bit: ProcessType) -> &'static TransformStep {
debug_assert!(
process_type_bit.bits().is_power_of_two() || process_type_bit == ProcessType::None,
"get_transform_step requires a single-bit ProcessType, got {:?}",
process_type_bit
);
let index = process_type_bit.bits().trailing_zeros() as usize;
debug_assert!(index < TRANSFORM_STEP_CACHE.len());
TRANSFORM_STEP_CACHE[index].get_or_init(|| build_transform_step(process_type_bit))
}
#[cfg(feature = "runtime_build")]
fn build_transform_step(process_type_bit: ProcessType) -> TransformStep {
match process_type_bit {
ProcessType::None => TransformStep::None,
ProcessType::Fanjian => {
let mut map = AHashMap::new();
for line in FANJIAN.trim().lines() {
let mut split = line.split('\t');
let key = split.next().unwrap();
let value = split.next().unwrap();
assert!(
key.chars().count() == 1,
"FANJIAN key must be exactly one character: {key:?}"
);
assert!(
value.chars().count() == 1,
"FANJIAN value must be exactly one character: {value:?}"
);
let key = key.chars().next().unwrap() as u32;
let value = value.chars().next().unwrap() as u32;
if key != value {
map.insert(key, value);
}
}
TransformStep::Fanjian(FanjianMatcher::from_map(map))
}
ProcessType::Delete => TransformStep::Delete(DeleteMatcher::from_sources(TEXT_DELETE)),
ProcessType::Normalize => {
let mut dict = AHashMap::new();
for process_map in [NORM, NUM_NORM] {
dict.extend(process_map.trim().lines().map(|pair| {
let mut split = pair.split('\t');
(split.next().unwrap(), split.next().unwrap())
}));
}
dict.retain(|&key, value| key != *value);
TransformStep::Normalize(NormalizeMatcher::from_dict(dict))
}
ProcessType::PinYin => {
let mut map = AHashMap::new();
for line in PINYIN.trim().lines() {
let mut split = line.split('\t');
let key = split.next().unwrap();
assert!(
key.chars().count() == 1,
"PINYIN key must be exactly one character: {key:?}"
);
let key = key.chars().next().unwrap() as u32;
let value = split.next().unwrap();
assert!(
!value.is_empty(),
"PINYIN value must not be empty for key U+{key:04X}"
);
map.insert(key, value);
}
TransformStep::PinYin(PinyinMatcher::from_map(map, false))
}
ProcessType::PinYinChar => {
let mut map = AHashMap::new();
for line in PINYIN.trim().lines() {
let mut split = line.split('\t');
let key = split.next().unwrap();
assert!(
key.chars().count() == 1,
"PINYIN key must be exactly one character: {key:?}"
);
let key = key.chars().next().unwrap() as u32;
let value = split.next().unwrap();
assert!(
!value.is_empty(),
"PINYIN value must not be empty for key U+{key:04X}"
);
map.insert(key, value);
}
TransformStep::PinYinChar(PinyinMatcher::from_map(map, true))
}
_ => unreachable!("unsupported single-bit ProcessType"),
}
}
#[cfg(not(feature = "runtime_build"))]
fn build_transform_step(process_type_bit: ProcessType) -> TransformStep {
match process_type_bit {
ProcessType::None => TransformStep::None,
ProcessType::Fanjian => {
TransformStep::Fanjian(FanjianMatcher::new(FANJIAN_L1_BYTES, FANJIAN_L2_BYTES))
}
ProcessType::Delete => TransformStep::Delete(DeleteMatcher::new(DELETE_BITSET_BYTES)),
ProcessType::Normalize => TransformStep::Normalize(
NormalizeMatcher::new(NORMALIZE_PROCESS_LIST_STR.lines())
.with_replacements(NORMALIZE_PROCESS_REPLACE_LIST_STR.lines().collect()),
),
ProcessType::PinYin => TransformStep::PinYin(PinyinMatcher::new(
PINYIN_L1_BYTES,
PINYIN_L2_BYTES,
PINYIN_STR_BYTES,
false,
)),
ProcessType::PinYinChar => TransformStep::PinYinChar(PinyinMatcher::new(
PINYIN_L1_BYTES,
PINYIN_L2_BYTES,
PINYIN_STR_BYTES,
true,
)),
_ => unreachable!("unsupported single-bit ProcessType"),
}
}