use std::borrow::Cow;
#[cfg(feature = "runtime_build")]
use std::collections::HashMap;
use std::sync::OnceLock;
use crate::process::process_type::ProcessType;
use crate::process::string_pool::{get_string_from_pool, return_string_to_pool};
use crate::process::transform::constants::*;
use crate::process::transform::multi_char_matcher::MultiCharMatcher;
use crate::process::transform::single_char_matcher::{SingleCharMatch, SingleCharMatcher};
static PROCESS_MATCHER_CACHE: [OnceLock<ProcessMatcher>; 8] = [
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
];
#[derive(Clone)]
pub(crate) enum ProcessMatcher {
MultiChar(MultiCharMatcher),
SingleChar(SingleCharMatcher),
}
impl ProcessMatcher {
#[inline(always)]
fn replace_scan<I, M, F>(text: &str, mut iter: I, mut push_replacement: F) -> Option<String>
where
I: Iterator<Item = (usize, usize, M)>,
F: FnMut(&mut String, M),
{
if let Some((start, end, m)) = iter.next() {
let mut result = get_string_from_pool(text.len());
result.push_str(&text[0..start]);
push_replacement(&mut result, m);
let mut last_end = end;
for (start, end, m) in iter {
result.push_str(&text[last_end..start]);
push_replacement(&mut result, m);
last_end = end;
}
result.push_str(&text[last_end..]);
Some(result)
} else {
None
}
}
#[inline(always)]
pub(crate) fn replace_all(&self, text: &str) -> Option<String> {
match self {
ProcessMatcher::SingleChar(matcher) => match matcher {
SingleCharMatcher::Fanjian { .. } => Self::replace_all_fanjian(matcher, text),
SingleCharMatcher::Pinyin { .. } => {
Self::replace_scan(text, matcher.pinyin_iter(text), |result, m| {
if let SingleCharMatch::Str(s) = m {
result.push_str(s);
}
})
}
SingleCharMatcher::Delete { .. } => {
debug_assert!(false, "replace_all called on Delete matcher");
None
}
},
ProcessMatcher::MultiChar(mc) => {
let replacements = mc.replace_list();
Self::replace_scan(text, mc.find_iter(text), |result, idx| {
result.push_str(replacements[idx]);
})
}
}
}
fn replace_all_fanjian(matcher: &SingleCharMatcher, text: &str) -> Option<String> {
let mut result: Option<String> = None;
for (start, end, m) in matcher.fanjian_iter(text) {
if let SingleCharMatch::Char(c) = m {
let span_len = end - start;
if c.len_utf8() == span_len {
let buf = result.get_or_insert_with(|| {
let mut s = get_string_from_pool(text.len());
s.push_str(text);
s
});
unsafe { c.encode_utf8(&mut buf.as_bytes_mut()[start..end]) };
} else {
if let Some(s) = result.take() {
return_string_to_pool(s);
}
return Self::replace_scan(text, matcher.fanjian_iter(text), |r, m| {
if let SingleCharMatch::Char(c) = m {
r.push(c);
}
});
}
}
}
result
}
#[inline(always)]
pub(crate) fn delete_all(&self, text: &str) -> Option<String> {
let ProcessMatcher::SingleChar(matcher) = self else {
debug_assert!(false, "delete_all called on non-Delete matcher");
return None;
};
Self::replace_scan(text, matcher.delete_iter(text), |_, _| {})
}
}
pub(crate) fn get_process_matcher(process_type_bit: ProcessType) -> &'static ProcessMatcher {
let index = process_type_bit.bits().trailing_zeros() as usize;
debug_assert!(index < 8, "ProcessType bit index out of bounds");
PROCESS_MATCHER_CACHE[index].get_or_init(|| {
#[cfg(feature = "runtime_build")]
{
match process_type_bit {
ProcessType::Fanjian => {
let mut map = HashMap::new();
for line in FANJIAN.trim().lines() {
let mut split = line.split('\t');
let k = split.next().unwrap().chars().next().unwrap() as u32;
let v = split.next().unwrap().chars().next().unwrap() as u32;
if k != v {
map.insert(k, v);
}
}
ProcessMatcher::SingleChar(SingleCharMatcher::fanjian_from_map(map))
}
ProcessType::PinYin | ProcessType::PinYinChar => {
let mut map = HashMap::new();
for line in PINYIN.trim().lines() {
let mut split = line.split('\t');
let k = split.next().unwrap().chars().next().unwrap() as u32;
let v = split.next().unwrap();
map.insert(k, v);
}
ProcessMatcher::SingleChar(SingleCharMatcher::pinyin_from_map(
map,
process_type_bit == ProcessType::PinYinChar,
))
}
ProcessType::Delete => ProcessMatcher::SingleChar(
SingleCharMatcher::delete_from_sources(TEXT_DELETE, WHITE_SPACE),
),
ProcessType::Normalize => {
let mut process_dict: HashMap<&'static str, &'static str> = HashMap::new();
for process_map in [NORM, NUM_NORM] {
process_dict.extend(process_map.trim().lines().map(|pair_str| {
let mut split = pair_str.split('\t');
(split.next().unwrap(), split.next().unwrap())
}));
}
process_dict.retain(|&key, &mut value| key != value);
ProcessMatcher::MultiChar(MultiCharMatcher::new_from_dict(process_dict))
}
_ => ProcessMatcher::MultiChar(MultiCharMatcher::new_empty()),
}
}
#[cfg(not(feature = "runtime_build"))]
{
match process_type_bit {
ProcessType::None => ProcessMatcher::MultiChar(MultiCharMatcher::new_empty()),
ProcessType::Fanjian => ProcessMatcher::SingleChar(SingleCharMatcher::fanjian(
Cow::Borrowed(FANJIAN_L1_BYTES),
Cow::Borrowed(FANJIAN_L2_BYTES),
)),
ProcessType::Delete => ProcessMatcher::SingleChar(SingleCharMatcher::delete(
Cow::Borrowed(DELETE_BITSET_BYTES),
)),
ProcessType::Normalize => {
#[cfg(feature = "dfa")]
{
ProcessMatcher::MultiChar(
MultiCharMatcher::new(NORMALIZE_PROCESS_LIST_STR.lines())
.with_replace_list(
NORMALIZE_PROCESS_REPLACE_LIST_STR.lines().collect(),
),
)
}
#[cfg(not(feature = "dfa"))]
{
ProcessMatcher::MultiChar(
MultiCharMatcher::deserialize_from(NORMALIZE_PROCESS_MATCHER_BYTES)
.with_replace_list(
NORMALIZE_PROCESS_REPLACE_LIST_STR.lines().collect(),
),
)
}
}
ProcessType::PinYin => ProcessMatcher::SingleChar(SingleCharMatcher::pinyin(
Cow::Borrowed(PINYIN_L1_BYTES),
Cow::Borrowed(PINYIN_L2_BYTES),
Cow::Borrowed(PINYIN_STR_BYTES),
false,
)),
ProcessType::PinYinChar => ProcessMatcher::SingleChar(SingleCharMatcher::pinyin(
Cow::Borrowed(PINYIN_L1_BYTES),
Cow::Borrowed(PINYIN_L2_BYTES),
Cow::Borrowed(PINYIN_STR_BYTES),
true,
)),
_ => unreachable!(),
}
}
})
}
#[inline(always)]
pub fn text_process<'a>(process_type: ProcessType, text: &'a str) -> Cow<'a, str> {
let mut result = Cow::Borrowed(text);
for process_type_bit in process_type.iter() {
let pm = get_process_matcher(process_type_bit);
match process_type_bit {
ProcessType::None => continue,
ProcessType::Delete => {
if let Some(processed) = pm.delete_all(result.as_ref())
&& let Cow::Owned(old) = std::mem::replace(&mut result, Cow::Owned(processed))
{
return_string_to_pool(old);
}
}
_ => {
if let Some(processed) = pm.replace_all(result.as_ref())
&& let Cow::Owned(old) = std::mem::replace(&mut result, Cow::Owned(processed))
{
return_string_to_pool(old);
}
}
}
}
result
}
fn reduce_text_process_inner<'a>(
process_type: ProcessType,
text: &'a str,
overwrite_replace: bool,
) -> Vec<Cow<'a, str>> {
let mut text_list: Vec<Cow<'a, str>> = Vec::new();
text_list.push(Cow::Borrowed(text));
for process_type_bit in process_type.iter() {
let pm = get_process_matcher(process_type_bit);
let current_text = text_list
.last_mut()
.expect("text_list is never empty (seeded with original text)");
match process_type_bit {
ProcessType::None => continue,
ProcessType::Delete => {
if let Some(processed) = pm.delete_all(current_text.as_ref()) {
text_list.push(Cow::Owned(processed));
}
}
_ => {
if let Some(processed) = pm.replace_all(current_text.as_ref()) {
if overwrite_replace {
*current_text = Cow::Owned(processed);
} else {
text_list.push(Cow::Owned(processed));
}
}
}
}
}
text_list
}
#[inline(always)]
pub fn reduce_text_process<'a>(process_type: ProcessType, text: &'a str) -> Vec<Cow<'a, str>> {
reduce_text_process_inner(process_type, text, false)
}
#[inline(always)]
pub fn reduce_text_process_emit<'a>(process_type: ProcessType, text: &'a str) -> Vec<Cow<'a, str>> {
reduce_text_process_inner(process_type, text, true)
}