use std::borrow::Cow;
use std::cell::RefCell;
#[cfg(feature = "runtime_build")]
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Display;
use std::sync::OnceLock;
use bitflags::bitflags;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::process::constants::*;
use crate::process::multi_char_matcher::MultiCharMatcher;
use crate::process::single_char_matcher::{
DeleteFindIter, FanjianFindIter, PinyinFindIter, SingleCharMatch, SingleCharMatcher,
};
const STRING_POOL_INIT_CAP: usize = 16;
const REDUCE_STATE_INIT_CAP: usize = 16;
const MASKS_POOL_INIT_CAP: usize = 4;
const STRING_POOL_MAX: usize = 128;
const MASKS_POOL_MAX: usize = 16;
thread_local! {
static STRING_POOL: RefCell<Vec<String>> = RefCell::new(Vec::with_capacity(STRING_POOL_INIT_CAP));
static REDUCE_STATE: RefCell<Vec<usize>> = RefCell::new(Vec::with_capacity(REDUCE_STATE_INIT_CAP));
static MASKS_POOL: RefCell<Vec<ProcessedTextMasks<'static>>> =
RefCell::new(Vec::with_capacity(MASKS_POOL_INIT_CAP));
}
fn get_string_from_pool(capacity: usize) -> String {
STRING_POOL.with(|pool| {
if let Some(mut s) = pool.borrow_mut().pop() {
s.clear();
if s.capacity() < capacity {
s.reserve(capacity - s.capacity());
}
s
} else {
String::with_capacity(capacity)
}
})
}
fn return_string_to_pool(s: String) {
STRING_POOL.with(|pool| {
let mut pool = pool.borrow_mut();
if pool.len() < STRING_POOL_MAX {
pool.push(s);
}
});
}
pub fn return_processed_string_to_pool(mut processed_text_process_type_masks: ProcessedTextMasks) {
for (cow, _) in processed_text_process_type_masks.drain(..) {
if let Cow::Owned(s) = cow {
return_string_to_pool(s);
}
}
let empty: ProcessedTextMasks<'static> =
unsafe { std::mem::transmute(processed_text_process_type_masks) };
MASKS_POOL.with(|pool| {
let mut pool = pool.borrow_mut();
if pool.len() < MASKS_POOL_MAX {
pool.push(empty);
}
});
}
bitflags! {
#[derive(Hash, PartialEq, Eq, Clone, Copy, Debug, Default)]
pub struct ProcessType: u8 {
const None = 0b00000001;
const Fanjian = 0b00000010;
const Delete = 0b00000100;
const Normalize = 0b00001000;
const DeleteNormalize = 0b00001100;
const FanjianDeleteNormalize = 0b00001110;
const PinYin = 0b00010000;
const PinYinChar = 0b00100000;
}
}
impl Serialize for ProcessType {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.bits().serialize(serializer)
}
}
impl<'de> Deserialize<'de> for ProcessType {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let bits: u8 = u8::deserialize(deserializer)?;
Ok(ProcessType::from_bits_retain(bits))
}
}
impl Display for ProcessType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let display_str_list = self
.iter_names()
.map(|(name, _)| name.to_lowercase())
.collect::<Vec<_>>();
write!(f, "{:?}", display_str_list.join("_"))
}
}
static PROCESS_MATCHER_CACHE: [OnceLock<ProcessMatcher>; 8] = [
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
OnceLock::new(),
];
pub type ProcessedTextMasks<'a> = Vec<(Cow<'a, str>, u64)>;
#[derive(Clone)]
pub enum ProcessMatcher {
MultiChar(MultiCharMatcher),
SingleChar(SingleCharMatcher),
}
impl ProcessMatcher {
#[inline(always)]
fn replace_scan<'a, I, M, F>(
text: &'a str,
mut iter: I,
mut push_replacement: F,
) -> (bool, Cow<'a, str>)
where
I: Iterator<Item = (usize, usize, M)>,
F: FnMut(&mut String, M),
{
if let Some((start, end, m)) = iter.next() {
let mut result = get_string_from_pool(text.len());
result.push_str(&text[0..start]);
push_replacement(&mut result, m);
let mut last_end = end;
for (start, end, m) in iter {
result.push_str(&text[last_end..start]);
push_replacement(&mut result, m);
last_end = end;
}
result.push_str(&text[last_end..]);
(true, Cow::Owned(result))
} else {
(false, Cow::Borrowed(text))
}
}
#[inline(always)]
pub fn replace_all<'a>(&self, text: &'a str) -> (bool, Cow<'a, str>) {
match self {
ProcessMatcher::SingleChar(matcher) => match matcher {
SingleCharMatcher::Fanjian { l1, l2 } => Self::replace_scan(
text,
FanjianFindIter {
l1,
l2,
text,
byte_offset: 0,
},
|result, m| {
if let SingleCharMatch::Char(c) = m {
result.push(c);
}
},
),
SingleCharMatcher::Pinyin {
l1,
l2,
strings,
trim_space,
} => Self::replace_scan(
text,
PinyinFindIter {
l1,
l2,
strings,
trim_space: *trim_space,
text,
byte_offset: 0,
},
|result, m| {
if let SingleCharMatch::Str(s) = m {
result.push_str(s);
}
},
),
SingleCharMatcher::Delete { .. } => {
debug_assert!(false, "replace_all called on Delete matcher");
(false, Cow::Borrowed(text))
}
},
ProcessMatcher::MultiChar(mc) => {
let rl = mc.replace_list();
Self::replace_scan(text, mc.find_iter(text), |result, idx| {
result.push_str(rl[idx]);
})
}
}
}
#[inline(always)]
pub fn delete_all<'a>(&self, text: &'a str) -> (bool, Cow<'a, str>) {
let ProcessMatcher::SingleChar(SingleCharMatcher::Delete { bitset, ascii_lut }) = self
else {
debug_assert!(false, "delete_all called on non-Delete matcher");
return (false, Cow::Borrowed(text));
};
Self::replace_scan(
text,
DeleteFindIter {
bitset,
ascii_lut: *ascii_lut,
text,
byte_offset: 0,
},
|_, _| {},
)
}
}
pub fn get_process_matcher(process_type_bit: ProcessType) -> &'static ProcessMatcher {
let index = process_type_bit.bits().trailing_zeros() as usize;
debug_assert!(index < 8, "ProcessType bit index out of bounds");
PROCESS_MATCHER_CACHE[index].get_or_init(|| {
#[cfg(feature = "runtime_build")]
{
match process_type_bit {
ProcessType::Fanjian => {
let mut map = HashMap::new();
for line in FANJIAN.trim().lines() {
let mut split = line.split('\t');
let k = split.next().unwrap().chars().next().unwrap() as u32;
let v = split.next().unwrap().chars().next().unwrap() as u32;
if k != v {
map.insert(k, v);
}
}
ProcessMatcher::SingleChar(SingleCharMatcher::fanjian_from_map(map))
}
ProcessType::PinYin | ProcessType::PinYinChar => {
let mut map = HashMap::new();
for line in PINYIN.trim().lines() {
let mut split = line.split('\t');
let k = split.next().unwrap().chars().next().unwrap() as u32;
let v = split.next().unwrap();
map.insert(k, v);
}
ProcessMatcher::SingleChar(SingleCharMatcher::pinyin_from_map(
map,
process_type_bit == ProcessType::PinYinChar,
))
}
ProcessType::Delete => ProcessMatcher::SingleChar(
SingleCharMatcher::delete_from_sources(TEXT_DELETE, WHITE_SPACE),
),
ProcessType::Normalize => {
let mut process_dict: HashMap<&'static str, &'static str> = HashMap::new();
for process_map in [NORM, NUM_NORM] {
process_dict.extend(process_map.trim().lines().map(|pair_str| {
let mut split = pair_str.split('\t');
(split.next().unwrap(), split.next().unwrap())
}));
}
process_dict.retain(|&key, &mut value| key != value);
ProcessMatcher::MultiChar(MultiCharMatcher::new_from_dict(process_dict))
}
_ => ProcessMatcher::MultiChar(MultiCharMatcher::new_empty()),
}
}
#[cfg(not(feature = "runtime_build"))]
{
match process_type_bit {
ProcessType::None => ProcessMatcher::MultiChar(MultiCharMatcher::new_empty()),
ProcessType::Fanjian => ProcessMatcher::SingleChar(SingleCharMatcher::fanjian(
Cow::Borrowed(FANJIAN_L1_BYTES),
Cow::Borrowed(FANJIAN_L2_BYTES),
)),
ProcessType::Delete => ProcessMatcher::SingleChar(SingleCharMatcher::delete(
Cow::Borrowed(DELETE_BITSET_BYTES),
)),
ProcessType::Normalize => {
#[cfg(feature = "dfa")]
{
ProcessMatcher::MultiChar(
MultiCharMatcher::new(NORMALIZE_PROCESS_LIST_STR.lines())
.with_replace_list(
NORMALIZE_PROCESS_REPLACE_LIST_STR.lines().collect(),
),
)
}
#[cfg(not(feature = "dfa"))]
{
ProcessMatcher::MultiChar(
MultiCharMatcher::deserialize_from(NORMALIZE_PROCESS_MATCHER_BYTES)
.with_replace_list(
NORMALIZE_PROCESS_REPLACE_LIST_STR.lines().collect(),
),
)
}
}
ProcessType::PinYin => ProcessMatcher::SingleChar(SingleCharMatcher::pinyin(
Cow::Borrowed(PINYIN_L1_BYTES),
Cow::Borrowed(PINYIN_L2_BYTES),
Cow::Borrowed(PINYIN_STR_BYTES),
false,
)),
ProcessType::PinYinChar => ProcessMatcher::SingleChar(SingleCharMatcher::pinyin(
Cow::Borrowed(PINYIN_L1_BYTES),
Cow::Borrowed(PINYIN_L2_BYTES),
Cow::Borrowed(PINYIN_STR_BYTES),
true,
)),
_ => unreachable!(),
}
}
})
}
#[inline(always)]
pub fn text_process<'a>(process_type_bit: ProcessType, text: &'a str) -> Cow<'a, str> {
let mut result = Cow::Borrowed(text);
for bit in process_type_bit.iter() {
let pm = get_process_matcher(bit);
match bit {
ProcessType::None => continue,
ProcessType::Delete => {
if let (true, Cow::Owned(pt)) = pm.delete_all(result.as_ref()) {
result = Cow::Owned(pt);
}
}
_ => {
if let (true, Cow::Owned(pt)) = pm.replace_all(result.as_ref()) {
result = Cow::Owned(pt);
}
}
}
}
result
}
#[inline(always)]
pub fn reduce_text_process<'a>(process_type: ProcessType, text: &'a str) -> Vec<Cow<'a, str>> {
let mut text_list: Vec<Cow<'a, str>> = Vec::new();
text_list.push(Cow::Borrowed(text));
for process_type_bit in process_type.iter() {
let pm = get_process_matcher(process_type_bit);
let current_text = text_list
.last_mut()
.expect("It should always have at least one element");
match process_type_bit {
ProcessType::None => {}
ProcessType::Delete => match pm.delete_all(current_text.as_ref()) {
(true, Cow::Owned(pt)) => {
text_list.push(Cow::Owned(pt));
}
(false, _) => {}
(_, _) => unreachable!(),
},
_ => match pm.replace_all(current_text.as_ref()) {
(true, Cow::Owned(pt)) => {
text_list.push(Cow::Owned(pt));
}
(false, _) => {}
(_, _) => unreachable!(),
},
}
}
text_list
}
#[inline(always)]
pub fn reduce_text_process_emit<'a>(process_type: ProcessType, text: &'a str) -> Vec<Cow<'a, str>> {
let mut text_list: Vec<Cow<'a, str>> = Vec::new();
text_list.push(Cow::Borrowed(text));
for process_type_bit in process_type.iter() {
let pm = get_process_matcher(process_type_bit);
let current_text = text_list
.last_mut()
.expect("It should always have at least one element");
match process_type_bit {
ProcessType::None => {}
ProcessType::Delete => match pm.delete_all(current_text.as_ref()) {
(true, Cow::Owned(pt)) => {
text_list.push(Cow::Owned(pt));
}
(false, _) => {}
(_, _) => unreachable!(),
},
_ => match pm.replace_all(current_text.as_ref()) {
(true, Cow::Owned(pt)) => {
*current_text = Cow::Owned(pt);
}
(false, _) => {}
(_, _) => unreachable!(),
},
}
}
text_list
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ProcessTypeBitNode {
process_type_list: Vec<ProcessType>,
process_type_bit: ProcessType,
children: Vec<usize>,
}
pub fn build_process_type_tree(process_type_set: &HashSet<u8>) -> Vec<ProcessTypeBitNode> {
let mut process_type_tree = Vec::new();
let root = ProcessTypeBitNode {
process_type_list: Vec::new(),
process_type_bit: ProcessType::None,
children: Vec::new(),
};
process_type_tree.push(root);
for process_type_bits in process_type_set.iter() {
let process_type = ProcessType::from_bits(*process_type_bits).unwrap();
let mut current_node_index = 0;
for process_type_bit in process_type.into_iter() {
let current_node = &process_type_tree[current_node_index];
if current_node.process_type_bit == process_type_bit {
continue;
}
let mut is_found = false;
for child_node_index in ¤t_node.children {
if process_type_bit == process_type_tree[*child_node_index].process_type_bit {
current_node_index = *child_node_index;
is_found = true;
break;
}
}
if !is_found {
let mut child = ProcessTypeBitNode {
process_type_list: Vec::new(),
process_type_bit,
children: Vec::new(),
};
child.process_type_list.push(process_type);
process_type_tree.push(child);
let new_node_index = process_type_tree.len() - 1;
process_type_tree[current_node_index]
.children
.push(new_node_index);
current_node_index = new_node_index;
} else {
process_type_tree[current_node_index]
.process_type_list
.push(process_type);
}
}
}
process_type_tree
}
#[inline(always)]
fn dedup_insert(
text_masks: &mut ProcessedTextMasks<'_>,
current_index: usize,
changed: Option<String>,
) -> usize {
match changed {
Some(pt) => {
if let Some(pos) = text_masks
.iter()
.position(|(t, _)| t.as_ref() == pt.as_str())
{
return_string_to_pool(pt);
pos
} else {
text_masks.push((Cow::Owned(pt), 0u64));
text_masks.len() - 1
}
}
None => current_index,
}
}
#[inline(always)]
pub fn reduce_text_process_with_tree<'a>(
process_type_tree: &[ProcessTypeBitNode],
text: &'a str,
) -> ProcessedTextMasks<'a> {
REDUCE_STATE.with(|state| {
let mut node_indices = state.borrow_mut();
node_indices.clear();
node_indices.resize(process_type_tree.len(), 0);
let pooled: Option<ProcessedTextMasks<'static>> = MASKS_POOL.with(|p| p.borrow_mut().pop());
let mut text_masks: ProcessedTextMasks<'a> =
unsafe { std::mem::transmute(pooled.unwrap_or_default()) };
text_masks.clear();
text_masks.push((Cow::Borrowed(text), 1u64 << ProcessType::None.bits()));
for (current_node_index, current_node) in process_type_tree.iter().enumerate() {
let current_index = node_indices[current_node_index];
for &child_node_index in ¤t_node.children {
let child_node = &process_type_tree[child_node_index];
let pm = get_process_matcher(child_node.process_type_bit);
let changed = match child_node.process_type_bit {
ProcessType::None => None,
ProcessType::Delete => {
let current_text = text_masks[current_index].0.as_ref();
match pm.delete_all(current_text) {
(true, Cow::Owned(pt)) => Some(pt),
_ => None,
}
}
_ => {
let current_text = text_masks[current_index].0.as_ref();
match pm.replace_all(current_text) {
(true, Cow::Owned(pt)) => Some(pt),
_ => None,
}
}
};
let child_index = dedup_insert(&mut text_masks, current_index, changed);
node_indices[child_node_index] = child_index;
let entry = &mut text_masks[child_index];
entry.1 |= child_node
.process_type_list
.iter()
.fold(0u64, |mask, pt| mask | (1u64 << pt.bits()));
}
}
text_masks
})
}
#[inline(always)]
pub fn reduce_text_process_with_set<'a>(
process_type_set: &HashSet<u8>,
text: &'a str,
) -> ProcessedTextMasks<'a> {
let mut process_type_tree = Vec::with_capacity(8);
let root = ProcessTypeBitNode {
process_type_list: Vec::new(),
process_type_bit: ProcessType::None,
children: Vec::new(),
};
process_type_tree.push(root);
let mut node_indices = Vec::with_capacity(8);
node_indices.push(0);
let mut text_masks: ProcessedTextMasks<'a> = Vec::new();
text_masks.push((Cow::Borrowed(text), 1u64 << ProcessType::None.bits()));
for process_type_bits in process_type_set.iter() {
let process_type = ProcessType::from_bits(*process_type_bits).unwrap();
let mut current_node_index = 0;
for process_type_bit in process_type.iter() {
let current_node = &process_type_tree[current_node_index];
if current_node.process_type_bit == process_type_bit {
continue;
}
let mut is_found = false;
for child_node_index in ¤t_node.children {
if process_type_bit == process_type_tree[*child_node_index].process_type_bit {
current_node_index = *child_node_index;
is_found = true;
break;
}
}
if !is_found {
let current_index = node_indices[current_node_index];
let pm = get_process_matcher(process_type_bit);
let changed = match process_type_bit {
ProcessType::None => None,
ProcessType::Delete => {
let current_text = text_masks[current_index].0.as_ref();
match pm.delete_all(current_text) {
(true, Cow::Owned(pt)) => Some(pt),
_ => None,
}
}
_ => {
let current_text = text_masks[current_index].0.as_ref();
match pm.replace_all(current_text) {
(true, Cow::Owned(pt)) => Some(pt),
_ => None,
}
}
};
let child_index = dedup_insert(&mut text_masks, current_index, changed);
let mut child = ProcessTypeBitNode {
process_type_list: Vec::new(),
process_type_bit,
children: Vec::new(),
};
child.process_type_list.push(process_type);
process_type_tree.push(child);
node_indices.push(child_index);
let new_node_index = process_type_tree.len() - 1;
process_type_tree[current_node_index]
.children
.push(new_node_index);
current_node_index = new_node_index;
} else {
process_type_tree[current_node_index]
.process_type_list
.push(process_type);
}
let current_index = node_indices[current_node_index];
let entry = &mut text_masks[current_index];
entry.1 |= 1u64 << process_type.bits();
}
}
text_masks
}