use std::borrow::Cow;
use std::sync::Arc;
use ahash::AHashMap;
use aho_corasick::{
AhoCorasick, AhoCorasickBuilder, AhoCorasickKind::DFA, MatchKind as AhoCorasickMatchKind,
};
#[allow(unused_imports)]
use daachorse::{
CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder,
MatchKind as DoubleArrayAhoCorasickMatchKind,
};
use lazy_static::lazy_static;
use nohash_hasher::IntMap;
use parking_lot::RwLock;
#[cfg(feature = "prebuilt")]
use crate::process::constants::prebuilt_feature::*;
#[cfg(feature = "runtime_build")]
use crate::process::constants::runtime_build_feature::*;
use crate::SimpleMatchType;
type ProcessMatcherCache =
RwLock<IntMap<SimpleMatchType, Arc<(Vec<&'static str>, ProcessMatcher)>>>;
lazy_static! {
pub static ref PROCESS_MATCHER_CACHE: ProcessMatcherCache = RwLock::new(IntMap::default());
}
#[derive(Clone)]
pub enum ProcessMatcher {
Chinese(CharwiseDoubleArrayAhoCorasick<u64>),
Others(AhoCorasick),
}
impl ProcessMatcher {
#[inline(always)]
pub fn replace_all<'a>(
&self,
text: &'a str,
process_replace_list: &[&str],
) -> (bool, Cow<'a, str>) {
let mut result = String::with_capacity(text.len());
let mut last_end = 0;
match self {
ProcessMatcher::Chinese(ac) => {
for mat in ac.find_iter(text) {
result.push_str(unsafe { text.get_unchecked(last_end..mat.start()) });
result.push_str(unsafe {
process_replace_list.get_unchecked(mat.value() as usize)
});
last_end = mat.end();
}
}
ProcessMatcher::Others(ac) => {
for mat in ac.find_iter(text) {
result.push_str(unsafe { text.get_unchecked(last_end..mat.start()) });
result.push_str(unsafe {
process_replace_list.get_unchecked(mat.pattern().as_usize())
});
last_end = mat.end();
}
}
}
if last_end > 0 {
result.push_str(unsafe { text.get_unchecked(last_end..) });
(true, Cow::Owned(result))
} else {
(false, Cow::Borrowed(text))
}
}
#[inline(always)]
pub fn delete_all<'a>(&self, text: &'a str) -> (bool, Cow<'a, str>) {
let mut result = String::with_capacity(text.len());
let mut last_end = 0;
match self {
ProcessMatcher::Chinese(ac) => {
for mat in ac.find_iter(text) {
result.push_str(unsafe { text.get_unchecked(last_end..mat.start()) });
last_end = mat.end();
}
}
ProcessMatcher::Others(ac) => {
for mat in ac.find_iter(text) {
result.push_str(unsafe { text.get_unchecked(last_end..mat.start()) });
last_end = mat.end();
}
}
}
if last_end > 0 {
result.push_str(unsafe { text.get_unchecked(last_end..) });
(true, Cow::Owned(result))
} else {
(false, Cow::Borrowed(text))
}
}
}
#[cfg(feature = "runtime_build")]
pub fn get_process_matcher(
simple_match_type_bit: SimpleMatchType,
) -> Arc<(Vec<&'static str>, ProcessMatcher)> {
{
let process_matcher_cache = PROCESS_MATCHER_CACHE.read();
if let Some(cached_result) = process_matcher_cache.get(&simple_match_type_bit) {
return Arc::clone(cached_result);
}
}
{
let mut process_dict = AHashMap::default();
match simple_match_type_bit {
SimpleMatchType::None => {}
SimpleMatchType::Fanjian => {
for str_conv_map in [FANJIAN, UNICODE] {
process_dict.extend(str_conv_map.trim().lines().map(|pair_str| {
let mut pair_str_split = pair_str.split('\t');
(
pair_str_split.next().unwrap(),
pair_str_split.next().unwrap(),
)
}));
}
}
SimpleMatchType::WordDelete => {
process_dict.extend(
PUNCTUATION_SPECIAL
.trim()
.lines()
.map(|pair_str| (pair_str, "")),
);
process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
}
SimpleMatchType::TextDelete => {
for str_conv_map in [PUNCTUATION_SPECIAL, CN_SPECIAL, EN_SPECIAL] {
process_dict.extend(str_conv_map.trim().lines().map(|pair_str| (pair_str, "")));
}
process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
}
SimpleMatchType::Normalize => {
for str_conv_map in [UPPER_LOWER, EN_VARIATION, NUM_NORM] {
process_dict.extend(str_conv_map.trim().lines().map(|pair_str| {
let mut pair_str_split = pair_str.split('\t');
(
pair_str_split.next().unwrap(),
pair_str_split.next().unwrap(),
)
}));
}
}
SimpleMatchType::PinYin => {
process_dict.extend(PINYIN.trim().lines().map(|pair_str| {
let mut pair_str_split = pair_str.split('\t');
(
pair_str_split.next().unwrap(),
pair_str_split.next().unwrap(),
)
}));
}
SimpleMatchType::PinYinChar => {
process_dict.extend(PINYIN_CHAR.trim().lines().map(|pair_str| {
let mut pair_str_split = pair_str.split('\t');
(
pair_str_split.next().unwrap(),
pair_str_split.next().unwrap(),
)
}));
}
_ => {}
}
process_dict
.retain(|&key, &mut value| (key == "#" || !key.starts_with('#')) && key != value);
let (process_replace_list, process_matcher) = match simple_match_type_bit {
SimpleMatchType::Fanjian | SimpleMatchType::PinYin | SimpleMatchType::PinYinChar => (
process_dict.iter().map(|(_, &val)| val).collect(),
CharwiseDoubleArrayAhoCorasickBuilder::new()
.match_kind(DoubleArrayAhoCorasickMatchKind::Standard)
.build(
process_dict
.iter()
.map(|(&key, _)| key)
.collect::<Vec<&str>>(),
)
.unwrap(),
),
_ => (
process_dict.iter().map(|(_, &val)| val).collect(),
AhoCorasickBuilder::new()
.kind(Some(DFA))
.match_kind(AhoCorasickMatchKind::LeftmostLongest)
.build(
process_dict
.iter()
.map(|(&key, _)| key)
.collect::<Vec<&str>>(),
)
.unwrap(),
),
};
let uncached_result = Arc::new((process_replace_list, process_matcher));
let mut process_matcher_cache = PROCESS_MATCHER_CACHE.write();
process_matcher_cache.insert(simple_match_type_bit, Arc::clone(&uncached_result));
uncached_result
}
}
#[cfg(feature = "prebuilt")]
pub fn get_process_matcher(
simple_match_type_bit: SimpleMatchType,
) -> Arc<(Vec<&'static str>, ProcessMatcher)> {
{
let process_matcher_cache = PROCESS_MATCHER_CACHE.read();
if let Some(cached_result) = process_matcher_cache.get(&simple_match_type_bit) {
return Arc::clone(cached_result);
}
}
{
let (process_replace_list, process_matcher) = match simple_match_type_bit {
SimpleMatchType::None => {
let empty_patterns: Vec<&str> = Vec::new();
(
Vec::new(),
ProcessMatcher::Others(AhoCorasick::new(&empty_patterns).unwrap()),
)
}
SimpleMatchType::Fanjian => (
FANJIAN_PROCESS_REPLACE_LIST_STR.lines().collect(),
ProcessMatcher::Chinese(unsafe {
CharwiseDoubleArrayAhoCorasick::<u64>::deserialize_unchecked(
FANJIAN_PROCESS_MATCHER_BYTES,
)
.0
}),
),
SimpleMatchType::WordDelete => {
let mut process_dict = AHashMap::new();
process_dict.extend(
PUNCTUATION_SPECIAL
.trim()
.lines()
.map(|pair_str| (pair_str, "")),
);
process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
process_dict.retain(|&key, &mut value| {
(key == "#" || !key.starts_with('#')) && key != value
});
let process_list = process_dict
.iter()
.map(|(&key, _)| key)
.collect::<Vec<&str>>();
(
Vec::new(),
ProcessMatcher::Others(
AhoCorasickBuilder::new()
.kind(Some(DFA))
.match_kind(AhoCorasickMatchKind::LeftmostLongest)
.build(&process_list)
.unwrap(),
),
)
}
SimpleMatchType::TextDelete => {
let mut process_dict = AHashMap::new();
for str_conv_map in [PUNCTUATION_SPECIAL, CN_SPECIAL, EN_SPECIAL] {
process_dict.extend(str_conv_map.trim().lines().map(|pair_str| (pair_str, "")));
}
process_dict.extend(WHITE_SPACE.iter().map(|&c| (c, "")));
process_dict.retain(|&key, &mut value| {
(key == "#" || !key.starts_with('#')) && key != value
});
let process_list = process_dict
.iter()
.map(|(&key, _)| key)
.collect::<Vec<&str>>();
(
Vec::new(),
ProcessMatcher::Others(
AhoCorasickBuilder::new()
.kind(Some(DFA))
.match_kind(AhoCorasickMatchKind::LeftmostLongest)
.build(&process_list)
.unwrap(),
),
)
}
SimpleMatchType::Normalize => (
NORMALIZE_PROCESS_REPLACE_LIST_STR.lines().collect(),
ProcessMatcher::Others(
AhoCorasickBuilder::new()
.kind(Some(DFA))
.match_kind(AhoCorasickMatchKind::LeftmostLongest)
.build(NORMALIZE_PROCESS_LIST_STR.lines())
.unwrap(),
),
),
SimpleMatchType::PinYin => (
PINYIN_PROCESS_REPLACE_LIST_STR.lines().collect(),
ProcessMatcher::Chinese(unsafe {
CharwiseDoubleArrayAhoCorasick::<u64>::deserialize_unchecked(
PINYIN_PROCESS_MATCHER_BYTES,
)
.0
}),
),
SimpleMatchType::PinYinChar => (
PINYINCHAR_PROCESS_REPLACE_LIST_STR.lines().collect(),
ProcessMatcher::Chinese(unsafe {
CharwiseDoubleArrayAhoCorasick::<u64>::deserialize_unchecked(
PINYINCHAR_PROCESS_MATCHER_BYTES,
)
.0
}),
),
_ => unreachable!(),
};
let uncached_result = Arc::new((process_replace_list, process_matcher));
let mut process_matcher_cache = PROCESS_MATCHER_CACHE.write();
process_matcher_cache.insert(simple_match_type_bit, Arc::clone(&uncached_result));
uncached_result
}
}