use std::collections::{BTreeSet, HashSet};
use daachorse::CharwiseDoubleArrayAhoCorasick;
use crate::header;
#[derive(Debug, Clone)]
pub(crate) struct TermHit {
pub byte_start: usize,
pub byte_end: usize,
pub source: String,
pub target: String,
}
pub(crate) fn deserialize_pattern_table(bytes: &[u8]) -> Vec<(String, String, u8)> {
let mut pos = 0;
let count = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
let mut table = Vec::with_capacity(count);
for _ in 0..count {
let mask = bytes[pos];
pos += 1;
let src_len = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
let src = std::str::from_utf8(&bytes[pos..pos + src_len])
.unwrap()
.to_string();
pos += src_len;
let tgt_len = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
let tgt = std::str::from_utf8(&bytes[pos..pos + tgt_len])
.unwrap()
.to_string();
pos += tgt_len;
table.push((src, tgt, mask));
}
table
}
pub(crate) fn build_automaton(
patterns: &[(String, String)],
) -> CharwiseDoubleArrayAhoCorasick<u32> {
use daachorse::{CharwiseDoubleArrayAhoCorasickBuilder, MatchKind};
let patvals: Vec<(&str, u32)> = patterns
.iter()
.enumerate()
.map(|(i, (src, _))| (src.as_str(), i as u32))
.collect();
CharwiseDoubleArrayAhoCorasickBuilder::new()
.match_kind(MatchKind::Standard)
.build_with_values(patvals)
.expect("failed to build daachorse automaton")
}
pub(crate) fn deserialize_default_automaton(bytes: &[u8]) -> CharwiseDoubleArrayAhoCorasick<u32> {
let payload = header::verify_header(bytes, header::SourceMask::CN_HK);
let (automaton, _trailing) =
unsafe { CharwiseDoubleArrayAhoCorasick::deserialize_unchecked(payload) };
automaton
}
pub(crate) fn build_byte_to_cp(text: &str) -> Vec<usize> {
let mut map = vec![0usize; text.len() + 1];
for (cp_idx, (byte_idx, ch)) in text.char_indices().enumerate() {
map[byte_idx] = cp_idx;
for item in map
.iter_mut()
.take(byte_idx + ch.len_utf8())
.skip(byte_idx + 1)
{
*item = cp_idx;
}
}
map[text.len()] = text.chars().count();
map
}
pub(crate) fn find_term_matches(
pma: &CharwiseDoubleArrayAhoCorasick<u32>,
pattern_table: &[(String, String)],
text: &str,
) -> Vec<TermHit> {
let mut all_hits: Vec<TermHit> = pma
.find_overlapping_iter(text)
.map(|m| {
let idx = m.value() as usize;
let (ref src, ref tgt) = pattern_table[idx];
TermHit {
byte_start: m.start(),
byte_end: m.end(),
source: src.clone(),
target: tgt.clone(),
}
})
.collect();
if all_hits.is_empty() {
return Vec::new();
}
all_hits.sort_by(|a, b| {
a.byte_start.cmp(&b.byte_start).then_with(|| {
let len_a = a.byte_end - a.byte_start;
let len_b = b.byte_end - b.byte_start;
len_b.cmp(&len_a) })
});
let mut identity_matches: Vec<&TermHit> = Vec::new();
let mut non_identity_spans: Vec<(usize, usize)> = Vec::new();
for hit in &all_hits {
if hit.source == hit.target {
identity_matches.push(hit);
} else {
non_identity_spans.push((hit.byte_start, hit.byte_end));
}
}
let mut protected: BTreeSet<usize> = BTreeSet::new();
if non_identity_spans.is_empty() {
for hit in &identity_matches {
for b in hit.byte_start..hit.byte_end {
protected.insert(b);
}
}
} else {
non_identity_spans.sort_by_key(|&(s, _)| s);
let mut prefix_max_end: Vec<usize> = Vec::with_capacity(non_identity_spans.len());
let mut running_max = 0usize;
for &(_, end) in &non_identity_spans {
running_max = running_max.max(end);
prefix_max_end.push(running_max);
}
for hit in &identity_matches {
if !is_contained_in_non_identity(
hit.byte_start,
hit.byte_end,
&non_identity_spans,
&prefix_max_end,
) {
for b in hit.byte_start..hit.byte_end {
protected.insert(b);
}
}
}
}
let mut result: Vec<TermHit> = Vec::new();
let mut cursor: usize = 0;
for hit in &all_hits {
if hit.byte_start < cursor {
continue; }
let is_identity = hit.source == hit.target;
if !is_identity {
if (hit.byte_start..hit.byte_end).any(|b| protected.contains(&b)) {
continue; }
}
cursor = hit.byte_end;
if !is_identity {
result.push(hit.clone());
}
}
result
}
pub(crate) fn get_covered_positions(
pma: &CharwiseDoubleArrayAhoCorasick<u32>,
text: &str,
) -> HashSet<usize> {
let mut covered = HashSet::new();
for m in pma.find_overlapping_iter(text) {
for b in m.start()..m.end() {
covered.insert(b);
}
}
covered
}
fn is_contained_in_non_identity(
start: usize,
end: usize,
spans: &[(usize, usize)],
prefix_max_end: &[usize],
) -> bool {
let idx = match spans.binary_search_by_key(&start, |&(s, _)| s) {
Ok(i) => i,
Err(0) => return false, Err(i) => i - 1,
};
prefix_max_end[idx] >= end
}
pub(crate) fn apply_layers_skipping(
segment: &str,
char_map: &phf::Map<char, char>,
balanced: Option<&phf::Map<char, char>>,
covered: &HashSet<usize>,
offset: usize,
) -> String {
let mut result = String::with_capacity(segment.len());
for (byte_idx, ch) in segment.char_indices() {
if covered.contains(&(offset + byte_idx)) {
result.push(ch);
} else {
let mut out = ch;
if let Some(bd) = balanced {
if let Some(&mapped) = bd.get(&ch) {
out = mapped;
}
}
if out == ch {
out = char_map.get(&ch).copied().unwrap_or(ch);
}
result.push(out);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn byte_to_cp_ascii() {
let map = build_byte_to_cp("hello");
assert_eq!(map[0], 0);
assert_eq!(map[1], 1);
assert_eq!(map[5], 5);
}
#[test]
fn byte_to_cp_cjk() {
let map = build_byte_to_cp("中X");
assert_eq!(map[0], 0);
assert_eq!(map[3], 1);
assert_eq!(map[4], 2);
}
#[test]
fn byte_to_cp_supplementary() {
let map = build_byte_to_cp("𠮷A");
assert_eq!(map[0], 0);
assert_eq!(map[4], 1);
assert_eq!(map[5], 2);
}
}