use std::{
cmp::{max, min},
fmt::Debug,
};
pub use self::chewing::ChewingEngine;
pub use self::fuzzy::FuzzyChewingEngine;
pub use self::simple::SimpleEngine;
pub(crate) use self::symbol::{full_width_symbol_input, special_symbol_input};
use crate::{dictionary::Dictionary, zhuyin::Syllable};
mod chewing;
mod fuzzy;
mod simple;
mod symbol;
pub trait ConversionEngine: Debug {
fn convert<'a>(&'a self, dict: &'a dyn Dictionary, comp: &'a Composition) -> Vec<Outcome>;
}
#[derive(Debug, Default, Clone, PartialEq)]
pub struct Outcome {
pub(crate) intervals: Vec<Interval>,
pub(crate) log_prob: f64,
}
#[derive(Default, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
pub struct Interval {
pub start: usize,
pub end: usize,
pub is_phrase: bool,
pub text: Box<str>,
}
impl Debug for Interval {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("I")
.field(&(self.start..self.end))
.field(&self.text)
.finish()
}
}
impl Interval {
pub fn contains(&self, other: &Interval) -> bool {
self.contains_range(other.start, other.end)
}
fn contains_range(&self, start: usize, end: usize) -> bool {
self.start <= start && self.end >= end
}
fn is_contained_by(&self, start: usize, end: usize) -> bool {
start <= self.start && end >= self.end
}
pub fn intersect(&self, other: &Interval) -> bool {
self.intersect_range(other.start, other.end)
}
fn intersect_range(&self, start: usize, end: usize) -> bool {
max(self.start, start) < min(self.end, end)
}
pub fn len(&self) -> usize {
self.end - self.start
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn sub_intervals(&self) -> impl Iterator<Item = Interval> {
self.text.chars().enumerate().map(|(offset, ch)| Interval {
start: self.start + offset,
end: self.start + offset + 1,
is_phrase: self.is_phrase,
text: ch.to_string().into_boxed_str(),
})
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Gap {
Begin,
Break,
Glue,
Normal,
}
#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
pub enum Symbol {
Syllable(Syllable),
Char(char),
}
impl Debug for Symbol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Symbol::Syllable(syl) => f.debug_tuple("S").field(&syl.to_string()).finish(),
Symbol::Char(ch) => f.debug_tuple("C").field(&ch).finish(),
}
}
}
impl Symbol {
pub fn is_syllable(&self) -> bool {
matches!(self, Symbol::Syllable(_))
}
pub fn is_char(&self) -> bool {
matches!(self, Symbol::Char(_))
}
pub fn to_syllable(self) -> Option<Syllable> {
match self {
Symbol::Syllable(syllable) => Some(syllable),
Symbol::Char(_) => None,
}
}
pub fn to_char(self) -> Option<char> {
match self {
Symbol::Syllable(_) => None,
Symbol::Char(c) => Some(c),
}
}
}
impl From<Syllable> for Symbol {
fn from(value: Syllable) -> Self {
Symbol::Syllable(value)
}
}
impl From<char> for Symbol {
fn from(value: char) -> Self {
Symbol::Char(value)
}
}
#[derive(Debug, Default, Clone)]
pub struct Composition {
symbols: Vec<Symbol>,
gaps: Vec<Gap>,
selections: Vec<Interval>,
}
impl Composition {
pub fn new() -> Composition {
Default::default()
}
pub fn len(&self) -> usize {
assert_eq!(self.symbols.len(), self.gaps.len());
self.symbols.len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn symbol(&self, index: usize) -> Option<Symbol> {
if index >= self.len() {
return None;
}
Some(self.symbols[index])
}
pub fn symbols(&self) -> &[Symbol] {
&self.symbols
}
pub fn selections(&self) -> &[Interval] {
&self.selections
}
pub fn gap(&self, index: usize) -> Option<Gap> {
if index >= self.len() {
return None;
}
Some(self.gaps[index])
}
pub fn gap_after(&self, index: usize) -> Option<Gap> {
if index + 1 >= self.len() {
return None;
}
Some(self.gaps[index + 1])
}
pub fn set_gap(&mut self, index: usize, gap: Gap) {
assert!(index < self.len());
assert_ne!(gap, Gap::Begin);
if index == 0 {
return;
}
if gap == Gap::Break {
let mut to_remove = vec![];
for (i, selection) in self.selections.iter_mut().enumerate() {
if selection.start < index && index < selection.end {
to_remove.push(i);
}
}
for i in to_remove.into_iter().rev() {
self.selections.swap_remove(i);
}
}
self.gaps[index] = gap;
}
pub fn push(&mut self, sym: Symbol) {
self.insert(self.len(), sym);
}
pub fn insert(&mut self, index: usize, sym: Symbol) {
assert!(index <= self.len());
let mut to_remove = vec![];
for (i, selection) in self.selections.iter_mut().enumerate() {
if selection.start < index && index < selection.end {
to_remove.push(i);
}
if selection.start >= index {
selection.start += 1;
selection.end += 1;
}
}
for i in to_remove.into_iter().rev() {
self.selections.swap_remove(i);
}
self.symbols.insert(index, sym);
if !self.gaps.is_empty() && index != self.gaps.len() {
self.gaps[index] = Gap::Normal;
}
self.gaps.insert(index, Gap::Normal);
self.gaps[0] = Gap::Begin;
}
pub fn replace(&mut self, index: usize, sym: Symbol) {
assert!(index < self.len());
self.symbols[index] = sym;
self.set_gap(index, Gap::Normal);
}
pub fn push_selection(&mut self, interval: Interval) {
assert!(interval.end <= self.len());
let mut to_remove = vec![];
for (i, selection) in self.selections.iter().enumerate() {
if selection.intersect(&interval) {
to_remove.push(i);
}
}
for i in to_remove.into_iter().rev() {
self.selections.swap_remove(i);
}
for i in (interval.start..interval.end).skip(1) {
self.gaps[i] = Gap::Normal;
}
self.selections.push(interval);
}
pub fn remove_front(&mut self, n: usize) {
assert!(n <= self.len());
let mut to_remove = vec![];
for (i, selection) in self.selections.iter_mut().enumerate() {
if selection.start < n {
to_remove.push(i);
} else {
selection.start -= n;
selection.end -= n;
}
}
for i in to_remove.into_iter().rev() {
self.selections.swap_remove(i);
}
self.symbols.drain(0..n);
self.gaps.drain(0..n);
if !self.gaps.is_empty() {
self.gaps[0] = Gap::Begin;
}
}
pub fn remove(&mut self, index: usize) {
assert!(index < self.len());
let mut to_remove = vec![];
for (i, selection) in self.selections.iter_mut().enumerate() {
if selection.start <= index {
if index < selection.end {
to_remove.push(i);
}
} else {
selection.start -= 1;
selection.end -= 1;
}
}
for i in to_remove.into_iter().rev() {
self.selections.swap_remove(i);
}
self.symbols.remove(index);
self.gaps.remove(index);
if !self.gaps.is_empty() {
self.gaps[0] = Gap::Begin;
}
}
pub fn clear(&mut self) {
self.symbols.clear();
self.gaps.clear();
self.selections.clear();
}
}