use crate::char_struct::{CharType, KoreanChar};
#[derive(Debug, Default, Clone, Copy)]
pub struct DocumentSummary {
pub has_english_context_for_korean: bool,
pub is_english_majority: bool,
pub is_english_dominant: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EncodingMode {
Korean,
English,
Math,
Number,
MiddleKorean,
ObjectSymbol,
Ipa,
}
impl std::str::FromStr for EncodingMode {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"korean" => Ok(Self::Korean),
"english" => Ok(Self::English),
"math" => Ok(Self::Math),
"number" => Ok(Self::Number),
"middle_korean" => Ok(Self::MiddleKorean),
"object_symbol" => Ok(Self::ObjectSymbol),
"ipa" => Ok(Self::Ipa),
_ => Err(()),
}
}
}
#[derive(Debug, Clone)]
pub struct EncoderState {
pub mode_stack: Vec<EncodingMode>,
pub is_english: bool,
pub english_indicator: bool,
pub triple_big_english: bool,
pub has_processed_word: bool,
pub needs_english_continuation: bool,
pub roman_number_chain: bool,
pub parenthesis_stack: Vec<bool>,
pub is_number: bool,
pub is_big_english: bool,
pub english_dominant_wrap_active: bool,
pub english_dominant_no_indicator: bool,
pub doc_summary: DocumentSummary,
pub matrix_context_active: bool,
pub math_mode_active: bool,
pub unmatched_open_single_quotes: i32,
}
impl EncoderState {
pub fn new(english_indicator: bool) -> Self {
Self {
mode_stack: vec![EncodingMode::Korean],
english_indicator,
is_english: false,
triple_big_english: false,
has_processed_word: false,
needs_english_continuation: false,
roman_number_chain: false,
parenthesis_stack: Vec::new(),
is_number: false,
is_big_english: false,
english_dominant_wrap_active: false,
english_dominant_no_indicator: false,
doc_summary: DocumentSummary::default(),
matrix_context_active: false,
math_mode_active: false,
unmatched_open_single_quotes: 0,
}
}
pub fn current_mode(&self) -> EncodingMode {
self.mode_stack
.last()
.copied()
.unwrap_or(EncodingMode::Korean)
}
pub fn push_mode(&mut self, mode: EncodingMode) {
self.mode_stack.push(mode);
}
pub fn pop_mode(&mut self) -> Option<EncodingMode> {
if self.mode_stack.len() > 1 {
self.mode_stack.pop()
} else {
None
}
}
}
pub struct RuleContext<'a> {
pub word_chars: &'a [char],
pub index: usize,
pub char_type: &'a CharType,
pub prev_word: &'a str,
pub remaining_words: &'a [&'a str],
pub has_korean_char: bool,
pub is_all_uppercase: bool,
pub ascii_starts_at_beginning: bool,
pub skip_count: &'a mut usize,
pub state: &'a mut EncoderState,
pub result: &'a mut Vec<u8>,
}
impl<'a> RuleContext<'a> {
pub fn current_char(&self) -> char {
self.word_chars[self.index]
}
pub fn next_char(&self) -> Option<char> {
self.word_chars.get(self.index + 1).copied()
}
pub fn prev_char(&self) -> Option<char> {
if self.index > 0 {
Some(self.word_chars[self.index - 1])
} else {
None
}
}
pub fn word_len(&self) -> usize {
self.word_chars.len()
}
pub fn as_korean(&self) -> Option<&KoreanChar> {
if let CharType::Korean(k) = self.char_type {
Some(k)
} else {
None
}
}
pub fn emit(&mut self, byte: u8) {
self.result.push(byte);
}
pub fn emit_slice(&mut self, bytes: &[u8]) {
self.result.extend_from_slice(bytes);
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::str::FromStr;
#[test]
fn encoding_mode_from_str_all_variants() {
assert_eq!(EncodingMode::from_str("korean"), Ok(EncodingMode::Korean));
assert_eq!(EncodingMode::from_str("english"), Ok(EncodingMode::English));
assert_eq!(EncodingMode::from_str("math"), Ok(EncodingMode::Math));
assert_eq!(EncodingMode::from_str("number"), Ok(EncodingMode::Number));
assert_eq!(
EncodingMode::from_str("middle_korean"),
Ok(EncodingMode::MiddleKorean)
);
assert_eq!(
EncodingMode::from_str("object_symbol"),
Ok(EncodingMode::ObjectSymbol)
);
assert_eq!(EncodingMode::from_str("ipa"), Ok(EncodingMode::Ipa));
}
#[test]
fn encoding_mode_from_str_unknown_returns_err() {
assert!(EncodingMode::from_str("unknown").is_err());
assert!(EncodingMode::from_str("").is_err());
assert!(EncodingMode::from_str("KOREAN").is_err());
}
}