use std::borrow::Cow;
use crate::rules::context::DocumentSummary;
use crate::rules::token::{Token, WordMeta, WordToken};
use crate::rules::token_rule::{TokenAction, TokenPhase, TokenRule};
use crate::utils::is_korean_char;
const HANGUL_WRAP_START: [u8; 2] = [56, 55]; const HANGUL_WRAP_END: [u8; 2] = [56, 62];
pub struct EnglishDominantKoreanWrapRule;
fn build_word_token<'a>(text: &str) -> Token<'a> {
let chars: Vec<char> = text.chars().collect();
Token::Word(WordToken {
text: Cow::Owned(text.to_string()),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
})
}
#[cfg(not(tarpaulin_include))]
fn prev_word_token<'a, 'b>(tokens: &'b [Token<'a>], index: usize) -> Option<&'b WordToken<'a>> {
for token in tokens[..index].iter().rev() {
match token {
Token::Word(w) => return Some(w),
Token::Space(_) | Token::PreEncoded(_) => continue,
_ => return None,
}
}
None
}
#[cfg(not(tarpaulin_include))]
fn next_word_token<'a, 'b>(tokens: &'b [Token<'a>], index: usize) -> Option<&'b WordToken<'a>> {
for token in tokens.iter().skip(index + 1) {
match token {
Token::Word(w) => return Some(w),
Token::Space(_) | Token::PreEncoded(_) => continue,
_ => return None,
}
}
None
}
fn word_is_english_only(word: &WordToken<'_>) -> bool {
!word.meta.has_korean && word.meta.has_ascii_alphabetic
}
fn is_punct_only(chars: &[char]) -> bool {
chars
.iter()
.all(|c| !c.is_ascii_alphabetic() && !is_korean_char(*c) && !c.is_ascii_digit())
}
fn same_token_left_is_english(left_chars: &[char]) -> bool {
for ch in left_chars.iter().rev() {
if ch.is_ascii_alphabetic() {
return true;
}
if is_korean_char(*ch) {
return false;
}
}
false
}
fn same_token_right_is_english(right_chars: &[char]) -> bool {
for ch in right_chars.iter() {
if ch.is_ascii_alphabetic() {
return true;
}
if is_korean_char(*ch) {
return false;
}
}
false
}
#[derive(Debug, Clone, Copy)]
struct KoreanSegment {
char_start: usize,
char_end: usize, }
#[derive(Default)]
struct KoreanContextScan {
has_same_token_context: bool,
has_boundary_segment: bool,
}
#[derive(Default)]
struct EnglishContextCandidates {
has_same_token_context: bool,
has_boundary_candidate: bool,
}
fn find_korean_segments(chars: &[char]) -> Vec<KoreanSegment> {
let mut segments = Vec::new();
let mut current_start: Option<usize> = None;
for (idx, ch) in chars.iter().enumerate() {
if is_korean_char(*ch) {
if current_start.is_none() {
current_start = Some(idx);
}
} else if let Some(start) = current_start.take() {
segments.push(KoreanSegment {
char_start: start,
char_end: idx,
});
}
}
if let Some(start) = current_start {
segments.push(KoreanSegment {
char_start: start,
char_end: chars.len(),
});
}
segments
}
fn first_script_char(word: &WordToken<'_>) -> Option<char> {
if word.meta.starts_with_ascii {
return word.chars.first().copied();
}
if word.chars.first().is_some_and(|ch| is_korean_char(*ch)) {
return word.chars.first().copied();
}
word.chars
.iter()
.copied()
.find(|ch| ch.is_ascii_alphabetic() || is_korean_char(*ch))
}
fn update_korean_context_scan(
chars: &[char],
char_start: usize,
char_end: usize,
scan: &mut KoreanContextScan,
) {
let left_slice = &chars[..char_start];
let right_slice = &chars[char_end..];
let left_at_boundary = is_punct_only(left_slice);
let right_at_boundary = is_punct_only(right_slice);
match (left_at_boundary, right_at_boundary) {
(true, true) => scan.has_boundary_segment = true,
(false, false) => {
scan.has_same_token_context |=
same_token_left_is_english(left_slice) && same_token_right_is_english(right_slice);
}
_ => {}
}
}
fn scan_korean_contexts(chars: &[char]) -> KoreanContextScan {
let mut scan = KoreanContextScan::default();
let mut current_start: Option<usize> = None;
for (idx, ch) in chars.iter().enumerate() {
if is_korean_char(*ch) {
if current_start.is_none() {
current_start = Some(idx);
}
} else if let Some(start) = current_start.take() {
update_korean_context_scan(chars, start, idx, &mut scan);
}
}
if let Some(start) = current_start {
update_korean_context_scan(chars, start, chars.len(), &mut scan);
}
scan
}
fn scan_english_context_candidates(tokens: &[Token<'_>]) -> EnglishContextCandidates {
let mut candidates = EnglishContextCandidates::default();
let mut pending_boundary_after_prev_english = false;
let mut prev_word_is_english_only: Option<bool> = None;
tokens
.iter()
.take_while(|token| {
match token {
Token::Word(word) => {
if pending_boundary_after_prev_english && word_is_english_only(word) {
candidates.has_boundary_candidate = true;
}
pending_boundary_after_prev_english = false;
if word.meta.has_korean && !candidates.has_same_token_context {
let scan = scan_korean_contexts(&word.chars);
candidates.has_same_token_context |= scan.has_same_token_context;
if scan.has_boundary_segment && prev_word_is_english_only == Some(true) {
pending_boundary_after_prev_english = true;
}
}
prev_word_is_english_only = Some(word_is_english_only(word));
}
Token::Space(_) | Token::PreEncoded(_) => {}
_ => {
pending_boundary_after_prev_english = false;
prev_word_is_english_only = None;
}
}
!(candidates.has_same_token_context && candidates.has_boundary_candidate)
})
.for_each(|_| {});
candidates
}
fn count_script_words(tokens: &[Token<'_>]) -> (usize, usize) {
let mut english_words = 0usize;
let mut korean_words = 0usize;
for token in tokens.iter() {
let Token::Word(word) = token else { continue };
let Some(c) = first_script_char(word) else {
continue;
};
if c.is_ascii_alphabetic() {
english_words += 1;
} else if is_korean_char(c) {
korean_words += 1;
}
}
(english_words, korean_words)
}
pub fn compute_document_summary(tokens: &[Token<'_>]) -> DocumentSummary {
let candidates = scan_english_context_candidates(tokens);
if !candidates.has_same_token_context && !candidates.has_boundary_candidate {
return DocumentSummary::default();
}
let (english_words, korean_words) = count_script_words(tokens);
let is_english_majority = english_words >= korean_words.max(1);
let is_english_dominant =
english_words >= 10 && english_words >= korean_words.saturating_mul(5);
let has_english_context_for_korean = candidates.has_same_token_context
|| (candidates.has_boundary_candidate && is_english_majority);
DocumentSummary {
has_english_context_for_korean,
is_english_majority,
is_english_dominant,
}
}
fn segment_in_english_context_with_majority<'a>(
chars: &[char],
seg: KoreanSegment,
tokens: &[Token<'a>],
token_index: usize,
is_english_majority: bool,
) -> bool {
let left_slice = &chars[..seg.char_start];
let right_slice = &chars[seg.char_end..];
let left_at_boundary = is_punct_only(left_slice);
let right_at_boundary = is_punct_only(right_slice);
if left_at_boundary && right_at_boundary {
return boundary_segment_wrap(tokens, token_index, is_english_majority);
}
if !left_at_boundary && !right_at_boundary {
return same_token_left_is_english(left_slice) && same_token_right_is_english(right_slice);
}
false
}
fn boundary_segment_wrap<'a>(
tokens: &[Token<'a>],
token_index: usize,
is_english_majority: bool,
) -> bool {
let prev_eng = prev_word_token(tokens, token_index).is_some_and(word_is_english_only);
let next_eng = next_word_token(tokens, token_index).is_some_and(word_is_english_only);
prev_eng && next_eng && is_english_majority
}
fn build_wrapped_replacement<'a>(
word: &WordToken<'a>,
tokens: &[Token<'a>],
token_index: usize,
is_english_majority: bool,
) -> Option<Vec<Token<'a>>> {
let segments = find_korean_segments(&word.chars);
if segments.is_empty() {
return None;
}
let chars = &word.chars;
let mut wrap_segments = Vec::new();
for seg in segments {
if segment_in_english_context_with_majority(
chars,
seg,
tokens,
token_index,
is_english_majority,
) {
wrap_segments.push(seg);
}
}
if wrap_segments.is_empty() {
return None;
}
let mut result: Vec<Token<'a>> = Vec::new();
let mut cursor = 0usize;
for seg in wrap_segments {
if seg.char_start > cursor {
let prefix: String = chars[cursor..seg.char_start].iter().collect();
if !prefix.is_empty() {
result.push(build_word_token(&prefix));
}
}
let korean: String = chars[seg.char_start..seg.char_end].iter().collect();
result.push(Token::PreEncoded(HANGUL_WRAP_START.to_vec()));
result.push(build_word_token(&korean));
result.push(Token::PreEncoded(HANGUL_WRAP_END.to_vec()));
cursor = seg.char_end;
}
if cursor < chars.len() {
let suffix: String = chars[cursor..].iter().collect();
if !suffix.is_empty() {
result.push(build_word_token(&suffix));
}
}
Some(result)
}
impl TokenRule for EnglishDominantKoreanWrapRule {
fn phase(&self) -> TokenPhase {
TokenPhase::PostWord
}
fn priority(&self) -> u16 {
50
}
fn apply<'a>(
&self,
tokens: &[Token<'a>],
index: usize,
state: &mut crate::rules::context::EncoderState,
) -> Result<TokenAction<'a>, String> {
let Some(Token::Word(word)) = tokens.get(index) else {
return Ok(TokenAction::Noop);
};
if !word.meta.has_korean {
return Ok(TokenAction::Noop);
}
if !state.doc_summary.has_english_context_for_korean {
return Ok(TokenAction::Noop);
}
state.english_dominant_wrap_active = true;
if state.doc_summary.is_english_dominant {
state.english_dominant_no_indicator = true;
}
match build_wrapped_replacement(word, tokens, index, state.doc_summary.is_english_majority)
{
Some(replacement) => Ok(TokenAction::ReplaceMany(replacement)),
None => Ok(TokenAction::Noop),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rules::token::SpaceKind;
fn word(text: &str) -> Token<'static> {
let chars: Vec<char> = text.chars().collect();
Token::Word(WordToken {
text: Cow::Owned(text.to_string()),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
})
}
fn unwrap_word<'a, 'b>(tok: &'b Token<'a>) -> &'b WordToken<'a> {
match tok {
Token::Word(w) => w,
_ => panic!("expected Word"),
}
}
#[test]
fn same_token_right_is_english_no_alpha_no_korean() {
assert!(!same_token_right_is_english(&['1', '2', '3']));
assert!(!same_token_right_is_english(&['.', ',', '!']));
assert!(!same_token_right_is_english(&[]));
}
#[test]
fn count_script_words_non_alpha_non_korean_first_char() {
let tokens = vec![word("english"), word("한국"), word("123"), word("more")];
let (eng, kor) = count_script_words(&tokens);
assert_eq!(eng, 2);
assert_eq!(kor, 1);
}
#[test]
fn segment_both_boundaries_prev_next_english_with_majority() {
let tokens = vec![
word("hello"),
Token::Space(SpaceKind::Regular),
word("한국"),
Token::Space(SpaceKind::Regular),
word("world"),
];
let kor_word = unwrap_word(&tokens[2]);
let result = build_wrapped_replacement(kor_word, &tokens, 2, true);
assert!(
result.is_some(),
"Korean word between two English words with majority should be wrapped"
);
}
#[test]
fn segment_within_same_token_english_letters() {
let token = word("www.대통령.kr");
let tokens = vec![token.clone()];
let kor_word = unwrap_word(&tokens[0]);
let result = build_wrapped_replacement(kor_word, &tokens, 0, false);
assert!(
result.is_some(),
"Korean segment within same-token English letters should wrap"
);
}
#[test]
fn build_wrapped_replacement_no_korean_returns_none() {
let token = word("hello");
let tokens = vec![token.clone()];
let eng_word = unwrap_word(&tokens[0]);
assert!(build_wrapped_replacement(eng_word, &tokens, 0, true).is_none());
}
#[test]
fn segment_both_boundaries_prev_not_english_no_wrap() {
let tokens = vec![
word("안녕하세요"),
Token::Space(SpaceKind::Regular),
word("한국"),
Token::Space(SpaceKind::Regular),
word("world"),
];
let kor_word = unwrap_word(&tokens[2]);
let result = build_wrapped_replacement(kor_word, &tokens, 2, true);
assert!(result.is_none());
}
#[test]
fn scan_english_context_candidates_resets_on_non_word_non_space() {
use crate::rules::token::FractionToken;
let tokens = vec![
word("english"),
Token::Fraction(FractionToken {
whole: None,
numerator: "1".into(),
denominator: "2".into(),
}),
word("more"),
];
let _ = scan_english_context_candidates(&tokens);
}
#[test]
fn scan_english_context_candidates_breaks_when_both_flags_true() {
let tokens = vec![
word("english"),
Token::Space(SpaceKind::Regular),
word("hello한국english"),
Token::Space(SpaceKind::Regular),
word("english"),
];
let _ = scan_english_context_candidates(&tokens);
}
}