use crate::text::CharacterInfo;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CJKScript {
Han,
HanExtensionA,
HanExtensionBF,
Hiragana,
Katakana,
HalfwidthKatakana,
Hangul,
CJKSymbol,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocumentLanguage {
Japanese,
Korean,
Chinese,
}
pub fn detect_cjk_script(code: u32) -> Option<CJKScript> {
if matches!(code, 0x4E00..=0x9FFF) {
return Some(CJKScript::Han);
}
match code {
0x3400..=0x4DBF => Some(CJKScript::HanExtensionA),
0x20000..=0x2EBEF => Some(CJKScript::HanExtensionBF),
0x3040..=0x309F => Some(CJKScript::Hiragana),
0x30A0..=0x30FF => Some(CJKScript::Katakana),
0xFF61..=0xFF9F => Some(CJKScript::HalfwidthKatakana),
0xAC00..=0xD7AF => Some(CJKScript::Hangul),
0x3190..=0x319F => Some(CJKScript::CJKSymbol),
_ => None,
}
}
pub fn should_split_on_script_transition(
prev_script: Option<CJKScript>,
curr_script: Option<CJKScript>,
language: Option<DocumentLanguage>,
) -> Option<bool> {
match (prev_script, curr_script) {
(Some(prev), Some(curr)) => should_split_cjk_transition(prev, curr, language),
(Some(_), None) | (None, Some(_)) => Some(true),
(None, None) => None,
}
}
fn should_split_cjk_transition(
prev: CJKScript,
curr: CJKScript,
language: Option<DocumentLanguage>,
) -> Option<bool> {
if prev == curr {
return None;
}
match language {
Some(DocumentLanguage::Japanese) => handle_japanese_transition(prev, curr),
Some(DocumentLanguage::Korean) => handle_korean_transition(prev, curr),
Some(DocumentLanguage::Chinese) | None => {
handle_chinese_transition(prev, curr)
},
}
}
fn handle_japanese_transition(prev: CJKScript, curr: CJKScript) -> Option<bool> {
use CJKScript::*;
match (prev, curr) {
(Han | HanExtensionA | HanExtensionBF, Hiragana) => Some(false),
(Hiragana, Han | HanExtensionA | HanExtensionBF) => Some(false),
(Han | HanExtensionA | HanExtensionBF, Katakana | HalfwidthKatakana) => Some(false),
(Katakana | HalfwidthKatakana, Han | HanExtensionA | HanExtensionBF) => Some(false),
(Hiragana, Katakana | HalfwidthKatakana) => Some(false),
(Katakana | HalfwidthKatakana, Hiragana) => Some(false),
(Katakana, HalfwidthKatakana) | (HalfwidthKatakana, Katakana) => Some(false),
_ => None,
}
}
fn handle_korean_transition(prev: CJKScript, curr: CJKScript) -> Option<bool> {
use CJKScript::*;
match (prev, curr) {
(Hangul, Han | HanExtensionA | HanExtensionBF) => Some(false),
(Han | HanExtensionA | HanExtensionBF, Hangul) => Some(false),
_ => None,
}
}
fn handle_chinese_transition(_prev: CJKScript, _curr: CJKScript) -> Option<bool> {
None
}
pub fn infer_document_language(scripts: &[(CJKScript, usize)]) -> Option<DocumentLanguage> {
if scripts.is_empty() {
return None;
}
let mut has_hiragana = false;
let mut has_katakana = false;
let mut has_hangul = false;
let mut has_han = false;
for (script, _count) in scripts {
match script {
CJKScript::Hiragana => has_hiragana = true,
CJKScript::Katakana | CJKScript::HalfwidthKatakana => has_katakana = true,
CJKScript::Hangul => has_hangul = true,
CJKScript::Han | CJKScript::HanExtensionA | CJKScript::HanExtensionBF => has_han = true,
_ => {},
}
}
if has_hiragana || has_katakana {
return Some(DocumentLanguage::Japanese);
}
if has_hangul {
return Some(DocumentLanguage::Korean);
}
if has_han {
return Some(DocumentLanguage::Chinese);
}
None
}
pub fn is_small_hiragana(code: u32) -> bool {
matches!(
code,
0x3041 | 0x3043 | 0x3045 | 0x3047 | 0x3049 | 0x3063 | 0x3083 | 0x3085 | 0x3087 | 0x308E )
}
pub fn is_small_katakana(code: u32) -> bool {
matches!(
code,
0x30A1 | 0x30A3 | 0x30A5 | 0x30A7 | 0x30A9 | 0x30C3 | 0x30E3 | 0x30E5 | 0x30E7 | 0x30EE | 0x30F5 | 0x30F6 )
}
pub fn is_combining_mark(code: u32) -> bool {
matches!(
code,
0x3099 | 0x309A | 0xFF9E | 0xFF9F )
}
pub fn is_japanese_modifier(code: u32) -> bool {
is_small_hiragana(code) || is_small_katakana(code) || is_combining_mark(code)
}
pub fn handle_japanese_text(
_prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
prev_script: Option<CJKScript>,
curr_script: Option<CJKScript>,
) -> Option<bool> {
if is_japanese_modifier(curr_char.code) {
return Some(false);
}
should_split_on_script_transition(prev_script, curr_script, Some(DocumentLanguage::Japanese))
}
pub fn handle_korean_text(
_prev_char: &CharacterInfo,
_curr_char: &CharacterInfo,
prev_script: Option<CJKScript>,
curr_script: Option<CJKScript>,
) -> Option<bool> {
should_split_on_script_transition(prev_script, curr_script, Some(DocumentLanguage::Korean))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_han_main_range() {
assert_eq!(detect_cjk_script(0x4E00), Some(CJKScript::Han));
assert_eq!(detect_cjk_script(0x6587), Some(CJKScript::Han));
assert_eq!(detect_cjk_script(0x9FFF), Some(CJKScript::Han));
}
#[test]
fn test_detect_han_extension_a() {
assert_eq!(detect_cjk_script(0x3400), Some(CJKScript::HanExtensionA));
assert_eq!(detect_cjk_script(0x4DBF), Some(CJKScript::HanExtensionA));
}
#[test]
fn test_detect_hiragana() {
assert_eq!(detect_cjk_script(0x3042), Some(CJKScript::Hiragana)); assert_eq!(detect_cjk_script(0x3093), Some(CJKScript::Hiragana)); }
#[test]
fn test_detect_katakana() {
assert_eq!(detect_cjk_script(0x30A2), Some(CJKScript::Katakana)); assert_eq!(detect_cjk_script(0x30F3), Some(CJKScript::Katakana)); }
#[test]
fn test_detect_hangul() {
assert_eq!(detect_cjk_script(0xAC00), Some(CJKScript::Hangul)); assert_eq!(detect_cjk_script(0xD7AF), Some(CJKScript::Hangul)); }
#[test]
fn test_detect_non_cjk() {
assert_eq!(detect_cjk_script(0x0041), None); assert_eq!(detect_cjk_script(0x0020), None); }
#[test]
fn test_infer_japanese_with_hiragana() {
let scripts = vec![(CJKScript::Han, 100), (CJKScript::Hiragana, 50)];
assert_eq!(infer_document_language(&scripts), Some(DocumentLanguage::Japanese));
}
#[test]
fn test_infer_japanese_with_katakana() {
let scripts = vec![(CJKScript::Han, 100), (CJKScript::Katakana, 30)];
assert_eq!(infer_document_language(&scripts), Some(DocumentLanguage::Japanese));
}
#[test]
fn test_infer_korean() {
let scripts = vec![(CJKScript::Hangul, 100), (CJKScript::Han, 20)];
assert_eq!(infer_document_language(&scripts), Some(DocumentLanguage::Korean));
}
#[test]
fn test_infer_chinese() {
let scripts = vec![(CJKScript::Han, 100)];
assert_eq!(infer_document_language(&scripts), Some(DocumentLanguage::Chinese));
}
#[test]
fn test_japanese_han_hiragana_no_split() {
let result = should_split_on_script_transition(
Some(CJKScript::Han),
Some(CJKScript::Hiragana),
Some(DocumentLanguage::Japanese),
);
assert_eq!(result, Some(false));
}
#[test]
fn test_japanese_hiragana_katakana_no_split() {
let result = should_split_on_script_transition(
Some(CJKScript::Hiragana),
Some(CJKScript::Katakana),
Some(DocumentLanguage::Japanese),
);
assert_eq!(result, Some(false));
}
#[test]
fn test_korean_hangul_han_no_split() {
let result = should_split_on_script_transition(
Some(CJKScript::Hangul),
Some(CJKScript::Han),
Some(DocumentLanguage::Korean),
);
assert_eq!(result, Some(false));
}
#[test]
fn test_cjk_to_latin_split() {
let result = should_split_on_script_transition(
Some(CJKScript::Han),
None,
Some(DocumentLanguage::Chinese),
);
assert_eq!(result, Some(true));
}
#[test]
fn test_small_hiragana_detection() {
assert!(is_small_hiragana(0x3041)); assert!(is_small_hiragana(0x3063)); assert!(is_small_hiragana(0x3083)); assert!(!is_small_hiragana(0x3042)); }
#[test]
fn test_small_katakana_detection() {
assert!(is_small_katakana(0x30A1)); assert!(is_small_katakana(0x30C3)); assert!(is_small_katakana(0x30E3)); assert!(!is_small_katakana(0x30A2)); }
#[test]
fn test_combining_marks() {
assert!(is_combining_mark(0x3099)); assert!(is_combining_mark(0x309A)); assert!(is_combining_mark(0xFF9E)); }
#[test]
fn test_japanese_modifier() {
assert!(is_japanese_modifier(0x3063)); assert!(is_japanese_modifier(0x30C3)); assert!(is_japanese_modifier(0x3099)); }
}