use icu_properties::props::{Alphabetic, GeneralCategory, Script, WhiteSpace};
use icu_properties::{
CodePointMapData, CodePointMapDataBorrowed, CodePointSetData, CodePointSetDataBorrowed,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TextScript {
Latin,
Greek,
Cyrillic,
Arabic,
Hebrew,
Han,
Hiragana,
Katakana,
Hangul,
Thai,
Devanagari,
Common,
Inherited,
Other,
}
impl TextScript {
pub fn is_rtl(self) -> bool {
matches!(self, TextScript::Arabic | TextScript::Hebrew)
}
fn from_icu(script: Script) -> Self {
match script {
Script::Latin => TextScript::Latin,
Script::Greek => TextScript::Greek,
Script::Cyrillic => TextScript::Cyrillic,
Script::Arabic => TextScript::Arabic,
Script::Hebrew => TextScript::Hebrew,
Script::Han => TextScript::Han,
Script::Hiragana => TextScript::Hiragana,
Script::Katakana => TextScript::Katakana,
Script::Hangul => TextScript::Hangul,
Script::Thai => TextScript::Thai,
Script::Devanagari => TextScript::Devanagari,
Script::Common => TextScript::Common,
Script::Inherited => TextScript::Inherited,
_ => TextScript::Other,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ScriptRun {
pub start: usize,
pub end: usize,
pub script: TextScript,
}
pub struct CharProperties {
script: CodePointMapDataBorrowed<'static, Script>,
general_category: CodePointMapDataBorrowed<'static, GeneralCategory>,
alphabetic: CodePointSetDataBorrowed<'static>,
whitespace: CodePointSetDataBorrowed<'static>,
}
impl CharProperties {
pub fn new() -> Self {
Self {
script: CodePointMapData::<Script>::new(),
general_category: CodePointMapData::<GeneralCategory>::new(),
alphabetic: CodePointSetData::new::<Alphabetic>(),
whitespace: CodePointSetData::new::<WhiteSpace>(),
}
}
pub fn script(&self, c: char) -> TextScript {
TextScript::from_icu(self.script.get(c))
}
pub fn is_alphabetic(&self, c: char) -> bool {
self.alphabetic.contains(c)
}
pub fn is_whitespace(&self, c: char) -> bool {
self.whitespace.contains(c)
}
pub fn is_numeric(&self, c: char) -> bool {
matches!(
self.general_category.get(c),
GeneralCategory::DecimalNumber
| GeneralCategory::LetterNumber
| GeneralCategory::OtherNumber
)
}
pub fn general_category(&self, c: char) -> GeneralCategory {
self.general_category.get(c)
}
pub fn itemize(&self, text: &str) -> Vec<ScriptRun> {
let mut runs: Vec<ScriptRun> = Vec::new();
if text.is_empty() {
return runs;
}
let mut current_script: Option<TextScript> = None;
let mut run_start = 0usize;
for (idx, c) in text.char_indices() {
let s = self.script(c);
let resolved = match s {
TextScript::Common | TextScript::Inherited => current_script,
strong => Some(strong),
};
match (current_script, resolved) {
(None, Some(strong)) => {
current_script = Some(strong);
}
(Some(prev), Some(strong)) if prev != strong => {
runs.push(ScriptRun {
start: run_start,
end: idx,
script: prev,
});
run_start = idx;
current_script = Some(strong);
}
_ => {
}
}
}
let final_script = current_script.unwrap_or(TextScript::Common);
runs.push(ScriptRun {
start: run_start,
end: text.len(),
script: final_script,
});
runs
}
pub fn dominant_script(&self, text: &str) -> TextScript {
use std::collections::HashMap;
let mut counts: HashMap<TextScript, usize> = HashMap::new();
for c in text.chars() {
let s = self.script(c);
if !matches!(s, TextScript::Common | TextScript::Inherited) {
*counts.entry(s).or_insert(0) += 1;
}
}
counts
.into_iter()
.max_by_key(|(_, n)| *n)
.map(|(s, _)| s)
.unwrap_or(TextScript::Common)
}
pub fn has_rtl(&self, text: &str) -> bool {
text.chars().any(|c| self.script(c).is_rtl())
}
}
impl Default for CharProperties {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_common_scripts() {
let p = CharProperties::new();
assert_eq!(p.script('A'), TextScript::Latin);
assert_eq!(p.script('Ω'), TextScript::Greek);
assert_eq!(p.script('Я'), TextScript::Cyrillic);
assert_eq!(p.script('木'), TextScript::Han);
assert_eq!(p.script('あ'), TextScript::Hiragana);
assert_eq!(p.script('ア'), TextScript::Katakana);
assert_eq!(p.script('한'), TextScript::Hangul);
assert_eq!(p.script('ก'), TextScript::Thai);
}
#[test]
fn detects_rtl_scripts() {
let p = CharProperties::new();
assert_eq!(p.script('ا'), TextScript::Arabic);
assert!(p.script('ا').is_rtl());
assert_eq!(p.script('א'), TextScript::Hebrew);
assert!(p.script('א').is_rtl());
assert!(!p.script('A').is_rtl());
}
#[test]
fn common_and_neutral_classification() {
let p = CharProperties::new();
assert_eq!(p.script('5'), TextScript::Common);
assert_eq!(p.script(' '), TextScript::Common);
assert_eq!(p.script('.'), TextScript::Common);
}
#[test]
fn property_predicates() {
let p = CharProperties::new();
assert!(p.is_alphabetic('A'));
assert!(p.is_alphabetic('Ä'));
assert!(!p.is_alphabetic('3'));
assert!(p.is_numeric('7'));
assert!(!p.is_numeric('A'));
assert!(p.is_whitespace(' '));
assert!(p.is_whitespace('\t'));
assert!(!p.is_whitespace('x'));
}
#[test]
fn itemize_pure_latin_one_run() {
let p = CharProperties::new();
let runs = p.itemize("hello");
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].script, TextScript::Latin);
assert_eq!(runs[0].start, 0);
assert_eq!(runs[0].end, 5);
}
#[test]
fn itemize_merges_spaces_into_latin() {
let p = CharProperties::new();
let runs = p.itemize("hi yo");
assert_eq!(runs.len(), 1, "space should not break a same-script run");
assert_eq!(runs[0].script, TextScript::Latin);
}
#[test]
fn itemize_splits_latin_and_han() {
let p = CharProperties::new();
let text = "abc木字";
let runs = p.itemize(text);
assert_eq!(runs.len(), 2);
assert_eq!(runs[0].script, TextScript::Latin);
assert_eq!(runs[0].start, 0);
assert_eq!(runs[0].end, 3); assert_eq!(runs[1].script, TextScript::Han);
assert_eq!(runs[1].start, 3);
assert_eq!(runs[1].end, text.len());
}
#[test]
fn itemize_empty_is_empty() {
let p = CharProperties::new();
assert!(p.itemize("").is_empty());
}
#[test]
fn itemize_leading_neutral_takes_following_script() {
let p = CharProperties::new();
let runs = p.itemize("12ab");
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].script, TextScript::Latin);
assert_eq!(runs[0].start, 0);
}
#[test]
fn dominant_script_picks_majority() {
let p = CharProperties::new();
assert_eq!(p.dominant_script("abc木"), TextScript::Latin);
assert_eq!(p.dominant_script("木字宙abc語"), TextScript::Han);
assert_eq!(p.dominant_script("123 456"), TextScript::Common);
}
#[test]
fn has_rtl_detection() {
let p = CharProperties::new();
assert!(p.has_rtl("hello مرحبا"));
assert!(!p.has_rtl("hello world"));
}
}