Skip to main content

cranpose_render_common/
text_hyphenation.rs

1use cranpose_ui::text::TextStyle;
2use hyphenation::{Hyphenator, Language, Load, Standard};
3use std::collections::HashMap;
4use std::sync::{OnceLock, RwLock};
5
6const MIN_SEGMENT_CHARS: usize = 2;
7
8fn get_dictionary(language: Language) -> Option<Standard> {
9    static DICTIONARIES: OnceLock<RwLock<HashMap<Language, Standard>>> = OnceLock::new();
10    let cache = DICTIONARIES.get_or_init(|| RwLock::new(HashMap::new()));
11
12    if let Ok(read_guard) = cache.read() {
13        if let Some(dict) = read_guard.get(&language) {
14            return Some(dict.clone());
15        }
16    }
17
18    // Load if not in cache
19    match Standard::from_embedded(language) {
20        Ok(dict) => {
21            if let Ok(mut write_guard) = cache.write() {
22                write_guard.insert(language, dict.clone());
23            }
24            Some(dict)
25        }
26        Err(_) => None,
27    }
28}
29
30pub fn choose_auto_hyphen_break(
31    line: &str,
32    style: &TextStyle,
33    segment_start_char: usize,
34    measured_break_char: usize,
35) -> Option<usize> {
36    if line.is_empty() || measured_break_char <= segment_start_char {
37        return None;
38    }
39
40    let language = resolve_hyphenation_language(style)?;
41
42    let dictionary = get_dictionary(language)?;
43    let boundaries = char_boundaries(line);
44    let char_count = boundaries.len().saturating_sub(1);
45
46    if measured_break_char == 0 || measured_break_char >= char_count {
47        return None;
48    }
49    if !is_break_inside_word(line, &boundaries, measured_break_char) {
50        return None;
51    }
52
53    let (word_start, word_end) = word_bounds(line, &boundaries, measured_break_char);
54    let word = &line[boundaries[word_start]..boundaries[word_end]];
55    if word.is_empty() {
56        return None;
57    }
58
59    let max_local_break = measured_break_char.saturating_sub(word_start);
60    let min_local_break = segment_start_char
61        .saturating_sub(word_start)
62        .saturating_add(MIN_SEGMENT_CHARS);
63
64    if min_local_break > max_local_break {
65        return None;
66    }
67
68    let hyphenated = dictionary.hyphenate(word);
69    for break_byte in hyphenated.breaks.into_iter().rev() {
70        if !word.is_char_boundary(break_byte) {
71            continue;
72        }
73        let local_break_chars = word[..break_byte].chars().count();
74        if local_break_chars < min_local_break || local_break_chars > max_local_break {
75            continue;
76        }
77        return Some(word_start + local_break_chars);
78    }
79
80    None
81}
82
83fn resolve_hyphenation_language(style: &TextStyle) -> Option<Language> {
84    let Some(locale_list) = style.span_style.locale_list.as_ref() else {
85        return Some(Language::EnglishUS);
86    };
87    if locale_list.is_empty() {
88        return Some(Language::EnglishUS);
89    }
90
91    // Check first matching locale
92    let primary_locale = locale_list.locales().first()?;
93    let normalized = primary_locale.trim().replace('_', "-").to_ascii_lowercase();
94
95    if normalized.starts_with("en-gb") {
96        return Some(Language::EnglishGB);
97    }
98    if normalized.starts_with("en") || normalized == "und" {
99        return Some(Language::EnglishUS);
100    }
101    if normalized.starts_with("fr") {
102        return Some(Language::French);
103    }
104    if normalized.starts_with("de") {
105        return Some(Language::German1996);
106    }
107    if normalized.starts_with("es") {
108        return Some(Language::Spanish);
109    }
110    if normalized.starts_with("it") {
111        return Some(Language::Italian);
112    }
113    if normalized.starts_with("ru") {
114        return Some(Language::Russian);
115    }
116    if normalized.starts_with("pt") {
117        return Some(Language::Portuguese);
118    }
119    if normalized.starts_with("nl") {
120        return Some(Language::Dutch);
121    }
122    if normalized.starts_with("pl") {
123        return Some(Language::Polish);
124    }
125    if normalized.starts_with("sv") {
126        return Some(Language::Swedish);
127    }
128    if normalized.starts_with("da") {
129        return Some(Language::Danish);
130    }
131    if normalized.starts_with("cs") {
132        return Some(Language::Czech);
133    }
134    if normalized.starts_with("sk") {
135        return Some(Language::Slovak);
136    }
137    if normalized.starts_with("uk") {
138        return Some(Language::Ukrainian);
139    }
140
141    None
142}
143
144fn char_boundaries(text: &str) -> Vec<usize> {
145    let mut out = Vec::with_capacity(text.chars().count() + 1);
146    out.push(0);
147    for (idx, _) in text.char_indices() {
148        if idx != 0 {
149            out.push(idx);
150        }
151    }
152    out.push(text.len());
153    out
154}
155
156fn is_break_inside_word(line: &str, boundaries: &[usize], break_idx: usize) -> bool {
157    if break_idx == 0 || break_idx >= boundaries.len() - 1 {
158        return false;
159    }
160    let prev = &line[boundaries[break_idx - 1]..boundaries[break_idx]];
161    let next = &line[boundaries[break_idx]..boundaries[break_idx + 1]];
162    !prev.chars().all(char::is_whitespace) && !next.chars().all(char::is_whitespace)
163}
164
165fn word_bounds(line: &str, boundaries: &[usize], anchor: usize) -> (usize, usize) {
166    let mut start = anchor;
167    while start > 0 {
168        let prev = &line[boundaries[start - 1]..boundaries[start]];
169        if prev.chars().all(char::is_whitespace) {
170            break;
171        }
172        start -= 1;
173    }
174
175    let mut end = anchor;
176    while end < boundaries.len() - 1 {
177        let current = &line[boundaries[end]..boundaries[end + 1]];
178        if current.chars().all(char::is_whitespace) {
179            break;
180        }
181        end += 1;
182    }
183    (start, end)
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189    use cranpose_ui::text::{LocaleList, SpanStyle, TextStyle};
190
191    fn style_with_locale(tags: &str) -> TextStyle {
192        TextStyle {
193            span_style: SpanStyle {
194                locale_list: Some(LocaleList::from_language_tags(tags)),
195                ..Default::default()
196            },
197            ..Default::default()
198        }
199    }
200
201    #[test]
202    fn dictionary_breaks_transformation_like_compose_contract() {
203        let break_idx = choose_auto_hyphen_break("Transformation", &TextStyle::default(), 8, 12);
204        assert_eq!(break_idx, Some(10));
205    }
206
207    #[test]
208    fn locale_gate_uses_french_dictionary() {
209        let break_idx = choose_auto_hyphen_break("éléphant", &style_with_locale("fr-FR"), 0, 7);
210        assert_eq!(break_idx, Some(3));
211    }
212
213    #[test]
214    fn locale_gate_uses_german_dictionary() {
215        let break_idx = choose_auto_hyphen_break(
216            "Geschwindigkeitsbegrenzung",
217            &style_with_locale("de-DE"),
218            10,
219            20,
220        );
221        assert!(break_idx.is_some());
222    }
223
224    #[test]
225    fn unknown_locale_disables_hyphenation() {
226        let break_idx =
227            choose_auto_hyphen_break("Transformation", &style_with_locale("ja-JP"), 8, 12);
228        assert_eq!(break_idx, None);
229    }
230
231    #[test]
232    fn dictionary_uses_english_locale_alias() {
233        let break_idx =
234            choose_auto_hyphen_break("Transformation", &style_with_locale("en_GB"), 8, 12);
235        assert_eq!(break_idx, Some(10));
236    }
237
238    #[test]
239    fn ignores_breaks_outside_words() {
240        let break_idx = choose_auto_hyphen_break("ab cd", &TextStyle::default(), 0, 2);
241        assert_eq!(break_idx, None);
242    }
243}