1#![forbid(unsafe_code)]
2
3use std::collections::HashMap;
25
26use crate::wrap::BreakPenalty;
27
28#[derive(Debug, Clone, PartialEq, Eq)]
38pub struct HyphenationPattern {
39 pub chars: Vec<char>,
41 pub levels: Vec<u8>,
44}
45
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub struct HyphenBreakPoint {
49 pub offset: usize,
52 pub level: u8,
54}
55
56impl HyphenBreakPoint {
57 #[must_use]
62 pub fn to_penalty(self) -> BreakPenalty {
63 let value = match self.level {
64 1 => 50,
65 3 => 40,
66 _ => 30, };
68 BreakPenalty {
69 value,
70 flagged: true,
71 }
72 }
73}
74
75#[must_use]
88pub fn compile_pattern(pattern: &str) -> Option<HyphenationPattern> {
89 let mut chars = Vec::new();
90 let mut levels = Vec::new();
91 let mut pending_digit: Option<u8> = None;
92
93 for ch in pattern.chars() {
94 if ch.is_ascii_digit() {
95 pending_digit = Some(ch as u8 - b'0');
96 } else if ch.is_alphabetic() || ch == '.' {
97 levels.push(pending_digit.unwrap_or(0));
98 pending_digit = None;
99 chars.push(ch.to_ascii_lowercase());
100 }
101 }
102 levels.push(pending_digit.unwrap_or(0));
104
105 if chars.is_empty() {
106 return None;
107 }
108
109 debug_assert_eq!(levels.len(), chars.len() + 1);
110 Some(HyphenationPattern { chars, levels })
111}
112
113#[derive(Debug, Clone, Default)]
119struct TrieNode {
120 children: HashMap<char, usize>,
121 levels: Option<Vec<u8>>,
124}
125
126#[derive(Debug, Clone)]
131pub struct PatternTrie {
132 nodes: Vec<TrieNode>,
133}
134
135impl PatternTrie {
136 #[must_use]
138 pub fn new(patterns: &[HyphenationPattern]) -> Self {
139 let mut trie = Self {
140 nodes: vec![TrieNode::default()],
141 };
142 for pat in patterns {
143 trie.insert(pat);
144 }
145 trie
146 }
147
148 fn insert(&mut self, pattern: &HyphenationPattern) {
149 let mut node_idx = 0;
150 for &ch in &pattern.chars {
151 let next_idx = if let Some(&idx) = self.nodes[node_idx].children.get(&ch) {
152 idx
153 } else {
154 let idx = self.nodes.len();
155 self.nodes.push(TrieNode::default());
156 self.nodes[node_idx].children.insert(ch, idx);
157 idx
158 };
159 node_idx = next_idx;
160 }
161 self.nodes[node_idx].levels = Some(pattern.levels.clone());
162 }
163
164 fn apply_at(&self, word_chars: &[char], start: usize, out_levels: &mut [u8]) {
167 let mut node_idx = 0;
168 for &ch in &word_chars[start..] {
169 let Some(&next) = self.nodes[node_idx].children.get(&ch) else {
170 break;
171 };
172 node_idx = next;
173 if let Some(ref lvls) = self.nodes[node_idx].levels {
174 for (j, &lv) in lvls.iter().enumerate() {
176 let pos = start + j;
177 if pos < out_levels.len() {
178 out_levels[pos] = out_levels[pos].max(lv);
179 }
180 }
181 }
182 }
183 }
184}
185
186const LEFT_HYPHEN_MIN: usize = 2;
192const RIGHT_HYPHEN_MIN: usize = 3;
194
195#[derive(Debug, Clone)]
200pub struct HyphenationDict {
201 pub language: String,
203 trie: PatternTrie,
205 exceptions: HashMap<String, Vec<usize>>,
208 pub left_min: usize,
210 pub right_min: usize,
212}
213
214impl HyphenationDict {
215 #[must_use]
220 pub fn new(language: &str, patterns: &[&str], exceptions: &[&str]) -> Self {
221 let compiled: Vec<HyphenationPattern> =
222 patterns.iter().filter_map(|p| compile_pattern(p)).collect();
223 let trie = PatternTrie::new(&compiled);
224
225 let mut exc_map = HashMap::new();
226 for &exc in exceptions {
227 let (word, breaks) = parse_exception(exc);
228 exc_map.insert(word, breaks);
229 }
230
231 Self {
232 language: language.to_string(),
233 trie,
234 exceptions: exc_map,
235 left_min: LEFT_HYPHEN_MIN,
236 right_min: RIGHT_HYPHEN_MIN,
237 }
238 }
239
240 #[must_use]
242 pub fn with_margins(mut self, left: usize, right: usize) -> Self {
243 self.left_min = left;
244 self.right_min = right;
245 self
246 }
247
248 #[must_use]
253 pub fn hyphenate(&self, word: &str) -> Vec<HyphenBreakPoint> {
254 let lower = word.to_lowercase();
255
256 if let Some(breaks) = self.exceptions.get(&lower) {
258 return breaks
259 .iter()
260 .filter_map(|&off| {
261 if off >= self.left_min
262 && off <= lower.chars().count().saturating_sub(self.right_min)
263 {
264 Some(HyphenBreakPoint {
265 offset: off,
266 level: 1,
267 })
268 } else {
269 None
270 }
271 })
272 .collect();
273 }
274
275 let word_chars: Vec<char> = lower.chars().collect();
277 let n = word_chars.len();
278 if n < self.left_min + self.right_min {
279 return Vec::new();
280 }
281
282 let mut delimited: Vec<char> = Vec::with_capacity(n + 2);
283 delimited.push('.');
284 delimited.extend_from_slice(&word_chars);
285 delimited.push('.');
286
287 let mut levels = vec![0u8; delimited.len() + 1];
290
291 for start in 0..delimited.len() {
293 self.trie.apply_at(&delimited, start, &mut levels);
294 }
295
296 let mut breaks = Vec::new();
301 for j in self.left_min..=(n.saturating_sub(self.right_min)) {
302 let lv = levels[j + 1]; if lv % 2 == 1 {
304 breaks.push(HyphenBreakPoint {
305 offset: j,
306 level: lv,
307 });
308 }
309 }
310
311 breaks
312 }
313
314 #[must_use]
316 pub fn can_hyphenate(&self, word: &str) -> bool {
317 !self.hyphenate(word).is_empty()
318 }
319}
320
321fn parse_exception(exc: &str) -> (String, Vec<usize>) {
323 let mut word = String::new();
324 let mut breaks = Vec::new();
325 let mut char_count = 0usize;
326
327 for ch in exc.chars() {
328 if ch == '-' {
329 breaks.push(char_count);
330 } else {
331 word.push(ch.to_ascii_lowercase());
332 char_count += 1;
333 }
334 }
335
336 (word, breaks)
337}
338
339pub const ENGLISH_PATTERNS_MINI: &[&str] = &[
349 ".hy3p", ".re1i", ".in1t", ".un1d", ".ex1a", ".dis1c", ".pre1v", ".over3f", ".semi5", ".auto3",
351 "a2l", "an2t", "as3ter", "at5omi", "be5ra", "bl2", "br2", "ca4t", "ch2", "cl2", "co2n",
353 "com5ma", "cr2", "de4moc", "di3vis", "dr2", "en3tic", "er1i", "fl2", "fr2", "gl2", "gr2",
354 "hy3pe", "i1a", "ism3", "ist3", "i2z", "li4ber", "m2p", "n2t", "ph2", "pl2", "pr2", "qu2",
355 "sc2", "sh2", "sk2", "sl2", "sm2", "sn2", "sp2", "st2", "sw2", "th2", "tr2", "tw2", "ty4p",
356 "wh2", "wr2", "ber3", "cial4", "ful3", "gy5n", "ing1", "ment3", "ness3", "tion5", "sion5", "tu4al", "able3",
358 "ible3", "ment1a", "ment1i", "n2kl", "n2gl", "n4gri", "mp3t", "nk3i", "ns2", "nt2", "nc2", "nd2", "ng2", "nf2", "ct2",
360 "pt2", "ps2", "ld2", "lf2", "lk2", "lm2", "lt2", "lv2", "rb2", "rc2", "rd2", "rf2", "rg2",
361 "rk2", "rl2", "rm2", "rn2", "rp2", "rs2", "rt2", "rv2", "rw2", "4ism.", "4ist.", "4ment.", "4ness.", "5tion.", "5sion.", "3ful.", "3less.", "3ous.", "3ive.",
363 "3able.", "3ible.", "3ment.", "3ness.",
364];
365
366pub const ENGLISH_EXCEPTIONS_MINI: &[&str] = &[
368 "as-so-ciate",
369 "as-so-ciates",
370 "dec-li-na-tion",
371 "oblig-a-tory",
372 "phil-an-thropic",
373 "present",
374 "presents",
375 "project",
376 "projects",
377 "reci-procity",
378 "ta-ble",
379];
380
381#[must_use]
383pub fn english_dict_mini() -> HyphenationDict {
384 HyphenationDict::new("en", ENGLISH_PATTERNS_MINI, ENGLISH_EXCEPTIONS_MINI)
385}
386
387#[must_use]
397pub fn break_penalties(breaks: &[HyphenBreakPoint]) -> Vec<(usize, BreakPenalty)> {
398 breaks
399 .iter()
400 .map(|bp| (bp.offset, bp.to_penalty()))
401 .collect()
402}
403
404#[cfg(test)]
409mod tests {
410 use super::*;
411
412 #[test]
415 fn compile_simple_pattern() {
416 let pat = compile_pattern("hy3p").unwrap();
417 assert_eq!(pat.chars, vec!['h', 'y', 'p']);
418 assert_eq!(pat.levels, vec![0, 0, 3, 0]);
419 }
420
421 #[test]
422 fn compile_leading_digit() {
423 let pat = compile_pattern("2ph").unwrap();
424 assert_eq!(pat.chars, vec!['p', 'h']);
425 assert_eq!(pat.levels, vec![2, 0, 0]);
426 }
427
428 #[test]
429 fn compile_trailing_digit() {
430 let pat = compile_pattern("ab4").unwrap();
431 assert_eq!(pat.chars, vec!['a', 'b']);
432 assert_eq!(pat.levels, vec![0, 0, 4]);
433 }
434
435 #[test]
436 fn compile_dot_delimiter() {
437 let pat = compile_pattern(".ex5am").unwrap();
438 assert_eq!(pat.chars, vec!['.', 'e', 'x', 'a', 'm']);
439 assert_eq!(pat.levels, vec![0, 0, 0, 5, 0, 0]);
440 }
441
442 #[test]
443 fn compile_multiple_digits() {
444 let pat = compile_pattern("a1b2c3d").unwrap();
445 assert_eq!(pat.chars, vec!['a', 'b', 'c', 'd']);
446 assert_eq!(pat.levels, vec![0, 1, 2, 3, 0]);
447 }
448
449 #[test]
450 fn compile_empty_returns_none() {
451 assert!(compile_pattern("").is_none());
452 assert!(compile_pattern("123").is_none());
453 }
454
455 #[test]
456 fn compile_all_zeros() {
457 let pat = compile_pattern("abc").unwrap();
458 assert_eq!(pat.levels, vec![0, 0, 0, 0]);
459 }
460
461 #[test]
464 fn trie_single_pattern() {
465 let pat = compile_pattern("hy3p").unwrap();
466 let trie = PatternTrie::new(&[pat]);
467
468 let word: Vec<char> = ".hyp.".chars().collect();
470 let mut levels = vec![0u8; word.len() + 1];
471 for start in 0..word.len() {
472 trie.apply_at(&word, start, &mut levels);
473 }
474 assert_eq!(levels[3], 3);
476 }
477
478 #[test]
479 fn trie_max_level_wins() {
480 let p1 = compile_pattern("ab2c").unwrap();
481 let p2 = compile_pattern("b5c").unwrap();
482 let trie = PatternTrie::new(&[p1, p2]);
483
484 let word: Vec<char> = ".abc.".chars().collect();
485 let mut levels = vec![0u8; word.len() + 1];
486 for start in 0..word.len() {
487 trie.apply_at(&word, start, &mut levels);
488 }
489 assert!(levels.contains(&5));
491 }
492
493 #[test]
496 fn parse_exception_basic() {
497 let (word, breaks) = parse_exception("hy-phen-ation");
498 assert_eq!(word, "hyphenation");
499 assert_eq!(breaks, vec![2, 6]);
500 }
501
502 #[test]
503 fn parse_exception_no_hyphens() {
504 let (word, breaks) = parse_exception("present");
505 assert_eq!(word, "present");
506 assert!(breaks.is_empty());
507 }
508
509 #[test]
510 fn parse_exception_single_hyphen() {
511 let (word, breaks) = parse_exception("ta-ble");
512 assert_eq!(word, "table");
513 assert_eq!(breaks, vec![2]);
514 }
515
516 #[test]
519 fn dict_exception_overrides_patterns() {
520 let dict = HyphenationDict::new(
521 "en",
522 &["a1b", "b1c"],
523 &["ab-c"], );
525 let breaks = dict.hyphenate("abc");
526 assert!(breaks.iter().all(|bp| bp.offset == 2));
528 }
529
530 #[test]
531 fn dict_short_word_no_breaks() {
532 let dict = english_dict_mini();
533 let breaks = dict.hyphenate("cat");
535 assert!(breaks.is_empty());
536 }
537
538 #[test]
539 fn dict_respects_left_min() {
540 let dict = HyphenationDict::new("en", &["1a1b1c1d1e"], &[]);
541 let breaks = dict.hyphenate("abcde");
542 assert!(breaks.iter().all(|bp| bp.offset >= 2));
544 }
545
546 #[test]
547 fn dict_respects_right_min() {
548 let dict = HyphenationDict::new("en", &["1a1b1c1d1e1f1g"], &[]);
549 let breaks = dict.hyphenate("abcdefg");
550 assert!(breaks.iter().all(|bp| bp.offset <= 4));
552 }
553
554 #[test]
555 fn dict_custom_margins() {
556 let dict = HyphenationDict::new("en", &["1a1b1c1d1e1f1g1h"], &[]).with_margins(3, 4);
557 let breaks = dict.hyphenate("abcdefgh");
558 assert!(breaks.iter().all(|bp| bp.offset >= 3 && bp.offset <= 4));
559 }
560
561 #[test]
562 fn dict_case_insensitive() {
563 let dict = HyphenationDict::new("en", &["hy3p"], &[]);
564 let lower = dict.hyphenate("hyper");
565 let upper = dict.hyphenate("HYPER");
566 let mixed = dict.hyphenate("Hyper");
567 assert_eq!(lower, upper);
568 assert_eq!(lower, mixed);
569 }
570
571 #[test]
572 fn dict_can_hyphenate() {
573 let dict = english_dict_mini();
574 assert!(dict.can_hyphenate("table"));
576 }
577
578 #[test]
579 fn dict_empty_word() {
580 let dict = english_dict_mini();
581 assert!(dict.hyphenate("").is_empty());
582 }
583
584 #[test]
587 fn english_mini_loads_without_panic() {
588 let dict = english_dict_mini();
589 assert_eq!(dict.language, "en");
590 }
591
592 #[test]
593 fn english_mini_exception_table() {
594 let dict = english_dict_mini();
595 let breaks = dict.hyphenate("table");
596 assert_eq!(breaks.len(), 1);
598 assert_eq!(breaks[0].offset, 2);
599 }
600
601 #[test]
602 fn english_mini_exception_associate() {
603 let dict = english_dict_mini();
604 let breaks = dict.hyphenate("associate");
605 assert_eq!(breaks.len(), 2);
607 assert_eq!(breaks[0].offset, 2);
608 assert_eq!(breaks[1].offset, 4);
609 }
610
611 #[test]
612 fn english_mini_no_exception_word() {
613 let dict = english_dict_mini();
614 let breaks = dict.hyphenate("computer");
616 assert!(!breaks.is_empty() || breaks.is_empty()); }
621
622 #[test]
625 fn penalty_level_1() {
626 let bp = HyphenBreakPoint {
627 offset: 3,
628 level: 1,
629 };
630 let penalty = bp.to_penalty();
631 assert_eq!(penalty.value, 50);
632 assert!(penalty.flagged);
633 }
634
635 #[test]
636 fn penalty_level_3() {
637 let bp = HyphenBreakPoint {
638 offset: 3,
639 level: 3,
640 };
641 let penalty = bp.to_penalty();
642 assert_eq!(penalty.value, 40);
643 assert!(penalty.flagged);
644 }
645
646 #[test]
647 fn penalty_level_5() {
648 let bp = HyphenBreakPoint {
649 offset: 3,
650 level: 5,
651 };
652 let penalty = bp.to_penalty();
653 assert_eq!(penalty.value, 30);
654 assert!(penalty.flagged);
655 }
656
657 #[test]
658 fn break_penalties_preserves_offsets() {
659 let bps = vec![
660 HyphenBreakPoint {
661 offset: 2,
662 level: 1,
663 },
664 HyphenBreakPoint {
665 offset: 5,
666 level: 3,
667 },
668 ];
669 let penalties = break_penalties(&bps);
670 assert_eq!(penalties.len(), 2);
671 assert_eq!(penalties[0].0, 2);
672 assert_eq!(penalties[1].0, 5);
673 assert!(penalties.iter().all(|(_, p)| p.flagged));
674 }
675
676 #[test]
679 fn deterministic_same_input_same_output() {
680 let dict = english_dict_mini();
681 let word = "hyphenation";
682 let breaks1 = dict.hyphenate(word);
683 let breaks2 = dict.hyphenate(word);
684 assert_eq!(breaks1, breaks2);
685 }
686
687 #[test]
688 fn deterministic_across_dict_rebuilds() {
689 let dict1 = english_dict_mini();
690 let dict2 = english_dict_mini();
691 let word = "associate";
692 assert_eq!(dict1.hyphenate(word), dict2.hyphenate(word));
693 }
694
695 #[test]
698 fn single_char_word() {
699 let dict = english_dict_mini();
700 assert!(dict.hyphenate("a").is_empty());
701 }
702
703 #[test]
704 fn two_char_word() {
705 let dict = english_dict_mini();
706 assert!(dict.hyphenate("an").is_empty());
707 }
708
709 #[test]
710 fn all_same_chars() {
711 let dict = english_dict_mini();
712 let _ = dict.hyphenate("aaaaaaa");
714 }
715
716 #[test]
717 fn unicode_word() {
718 let dict = english_dict_mini();
719 let breaks = dict.hyphenate("über");
721 assert!(breaks.is_empty() || !breaks.is_empty()); }
724
725 #[test]
726 fn only_odd_levels_produce_breaks() {
727 let dict = HyphenationDict::new("test", &["a2b2c2d2e2f"], &[]);
729 let breaks = dict.hyphenate("abcdef");
730 assert!(breaks.is_empty());
732 }
733
734 #[test]
735 fn mixed_odd_even_levels() {
736 let dict = HyphenationDict::new("test", &["a1b2c3d"], &[]).with_margins(1, 1);
739 let breaks = dict.hyphenate("abcd");
740 let offsets: Vec<usize> = breaks.iter().map(|b| b.offset).collect();
741 assert!(offsets.contains(&1)); assert!(!offsets.contains(&2)); assert!(offsets.contains(&3)); }
745}