#![forbid(unsafe_code)]
use std::collections::HashMap;
use crate::wrap::BreakPenalty;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HyphenationPattern {
pub chars: Vec<char>,
pub levels: Vec<u8>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct HyphenBreakPoint {
pub offset: usize,
pub level: u8,
}
impl HyphenBreakPoint {
#[must_use]
pub fn to_penalty(self) -> BreakPenalty {
let value = match self.level {
1 => 50,
3 => 40,
_ => 30, };
BreakPenalty {
value,
flagged: true,
}
}
}
#[must_use]
pub fn compile_pattern(pattern: &str) -> Option<HyphenationPattern> {
let mut chars = Vec::new();
let mut levels = Vec::new();
let mut pending_digit: Option<u8> = None;
for ch in pattern.chars() {
if ch.is_ascii_digit() {
pending_digit = Some(ch as u8 - b'0');
} else if ch.is_alphabetic() || ch == '.' {
levels.push(pending_digit.unwrap_or(0));
pending_digit = None;
chars.push(ch.to_ascii_lowercase());
}
}
levels.push(pending_digit.unwrap_or(0));
if chars.is_empty() {
return None;
}
debug_assert_eq!(levels.len(), chars.len() + 1);
Some(HyphenationPattern { chars, levels })
}
#[derive(Debug, Clone, Default)]
struct TrieNode {
children: HashMap<char, usize>,
levels: Option<Vec<u8>>,
}
#[derive(Debug, Clone)]
pub struct PatternTrie {
nodes: Vec<TrieNode>,
}
impl PatternTrie {
#[must_use]
pub fn new(patterns: &[HyphenationPattern]) -> Self {
let mut trie = Self {
nodes: vec![TrieNode::default()],
};
for pat in patterns {
trie.insert(pat);
}
trie
}
fn insert(&mut self, pattern: &HyphenationPattern) {
let mut node_idx = 0;
for &ch in &pattern.chars {
let next_idx = if let Some(&idx) = self.nodes[node_idx].children.get(&ch) {
idx
} else {
let idx = self.nodes.len();
self.nodes.push(TrieNode::default());
self.nodes[node_idx].children.insert(ch, idx);
idx
};
node_idx = next_idx;
}
self.nodes[node_idx].levels = Some(pattern.levels.clone());
}
fn apply_at(&self, word_chars: &[char], start: usize, out_levels: &mut [u8]) {
let mut node_idx = 0;
for &ch in &word_chars[start..] {
let Some(&next) = self.nodes[node_idx].children.get(&ch) else {
break;
};
node_idx = next;
if let Some(ref lvls) = self.nodes[node_idx].levels {
for (j, &lv) in lvls.iter().enumerate() {
let pos = start + j;
if pos < out_levels.len() {
out_levels[pos] = out_levels[pos].max(lv);
}
}
}
}
}
}
const LEFT_HYPHEN_MIN: usize = 2;
const RIGHT_HYPHEN_MIN: usize = 3;
#[derive(Debug, Clone)]
pub struct HyphenationDict {
pub language: String,
trie: PatternTrie,
exceptions: HashMap<String, Vec<usize>>,
pub left_min: usize,
pub right_min: usize,
}
impl HyphenationDict {
#[must_use]
pub fn new(language: &str, patterns: &[&str], exceptions: &[&str]) -> Self {
let compiled: Vec<HyphenationPattern> =
patterns.iter().filter_map(|p| compile_pattern(p)).collect();
let trie = PatternTrie::new(&compiled);
let mut exc_map = HashMap::new();
for &exc in exceptions {
let (word, breaks) = parse_exception(exc);
exc_map.insert(word, breaks);
}
Self {
language: language.to_string(),
trie,
exceptions: exc_map,
left_min: LEFT_HYPHEN_MIN,
right_min: RIGHT_HYPHEN_MIN,
}
}
#[must_use]
pub fn with_margins(mut self, left: usize, right: usize) -> Self {
self.left_min = left;
self.right_min = right;
self
}
#[must_use]
pub fn hyphenate(&self, word: &str) -> Vec<HyphenBreakPoint> {
let lower = word.to_lowercase();
if let Some(breaks) = self.exceptions.get(&lower) {
return breaks
.iter()
.filter_map(|&off| {
if off >= self.left_min
&& off <= lower.chars().count().saturating_sub(self.right_min)
{
Some(HyphenBreakPoint {
offset: off,
level: 1,
})
} else {
None
}
})
.collect();
}
let word_chars: Vec<char> = lower.chars().collect();
let n = word_chars.len();
if n < self.left_min + self.right_min {
return Vec::new();
}
let mut delimited: Vec<char> = Vec::with_capacity(n + 2);
delimited.push('.');
delimited.extend_from_slice(&word_chars);
delimited.push('.');
let mut levels = vec![0u8; delimited.len() + 1];
for start in 0..delimited.len() {
self.trie.apply_at(&delimited, start, &mut levels);
}
let mut breaks = Vec::new();
for j in self.left_min..=(n.saturating_sub(self.right_min)) {
let lv = levels[j + 1]; if lv % 2 == 1 {
breaks.push(HyphenBreakPoint {
offset: j,
level: lv,
});
}
}
breaks
}
#[must_use]
pub fn can_hyphenate(&self, word: &str) -> bool {
!self.hyphenate(word).is_empty()
}
}
fn parse_exception(exc: &str) -> (String, Vec<usize>) {
let mut word = String::new();
let mut breaks = Vec::new();
let mut char_count = 0usize;
for ch in exc.chars() {
if ch == '-' {
breaks.push(char_count);
} else {
word.push(ch.to_ascii_lowercase());
char_count += 1;
}
}
(word, breaks)
}
pub const ENGLISH_PATTERNS_MINI: &[&str] = &[
".hy3p", ".re1i", ".in1t", ".un1d", ".ex1a", ".dis1c", ".pre1v", ".over3f", ".semi5", ".auto3",
"a2l", "an2t", "as3ter", "at5omi", "be5ra", "bl2", "br2", "ca4t", "ch2", "cl2", "co2n",
"com5ma", "cr2", "de4moc", "di3vis", "dr2", "en3tic", "er1i", "fl2", "fr2", "gl2", "gr2",
"hy3pe", "i1a", "ism3", "ist3", "i2z", "li4ber", "m2p", "n2t", "ph2", "pl2", "pr2", "qu2",
"sc2", "sh2", "sk2", "sl2", "sm2", "sn2", "sp2", "st2", "sw2", "th2", "tr2", "tw2", "ty4p",
"wh2", "wr2", "ber3", "cial4", "ful3", "gy5n", "ing1", "ment3", "ness3", "tion5", "sion5", "tu4al", "able3",
"ible3", "ment1a", "ment1i", "n2kl", "n2gl", "n4gri", "mp3t", "nk3i", "ns2", "nt2", "nc2", "nd2", "ng2", "nf2", "ct2",
"pt2", "ps2", "ld2", "lf2", "lk2", "lm2", "lt2", "lv2", "rb2", "rc2", "rd2", "rf2", "rg2",
"rk2", "rl2", "rm2", "rn2", "rp2", "rs2", "rt2", "rv2", "rw2", "4ism.", "4ist.", "4ment.", "4ness.", "5tion.", "5sion.", "3ful.", "3less.", "3ous.", "3ive.",
"3able.", "3ible.", "3ment.", "3ness.",
];
pub const ENGLISH_EXCEPTIONS_MINI: &[&str] = &[
"as-so-ciate",
"as-so-ciates",
"dec-li-na-tion",
"oblig-a-tory",
"phil-an-thropic",
"present",
"presents",
"project",
"projects",
"reci-procity",
"ta-ble",
];
#[must_use]
pub fn english_dict_mini() -> HyphenationDict {
HyphenationDict::new("en", ENGLISH_PATTERNS_MINI, ENGLISH_EXCEPTIONS_MINI)
}
#[must_use]
pub fn break_penalties(breaks: &[HyphenBreakPoint]) -> Vec<(usize, BreakPenalty)> {
breaks
.iter()
.map(|bp| (bp.offset, bp.to_penalty()))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn compile_simple_pattern() {
let pat = compile_pattern("hy3p").unwrap();
assert_eq!(pat.chars, vec!['h', 'y', 'p']);
assert_eq!(pat.levels, vec![0, 0, 3, 0]);
}
#[test]
fn compile_leading_digit() {
let pat = compile_pattern("2ph").unwrap();
assert_eq!(pat.chars, vec!['p', 'h']);
assert_eq!(pat.levels, vec![2, 0, 0]);
}
#[test]
fn compile_trailing_digit() {
let pat = compile_pattern("ab4").unwrap();
assert_eq!(pat.chars, vec!['a', 'b']);
assert_eq!(pat.levels, vec![0, 0, 4]);
}
#[test]
fn compile_dot_delimiter() {
let pat = compile_pattern(".ex5am").unwrap();
assert_eq!(pat.chars, vec!['.', 'e', 'x', 'a', 'm']);
assert_eq!(pat.levels, vec![0, 0, 0, 5, 0, 0]);
}
#[test]
fn compile_multiple_digits() {
let pat = compile_pattern("a1b2c3d").unwrap();
assert_eq!(pat.chars, vec!['a', 'b', 'c', 'd']);
assert_eq!(pat.levels, vec![0, 1, 2, 3, 0]);
}
#[test]
fn compile_empty_returns_none() {
assert!(compile_pattern("").is_none());
assert!(compile_pattern("123").is_none());
}
#[test]
fn compile_all_zeros() {
let pat = compile_pattern("abc").unwrap();
assert_eq!(pat.levels, vec![0, 0, 0, 0]);
}
#[test]
fn trie_single_pattern() {
let pat = compile_pattern("hy3p").unwrap();
let trie = PatternTrie::new(&[pat]);
let word: Vec<char> = ".hyp.".chars().collect();
let mut levels = vec![0u8; word.len() + 1];
for start in 0..word.len() {
trie.apply_at(&word, start, &mut levels);
}
assert_eq!(levels[3], 3);
}
#[test]
fn trie_max_level_wins() {
let p1 = compile_pattern("ab2c").unwrap();
let p2 = compile_pattern("b5c").unwrap();
let trie = PatternTrie::new(&[p1, p2]);
let word: Vec<char> = ".abc.".chars().collect();
let mut levels = vec![0u8; word.len() + 1];
for start in 0..word.len() {
trie.apply_at(&word, start, &mut levels);
}
assert!(levels.contains(&5));
}
#[test]
fn parse_exception_basic() {
let (word, breaks) = parse_exception("hy-phen-ation");
assert_eq!(word, "hyphenation");
assert_eq!(breaks, vec![2, 6]);
}
#[test]
fn parse_exception_no_hyphens() {
let (word, breaks) = parse_exception("present");
assert_eq!(word, "present");
assert!(breaks.is_empty());
}
#[test]
fn parse_exception_single_hyphen() {
let (word, breaks) = parse_exception("ta-ble");
assert_eq!(word, "table");
assert_eq!(breaks, vec![2]);
}
#[test]
fn dict_exception_overrides_patterns() {
let dict = HyphenationDict::new(
"en",
&["a1b", "b1c"],
&["ab-c"], );
let breaks = dict.hyphenate("abc");
assert!(breaks.iter().all(|bp| bp.offset == 2));
}
#[test]
fn dict_short_word_no_breaks() {
let dict = english_dict_mini();
let breaks = dict.hyphenate("cat");
assert!(breaks.is_empty());
}
#[test]
fn dict_respects_left_min() {
let dict = HyphenationDict::new("en", &["1a1b1c1d1e"], &[]);
let breaks = dict.hyphenate("abcde");
assert!(breaks.iter().all(|bp| bp.offset >= 2));
}
#[test]
fn dict_respects_right_min() {
let dict = HyphenationDict::new("en", &["1a1b1c1d1e1f1g"], &[]);
let breaks = dict.hyphenate("abcdefg");
assert!(breaks.iter().all(|bp| bp.offset <= 4));
}
#[test]
fn dict_custom_margins() {
let dict = HyphenationDict::new("en", &["1a1b1c1d1e1f1g1h"], &[]).with_margins(3, 4);
let breaks = dict.hyphenate("abcdefgh");
assert!(breaks.iter().all(|bp| bp.offset >= 3 && bp.offset <= 4));
}
#[test]
fn dict_case_insensitive() {
let dict = HyphenationDict::new("en", &["hy3p"], &[]);
let lower = dict.hyphenate("hyper");
let upper = dict.hyphenate("HYPER");
let mixed = dict.hyphenate("Hyper");
assert_eq!(lower, upper);
assert_eq!(lower, mixed);
}
#[test]
fn dict_can_hyphenate() {
let dict = english_dict_mini();
assert!(dict.can_hyphenate("table"));
}
#[test]
fn dict_empty_word() {
let dict = english_dict_mini();
assert!(dict.hyphenate("").is_empty());
}
#[test]
fn english_mini_loads_without_panic() {
let dict = english_dict_mini();
assert_eq!(dict.language, "en");
}
#[test]
fn english_mini_exception_table() {
let dict = english_dict_mini();
let breaks = dict.hyphenate("table");
assert_eq!(breaks.len(), 1);
assert_eq!(breaks[0].offset, 2);
}
#[test]
fn english_mini_exception_associate() {
let dict = english_dict_mini();
let breaks = dict.hyphenate("associate");
assert_eq!(breaks.len(), 2);
assert_eq!(breaks[0].offset, 2);
assert_eq!(breaks[1].offset, 4);
}
#[test]
fn english_mini_no_exception_word() {
let dict = english_dict_mini();
let breaks = dict.hyphenate("computer");
assert!(!breaks.is_empty() || breaks.is_empty()); }
#[test]
fn penalty_level_1() {
let bp = HyphenBreakPoint {
offset: 3,
level: 1,
};
let penalty = bp.to_penalty();
assert_eq!(penalty.value, 50);
assert!(penalty.flagged);
}
#[test]
fn penalty_level_3() {
let bp = HyphenBreakPoint {
offset: 3,
level: 3,
};
let penalty = bp.to_penalty();
assert_eq!(penalty.value, 40);
assert!(penalty.flagged);
}
#[test]
fn penalty_level_5() {
let bp = HyphenBreakPoint {
offset: 3,
level: 5,
};
let penalty = bp.to_penalty();
assert_eq!(penalty.value, 30);
assert!(penalty.flagged);
}
#[test]
fn break_penalties_preserves_offsets() {
let bps = vec![
HyphenBreakPoint {
offset: 2,
level: 1,
},
HyphenBreakPoint {
offset: 5,
level: 3,
},
];
let penalties = break_penalties(&bps);
assert_eq!(penalties.len(), 2);
assert_eq!(penalties[0].0, 2);
assert_eq!(penalties[1].0, 5);
assert!(penalties.iter().all(|(_, p)| p.flagged));
}
#[test]
fn deterministic_same_input_same_output() {
let dict = english_dict_mini();
let word = "hyphenation";
let breaks1 = dict.hyphenate(word);
let breaks2 = dict.hyphenate(word);
assert_eq!(breaks1, breaks2);
}
#[test]
fn deterministic_across_dict_rebuilds() {
let dict1 = english_dict_mini();
let dict2 = english_dict_mini();
let word = "associate";
assert_eq!(dict1.hyphenate(word), dict2.hyphenate(word));
}
#[test]
fn single_char_word() {
let dict = english_dict_mini();
assert!(dict.hyphenate("a").is_empty());
}
#[test]
fn two_char_word() {
let dict = english_dict_mini();
assert!(dict.hyphenate("an").is_empty());
}
#[test]
fn all_same_chars() {
let dict = english_dict_mini();
let _ = dict.hyphenate("aaaaaaa");
}
#[test]
fn unicode_word() {
let dict = english_dict_mini();
let breaks = dict.hyphenate("über");
assert!(breaks.is_empty() || !breaks.is_empty()); }
#[test]
fn only_odd_levels_produce_breaks() {
let dict = HyphenationDict::new("test", &["a2b2c2d2e2f"], &[]);
let breaks = dict.hyphenate("abcdef");
assert!(breaks.is_empty());
}
#[test]
fn mixed_odd_even_levels() {
let dict = HyphenationDict::new("test", &["a1b2c3d"], &[]).with_margins(1, 1);
let breaks = dict.hyphenate("abcd");
let offsets: Vec<usize> = breaks.iter().map(|b| b.offset).collect();
assert!(offsets.contains(&1)); assert!(!offsets.contains(&2)); assert!(offsets.contains(&3)); }
}