#[derive(Debug, Clone)]
pub struct HyphenationHandler {
min_continuation_length: usize,
preserve_compounds: bool,
}
impl Default for HyphenationHandler {
fn default() -> Self {
Self::new()
}
}
impl HyphenationHandler {
pub fn new() -> Self {
Self {
min_continuation_length: 2,
preserve_compounds: true,
}
}
pub fn with_min_continuation_length(mut self, len: usize) -> Self {
self.min_continuation_length = len;
self
}
pub fn with_preserve_compounds(mut self, preserve: bool) -> Self {
self.preserve_compounds = preserve;
self
}
pub fn is_continuation_hyphen(text: &str) -> bool {
let trimmed = text.trim_end();
if trimmed.is_empty() {
return false;
}
if trimmed.ends_with('\u{00AD}') {
let before_hyphen = &trimmed[..trimmed.len() - '\u{00AD}'.len_utf8()];
return before_hyphen
.chars()
.last()
.is_some_and(|c| c.is_alphabetic());
}
if !trimmed.ends_with('-') {
return false;
}
let before_hyphen = &trimmed[..trimmed.len() - 1];
before_hyphen
.chars()
.last()
.is_some_and(|c| c.is_alphabetic())
}
fn is_compound_word(first_part: &str, second_part: &str) -> bool {
let first_lower = first_part.to_lowercase();
let compound_prefixes = [
"self", "non", "anti", "pre", "post", "re", "co", "ex", "multi", "semi", "sub",
"super", "ultra", "under", "over", "cross", "inter", "intra", "counter", "mid", "well",
"ill", "all", "half", "high", "low", "full", "part", "short", "long", "hard", "soft",
];
if compound_prefixes.contains(&first_lower.as_str()) {
return true;
}
let technical_patterns = [
("content", "type"),
("content", "length"),
("content", "encoding"),
("content", "coding"),
("user", "agent"),
("cache", "control"),
("product", "version"),
("media", "type"),
];
for (prefix, suffix) in &technical_patterns {
if first_lower == *prefix && second_part.to_lowercase().starts_with(suffix) {
return true;
}
}
if let Some(first_char) = second_part.chars().next() {
if first_char.is_lowercase()
&& first_part.chars().last().is_some_and(|c| c.is_lowercase())
{
let combined = format!("{}{}", first_part, second_part);
if is_common_word(&combined) {
return false; }
return true;
}
}
false
}
pub fn process_line_pair(&self, current_line: &str, next_line: &str) -> (String, bool) {
let trimmed_current = current_line.trim_end();
if !Self::is_continuation_hyphen(trimmed_current) {
return (current_line.to_string(), false);
}
let trimmed_next = next_line.trim_start();
let next_word = trimmed_next.split_whitespace().next().unwrap_or("");
if next_word.len() < self.min_continuation_length {
return (current_line.to_string(), false);
}
let without_hyphen = if trimmed_current.ends_with('\u{00AD}') {
&trimmed_current[..trimmed_current.len() - '\u{00AD}'.len_utf8()]
} else {
&trimmed_current[..trimmed_current.len() - 1]
};
let last_word = without_hyphen
.split_whitespace()
.next_back()
.unwrap_or(without_hyphen);
let is_soft_hyphen = trimmed_current.ends_with('\u{00AD}');
if !is_soft_hyphen
&& self.preserve_compounds
&& Self::is_compound_word(last_word, next_word)
{
return (current_line.to_string(), false);
}
let hyphen_len = if is_soft_hyphen {
'\u{00AD}'.len_utf8()
} else {
1
};
let prefix = &trimmed_current[..trimmed_current.len() - last_word.len() - hyphen_len];
let joined_word = format!("{}{}", last_word, next_word);
let rest_of_next = trimmed_next[next_word.len()..].trim_start();
let mut result = if prefix.is_empty() {
joined_word
} else {
format!("{} {}", prefix.trim_end(), joined_word)
};
if !rest_of_next.is_empty() {
result.push(' ');
result.push_str(rest_of_next);
}
(result, true)
}
pub fn process_text(&self, text: &str) -> String {
let lines: Vec<&str> = text.lines().collect();
if lines.is_empty() {
return String::new();
}
let mut result = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
if i + 1 < lines.len() {
let (processed, consumed) = self.process_line_pair(lines[i], lines[i + 1]);
result.push(processed);
if consumed {
i += 2;
continue;
}
} else {
result.push(lines[i].to_string());
}
i += 1;
}
let mut output = result.join("\n");
if text.ends_with('\n') && !output.ends_with('\n') {
output.push('\n');
}
output
}
}
fn is_common_word(word: &str) -> bool {
let common_words = [
"government",
"department",
"information",
"administration",
"documentation",
"implementation",
"communication",
"organization",
"representation",
"transportation",
"investigation",
"determination",
"consideration",
"recommendation",
"responsibility",
"understanding",
"international",
"environmental",
"constitutional",
"congressional",
"agricultural",
"professional",
"manufacturing",
"requirements",
"development",
"management",
"performance",
"maintenance",
"compliance",
"procedures",
"regulations",
"activities",
"operations",
"provisions",
"conditions",
"limitations",
"applications",
"publications",
];
let lower = word.to_lowercase();
common_words.contains(&lower.as_str())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_continuation_hyphen_basic() {
assert!(HyphenationHandler::is_continuation_hyphen("Govern-"));
assert!(HyphenationHandler::is_continuation_hyphen("content-"));
assert!(HyphenationHandler::is_continuation_hyphen("word- ")); }
#[test]
fn test_is_continuation_hyphen_negative() {
assert!(!HyphenationHandler::is_continuation_hyphen("- bullet"));
assert!(!HyphenationHandler::is_continuation_hyphen(""));
assert!(!HyphenationHandler::is_continuation_hyphen("no hyphen"));
assert!(!HyphenationHandler::is_continuation_hyphen("123-")); }
#[test]
fn test_is_continuation_hyphen_soft_hyphen() {
assert!(HyphenationHandler::is_continuation_hyphen("Govern\u{00AD}"));
assert!(HyphenationHandler::is_continuation_hyphen("busi\u{00AD}"));
assert!(HyphenationHandler::is_continuation_hyphen("word\u{00AD} ")); }
#[test]
fn test_is_continuation_hyphen_soft_hyphen_negative() {
assert!(!HyphenationHandler::is_continuation_hyphen("123\u{00AD}"));
assert!(!HyphenationHandler::is_continuation_hyphen("\u{00AD}")); }
#[test]
fn test_is_compound_word() {
assert!(HyphenationHandler::is_compound_word("self", "regulation"));
assert!(HyphenationHandler::is_compound_word("non", "linear"));
assert!(HyphenationHandler::is_compound_word("content", "type"));
}
#[test]
fn test_process_line_pair_join() {
let handler = HyphenationHandler::new();
let (result, consumed) = handler.process_line_pair("Govern-", "ment of the");
assert!(consumed);
assert_eq!(result, "Government of the");
}
#[test]
fn test_process_line_pair_preserve_compound() {
let handler = HyphenationHandler::new();
let (result, consumed) = handler.process_line_pair("self-", "regulation");
assert!(!consumed); assert_eq!(result, "self-");
}
#[test]
fn test_process_line_pair_soft_hyphen() {
let handler = HyphenationHandler::new();
let (result, consumed) = handler.process_line_pair("busi\u{00AD}", "ness today");
assert!(consumed);
assert_eq!(result, "business today");
}
#[test]
fn test_process_line_pair_soft_hyphen_always_joins() {
let handler = HyphenationHandler::new();
let (result, consumed) = handler.process_line_pair("self\u{00AD}", "regulation");
assert!(consumed);
assert_eq!(result, "selfregulation");
}
#[test]
fn test_process_text_multiple_lines() {
let handler = HyphenationHandler::new();
let text = "The Govern-\nment issued a\nstate-\nment today.";
let result = handler.process_text(text);
assert!(result.contains("Government"));
}
#[test]
fn test_process_text_no_hyphen() {
let handler = HyphenationHandler::new();
let text = "Normal text\nwith no hyphens\nat line ends.";
let result = handler.process_text(text);
assert_eq!(result, text);
}
}