const DEFAULT_TRUNCATION: &str = "…";
pub fn slugify(value: &str, allow_unicode: bool) -> String {
let normalized = if allow_unicode {
value.to_lowercase()
} else {
fold_to_ascii(value).to_lowercase()
};
let mut filtered = String::with_capacity(normalized.len());
for ch in normalized.chars() {
let keep = if allow_unicode {
ch.is_alphanumeric() || ch == '_' || ch == '-' || ch.is_whitespace()
} else {
ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch.is_ascii_whitespace()
};
if keep {
filtered.push(ch);
}
}
let mut collapsed = String::with_capacity(filtered.len());
let mut last_was_separator = false;
for ch in filtered.chars() {
if ch.is_whitespace() || ch == '-' {
if !last_was_separator {
collapsed.push('-');
last_was_separator = true;
}
} else {
collapsed.push(ch);
last_was_separator = false;
}
}
collapsed
.trim_matches(|ch| ch == '-' || ch == '_')
.to_string()
}
pub fn truncate_chars(value: &str, length: usize) -> String {
if length == 0 {
return String::new();
}
let normalized = normalize_basic_nfc(value);
let visible_len = visible_char_count(&normalized);
if visible_len <= length {
return normalized;
}
let suffix_len = visible_char_count(DEFAULT_TRUNCATION);
if length <= suffix_len {
return DEFAULT_TRUNCATION.to_string();
}
let keep_visible = length - suffix_len;
let end = truncate_visible_boundary(&normalized, keep_visible);
format!("{}{}", &normalized[..end], DEFAULT_TRUNCATION)
}
pub fn truncate_words(value: &str, num_words: usize) -> String {
if num_words == 0 {
return String::new();
}
let words: Vec<&str> = value.split_whitespace().collect();
if words.len() <= num_words {
return words.join(" ");
}
format!("{}{}", words[..num_words].join(" "), DEFAULT_TRUNCATION)
}
pub fn wrap(text: &str, width: usize) -> String {
if width == 0 {
return text.to_string();
}
let mut wrapped_lines = Vec::new();
for line in text.split('\n') {
wrapped_lines.extend(wrap_line(line, width));
}
wrapped_lines.join("\n")
}
pub fn phone2numeric(phone: &str) -> String {
phone
.chars()
.flat_map(|ch| ch.to_lowercase())
.map(|ch| match ch {
'a' | 'b' | 'c' => '2',
'd' | 'e' | 'f' => '3',
'g' | 'h' | 'i' => '4',
'j' | 'k' | 'l' => '5',
'm' | 'n' | 'o' => '6',
'p' | 'q' | 'r' | 's' => '7',
't' | 'u' | 'v' => '8',
'w' | 'x' | 'y' | 'z' => '9',
other => other,
})
.collect()
}
pub fn camel_case_to_spaces(value: &str) -> String {
let trimmed = value.trim();
let chars: Vec<char> = trimmed.chars().collect();
let mut output = String::with_capacity(trimmed.len());
for (index, &ch) in chars.iter().enumerate() {
if index > 0 && ch.is_uppercase() {
let prev = chars[index - 1];
let next = chars.get(index + 1).copied();
let boundary_after_acronym =
prev.is_uppercase() && next.is_some_and(char::is_lowercase);
if prev.is_lowercase() || boundary_after_acronym {
output.push(' ');
}
}
output.extend(ch.to_lowercase());
}
output
}
pub fn normalize_newlines(text: &str) -> String {
text.replace("\r\n", "\n").replace('\r', "\n")
}
pub fn get_text_list(items: &[&str], last_word: &str) -> String {
match items {
[] => String::new(),
[item] => (*item).to_string(),
[first, second] => format!("{first} {last_word} {second}"),
_ => format!(
"{} {last_word} {}",
items[..items.len() - 1].join(", "),
items[items.len() - 1]
),
}
}
fn wrap_line(line: &str, width: usize) -> Vec<String> {
if line.is_empty() {
return vec![String::new()];
}
let segments = split_segments(line);
let mut lines = Vec::new();
let mut current = String::new();
let mut pending_whitespace = String::new();
for (is_whitespace, segment) in segments {
if is_whitespace {
if current.is_empty() {
if lines.is_empty() {
current.push_str(&segment);
}
} else {
pending_whitespace.push_str(&segment);
}
continue;
}
if current.is_empty() {
if segment.chars().count() > width {
lines.push(segment);
} else {
current.push_str(&segment);
}
continue;
}
let candidate_len =
current.chars().count() + pending_whitespace.chars().count() + segment.chars().count();
if candidate_len <= width {
current.push_str(&pending_whitespace);
current.push_str(&segment);
pending_whitespace.clear();
} else {
lines.push(current);
current = if segment.chars().count() > width {
lines.push(segment);
String::new()
} else {
segment
};
pending_whitespace.clear();
}
}
if !current.is_empty() || line.chars().all(char::is_whitespace) {
lines.push(current);
}
if lines.is_empty() {
vec![line.to_string()]
} else {
lines
}
}
fn split_segments(line: &str) -> Vec<(bool, String)> {
let mut segments = Vec::new();
let mut buffer = String::new();
let mut current_is_whitespace = None;
for ch in line.chars() {
let is_whitespace = ch.is_whitespace();
match current_is_whitespace {
Some(flag) if flag == is_whitespace => buffer.push(ch),
Some(flag) => {
segments.push((flag, std::mem::take(&mut buffer)));
buffer.push(ch);
current_is_whitespace = Some(is_whitespace);
}
None => {
buffer.push(ch);
current_is_whitespace = Some(is_whitespace);
}
}
}
if let Some(flag) = current_is_whitespace {
segments.push((flag, buffer));
}
segments
}
fn truncate_visible_boundary(text: &str, keep_visible: usize) -> usize {
let mut visible_seen = 0usize;
let mut end = 0usize;
for (index, ch) in text.char_indices() {
if !is_combining_mark(ch) {
if visible_seen == keep_visible {
break;
}
visible_seen += 1;
}
end = index + ch.len_utf8();
}
end
}
fn visible_char_count(text: &str) -> usize {
text.chars().filter(|&ch| !is_combining_mark(ch)).count()
}
fn is_combining_mark(ch: char) -> bool {
matches!(
ch as u32,
0x0300..=0x036F | 0x1AB0..=0x1AFF | 0x1DC0..=0x1DFF | 0x20D0..=0x20FF | 0xFE20..=0xFE2F
)
}
fn normalize_basic_nfc(value: &str) -> String {
let mut normalized = String::with_capacity(value.len());
let mut cluster = String::new();
let mut current_base = None;
for ch in value.chars() {
if is_combining_mark(ch) {
if let Some(base) = current_base {
if let Some(composed) = compose_pair(base, ch) {
current_base = Some(composed);
cluster.clear();
cluster.push(composed);
} else {
cluster.push(ch);
}
} else {
cluster.push(ch);
}
} else {
normalized.push_str(&cluster);
cluster.clear();
cluster.push(ch);
current_base = Some(ch);
}
}
normalized.push_str(&cluster);
normalized
}
fn compose_pair(base: char, mark: char) -> Option<char> {
Some(match (base, mark) {
('A', '\u{0300}') => 'À',
('A', '\u{0301}') => 'Á',
('A', '\u{0302}') => 'Â',
('A', '\u{0303}') => 'Ã',
('A', '\u{0308}') => 'Ä',
('A', '\u{030A}') => 'Å',
('C', '\u{0327}') => 'Ç',
('E', '\u{0300}') => 'È',
('E', '\u{0301}') => 'É',
('E', '\u{0302}') => 'Ê',
('E', '\u{0308}') => 'Ë',
('I', '\u{0300}') => 'Ì',
('I', '\u{0301}') => 'Í',
('I', '\u{0302}') => 'Î',
('I', '\u{0308}') => 'Ï',
('N', '\u{0303}') => 'Ñ',
('O', '\u{0300}') => 'Ò',
('O', '\u{0301}') => 'Ó',
('O', '\u{0302}') => 'Ô',
('O', '\u{0303}') => 'Õ',
('O', '\u{0308}') => 'Ö',
('U', '\u{0300}') => 'Ù',
('U', '\u{0301}') => 'Ú',
('U', '\u{0302}') => 'Û',
('U', '\u{0308}') => 'Ü',
('Y', '\u{0301}') => 'Ý',
('a', '\u{0300}') => 'à',
('a', '\u{0301}') => 'á',
('a', '\u{0302}') => 'â',
('a', '\u{0303}') => 'ã',
('a', '\u{0308}') => 'ä',
('a', '\u{030A}') => 'å',
('c', '\u{0327}') => 'ç',
('e', '\u{0300}') => 'è',
('e', '\u{0301}') => 'é',
('e', '\u{0302}') => 'ê',
('e', '\u{0308}') => 'ë',
('i', '\u{0300}') => 'ì',
('i', '\u{0301}') => 'í',
('i', '\u{0302}') => 'î',
('i', '\u{0308}') => 'ï',
('n', '\u{0303}') => 'ñ',
('o', '\u{0300}') => 'ò',
('o', '\u{0301}') => 'ó',
('o', '\u{0302}') => 'ô',
('o', '\u{0303}') => 'õ',
('o', '\u{0308}') => 'ö',
('u', '\u{0300}') => 'ù',
('u', '\u{0301}') => 'ú',
('u', '\u{0302}') => 'û',
('u', '\u{0308}') => 'ü',
('y', '\u{0301}') => 'ý',
('y', '\u{0308}') => 'ÿ',
_ => return None,
})
}
fn fold_to_ascii(value: &str) -> String {
let mut folded = String::with_capacity(value.len());
for ch in value.chars() {
if ch.is_ascii() {
folded.push(ch);
continue;
}
if is_combining_mark(ch) {
continue;
}
match ch {
'À' | 'Á' | 'Â' | 'Ã' | 'Ä' | 'Å' | 'Ā' | 'Ă' | 'Ą' | 'Ǎ' | 'à' | 'á' | 'â' | 'ã'
| 'ä' | 'å' | 'ā' | 'ă' | 'ą' | 'ǎ' => folded.push('a'),
'Æ' | 'Ǽ' | 'æ' | 'ǽ' => folded.push_str("ae"),
'Ç' | 'Ć' | 'Ĉ' | 'Ċ' | 'Č' | 'ç' | 'ć' | 'ĉ' | 'ċ' | 'č' => folded.push('c'),
'Ð' | 'Ď' | 'Đ' | 'ð' | 'ď' | 'đ' => folded.push('d'),
'È' | 'É' | 'Ê' | 'Ë' | 'Ē' | 'Ĕ' | 'Ė' | 'Ę' | 'Ě' | 'è' | 'é' | 'ê' | 'ë' | 'ē'
| 'ĕ' | 'ė' | 'ę' | 'ě' => folded.push('e'),
'Ĝ' | 'Ğ' | 'Ġ' | 'Ģ' | 'ĝ' | 'ğ' | 'ġ' | 'ģ' => folded.push('g'),
'Ĥ' | 'Ħ' | 'ĥ' | 'ħ' => folded.push('h'),
'Ì' | 'Í' | 'Î' | 'Ï' | 'Ĩ' | 'Ī' | 'Ĭ' | 'Į' | 'İ' | 'ì' | 'í' | 'î' | 'ï' | 'ĩ'
| 'ī' | 'ĭ' | 'į' | 'ı' => folded.push('i'),
'Ĵ' | 'ĵ' => folded.push('j'),
'Ķ' | 'ķ' | 'ĸ' => folded.push('k'),
'Ĺ' | 'Ļ' | 'Ľ' | 'Ŀ' | 'Ł' | 'ĺ' | 'ļ' | 'ľ' | 'ŀ' | 'ł' => folded.push('l'),
'Ñ' | 'Ń' | 'Ņ' | 'Ň' | 'ñ' | 'ń' | 'ņ' | 'ň' => folded.push('n'),
'Ò' | 'Ó' | 'Ô' | 'Õ' | 'Ö' | 'Ø' | 'Ō' | 'Ŏ' | 'Ő' | 'Ǒ' | 'ò' | 'ó' | 'ô' | 'õ'
| 'ö' | 'ø' | 'ō' | 'ŏ' | 'ő' | 'ǒ' => folded.push('o'),
'Œ' | 'œ' => folded.push_str("oe"),
'Ŕ' | 'Ŗ' | 'Ř' | 'ŕ' | 'ŗ' | 'ř' => folded.push('r'),
'Ś' | 'Ŝ' | 'Ş' | 'Š' | 'ś' | 'ŝ' | 'ş' | 'š' | 'ß' => folded.push('s'),
'Ţ' | 'Ť' | 'Ŧ' | 'ţ' | 'ť' | 'ŧ' => folded.push('t'),
'Ù' | 'Ú' | 'Û' | 'Ü' | 'Ũ' | 'Ū' | 'Ŭ' | 'Ů' | 'Ű' | 'Ų' | 'ù' | 'ú' | 'û' | 'ü'
| 'ũ' | 'ū' | 'ŭ' | 'ů' | 'ű' | 'ų' => folded.push('u'),
'Ý' | 'Ÿ' | 'Ŷ' | 'ý' | 'ÿ' | 'ŷ' => folded.push('y'),
'Ź' | 'Ż' | 'Ž' | 'ź' | 'ż' | 'ž' => folded.push('z'),
'Þ' | 'þ' => folded.push_str("th"),
_ => {}
}
}
folded
}
#[cfg(test)]
mod tests {
use super::{
camel_case_to_spaces, get_text_list, normalize_newlines, phone2numeric, slugify,
truncate_chars, truncate_words, wrap,
};
mod test_utils_text {
use super::{
camel_case_to_spaces, get_text_list, normalize_newlines, phone2numeric, slugify,
truncate_chars, truncate_words, wrap,
};
#[test]
fn test_get_text_list_empty() {
assert_eq!(get_text_list(&[], "and"), "");
}
#[test]
fn test_get_text_list_single_item() {
assert_eq!(get_text_list(&["a"], "and"), "a");
}
#[test]
fn test_get_text_list_two_items() {
assert_eq!(get_text_list(&["a", "b"], "and"), "a and b");
}
#[test]
fn test_get_text_list_three_items_with_and() {
assert_eq!(get_text_list(&["a", "b", "c"], "and"), "a, b and c");
}
#[test]
fn test_get_text_list_four_items_defaults_to_or_style_joining() {
assert_eq!(get_text_list(&["a", "b", "c", "d"], "or"), "a, b, c or d");
}
#[test]
fn test_truncate_chars_returns_original_when_short_enough() {
assert_eq!(
truncate_chars("The quick brown fox jumped over the lazy dog.", 100),
"The quick brown fox jumped over the lazy dog."
);
}
#[test]
fn test_truncate_chars_basic() {
assert_eq!(
truncate_chars("The quick brown fox jumped over the lazy dog.", 21),
"The quick brown fox …"
);
}
#[test]
fn test_truncate_chars_uses_unicode_ellipsis_when_length_is_tiny() {
assert_eq!(truncate_chars("asdf", 1), "…");
}
#[test]
fn test_truncate_chars_handles_precomposed_combining_text() {
assert_eq!(truncate_chars("oüoüoüoü", 8), "oüoüoüoü");
assert_eq!(truncate_chars("oüoüoüoü", 3), "oü…");
}
#[test]
fn test_truncate_chars_normalizes_decomposed_umlaut_text() {
assert_eq!(
truncate_chars("ou\u{0308}ou\u{0308}ou\u{0308}ou\u{0308}", 8),
"oüoüoüoü"
);
assert_eq!(
truncate_chars("ou\u{0308}ou\u{0308}ou\u{0308}ou\u{0308}", 3),
"oü…"
);
}
#[test]
fn test_truncate_chars_preserves_non_precomposed_combining_sequences() {
assert_eq!(truncate_chars("-B\u{030A}B\u{030A}----8", 3), "-B\u{030A}…");
assert_eq!(
truncate_chars("-B\u{030A}B\u{030A}----8", 5),
"-B\u{030A}B\u{030A}-…"
);
assert_eq!(
truncate_chars("-B\u{030A}B\u{030A}----8", 8),
"-B\u{030A}B\u{030A}----8"
);
}
#[test]
fn test_truncate_words_returns_original_when_short_enough() {
assert_eq!(
truncate_words("The quick brown fox jumped over the lazy dog.", 10),
"The quick brown fox jumped over the lazy dog."
);
}
#[test]
fn test_truncate_words_basic() {
assert_eq!(
truncate_words("The quick brown fox jumped over the lazy dog.", 4),
"The quick brown fox…"
);
}
#[test]
fn test_truncate_words_zero_returns_empty_string() {
assert_eq!(
truncate_words("The quick brown fox jumped over the lazy dog.", 0),
""
);
}
#[test]
fn test_wrap_leaves_short_text_unchanged() {
assert_eq!(wrap("1234 67 9", 100), "1234 67 9");
assert_eq!(wrap("1234 67 9", 9), "1234 67 9");
}
#[test]
fn test_wrap_breaks_on_word_boundaries() {
assert_eq!(wrap("1234 67 9", 8), "1234 67\n9");
}
#[test]
fn test_wrap_preserves_existing_line_breaks() {
assert_eq!(wrap("short\na long line", 7), "short\na long\nline");
}
#[test]
fn test_wrap_does_not_break_long_words() {
assert_eq!(
wrap("do-not-break-long-words please? ok", 8),
"do-not-break-long-words\nplease?\nok"
);
let long_word = format!("l{}ng", "o".repeat(20));
assert_eq!(wrap(&long_word, 20), long_word);
assert_eq!(
wrap(&format!("a {long_word} word"), 10),
format!("a\n{long_word}\nword")
);
}
#[test]
fn test_normalize_newlines() {
assert_eq!(normalize_newlines("abc\ndef\rghi\r\n"), "abc\ndef\nghi\n");
assert_eq!(normalize_newlines("\n\r\r\n\r"), "\n\n\n\n");
assert_eq!(normalize_newlines("abcdefghi"), "abcdefghi");
assert_eq!(normalize_newlines(""), "");
}
#[test]
fn test_phone2numeric() {
assert_eq!(phone2numeric("0800 flowers"), "0800 3569377");
}
#[test]
fn test_slugify_ascii() {
let items = [
("Hello, World!", "hello-world", false),
("spam & eggs", "spam-eggs", false),
(
" multiple---dash and space ",
"multiple-dash-and-space",
false,
),
("\t whitespace-in-value \n", "whitespace-in-value", false),
("underscore_in-value", "underscore_in-value", false),
(
"__strip__underscore-value___",
"strip__underscore-value",
false,
),
("--strip-dash-value---", "strip-dash-value", false),
("__strip-mixed-value---", "strip-mixed-value", false),
("_ -strip-mixed-value _-", "strip-mixed-value", false),
];
for (value, output, allow_unicode) in items {
assert_eq!(slugify(value, allow_unicode), output);
}
}
#[test]
fn test_slugify_unicode() {
let items = [
("spam & ıçüş", "spam-ıçüş"),
("foo ıç bar", "foo-ıç-bar"),
(" foo ıç bar", "foo-ıç-bar"),
("你好", "你好"),
("İstanbul", "istanbul"),
];
for (value, output) in items {
assert_eq!(slugify(value, true), output);
}
}
#[test]
fn test_camel_case_to_spaces() {
assert_eq!(camel_case_to_spaces("CamelCaseValue"), "camel case value");
assert_eq!(camel_case_to_spaces("HTMLParser"), "html parser");
assert_eq!(camel_case_to_spaces(" lowerCamelCase "), "lower camel case");
}
}
}