rjango 0.1.1 - Docs.rs

/// The default suffix appended when text is truncated.
const DEFAULT_TRUNCATION: &str = "…";

/// Convert a string into a URL-friendly slug.
///
/// When `allow_unicode` is `false`, non-ASCII letters are folded to ASCII when
/// possible and otherwise removed. When `allow_unicode` is `true`, Unicode
/// letters and digits are preserved.
pub fn slugify(value: &str, allow_unicode: bool) -> String {
    let normalized = if allow_unicode {
        value.to_lowercase()
    } else {
        fold_to_ascii(value).to_lowercase()
    };

    let mut filtered = String::with_capacity(normalized.len());
    for ch in normalized.chars() {
        let keep = if allow_unicode {
            ch.is_alphanumeric() || ch == '_' || ch == '-' || ch.is_whitespace()
        } else {
            ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch.is_ascii_whitespace()
        };

        if keep {
            filtered.push(ch);
        }
    }

    let mut collapsed = String::with_capacity(filtered.len());
    let mut last_was_separator = false;
    for ch in filtered.chars() {
        if ch.is_whitespace() || ch == '-' {
            if !last_was_separator {
                collapsed.push('-');
                last_was_separator = true;
            }
        } else {
            collapsed.push(ch);
            last_was_separator = false;
        }
    }

    collapsed
        .trim_matches(|ch| ch == '-' || ch == '_')
        .to_string()
}

/// Truncate a string to at most `length` visible characters and append `…`
/// when truncation happens.
///
/// Combining marks are treated as part of the preceding character and don't
/// count toward the visible length.
pub fn truncate_chars(value: &str, length: usize) -> String {
    if length == 0 {
        return String::new();
    }

    let normalized = normalize_basic_nfc(value);
    let visible_len = visible_char_count(&normalized);
    if visible_len <= length {
        return normalized;
    }

    let suffix_len = visible_char_count(DEFAULT_TRUNCATION);
    if length <= suffix_len {
        return DEFAULT_TRUNCATION.to_string();
    }

    let keep_visible = length - suffix_len;
    let end = truncate_visible_boundary(&normalized, keep_visible);
    format!("{}{}", &normalized[..end], DEFAULT_TRUNCATION)
}

/// Truncate a string to at most `num_words` words and append `…` when
/// truncation happens.
pub fn truncate_words(value: &str, num_words: usize) -> String {
    if num_words == 0 {
        return String::new();
    }

    let words: Vec<&str> = value.split_whitespace().collect();
    if words.len() <= num_words {
        return words.join(" ");
    }

    format!("{}{}", words[..num_words].join(" "), DEFAULT_TRUNCATION)
}

/// Wrap text to the requested `width` while preserving existing line breaks.
///
/// Long words are never split, so output lines may exceed `width`.
pub fn wrap(text: &str, width: usize) -> String {
    if width == 0 {
        return text.to_string();
    }

    let mut wrapped_lines = Vec::new();
    for line in text.split('\n') {
        wrapped_lines.extend(wrap_line(line, width));
    }

    wrapped_lines.join("\n")
}

/// Convert a phone number containing alphabetic characters into its numeric
/// keypad equivalent.
pub fn phone2numeric(phone: &str) -> String {
    phone
        .chars()
        .flat_map(|ch| ch.to_lowercase())
        .map(|ch| match ch {
            'a' | 'b' | 'c' => '2',
            'd' | 'e' | 'f' => '3',
            'g' | 'h' | 'i' => '4',
            'j' | 'k' | 'l' => '5',
            'm' | 'n' | 'o' => '6',
            'p' | 'q' | 'r' | 's' => '7',
            't' | 'u' | 'v' => '8',
            'w' | 'x' | 'y' | 'z' => '9',
            other => other,
        })
        .collect()
}

/// Convert CamelCase text to lowercase words separated by spaces.
pub fn camel_case_to_spaces(value: &str) -> String {
    let trimmed = value.trim();
    let chars: Vec<char> = trimmed.chars().collect();
    let mut output = String::with_capacity(trimmed.len());

    for (index, &ch) in chars.iter().enumerate() {
        if index > 0 && ch.is_uppercase() {
            let prev = chars[index - 1];
            let next = chars.get(index + 1).copied();
            let boundary_after_acronym =
                prev.is_uppercase() && next.is_some_and(char::is_lowercase);
            if prev.is_lowercase() || boundary_after_acronym {
                output.push(' ');
            }
        }

        output.extend(ch.to_lowercase());
    }

    output
}

/// Normalize all newlines in `text` to Unix `\n` newlines.
pub fn normalize_newlines(text: &str) -> String {
    text.replace("\r\n", "\n").replace('\r', "\n")
}

/// Join a list of text fragments with commas and a final conjunction.
pub fn get_text_list(items: &[&str], last_word: &str) -> String {
    match items {
        [] => String::new(),
        [item] => (*item).to_string(),
        [first, second] => format!("{first} {last_word} {second}"),
        _ => format!(
            "{} {last_word} {}",
            items[..items.len() - 1].join(", "),
            items[items.len() - 1]
        ),
    }
}

fn wrap_line(line: &str, width: usize) -> Vec<String> {
    if line.is_empty() {
        return vec![String::new()];
    }

    let segments = split_segments(line);
    let mut lines = Vec::new();
    let mut current = String::new();
    let mut pending_whitespace = String::new();

    for (is_whitespace, segment) in segments {
        if is_whitespace {
            if current.is_empty() {
                if lines.is_empty() {
                    current.push_str(&segment);
                }
            } else {
                pending_whitespace.push_str(&segment);
            }
            continue;
        }

        if current.is_empty() {
            if segment.chars().count() > width {
                lines.push(segment);
            } else {
                current.push_str(&segment);
            }
            continue;
        }

        let candidate_len =
            current.chars().count() + pending_whitespace.chars().count() + segment.chars().count();
        if candidate_len <= width {
            current.push_str(&pending_whitespace);
            current.push_str(&segment);
            pending_whitespace.clear();
        } else {
            lines.push(current);
            current = if segment.chars().count() > width {
                lines.push(segment);
                String::new()
            } else {
                segment
            };
            pending_whitespace.clear();
        }
    }

    if !current.is_empty() || line.chars().all(char::is_whitespace) {
        lines.push(current);
    }

    if lines.is_empty() {
        vec![line.to_string()]
    } else {
        lines
    }
}

fn split_segments(line: &str) -> Vec<(bool, String)> {
    let mut segments = Vec::new();
    let mut buffer = String::new();
    let mut current_is_whitespace = None;

    for ch in line.chars() {
        let is_whitespace = ch.is_whitespace();
        match current_is_whitespace {
            Some(flag) if flag == is_whitespace => buffer.push(ch),
            Some(flag) => {
                segments.push((flag, std::mem::take(&mut buffer)));
                buffer.push(ch);
                current_is_whitespace = Some(is_whitespace);
            }
            None => {
                buffer.push(ch);
                current_is_whitespace = Some(is_whitespace);
            }
        }
    }

    if let Some(flag) = current_is_whitespace {
        segments.push((flag, buffer));
    }

    segments
}

fn truncate_visible_boundary(text: &str, keep_visible: usize) -> usize {
    let mut visible_seen = 0usize;
    let mut end = 0usize;

    for (index, ch) in text.char_indices() {
        if !is_combining_mark(ch) {
            if visible_seen == keep_visible {
                break;
            }
            visible_seen += 1;
        }
        end = index + ch.len_utf8();
    }

    end
}

fn visible_char_count(text: &str) -> usize {
    text.chars().filter(|&ch| !is_combining_mark(ch)).count()
}

fn is_combining_mark(ch: char) -> bool {
    matches!(
        ch as u32,
        0x0300..=0x036F | 0x1AB0..=0x1AFF | 0x1DC0..=0x1DFF | 0x20D0..=0x20FF | 0xFE20..=0xFE2F
    )
}

fn normalize_basic_nfc(value: &str) -> String {
    let mut normalized = String::with_capacity(value.len());
    let mut cluster = String::new();
    let mut current_base = None;

    for ch in value.chars() {
        if is_combining_mark(ch) {
            if let Some(base) = current_base {
                if let Some(composed) = compose_pair(base, ch) {
                    current_base = Some(composed);
                    cluster.clear();
                    cluster.push(composed);
                } else {
                    cluster.push(ch);
                }
            } else {
                cluster.push(ch);
            }
        } else {
            normalized.push_str(&cluster);
            cluster.clear();
            cluster.push(ch);
            current_base = Some(ch);
        }
    }

    normalized.push_str(&cluster);
    normalized
}

fn compose_pair(base: char, mark: char) -> Option<char> {
    Some(match (base, mark) {
        ('A', '\u{0300}') => 'À',
        ('A', '\u{0301}') => 'Á',
        ('A', '\u{0302}') => 'Â',
        ('A', '\u{0303}') => 'Ã',
        ('A', '\u{0308}') => 'Ä',
        ('A', '\u{030A}') => 'Å',
        ('C', '\u{0327}') => 'Ç',
        ('E', '\u{0300}') => 'È',
        ('E', '\u{0301}') => 'É',
        ('E', '\u{0302}') => 'Ê',
        ('E', '\u{0308}') => 'Ë',
        ('I', '\u{0300}') => 'Ì',
        ('I', '\u{0301}') => 'Í',
        ('I', '\u{0302}') => 'Î',
        ('I', '\u{0308}') => 'Ï',
        ('N', '\u{0303}') => 'Ñ',
        ('O', '\u{0300}') => 'Ò',
        ('O', '\u{0301}') => 'Ó',
        ('O', '\u{0302}') => 'Ô',
        ('O', '\u{0303}') => 'Õ',
        ('O', '\u{0308}') => 'Ö',
        ('U', '\u{0300}') => 'Ù',
        ('U', '\u{0301}') => 'Ú',
        ('U', '\u{0302}') => 'Û',
        ('U', '\u{0308}') => 'Ü',
        ('Y', '\u{0301}') => 'Ý',
        ('a', '\u{0300}') => 'à',
        ('a', '\u{0301}') => 'á',
        ('a', '\u{0302}') => 'â',
        ('a', '\u{0303}') => 'ã',
        ('a', '\u{0308}') => 'ä',
        ('a', '\u{030A}') => 'å',
        ('c', '\u{0327}') => 'ç',
        ('e', '\u{0300}') => 'è',
        ('e', '\u{0301}') => 'é',
        ('e', '\u{0302}') => 'ê',
        ('e', '\u{0308}') => 'ë',
        ('i', '\u{0300}') => 'ì',
        ('i', '\u{0301}') => 'í',
        ('i', '\u{0302}') => 'î',
        ('i', '\u{0308}') => 'ï',
        ('n', '\u{0303}') => 'ñ',
        ('o', '\u{0300}') => 'ò',
        ('o', '\u{0301}') => 'ó',
        ('o', '\u{0302}') => 'ô',
        ('o', '\u{0303}') => 'õ',
        ('o', '\u{0308}') => 'ö',
        ('u', '\u{0300}') => 'ù',
        ('u', '\u{0301}') => 'ú',
        ('u', '\u{0302}') => 'û',
        ('u', '\u{0308}') => 'ü',
        ('y', '\u{0301}') => 'ý',
        ('y', '\u{0308}') => 'ÿ',
        _ => return None,
    })
}

fn fold_to_ascii(value: &str) -> String {
    let mut folded = String::with_capacity(value.len());
    for ch in value.chars() {
        if ch.is_ascii() {
            folded.push(ch);
            continue;
        }

        if is_combining_mark(ch) {
            continue;
        }

        match ch {
            'À' | 'Á' | 'Â' | 'Ã' | 'Ä' | 'Å' | 'Ā' | 'Ă' | 'Ą' | 'Ǎ' | 'à' | 'á' | 'â' | 'ã'
            | 'ä' | 'å' | 'ā' | 'ă' | 'ą' | 'ǎ' => folded.push('a'),
            'Æ' | 'Ǽ' | 'æ' | 'ǽ' => folded.push_str("ae"),
            'Ç' | 'Ć' | 'Ĉ' | 'Ċ' | 'Č' | 'ç' | 'ć' | 'ĉ' | 'ċ' | 'č' => folded.push('c'),
            'Ð' | 'Ď' | 'Đ' | 'ð' | 'ď' | 'đ' => folded.push('d'),
            'È' | 'É' | 'Ê' | 'Ë' | 'Ē' | 'Ĕ' | 'Ė' | 'Ę' | 'Ě' | 'è' | 'é' | 'ê' | 'ë' | 'ē'
            | 'ĕ' | 'ė' | 'ę' | 'ě' => folded.push('e'),
            'Ĝ' | 'Ğ' | 'Ġ' | 'Ģ' | 'ĝ' | 'ğ' | 'ġ' | 'ģ' => folded.push('g'),
            'Ĥ' | 'Ħ' | 'ĥ' | 'ħ' => folded.push('h'),
            'Ì' | 'Í' | 'Î' | 'Ï' | 'Ĩ' | 'Ī' | 'Ĭ' | 'Į' | 'İ' | 'ì' | 'í' | 'î' | 'ï' | 'ĩ'
            | 'ī' | 'ĭ' | 'į' | 'ı' => folded.push('i'),
            'Ĵ' | 'ĵ' => folded.push('j'),
            'Ķ' | 'ķ' | 'ĸ' => folded.push('k'),
            'Ĺ' | 'Ļ' | 'Ľ' | 'Ŀ' | 'Ł' | 'ĺ' | 'ļ' | 'ľ' | 'ŀ' | 'ł' => folded.push('l'),
            'Ñ' | 'Ń' | 'Ņ' | 'Ň' | 'ñ' | 'ń' | 'ņ' | 'ň' => folded.push('n'),
            'Ò' | 'Ó' | 'Ô' | 'Õ' | 'Ö' | 'Ø' | 'Ō' | 'Ŏ' | 'Ő' | 'Ǒ' | 'ò' | 'ó' | 'ô' | 'õ'
            | 'ö' | 'ø' | 'ō' | 'ŏ' | 'ő' | 'ǒ' => folded.push('o'),
            'Œ' | 'œ' => folded.push_str("oe"),
            'Ŕ' | 'Ŗ' | 'Ř' | 'ŕ' | 'ŗ' | 'ř' => folded.push('r'),
            'Ś' | 'Ŝ' | 'Ş' | 'Š' | 'ś' | 'ŝ' | 'ş' | 'š' | 'ß' => folded.push('s'),
            'Ţ' | 'Ť' | 'Ŧ' | 'ţ' | 'ť' | 'ŧ' => folded.push('t'),
            'Ù' | 'Ú' | 'Û' | 'Ü' | 'Ũ' | 'Ū' | 'Ŭ' | 'Ů' | 'Ű' | 'Ų' | 'ù' | 'ú' | 'û' | 'ü'
            | 'ũ' | 'ū' | 'ŭ' | 'ů' | 'ű' | 'ų' => folded.push('u'),
            'Ý' | 'Ÿ' | 'Ŷ' | 'ý' | 'ÿ' | 'ŷ' => folded.push('y'),
            'Ź' | 'Ż' | 'Ž' | 'ź' | 'ż' | 'ž' => folded.push('z'),
            'Þ' | 'þ' => folded.push_str("th"),
            _ => {}
        }
    }
    folded
}

#[cfg(test)]
mod tests {
    use super::{
        camel_case_to_spaces, get_text_list, normalize_newlines, phone2numeric, slugify,
        truncate_chars, truncate_words, wrap,
    };

    mod test_utils_text {
        use super::{
            camel_case_to_spaces, get_text_list, normalize_newlines, phone2numeric, slugify,
            truncate_chars, truncate_words, wrap,
        };

        #[test]
        fn test_get_text_list_empty() {
            assert_eq!(get_text_list(&[], "and"), "");
        }

        #[test]
        fn test_get_text_list_single_item() {
            assert_eq!(get_text_list(&["a"], "and"), "a");
        }

        #[test]
        fn test_get_text_list_two_items() {
            assert_eq!(get_text_list(&["a", "b"], "and"), "a and b");
        }

        #[test]
        fn test_get_text_list_three_items_with_and() {
            assert_eq!(get_text_list(&["a", "b", "c"], "and"), "a, b and c");
        }

        #[test]
        fn test_get_text_list_four_items_defaults_to_or_style_joining() {
            assert_eq!(get_text_list(&["a", "b", "c", "d"], "or"), "a, b, c or d");
        }

        #[test]
        fn test_truncate_chars_returns_original_when_short_enough() {
            assert_eq!(
                truncate_chars("The quick brown fox jumped over the lazy dog.", 100),
                "The quick brown fox jumped over the lazy dog."
            );
        }

        #[test]
        fn test_truncate_chars_basic() {
            assert_eq!(
                truncate_chars("The quick brown fox jumped over the lazy dog.", 21),
                "The quick brown fox …"
            );
        }

        #[test]
        fn test_truncate_chars_uses_unicode_ellipsis_when_length_is_tiny() {
            assert_eq!(truncate_chars("asdf", 1), "…");
        }

        #[test]
        fn test_truncate_chars_handles_precomposed_combining_text() {
            assert_eq!(truncate_chars("oüoüoüoü", 8), "oüoüoüoü");
            assert_eq!(truncate_chars("oüoüoüoü", 3), "oü…");
        }

        #[test]
        fn test_truncate_chars_normalizes_decomposed_umlaut_text() {
            assert_eq!(
                truncate_chars("ou\u{0308}ou\u{0308}ou\u{0308}ou\u{0308}", 8),
                "oüoüoüoü"
            );
            assert_eq!(
                truncate_chars("ou\u{0308}ou\u{0308}ou\u{0308}ou\u{0308}", 3),
                "oü…"
            );
        }

        #[test]
        fn test_truncate_chars_preserves_non_precomposed_combining_sequences() {
            assert_eq!(truncate_chars("-B\u{030A}B\u{030A}----8", 3), "-B\u{030A}…");
            assert_eq!(
                truncate_chars("-B\u{030A}B\u{030A}----8", 5),
                "-B\u{030A}B\u{030A}-…"
            );
            assert_eq!(
                truncate_chars("-B\u{030A}B\u{030A}----8", 8),
                "-B\u{030A}B\u{030A}----8"
            );
        }

        #[test]
        fn test_truncate_words_returns_original_when_short_enough() {
            assert_eq!(
                truncate_words("The quick brown fox jumped over the lazy dog.", 10),
                "The quick brown fox jumped over the lazy dog."
            );
        }

        #[test]
        fn test_truncate_words_basic() {
            assert_eq!(
                truncate_words("The quick brown fox jumped over the lazy dog.", 4),
                "The quick brown fox…"
            );
        }

        #[test]
        fn test_truncate_words_zero_returns_empty_string() {
            assert_eq!(
                truncate_words("The quick brown fox jumped over the lazy dog.", 0),
                ""
            );
        }

        #[test]
        fn test_wrap_leaves_short_text_unchanged() {
            assert_eq!(wrap("1234 67 9", 100), "1234 67 9");
            assert_eq!(wrap("1234 67 9", 9), "1234 67 9");
        }

        #[test]
        fn test_wrap_breaks_on_word_boundaries() {
            assert_eq!(wrap("1234 67 9", 8), "1234 67\n9");
        }

        #[test]
        fn test_wrap_preserves_existing_line_breaks() {
            assert_eq!(wrap("short\na long line", 7), "short\na long\nline");
        }

        #[test]
        fn test_wrap_does_not_break_long_words() {
            assert_eq!(
                wrap("do-not-break-long-words please? ok", 8),
                "do-not-break-long-words\nplease?\nok"
            );
            let long_word = format!("l{}ng", "o".repeat(20));
            assert_eq!(wrap(&long_word, 20), long_word);
            assert_eq!(
                wrap(&format!("a {long_word} word"), 10),
                format!("a\n{long_word}\nword")
            );
        }

        #[test]
        fn test_normalize_newlines() {
            assert_eq!(normalize_newlines("abc\ndef\rghi\r\n"), "abc\ndef\nghi\n");
            assert_eq!(normalize_newlines("\n\r\r\n\r"), "\n\n\n\n");
            assert_eq!(normalize_newlines("abcdefghi"), "abcdefghi");
            assert_eq!(normalize_newlines(""), "");
        }

        #[test]
        fn test_phone2numeric() {
            assert_eq!(phone2numeric("0800 flowers"), "0800 3569377");
        }

        #[test]
        fn test_slugify_ascii() {
            let items = [
                ("Hello, World!", "hello-world", false),
                ("spam & eggs", "spam-eggs", false),
                (
                    " multiple---dash and  space ",
                    "multiple-dash-and-space",
                    false,
                ),
                ("\t whitespace-in-value \n", "whitespace-in-value", false),
                ("underscore_in-value", "underscore_in-value", false),
                (
                    "__strip__underscore-value___",
                    "strip__underscore-value",
                    false,
                ),
                ("--strip-dash-value---", "strip-dash-value", false),
                ("__strip-mixed-value---", "strip-mixed-value", false),
                ("_ -strip-mixed-value _-", "strip-mixed-value", false),
            ];

            for (value, output, allow_unicode) in items {
                assert_eq!(slugify(value, allow_unicode), output);
            }
        }

        #[test]
        fn test_slugify_unicode() {
            let items = [
                ("spam & ıçüş", "spam-ıçüş"),
                ("foo ıç bar", "foo-ıç-bar"),
                ("    foo ıç bar", "foo-ıç-bar"),
                ("你好", "你好"),
                ("İstanbul", "istanbul"),
            ];

            for (value, output) in items {
                assert_eq!(slugify(value, true), output);
            }
        }

        #[test]
        fn test_camel_case_to_spaces() {
            assert_eq!(camel_case_to_spaces("CamelCaseValue"), "camel case value");
            assert_eq!(camel_case_to_spaces("HTMLParser"), "html parser");
            assert_eq!(camel_case_to_spaces(" lowerCamelCase "), "lower camel case");
        }
    }
}