use unicode_normalization::UnicodeNormalization;
pub(crate) fn canonicalize(input: &str) -> String {
let nfc: String = input.nfc().collect();
let mut out = String::with_capacity(nfc.len());
let mut chars = nfc.chars().peekable();
while let Some(c) = chars.next() {
if c == '\r' {
if chars.peek() == Some(&'\n') {
chars.next();
}
out.push('\n');
} else {
out.push(c);
}
}
let lines: Vec<&str> = out.split('\n').collect();
let mut trimmed: Vec<String> = lines
.iter()
.map(|line| line.trim_end_matches([' ', '\t']).to_string())
.collect();
while matches!(trimmed.last(), Some(line) if line.is_empty()) {
trimmed.pop();
}
if trimmed.is_empty() {
return String::new();
}
let mut result = trimmed.join("\n");
result.push('\n');
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn crlf_collapses_to_lf() {
assert_eq!(canonicalize("a\r\nb\r\n"), "a\nb\n");
}
#[test]
fn lone_cr_becomes_lf() {
assert_eq!(canonicalize("a\rb"), "a\nb\n");
}
#[test]
fn trailing_whitespace_stripped_per_line() {
assert_eq!(canonicalize("hello \nworld\t\n"), "hello\nworld\n");
}
#[test]
fn final_newline_enforced() {
assert_eq!(canonicalize("no trailing newline"), "no trailing newline\n");
}
#[test]
fn trailing_blank_lines_collapsed() {
assert_eq!(canonicalize("a\n\n\n\n"), "a\n");
}
#[test]
fn empty_input_stays_empty() {
assert_eq!(canonicalize(""), "");
assert_eq!(canonicalize("\n\n\n"), "");
}
#[test]
fn nfc_normalizes_combining_marks() {
let combining = "e\u{0301}";
let precomposed = "\u{00E9}";
assert_eq!(canonicalize(combining), format!("{precomposed}\n"));
}
#[test]
fn idempotent() {
let once = canonicalize("a\r\nb\r\n \n\n");
let twice = canonicalize(&once);
assert_eq!(once, twice);
}
}