use unicode_normalization::UnicodeNormalization;
pub fn normalize_pdf_text(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for ch in text.chars() {
if let Some(replacement) = decompose_ligature(ch) {
result.push_str(replacement);
} else if is_ignorable(ch) {
} else {
result.push(ch);
}
}
result.nfc().collect()
}
fn decompose_ligature(ch: char) -> Option<&'static str> {
match ch {
'\u{FB00}' => Some("ff"),
'\u{FB01}' => Some("fi"),
'\u{FB02}' => Some("fl"),
'\u{FB03}' => Some("ffi"),
'\u{FB04}' => Some("ffl"),
'\u{FB05}' => Some("st"), '\u{FB06}' => Some("st"), '\u{0132}' => Some("IJ"), '\u{0133}' => Some("ij"), '\u{0152}' => Some("OE"), '\u{0153}' => Some("oe"), '\u{00C6}' => Some("AE"), '\u{00E6}' => Some("ae"), _ => None,
}
}
fn is_ignorable(ch: char) -> bool {
matches!(
ch,
'\u{00AD}' | '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{2060}' | '\u{FFFC}' )
}
pub fn collapse_whitespace(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut prev_space = true; for ch in text.chars() {
if ch.is_whitespace() {
if !prev_space {
result.push(' ');
prev_space = true;
}
} else {
result.push(ch);
prev_space = false;
}
}
if result.ends_with(' ') {
result.pop();
}
result
}
pub fn strip_diacritics(text: &str) -> String {
text.nfd()
.filter(|c| !unicode_normalization::char::is_combining_mark(*c))
.nfc()
.collect()
}
pub fn normalize_typography(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for ch in text.chars() {
match ch {
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => result.push('\''),
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => result.push('"'),
'\u{2013}' => result.push('-'), '\u{2014}' => result.push_str("--"), '\u{2026}' => result.push_str("..."), '\u{00A0}' => result.push(' '), '\u{2002}' | '\u{2003}' | '\u{2009}' => result.push(' '), _ => result.push(ch),
}
}
result
}
pub fn full_normalize(text: &str) -> String {
let step1 = normalize_pdf_text(text);
let step2 = normalize_typography(&step1);
collapse_whitespace(&step2)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ligature_decomposition() {
assert_eq!(normalize_pdf_text("e\u{FB03}cient"), "efficient");
assert_eq!(normalize_pdf_text("\u{FB01}le"), "file");
assert_eq!(normalize_pdf_text("\u{FB02}oor"), "floor");
assert_eq!(normalize_pdf_text("\u{FB00}ect"), "ffect");
assert_eq!(normalize_pdf_text("\u{FB04}e"), "ffle");
}
#[test]
fn test_soft_hyphen_removal() {
assert_eq!(normalize_pdf_text("con\u{00AD}tin\u{00AD}ue"), "continue");
}
#[test]
fn test_zero_width_removal() {
assert_eq!(
normalize_pdf_text("he\u{200B}llo\u{FEFF} world"),
"hello world"
);
}
#[test]
fn test_nfc_after_decomposition() {
assert_eq!(normalize_pdf_text("e\u{0301}"), "é");
}
#[test]
fn test_collapse_whitespace() {
assert_eq!(collapse_whitespace(" hello world "), "hello world");
assert_eq!(collapse_whitespace("a\n\n b\tc"), "a b c");
assert_eq!(collapse_whitespace(""), "");
assert_eq!(collapse_whitespace(" "), "");
}
#[test]
fn test_plain_text_unchanged() {
assert_eq!(normalize_pdf_text("Hello World"), "Hello World");
}
#[test]
fn test_strip_diacritics() {
assert_eq!(strip_diacritics("café"), "cafe");
assert_eq!(strip_diacritics("über"), "uber");
assert_eq!(strip_diacritics("naïve"), "naive");
assert_eq!(strip_diacritics("résumé"), "resume");
}
#[test]
fn test_normalize_typography() {
assert_eq!(normalize_typography("\u{201C}hello\u{201D}"), "\"hello\"");
assert_eq!(normalize_typography("it\u{2019}s"), "it's");
assert_eq!(normalize_typography("a\u{2013}b"), "a-b");
assert_eq!(normalize_typography("wait\u{2026}"), "wait...");
}
#[test]
fn test_full_normalize() {
let input = "\u{FB01}le\u{00AD}name\u{200B} \u{201C}test\u{201D} \u{2026}";
let result = full_normalize(input);
assert_eq!(result, "filename \"test\" ...");
}
}