Skip to main content

pdfplumber_core/
unicode_norm.rs

1//! Unicode normalization for extracted text.
2//!
3//! Provides [`UnicodeNorm`] enum for selecting normalization form and
4//! [`normalize_chars`] for applying normalization to character text.
5
6use unicode_normalization::UnicodeNormalization;
7
8use crate::text::Char;
9
10/// Unicode normalization form to apply to extracted text.
11///
12/// Different PDF generators may produce different Unicode representations
13/// for the same visual text (e.g., composed vs. decomposed accented chars).
14/// Normalizing ensures consistent text output regardless of the source.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
16#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
17pub enum UnicodeNorm {
18    /// No normalization (default).
19    #[default]
20    None,
21    /// Canonical Decomposition, followed by Canonical Composition (NFC).
22    Nfc,
23    /// Canonical Decomposition (NFD).
24    Nfd,
25    /// Compatibility Decomposition, followed by Canonical Composition (NFKC).
26    Nfkc,
27    /// Compatibility Decomposition (NFKD).
28    Nfkd,
29}
30
31impl UnicodeNorm {
32    /// Apply this normalization form to the given string.
33    ///
34    /// Returns the input unchanged if normalization is `None`.
35    pub fn normalize(&self, text: &str) -> String {
36        match self {
37            UnicodeNorm::None => text.to_string(),
38            UnicodeNorm::Nfc => text.nfc().collect(),
39            UnicodeNorm::Nfd => text.nfd().collect(),
40            UnicodeNorm::Nfkc => text.nfkc().collect(),
41            UnicodeNorm::Nfkd => text.nfkd().collect(),
42        }
43    }
44}
45
46/// Apply Unicode normalization to the text of each character.
47///
48/// Returns a new `Vec<Char>` with normalized text values. All other
49/// fields (bbox, fontname, size, etc.) are preserved unchanged.
50/// If normalization is `None`, the chars are returned as clones.
51pub fn normalize_chars(chars: &[Char], norm: &UnicodeNorm) -> Vec<Char> {
52    if *norm == UnicodeNorm::None {
53        return chars.to_vec();
54    }
55
56    chars
57        .iter()
58        .map(|ch| {
59            let mut normalized = ch.clone();
60            normalized.text = norm.normalize(&ch.text);
61            normalized
62        })
63        .collect()
64}
65
66#[cfg(test)]
67mod tests {
68    use super::*;
69    use crate::geometry::BBox;
70    use crate::text::TextDirection;
71
72    fn make_char(text: &str) -> Char {
73        Char {
74            text: text.to_string(),
75            bbox: BBox::new(0.0, 0.0, 10.0, 12.0),
76            fontname: "TestFont".to_string(),
77            size: 12.0,
78            doctop: 0.0,
79            upright: true,
80            direction: TextDirection::Ltr,
81            stroking_color: None,
82            non_stroking_color: None,
83            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
84            char_code: 0,
85            mcid: None,
86            tag: None,
87        }
88    }
89
90    #[test]
91    fn unicode_norm_default_is_none() {
92        assert_eq!(UnicodeNorm::default(), UnicodeNorm::None);
93    }
94
95    #[test]
96    fn normalize_none_returns_unchanged() {
97        let text = "caf\u{0065}\u{0301}"; // "café" in NFD (e + combining acute)
98        let result = UnicodeNorm::None.normalize(text);
99        assert_eq!(result, text);
100    }
101
102    #[test]
103    fn normalize_nfc_composes_characters() {
104        // NFD: "e" + combining acute accent → NFC: "é"
105        let decomposed = "caf\u{0065}\u{0301}";
106        let result = UnicodeNorm::Nfc.normalize(decomposed);
107        assert_eq!(result, "caf\u{00E9}"); // "café" with composed é
108    }
109
110    #[test]
111    fn normalize_nfd_decomposes_characters() {
112        // NFC: "é" → NFD: "e" + combining acute accent
113        let composed = "caf\u{00E9}";
114        let result = UnicodeNorm::Nfd.normalize(composed);
115        assert_eq!(result, "caf\u{0065}\u{0301}");
116    }
117
118    #[test]
119    fn normalize_nfkc_decomposes_compatibility_and_composes() {
120        // NFKC: "fi" (U+FB01 LATIN SMALL LIGATURE FI) → "fi"
121        let ligature = "\u{FB01}";
122        let result = UnicodeNorm::Nfkc.normalize(ligature);
123        assert_eq!(result, "fi");
124    }
125
126    #[test]
127    fn normalize_nfkd_decomposes_compatibility() {
128        // NFKD: "fi" (U+FB01) → "fi"
129        let ligature = "\u{FB01}";
130        let result = UnicodeNorm::Nfkd.normalize(ligature);
131        assert_eq!(result, "fi");
132    }
133
134    #[test]
135    fn normalize_nfkc_fullwidth_to_ascii() {
136        // Fullwidth "A" (U+FF21) → "A"
137        let fullwidth = "\u{FF21}";
138        let result = UnicodeNorm::Nfkc.normalize(fullwidth);
139        assert_eq!(result, "A");
140    }
141
142    #[test]
143    fn normalize_chars_none_preserves_original() {
144        let chars = vec![make_char("caf\u{0065}\u{0301}"), make_char("hello")];
145        let result = normalize_chars(&chars, &UnicodeNorm::None);
146        assert_eq!(result[0].text, "caf\u{0065}\u{0301}");
147        assert_eq!(result[1].text, "hello");
148    }
149
150    #[test]
151    fn normalize_chars_nfc_composes() {
152        let chars = vec![
153            make_char("caf\u{0065}\u{0301}"), // decomposed é
154            make_char("hello"),
155        ];
156        let result = normalize_chars(&chars, &UnicodeNorm::Nfc);
157        assert_eq!(result[0].text, "caf\u{00E9}"); // composed é
158        assert_eq!(result[1].text, "hello"); // unchanged
159    }
160
161    #[test]
162    fn normalize_chars_preserves_other_fields() {
163        let mut ch = make_char("\u{FB01}"); // fi ligature
164        ch.fontname = "SpecialFont".to_string();
165        ch.size = 14.0;
166        ch.bbox = BBox::new(10.0, 20.0, 30.0, 40.0);
167
168        let result = normalize_chars(&[ch], &UnicodeNorm::Nfkc);
169        assert_eq!(result[0].text, "fi"); // normalized
170        assert_eq!(result[0].fontname, "SpecialFont"); // preserved
171        assert!((result[0].size - 14.0).abs() < f64::EPSILON); // preserved
172        assert_eq!(result[0].bbox, BBox::new(10.0, 20.0, 30.0, 40.0)); // preserved
173    }
174
175    #[test]
176    fn normalize_chars_empty_input() {
177        let result = normalize_chars(&[], &UnicodeNorm::Nfc);
178        assert!(result.is_empty());
179    }
180}