pdfplumber_core/
unicode_norm.rs1use unicode_normalization::UnicodeNormalization;
7
8use crate::text::Char;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
16#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
17pub enum UnicodeNorm {
18 #[default]
20 None,
21 Nfc,
23 Nfd,
25 Nfkc,
27 Nfkd,
29}
30
31impl UnicodeNorm {
32 pub fn normalize(&self, text: &str) -> String {
36 match self {
37 UnicodeNorm::None => text.to_string(),
38 UnicodeNorm::Nfc => text.nfc().collect(),
39 UnicodeNorm::Nfd => text.nfd().collect(),
40 UnicodeNorm::Nfkc => text.nfkc().collect(),
41 UnicodeNorm::Nfkd => text.nfkd().collect(),
42 }
43 }
44}
45
46pub fn normalize_chars(chars: &[Char], norm: &UnicodeNorm) -> Vec<Char> {
52 if *norm == UnicodeNorm::None {
53 return chars.to_vec();
54 }
55
56 chars
57 .iter()
58 .map(|ch| {
59 let mut normalized = ch.clone();
60 normalized.text = norm.normalize(&ch.text);
61 normalized
62 })
63 .collect()
64}
65
66#[cfg(test)]
67mod tests {
68 use super::*;
69 use crate::geometry::BBox;
70 use crate::text::TextDirection;
71
72 fn make_char(text: &str) -> Char {
73 Char {
74 text: text.to_string(),
75 bbox: BBox::new(0.0, 0.0, 10.0, 12.0),
76 fontname: "TestFont".to_string(),
77 size: 12.0,
78 doctop: 0.0,
79 upright: true,
80 direction: TextDirection::Ltr,
81 stroking_color: None,
82 non_stroking_color: None,
83 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
84 char_code: 0,
85 mcid: None,
86 tag: None,
87 }
88 }
89
90 #[test]
91 fn unicode_norm_default_is_none() {
92 assert_eq!(UnicodeNorm::default(), UnicodeNorm::None);
93 }
94
95 #[test]
96 fn normalize_none_returns_unchanged() {
97 let text = "caf\u{0065}\u{0301}"; let result = UnicodeNorm::None.normalize(text);
99 assert_eq!(result, text);
100 }
101
102 #[test]
103 fn normalize_nfc_composes_characters() {
104 let decomposed = "caf\u{0065}\u{0301}";
106 let result = UnicodeNorm::Nfc.normalize(decomposed);
107 assert_eq!(result, "caf\u{00E9}"); }
109
110 #[test]
111 fn normalize_nfd_decomposes_characters() {
112 let composed = "caf\u{00E9}";
114 let result = UnicodeNorm::Nfd.normalize(composed);
115 assert_eq!(result, "caf\u{0065}\u{0301}");
116 }
117
118 #[test]
119 fn normalize_nfkc_decomposes_compatibility_and_composes() {
120 let ligature = "\u{FB01}";
122 let result = UnicodeNorm::Nfkc.normalize(ligature);
123 assert_eq!(result, "fi");
124 }
125
126 #[test]
127 fn normalize_nfkd_decomposes_compatibility() {
128 let ligature = "\u{FB01}";
130 let result = UnicodeNorm::Nfkd.normalize(ligature);
131 assert_eq!(result, "fi");
132 }
133
134 #[test]
135 fn normalize_nfkc_fullwidth_to_ascii() {
136 let fullwidth = "\u{FF21}";
138 let result = UnicodeNorm::Nfkc.normalize(fullwidth);
139 assert_eq!(result, "A");
140 }
141
142 #[test]
143 fn normalize_chars_none_preserves_original() {
144 let chars = vec![make_char("caf\u{0065}\u{0301}"), make_char("hello")];
145 let result = normalize_chars(&chars, &UnicodeNorm::None);
146 assert_eq!(result[0].text, "caf\u{0065}\u{0301}");
147 assert_eq!(result[1].text, "hello");
148 }
149
150 #[test]
151 fn normalize_chars_nfc_composes() {
152 let chars = vec![
153 make_char("caf\u{0065}\u{0301}"), make_char("hello"),
155 ];
156 let result = normalize_chars(&chars, &UnicodeNorm::Nfc);
157 assert_eq!(result[0].text, "caf\u{00E9}"); assert_eq!(result[1].text, "hello"); }
160
161 #[test]
162 fn normalize_chars_preserves_other_fields() {
163 let mut ch = make_char("\u{FB01}"); ch.fontname = "SpecialFont".to_string();
165 ch.size = 14.0;
166 ch.bbox = BBox::new(10.0, 20.0, 30.0, 40.0);
167
168 let result = normalize_chars(&[ch], &UnicodeNorm::Nfkc);
169 assert_eq!(result[0].text, "fi"); assert_eq!(result[0].fontname, "SpecialFont"); assert!((result[0].size - 14.0).abs() < f64::EPSILON); assert_eq!(result[0].bbox, BBox::new(10.0, 20.0, 30.0, 40.0)); }
174
175 #[test]
176 fn normalize_chars_empty_input() {
177 let result = normalize_chars(&[], &UnicodeNorm::Nfc);
178 assert!(result.is_empty());
179 }
180}