oxidize_pdf/text/
encoding.rs

1#[derive(Debug, Clone, Copy, PartialEq)]
2pub enum TextEncoding {
3    StandardEncoding,
4    MacRomanEncoding,
5    WinAnsiEncoding,
6    PdfDocEncoding,
7}
8
9impl TextEncoding {
10    pub fn encode(&self, text: &str) -> Vec<u8> {
11        match self {
12            TextEncoding::StandardEncoding | TextEncoding::PdfDocEncoding => {
13                // For now, use UTF-8 encoding
14                text.bytes().collect()
15            }
16            TextEncoding::WinAnsiEncoding => {
17                // Convert UTF-8 to Windows-1252
18                let mut result = Vec::new();
19                for ch in text.chars() {
20                    match ch as u32 {
21                        // ASCII range
22                        0x00..=0x7F => result.push(ch as u8),
23                        // Latin-1 Supplement that overlaps with Windows-1252
24                        0xA0..=0xFF => result.push(ch as u8),
25                        // Special mappings for Windows-1252
26                        0x20AC => result.push(0x80), // Euro sign
27                        0x201A => result.push(0x82), // Single low quotation mark
28                        0x0192 => result.push(0x83), // Latin small letter f with hook
29                        0x201E => result.push(0x84), // Double low quotation mark
30                        0x2026 => result.push(0x85), // Horizontal ellipsis
31                        0x2020 => result.push(0x86), // Dagger
32                        0x2021 => result.push(0x87), // Double dagger
33                        0x02C6 => result.push(0x88), // Circumflex accent
34                        0x2030 => result.push(0x89), // Per mille sign
35                        0x0160 => result.push(0x8A), // Latin capital letter S with caron
36                        0x2039 => result.push(0x8B), // Single left angle quotation mark
37                        0x0152 => result.push(0x8C), // Latin capital ligature OE
38                        0x017D => result.push(0x8E), // Latin capital letter Z with caron
39                        0x2018 => result.push(0x91), // Left single quotation mark
40                        0x2019 => result.push(0x92), // Right single quotation mark
41                        0x201C => result.push(0x93), // Left double quotation mark
42                        0x201D => result.push(0x94), // Right double quotation mark
43                        0x2022 => result.push(0x95), // Bullet
44                        0x2013 => result.push(0x96), // En dash
45                        0x2014 => result.push(0x97), // Em dash
46                        0x02DC => result.push(0x98), // Small tilde
47                        0x2122 => result.push(0x99), // Trade mark sign
48                        0x0161 => result.push(0x9A), // Latin small letter s with caron
49                        0x203A => result.push(0x9B), // Single right angle quotation mark
50                        0x0153 => result.push(0x9C), // Latin small ligature oe
51                        0x017E => result.push(0x9E), // Latin small letter z with caron
52                        0x0178 => result.push(0x9F), // Latin capital letter Y with diaeresis
53                        // Default: use question mark for unmapped characters
54                        _ => result.push(b'?'),
55                    }
56                }
57                result
58            }
59            TextEncoding::MacRomanEncoding => {
60                // Convert UTF-8 to Mac Roman encoding
61                let mut result = Vec::new();
62                for ch in text.chars() {
63                    match ch as u32 {
64                        // ASCII range
65                        0x00..=0x7F => result.push(ch as u8),
66                        // Mac Roman specific mappings
67                        0x00C4 => result.push(0x80), // Latin capital letter A with diaeresis
68                        0x00C5 => result.push(0x81), // Latin capital letter A with ring above
69                        0x00C7 => result.push(0x82), // Latin capital letter C with cedilla
70                        0x00C9 => result.push(0x83), // Latin capital letter E with acute
71                        0x00D1 => result.push(0x84), // Latin capital letter N with tilde
72                        0x00D6 => result.push(0x85), // Latin capital letter O with diaeresis
73                        0x00DC => result.push(0x86), // Latin capital letter U with diaeresis
74                        0x00E1 => result.push(0x87), // Latin small letter a with acute
75                        0x00E0 => result.push(0x88), // Latin small letter a with grave
76                        0x00E2 => result.push(0x89), // Latin small letter a with circumflex
77                        0x00E4 => result.push(0x8A), // Latin small letter a with diaeresis
78                        0x00E3 => result.push(0x8B), // Latin small letter a with tilde
79                        0x00E5 => result.push(0x8C), // Latin small letter a with ring above
80                        0x00E7 => result.push(0x8D), // Latin small letter c with cedilla
81                        0x00E9 => result.push(0x8E), // Latin small letter e with acute
82                        0x00E8 => result.push(0x8F), // Latin small letter e with grave
83                        0x00EA => result.push(0x90), // Latin small letter e with circumflex
84                        0x00EB => result.push(0x91), // Latin small letter e with diaeresis
85                        0x00ED => result.push(0x92), // Latin small letter i with acute
86                        0x00EC => result.push(0x93), // Latin small letter i with grave
87                        0x00EE => result.push(0x94), // Latin small letter i with circumflex
88                        0x00EF => result.push(0x95), // Latin small letter i with diaeresis
89                        0x00F1 => result.push(0x96), // Latin small letter n with tilde
90                        0x00F3 => result.push(0x97), // Latin small letter o with acute
91                        0x00F2 => result.push(0x98), // Latin small letter o with grave
92                        0x00F4 => result.push(0x99), // Latin small letter o with circumflex
93                        0x00F6 => result.push(0x9A), // Latin small letter o with diaeresis
94                        0x00F5 => result.push(0x9B), // Latin small letter o with tilde
95                        0x00FA => result.push(0x9C), // Latin small letter u with acute
96                        0x00F9 => result.push(0x9D), // Latin small letter u with grave
97                        0x00FB => result.push(0x9E), // Latin small letter u with circumflex
98                        0x00FC => result.push(0x9F), // Latin small letter u with diaeresis
99                        0x2020 => result.push(0xA0), // Dagger
100                        0x00B0 => result.push(0xA1), // Degree sign
101                        0x00A2 => result.push(0xA2), // Cent sign
102                        0x00A3 => result.push(0xA3), // Pound sign
103                        0x00A7 => result.push(0xA4), // Section sign
104                        0x2022 => result.push(0xA5), // Bullet
105                        0x00B6 => result.push(0xA6), // Pilcrow sign
106                        0x00DF => result.push(0xA7), // Latin small letter sharp s
107                        0x00AE => result.push(0xA8), // Registered sign
108                        0x00A9 => result.push(0xA9), // Copyright sign
109                        0x2122 => result.push(0xAA), // Trade mark sign
110                        0x00B4 => result.push(0xAB), // Acute accent
111                        0x00A8 => result.push(0xAC), // Diaeresis
112                        0x2260 => result.push(0xAD), // Not equal to
113                        0x00C6 => result.push(0xAE), // Latin capital letter AE
114                        0x00D8 => result.push(0xAF), // Latin capital letter O with stroke
115                        // Default: use question mark for unmapped characters
116                        _ => result.push(b'?'),
117                    }
118                }
119                result
120            }
121        }
122    }
123
124    pub fn decode(&self, data: &[u8]) -> String {
125        match self {
126            TextEncoding::StandardEncoding | TextEncoding::PdfDocEncoding => {
127                // For now, assume UTF-8
128                String::from_utf8_lossy(data).to_string()
129            }
130            TextEncoding::WinAnsiEncoding => {
131                // Decode Windows-1252 to UTF-8
132                let mut result = String::new();
133                for &byte in data {
134                    let ch = match byte {
135                        // ASCII range
136                        0x00..=0x7F => byte as char,
137                        // Windows-1252 specific mappings
138                        0x80 => '\u{20AC}', // Euro sign
139                        0x82 => '\u{201A}', // Single low quotation mark
140                        0x83 => '\u{0192}', // Latin small letter f with hook
141                        0x84 => '\u{201E}', // Double low quotation mark
142                        0x85 => '\u{2026}', // Horizontal ellipsis
143                        0x86 => '\u{2020}', // Dagger
144                        0x87 => '\u{2021}', // Double dagger
145                        0x88 => '\u{02C6}', // Circumflex accent
146                        0x89 => '\u{2030}', // Per mille sign
147                        0x8A => '\u{0160}', // Latin capital letter S with caron
148                        0x8B => '\u{2039}', // Single left angle quotation mark
149                        0x8C => '\u{0152}', // Latin capital ligature OE
150                        0x8E => '\u{017D}', // Latin capital letter Z with caron
151                        0x91 => '\u{2018}', // Left single quotation mark
152                        0x92 => '\u{2019}', // Right single quotation mark
153                        0x93 => '\u{201C}', // Left double quotation mark
154                        0x94 => '\u{201D}', // Right double quotation mark
155                        0x95 => '\u{2022}', // Bullet
156                        0x96 => '\u{2013}', // En dash
157                        0x97 => '\u{2014}', // Em dash
158                        0x98 => '\u{02DC}', // Small tilde
159                        0x99 => '\u{2122}', // Trade mark sign
160                        0x9A => '\u{0161}', // Latin small letter s with caron
161                        0x9B => '\u{203A}', // Single right angle quotation mark
162                        0x9C => '\u{0153}', // Latin small ligature oe
163                        0x9E => '\u{017E}', // Latin small letter z with caron
164                        0x9F => '\u{0178}', // Latin capital letter Y with diaeresis
165                        // Latin-1 range that overlaps with Windows-1252
166                        0xA0..=0xFF => char::from_u32(byte as u32).unwrap_or('?'),
167                        // Undefined bytes
168                        _ => '?',
169                    };
170                    result.push(ch);
171                }
172                result
173            }
174            TextEncoding::MacRomanEncoding => {
175                // Decode Mac Roman to UTF-8
176                let mut result = String::new();
177                for &byte in data {
178                    let ch = match byte {
179                        // ASCII range
180                        0x00..=0x7F => byte as char,
181                        // Mac Roman specific mappings
182                        0x80 => '\u{00C4}', // Latin capital letter A with diaeresis
183                        0x81 => '\u{00C5}', // Latin capital letter A with ring above
184                        0x82 => '\u{00C7}', // Latin capital letter C with cedilla
185                        0x83 => '\u{00C9}', // Latin capital letter E with acute
186                        0x84 => '\u{00D1}', // Latin capital letter N with tilde
187                        0x85 => '\u{00D6}', // Latin capital letter O with diaeresis
188                        0x86 => '\u{00DC}', // Latin capital letter U with diaeresis
189                        0x87 => '\u{00E1}', // Latin small letter a with acute
190                        0x88 => '\u{00E0}', // Latin small letter a with grave
191                        0x89 => '\u{00E2}', // Latin small letter a with circumflex
192                        0x8A => '\u{00E4}', // Latin small letter a with diaeresis
193                        0x8B => '\u{00E3}', // Latin small letter a with tilde
194                        0x8C => '\u{00E5}', // Latin small letter a with ring above
195                        0x8D => '\u{00E7}', // Latin small letter c with cedilla
196                        0x8E => '\u{00E9}', // Latin small letter e with acute
197                        0x8F => '\u{00E8}', // Latin small letter e with grave
198                        0x90 => '\u{00EA}', // Latin small letter e with circumflex
199                        0x91 => '\u{00EB}', // Latin small letter e with diaeresis
200                        0x92 => '\u{00ED}', // Latin small letter i with acute
201                        0x93 => '\u{00EC}', // Latin small letter i with grave
202                        0x94 => '\u{00EE}', // Latin small letter i with circumflex
203                        0x95 => '\u{00EF}', // Latin small letter i with diaeresis
204                        0x96 => '\u{00F1}', // Latin small letter n with tilde
205                        0x97 => '\u{00F3}', // Latin small letter o with acute
206                        0x98 => '\u{00F2}', // Latin small letter o with grave
207                        0x99 => '\u{00F4}', // Latin small letter o with circumflex
208                        0x9A => '\u{00F6}', // Latin small letter o with diaeresis
209                        0x9B => '\u{00F5}', // Latin small letter o with tilde
210                        0x9C => '\u{00FA}', // Latin small letter u with acute
211                        0x9D => '\u{00F9}', // Latin small letter u with grave
212                        0x9E => '\u{00FB}', // Latin small letter u with circumflex
213                        0x9F => '\u{00FC}', // Latin small letter u with diaeresis
214                        0xA0 => '\u{2020}', // Dagger
215                        0xA1 => '\u{00B0}', // Degree sign
216                        0xA2 => '\u{00A2}', // Cent sign
217                        0xA3 => '\u{00A3}', // Pound sign
218                        0xA4 => '\u{00A7}', // Section sign
219                        0xA5 => '\u{2022}', // Bullet
220                        0xA6 => '\u{00B6}', // Pilcrow sign
221                        0xA7 => '\u{00DF}', // Latin small letter sharp s
222                        0xA8 => '\u{00AE}', // Registered sign
223                        0xA9 => '\u{00A9}', // Copyright sign
224                        0xAA => '\u{2122}', // Trade mark sign
225                        0xAB => '\u{00B4}', // Acute accent
226                        0xAC => '\u{00A8}', // Diaeresis
227                        0xAD => '\u{2260}', // Not equal to
228                        0xAE => '\u{00C6}', // Latin capital letter AE
229                        0xAF => '\u{00D8}', // Latin capital letter O with stroke
230                        0xB0 => '\u{221E}', // Infinity
231                        0xB1 => '\u{00B1}', // Plus-minus sign
232                        0xB2 => '\u{2264}', // Less-than or equal to
233                        0xB3 => '\u{2265}', // Greater-than or equal to
234                        0xB4 => '\u{00A5}', // Yen sign
235                        0xB5 => '\u{00B5}', // Micro sign
236                        0xB6 => '\u{2202}', // Partial differential
237                        0xB7 => '\u{2211}', // N-ary summation
238                        0xB8 => '\u{220F}', // N-ary product
239                        0xB9 => '\u{03C0}', // Greek small letter pi
240                        0xBA => '\u{222B}', // Integral
241                        0xBB => '\u{00AA}', // Feminine ordinal indicator
242                        0xBC => '\u{00BA}', // Masculine ordinal indicator
243                        0xBD => '\u{03A9}', // Greek capital letter omega
244                        0xBE => '\u{00E6}', // Latin small letter ae
245                        0xBF => '\u{00F8}', // Latin small letter o with stroke
246                        0xC0 => '\u{00BF}', // Inverted question mark
247                        0xC1 => '\u{00A1}', // Inverted exclamation mark
248                        0xC2 => '\u{00AC}', // Not sign
249                        0xC3 => '\u{221A}', // Square root
250                        0xC4 => '\u{0192}', // Latin small letter f with hook
251                        0xC5 => '\u{2248}', // Almost equal to
252                        0xC6 => '\u{2206}', // Increment
253                        0xC7 => '\u{00AB}', // Left-pointing double angle quotation mark
254                        0xC8 => '\u{00BB}', // Right-pointing double angle quotation mark
255                        0xC9 => '\u{2026}', // Horizontal ellipsis
256                        0xCA => '\u{00A0}', // No-break space
257                        0xCB => '\u{00C0}', // Latin capital letter A with grave
258                        0xCC => '\u{00C3}', // Latin capital letter A with tilde
259                        0xCD => '\u{00D5}', // Latin capital letter O with tilde
260                        0xCE => '\u{0152}', // Latin capital ligature OE
261                        0xCF => '\u{0153}', // Latin small ligature oe
262                        0xD0 => '\u{2013}', // En dash
263                        0xD1 => '\u{2014}', // Em dash
264                        0xD2 => '\u{201C}', // Left double quotation mark
265                        0xD3 => '\u{201D}', // Right double quotation mark
266                        0xD4 => '\u{2018}', // Left single quotation mark
267                        0xD5 => '\u{2019}', // Right single quotation mark
268                        0xD6 => '\u{00F7}', // Division sign
269                        0xD7 => '\u{25CA}', // Lozenge
270                        0xD8 => '\u{00FF}', // Latin small letter y with diaeresis
271                        0xD9 => '\u{0178}', // Latin capital letter Y with diaeresis
272                        0xDA => '\u{2044}', // Fraction slash
273                        0xDB => '\u{20AC}', // Euro sign
274                        0xDC => '\u{2039}', // Single left-pointing angle quotation mark
275                        0xDD => '\u{203A}', // Single right-pointing angle quotation mark
276                        0xDE => '\u{FB01}', // Latin small ligature fi
277                        0xDF => '\u{FB02}', // Latin small ligature fl
278                        0xE0 => '\u{2021}', // Double dagger
279                        0xE1 => '\u{00B7}', // Middle dot
280                        0xE2 => '\u{201A}', // Single low-9 quotation mark
281                        0xE3 => '\u{201E}', // Double low-9 quotation mark
282                        0xE4 => '\u{2030}', // Per mille sign
283                        0xE5 => '\u{00C2}', // Latin capital letter A with circumflex
284                        0xE6 => '\u{00CA}', // Latin capital letter E with circumflex
285                        0xE7 => '\u{00C1}', // Latin capital letter A with acute
286                        0xE8 => '\u{00CB}', // Latin capital letter E with diaeresis
287                        0xE9 => '\u{00C8}', // Latin capital letter E with grave
288                        0xEA => '\u{00CD}', // Latin capital letter I with acute
289                        0xEB => '\u{00CE}', // Latin capital letter I with circumflex
290                        0xEC => '\u{00CF}', // Latin capital letter I with diaeresis
291                        0xED => '\u{00CC}', // Latin capital letter I with grave
292                        0xEE => '\u{00D3}', // Latin capital letter O with acute
293                        0xEF => '\u{00D4}', // Latin capital letter O with circumflex
294                        0xF0 => '\u{F8FF}', // Apple logo
295                        0xF1 => '\u{00D2}', // Latin capital letter O with grave
296                        0xF2 => '\u{00DA}', // Latin capital letter U with acute
297                        0xF3 => '\u{00DB}', // Latin capital letter U with circumflex
298                        0xF4 => '\u{00D9}', // Latin capital letter U with grave
299                        0xF5 => '\u{0131}', // Latin small letter dotless i
300                        0xF6 => '\u{02C6}', // Modifier letter circumflex accent
301                        0xF7 => '\u{02DC}', // Small tilde
302                        0xF8 => '\u{00AF}', // Macron
303                        0xF9 => '\u{02D8}', // Breve
304                        0xFA => '\u{02D9}', // Dot above
305                        0xFB => '\u{02DA}', // Ring above
306                        0xFC => '\u{00B8}', // Cedilla
307                        0xFD => '\u{02DD}', // Double acute accent
308                        0xFE => '\u{02DB}', // Ogonek
309                        0xFF => '\u{02C7}', // Caron
310                    };
311                    result.push(ch);
312                }
313                result
314            }
315        }
316    }
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    #[test]
324    fn test_text_encoding_variants() {
325        let encodings = [
326            TextEncoding::StandardEncoding,
327            TextEncoding::MacRomanEncoding,
328            TextEncoding::WinAnsiEncoding,
329            TextEncoding::PdfDocEncoding,
330        ];
331
332        for encoding in &encodings {
333            assert_eq!(*encoding, *encoding);
334        }
335
336        assert_ne!(
337            TextEncoding::StandardEncoding,
338            TextEncoding::WinAnsiEncoding
339        );
340    }
341
342    #[test]
343    fn test_standard_encoding_basic_ascii() {
344        let encoding = TextEncoding::StandardEncoding;
345        let text = "Hello World!";
346
347        let encoded = encoding.encode(text);
348        let decoded = encoding.decode(&encoded);
349        assert_eq!(decoded, text);
350    }
351
352    #[test]
353    fn test_win_ansi_encoding_special_chars() {
354        let encoding = TextEncoding::WinAnsiEncoding;
355
356        // Test Euro sign
357        let text = "€100";
358        let encoded = encoding.encode(text);
359        assert_eq!(encoded[0], 0x80);
360        let decoded = encoding.decode(&encoded);
361        assert_eq!(decoded, text);
362
363        // Test other special characters
364        let text2 = "Hello—World"; // Em dash
365        let encoded2 = encoding.encode(text2);
366        let decoded2 = encoding.decode(&encoded2);
367        assert_eq!(decoded2, text2);
368    }
369
370    #[test]
371    fn test_mac_roman_encoding_special_chars() {
372        let encoding = TextEncoding::MacRomanEncoding;
373
374        // Test accented characters
375        let text = "café";
376        let encoded = encoding.encode(text);
377        assert_eq!(encoded[3], 0x8E); // é
378        let decoded = encoding.decode(&encoded);
379        assert_eq!(decoded, text);
380
381        // Test Apple logo (special Mac character)
382        let apple_bytes = vec![0xF0];
383        let decoded_apple = encoding.decode(&apple_bytes);
384        assert_eq!(decoded_apple, "\u{F8FF}");
385
386        // Test various accented characters
387        let text2 = "Zürich";
388        let encoded2 = encoding.encode(text2);
389        assert_eq!(encoded2[1], 0x9F); // ü
390        let decoded2 = encoding.decode(&encoded2);
391        assert_eq!(decoded2, text2);
392    }
393
394    #[test]
395    fn test_pdf_doc_encoding() {
396        let encoding = TextEncoding::PdfDocEncoding;
397        let text = "PDF Document";
398
399        let encoded = encoding.encode(text);
400        let decoded = encoding.decode(&encoded);
401
402        assert_eq!(text, decoded);
403    }
404
405    #[test]
406    fn test_pdf_doc_encoding_basic_ascii() {
407        let encoding = TextEncoding::PdfDocEncoding;
408        let text = "Hello World!";
409
410        let encoded = encoding.encode(text);
411        let decoded = encoding.decode(&encoded);
412
413        assert_eq!(text, decoded);
414    }
415
416    #[test]
417    fn test_mac_roman_encoding_basic_ascii() {
418        let encoding = TextEncoding::MacRomanEncoding;
419        let text = "Hello World!";
420
421        let encoded = encoding.encode(text);
422        let decoded = encoding.decode(&encoded);
423
424        assert_eq!(text, decoded);
425    }
426
427    #[test]
428    fn test_win_ansi_encoding_basic_ascii() {
429        let encoding = TextEncoding::WinAnsiEncoding;
430        let text = "Hello World!";
431
432        let encoded = encoding.encode(text);
433        let decoded = encoding.decode(&encoded);
434
435        assert_eq!(text, decoded);
436    }
437
438    #[test]
439    fn test_win_ansi_encoding_special_characters() {
440        let encoding = TextEncoding::WinAnsiEncoding;
441
442        // Test Euro sign
443        let euro_text = "€";
444        let encoded = encoding.encode(euro_text);
445        assert_eq!(encoded, vec![0x80]);
446        let decoded = encoding.decode(&encoded);
447        assert_eq!(decoded, euro_text);
448
449        // Test em dash
450        let dash_text = "—";
451        let encoded = encoding.encode(dash_text);
452        assert_eq!(encoded, vec![0x97]);
453        let decoded = encoding.decode(&encoded);
454        assert_eq!(decoded, dash_text);
455
456        // Test single low quotation mark
457        let quote_text = "‚";
458        let encoded = encoding.encode(quote_text);
459        assert_eq!(encoded, vec![0x82]);
460        let decoded = encoding.decode(&encoded);
461        assert_eq!(decoded, quote_text);
462    }
463
464    #[test]
465    fn test_win_ansi_encoding_latin_supplement() {
466        let encoding = TextEncoding::WinAnsiEncoding;
467        let text = "café";
468
469        let encoded = encoding.encode(text);
470        let decoded = encoding.decode(&encoded);
471
472        assert_eq!(text, decoded);
473    }
474
475    #[test]
476    fn test_win_ansi_encoding_unmapped_character() {
477        let encoding = TextEncoding::WinAnsiEncoding;
478
479        // Use a character that's not in Windows-1252
480        let text = "❤"; // Heart emoji
481        let encoded = encoding.encode(text);
482        assert_eq!(encoded, vec![b'?']); // Should be replaced with ?
483
484        let decoded = encoding.decode(&encoded);
485        assert_eq!(decoded, "?");
486    }
487
488    #[test]
489    fn test_win_ansi_encoding_round_trip_special_chars() {
490        let encoding = TextEncoding::WinAnsiEncoding;
491
492        let special_chars = [
493            ("€", 0x80),        // Euro sign
494            ("‚", 0x82),        // Single low quotation mark
495            ("ƒ", 0x83),        // Latin small letter f with hook
496            ("„", 0x84),        // Double low quotation mark
497            ("…", 0x85),        // Horizontal ellipsis
498            ("†", 0x86),        // Dagger
499            ("‡", 0x87),        // Double dagger
500            ("‰", 0x89),        // Per mille sign
501            ("\u{2018}", 0x91), // Left single quotation mark
502            ("\u{2019}", 0x92), // Right single quotation mark
503            ("\u{201C}", 0x93), // Left double quotation mark
504            ("\u{201D}", 0x94), // Right double quotation mark
505            ("•", 0x95),        // Bullet
506            ("–", 0x96),        // En dash
507            ("—", 0x97),        // Em dash
508            ("™", 0x99),        // Trade mark sign
509        ];
510
511        for (text, expected_byte) in &special_chars {
512            let encoded = encoding.encode(text);
513            assert_eq!(encoded, vec![*expected_byte], "Failed for character {text}");
514
515            let decoded = encoding.decode(&encoded);
516            assert_eq!(decoded, *text, "Round trip failed for character {text}");
517        }
518    }
519
520    #[test]
521    fn test_encoding_equality() {
522        assert_eq!(
523            TextEncoding::StandardEncoding,
524            TextEncoding::StandardEncoding
525        );
526        assert_eq!(TextEncoding::WinAnsiEncoding, TextEncoding::WinAnsiEncoding);
527
528        assert_ne!(
529            TextEncoding::StandardEncoding,
530            TextEncoding::WinAnsiEncoding
531        );
532        assert_ne!(TextEncoding::MacRomanEncoding, TextEncoding::PdfDocEncoding);
533    }
534
535    #[test]
536    fn test_encoding_debug() {
537        let encoding = TextEncoding::WinAnsiEncoding;
538        let debug_str = format!("{encoding:?}");
539        assert_eq!(debug_str, "WinAnsiEncoding");
540    }
541
542    #[test]
543    fn test_encoding_clone() {
544        let encoding1 = TextEncoding::PdfDocEncoding;
545        let encoding2 = encoding1;
546        assert_eq!(encoding1, encoding2);
547    }
548
549    #[test]
550    fn test_encoding_copy() {
551        let encoding1 = TextEncoding::StandardEncoding;
552        let encoding2 = encoding1; // Copy semantics
553        assert_eq!(encoding1, encoding2);
554
555        // Both variables should still be usable
556        assert_eq!(encoding1, TextEncoding::StandardEncoding);
557        assert_eq!(encoding2, TextEncoding::StandardEncoding);
558    }
559
560    #[test]
561    fn test_empty_string_encoding() {
562        for encoding in &[
563            TextEncoding::StandardEncoding,
564            TextEncoding::MacRomanEncoding,
565            TextEncoding::WinAnsiEncoding,
566            TextEncoding::PdfDocEncoding,
567        ] {
568            let encoded = encoding.encode("");
569            assert!(encoded.is_empty());
570
571            let decoded = encoding.decode(&[]);
572            assert!(decoded.is_empty());
573        }
574    }
575
576    #[test]
577    fn test_win_ansi_decode_undefined_bytes() {
578        let encoding = TextEncoding::WinAnsiEncoding;
579
580        // Test some undefined bytes in Windows-1252 (0x81, 0x8D, 0x8F, 0x90, 0x9D)
581        let undefined_bytes = [0x81, 0x8D, 0x8F, 0x90, 0x9D];
582
583        for &byte in &undefined_bytes {
584            let decoded = encoding.decode(&[byte]);
585            assert_eq!(
586                decoded, "?",
587                "Undefined byte 0x{byte:02X} should decode to '?'"
588            );
589        }
590    }
591
592    #[test]
593    fn test_win_ansi_ascii_range() {
594        let encoding = TextEncoding::WinAnsiEncoding;
595
596        // Test ASCII range (0x00-0x7F)
597        for byte in 0x20..=0x7E {
598            // Printable ASCII
599            let text = char::from(byte).to_string();
600            let encoded = encoding.encode(&text);
601            assert_eq!(encoded, vec![byte]);
602
603            let decoded = encoding.decode(&encoded);
604            assert_eq!(decoded, text);
605        }
606    }
607
608    #[test]
609    fn test_win_ansi_latin1_overlap() {
610        let encoding = TextEncoding::WinAnsiEncoding;
611
612        // Test Latin-1 range that overlaps with Windows-1252 (0xA0-0xFF)
613        let test_chars = "¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
614
615        let encoded = encoding.encode(test_chars);
616        let decoded = encoding.decode(&encoded);
617
618        assert_eq!(decoded, test_chars);
619    }
620
621    #[test]
622    fn test_mac_roman_encode_special_characters() {
623        let encoding = TextEncoding::MacRomanEncoding;
624
625        // Test Mac Roman specific mappings
626        let test_cases = [
627            ("Ä", 0x80), // A with diaeresis
628            ("Å", 0x81), // A with ring
629            ("Ç", 0x82), // C with cedilla
630            ("É", 0x83), // E with acute
631            ("Ñ", 0x84), // N with tilde
632            ("Ö", 0x85), // O with diaeresis
633            ("Ü", 0x86), // U with diaeresis
634            ("á", 0x87), // a with acute
635            ("à", 0x88), // a with grave
636            ("â", 0x89), // a with circumflex
637            ("ä", 0x8A), // a with diaeresis
638            ("ã", 0x8B), // a with tilde
639            ("å", 0x8C), // a with ring
640            ("ç", 0x8D), // c with cedilla
641            ("é", 0x8E), // e with acute
642            ("è", 0x8F), // e with grave
643            ("ê", 0x90), // e with circumflex
644            ("ë", 0x91), // e with diaeresis
645            ("í", 0x92), // i with acute
646            ("ì", 0x93), // i with grave
647            ("î", 0x94), // i with circumflex
648            ("ï", 0x95), // i with diaeresis
649            ("ñ", 0x96), // n with tilde
650            ("ó", 0x97), // o with acute
651            ("ò", 0x98), // o with grave
652            ("ô", 0x99), // o with circumflex
653            ("ö", 0x9A), // o with diaeresis
654            ("õ", 0x9B), // o with tilde
655            ("ú", 0x9C), // u with acute
656            ("ù", 0x9D), // u with grave
657            ("û", 0x9E), // u with circumflex
658            ("ü", 0x9F), // u with diaeresis
659        ];
660
661        for (text, expected_byte) in &test_cases {
662            let encoded = encoding.encode(text);
663            assert_eq!(
664                encoded,
665                vec![*expected_byte],
666                "Failed encoding {text} (U+{:04X})",
667                text.chars().next().unwrap() as u32
668            );
669        }
670    }
671
672    #[test]
673    fn test_mac_roman_encode_symbols() {
674        let encoding = TextEncoding::MacRomanEncoding;
675
676        let test_cases = [
677            ("†", 0xA0), // Dagger
678            ("°", 0xA1), // Degree sign
679            ("¢", 0xA2), // Cent sign
680            ("£", 0xA3), // Pound sign
681            ("§", 0xA4), // Section sign
682            ("•", 0xA5), // Bullet
683            ("¶", 0xA6), // Pilcrow sign
684            ("ß", 0xA7), // Sharp s
685            ("®", 0xA8), // Registered sign
686            ("©", 0xA9), // Copyright sign
687            ("™", 0xAA), // Trade mark sign
688            ("´", 0xAB), // Acute accent
689            ("¨", 0xAC), // Diaeresis
690            ("≠", 0xAD), // Not equal to
691            ("Æ", 0xAE), // AE ligature
692            ("Ø", 0xAF), // O with stroke
693        ];
694
695        for (text, expected_byte) in &test_cases {
696            let encoded = encoding.encode(text);
697            assert_eq!(encoded, vec![*expected_byte], "Failed encoding {text}");
698        }
699    }
700
701    #[test]
702    fn test_mac_roman_decode_extended_range() {
703        let encoding = TextEncoding::MacRomanEncoding;
704
705        // Test extended range (0xB0-0xFF)
706        let test_cases: Vec<(u8, char)> = vec![
707            (0xB0, '∞'),        // Infinity
708            (0xB1, '±'),        // Plus-minus
709            (0xB2, '≤'),        // Less-than or equal
710            (0xB3, '≥'),        // Greater-than or equal
711            (0xB4, '¥'),        // Yen sign
712            (0xB5, 'µ'),        // Micro sign
713            (0xB6, '∂'),        // Partial differential
714            (0xB7, '∑'),        // Summation
715            (0xB8, '∏'),        // Product
716            (0xB9, 'π'),        // Pi
717            (0xBA, '∫'),        // Integral
718            (0xBB, 'ª'),        // Feminine ordinal
719            (0xBC, 'º'),        // Masculine ordinal
720            (0xBD, 'Ω'),        // Omega
721            (0xBE, 'æ'),        // ae ligature
722            (0xBF, 'ø'),        // o with stroke
723            (0xC0, '¿'),        // Inverted question mark
724            (0xC1, '¡'),        // Inverted exclamation
725            (0xC2, '¬'),        // Not sign
726            (0xC3, '√'),        // Square root
727            (0xC4, 'ƒ'),        // f with hook
728            (0xC5, '≈'),        // Almost equal
729            (0xC6, '∆'),        // Increment
730            (0xC7, '«'),        // Left double angle quote
731            (0xC8, '»'),        // Right double angle quote
732            (0xC9, '…'),        // Horizontal ellipsis
733            (0xCA, '\u{00A0}'), // No-break space
734            (0xCB, 'À'),        // A with grave
735            (0xCC, 'Ã'),        // A with tilde
736            (0xCD, 'Õ'),        // O with tilde
737            (0xCE, 'Œ'),        // OE ligature
738            (0xCF, 'œ'),        // oe ligature
739        ];
740
741        for (byte, expected_char) in test_cases {
742            let decoded = encoding.decode(&[byte]);
743            assert_eq!(
744                decoded.chars().next().unwrap(),
745                expected_char,
746                "Failed decoding byte 0x{byte:02X}"
747            );
748        }
749    }
750
751    #[test]
752    fn test_mac_roman_decode_high_range() {
753        let encoding = TextEncoding::MacRomanEncoding;
754
755        let test_cases: Vec<(u8, char)> = vec![
756            (0xD0, '\u{2013}'), // En dash
757            (0xD1, '\u{2014}'), // Em dash
758            (0xD2, '\u{201C}'), // Left double quote
759            (0xD3, '\u{201D}'), // Right double quote
760            (0xD4, '\u{2018}'), // Left single quote
761            (0xD5, '\u{2019}'), // Right single quote
762            (0xD6, '\u{00F7}'), // Division sign
763            (0xD7, '\u{25CA}'), // Lozenge
764            (0xD8, '\u{00FF}'), // y with diaeresis
765            (0xD9, '\u{0178}'), // Y with diaeresis
766            (0xDA, '\u{2044}'), // Fraction slash
767            (0xDB, '\u{20AC}'), // Euro sign
768            (0xDC, '\u{2039}'), // Single left angle quote
769            (0xDD, '\u{203A}'), // Single right angle quote
770            (0xDE, '\u{FB01}'), // fi ligature
771            (0xDF, '\u{FB02}'), // fl ligature
772            (0xE0, '\u{2021}'), // Double dagger
773            (0xE1, '\u{00B7}'), // Middle dot
774            (0xE2, '\u{201A}'), // Single low quote
775            (0xE3, '\u{201E}'), // Double low quote
776            (0xE4, '\u{2030}'), // Per mille sign
777            (0xE5, '\u{00C2}'), // A with circumflex
778            (0xE6, '\u{00CA}'), // E with circumflex
779            (0xE7, '\u{00C1}'), // A with acute
780            (0xE8, '\u{00CB}'), // E with diaeresis
781            (0xE9, '\u{00C8}'), // E with grave
782            (0xEA, '\u{00CD}'), // I with acute
783            (0xEB, '\u{00CE}'), // I with circumflex
784            (0xEC, '\u{00CF}'), // I with diaeresis
785            (0xED, '\u{00CC}'), // I with grave
786            (0xEE, '\u{00D3}'), // O with acute
787            (0xEF, '\u{00D4}'), // O with circumflex
788        ];
789
790        for (byte, expected_char) in test_cases {
791            let decoded = encoding.decode(&[byte]);
792            assert_eq!(
793                decoded.chars().next().unwrap(),
794                expected_char,
795                "Failed decoding byte 0x{byte:02X}"
796            );
797        }
798    }
799
800    #[test]
801    fn test_mac_roman_decode_final_range() {
802        let encoding = TextEncoding::MacRomanEncoding;
803
804        let test_cases: Vec<(u8, char)> = vec![
805            (0xF0, '\u{F8FF}'), // Apple logo
806            (0xF1, 'Ò'),        // O with grave
807            (0xF2, 'Ú'),        // U with acute
808            (0xF3, 'Û'),        // U with circumflex
809            (0xF4, 'Ù'),        // U with grave
810            (0xF5, 'ı'),        // Dotless i
811            (0xF6, 'ˆ'),        // Circumflex modifier
812            (0xF7, '˜'),        // Small tilde
813            (0xF8, '¯'),        // Macron
814            (0xF9, '˘'),        // Breve
815            (0xFA, '˙'),        // Dot above
816            (0xFB, '˚'),        // Ring above
817            (0xFC, '¸'),        // Cedilla
818            (0xFD, '˝'),        // Double acute
819            (0xFE, '˛'),        // Ogonek
820            (0xFF, 'ˇ'),        // Caron
821        ];
822
823        for (byte, expected_char) in test_cases {
824            let decoded = encoding.decode(&[byte]);
825            assert_eq!(
826                decoded.chars().next().unwrap(),
827                expected_char,
828                "Failed decoding byte 0x{byte:02X}"
829            );
830        }
831    }
832
833    #[test]
834    fn test_mac_roman_unmapped_character() {
835        let encoding = TextEncoding::MacRomanEncoding;
836
837        // Use a character that's not in Mac Roman
838        let text = "❤"; // Heart emoji
839        let encoded = encoding.encode(text);
840        assert_eq!(encoded, vec![b'?']);
841    }
842
843    #[test]
844    fn test_win_ansi_encode_all_special_mappings() {
845        let encoding = TextEncoding::WinAnsiEncoding;
846
847        let test_cases = [
848            ("\u{0160}", 0x8A), // S with caron
849            ("\u{0152}", 0x8C), // OE ligature
850            ("\u{017D}", 0x8E), // Z with caron
851            ("\u{0161}", 0x9A), // s with caron
852            ("\u{0153}", 0x9C), // oe ligature
853            ("\u{017E}", 0x9E), // z with caron
854            ("\u{0178}", 0x9F), // Y with diaeresis
855            ("\u{02C6}", 0x88), // Circumflex
856            ("\u{02DC}", 0x98), // Small tilde
857            ("\u{2039}", 0x8B), // Single left angle quote
858            ("\u{203A}", 0x9B), // Single right angle quote
859        ];
860
861        for (text, expected_byte) in &test_cases {
862            let encoded = encoding.encode(text);
863            assert_eq!(
864                encoded,
865                vec![*expected_byte],
866                "Failed encoding {text} (U+{:04X})",
867                text.chars().next().unwrap() as u32
868            );
869        }
870    }
871
872    #[test]
873    fn test_long_text_encoding_roundtrip() {
874        let encodings = [
875            TextEncoding::StandardEncoding,
876            TextEncoding::WinAnsiEncoding,
877            TextEncoding::MacRomanEncoding,
878            TextEncoding::PdfDocEncoding,
879        ];
880
881        let long_text = "The quick brown fox jumps over the lazy dog. 0123456789!@#$%^&*()";
882
883        for encoding in &encodings {
884            let encoded = encoding.encode(long_text);
885            let decoded = encoding.decode(&encoded);
886            assert_eq!(decoded, long_text, "Failed for {encoding:?}");
887        }
888    }
889
890    #[test]
891    fn test_win_ansi_decode_all_special_bytes() {
892        let encoding = TextEncoding::WinAnsiEncoding;
893
894        // Test all defined special bytes
895        let test_cases: Vec<(u8, char)> = vec![
896            (0x80, '\u{20AC}'), // Euro sign
897            (0x82, '\u{201A}'), // Single low quotation mark
898            (0x83, '\u{0192}'), // f with hook
899            (0x84, '\u{201E}'), // Double low quotation mark
900            (0x85, '\u{2026}'), // Horizontal ellipsis
901            (0x86, '\u{2020}'), // Dagger
902            (0x87, '\u{2021}'), // Double dagger
903            (0x88, '\u{02C6}'), // Circumflex accent
904            (0x89, '\u{2030}'), // Per mille sign
905            (0x8A, '\u{0160}'), // S with caron
906            (0x8B, '\u{2039}'), // Single left angle quote
907            (0x8C, '\u{0152}'), // OE ligature
908            (0x8E, '\u{017D}'), // Z with caron
909            (0x91, '\u{2018}'), // Left single quotation mark
910            (0x92, '\u{2019}'), // Right single quotation mark
911            (0x93, '\u{201C}'), // Left double quotation mark
912            (0x94, '\u{201D}'), // Right double quotation mark
913            (0x95, '\u{2022}'), // Bullet
914            (0x96, '\u{2013}'), // En dash
915            (0x97, '\u{2014}'), // Em dash
916            (0x98, '\u{02DC}'), // Small tilde
917            (0x99, '\u{2122}'), // Trade mark sign
918            (0x9A, '\u{0161}'), // s with caron
919            (0x9B, '\u{203A}'), // Single right angle quote
920            (0x9C, '\u{0153}'), // oe ligature
921            (0x9E, '\u{017E}'), // z with caron
922            (0x9F, '\u{0178}'), // Y with diaeresis
923        ];
924
925        for (byte, expected_char) in test_cases {
926            let decoded = encoding.decode(&[byte]);
927            assert_eq!(
928                decoded.chars().next().unwrap(),
929                expected_char,
930                "Failed decoding byte 0x{byte:02X}"
931            );
932        }
933    }
934}
oxidize_pdf/text/encoding.rs

oxidize_pdf/text/
encoding.rs