oxidize_pdf/text/encoding.rs
1#[derive(Debug, Clone, Copy, PartialEq)]
2pub enum TextEncoding {
3 StandardEncoding,
4 MacRomanEncoding,
5 WinAnsiEncoding,
6 PdfDocEncoding,
7}
8
9impl TextEncoding {
10 pub fn encode(&self, text: &str) -> Vec<u8> {
11 match self {
12 TextEncoding::StandardEncoding | TextEncoding::PdfDocEncoding => {
13 // For now, use UTF-8 encoding
14 text.bytes().collect()
15 }
16 TextEncoding::WinAnsiEncoding => {
17 // Convert UTF-8 to Windows-1252
18 let mut result = Vec::new();
19 for ch in text.chars() {
20 match ch as u32 {
21 // ASCII range
22 0x00..=0x7F => result.push(ch as u8),
23 // Latin-1 Supplement that overlaps with Windows-1252
24 0xA0..=0xFF => result.push(ch as u8),
25 // Special mappings for Windows-1252
26 0x20AC => result.push(0x80), // Euro sign
27 0x201A => result.push(0x82), // Single low quotation mark
28 0x0192 => result.push(0x83), // Latin small letter f with hook
29 0x201E => result.push(0x84), // Double low quotation mark
30 0x2026 => result.push(0x85), // Horizontal ellipsis
31 0x2020 => result.push(0x86), // Dagger
32 0x2021 => result.push(0x87), // Double dagger
33 0x02C6 => result.push(0x88), // Circumflex accent
34 0x2030 => result.push(0x89), // Per mille sign
35 0x0160 => result.push(0x8A), // Latin capital letter S with caron
36 0x2039 => result.push(0x8B), // Single left angle quotation mark
37 0x0152 => result.push(0x8C), // Latin capital ligature OE
38 0x017D => result.push(0x8E), // Latin capital letter Z with caron
39 0x2018 => result.push(0x91), // Left single quotation mark
40 0x2019 => result.push(0x92), // Right single quotation mark
41 0x201C => result.push(0x93), // Left double quotation mark
42 0x201D => result.push(0x94), // Right double quotation mark
43 0x2022 => result.push(0x95), // Bullet
44 0x2013 => result.push(0x96), // En dash
45 0x2014 => result.push(0x97), // Em dash
46 0x02DC => result.push(0x98), // Small tilde
47 0x2122 => result.push(0x99), // Trade mark sign
48 0x0161 => result.push(0x9A), // Latin small letter s with caron
49 0x203A => result.push(0x9B), // Single right angle quotation mark
50 0x0153 => result.push(0x9C), // Latin small ligature oe
51 0x017E => result.push(0x9E), // Latin small letter z with caron
52 0x0178 => result.push(0x9F), // Latin capital letter Y with diaeresis
53 // Default: use question mark for unmapped characters
54 _ => result.push(b'?'),
55 }
56 }
57 result
58 }
59 TextEncoding::MacRomanEncoding => {
60 // Convert UTF-8 to Mac Roman encoding
61 let mut result = Vec::new();
62 for ch in text.chars() {
63 match ch as u32 {
64 // ASCII range
65 0x00..=0x7F => result.push(ch as u8),
66 // Mac Roman specific mappings
67 0x00C4 => result.push(0x80), // Latin capital letter A with diaeresis
68 0x00C5 => result.push(0x81), // Latin capital letter A with ring above
69 0x00C7 => result.push(0x82), // Latin capital letter C with cedilla
70 0x00C9 => result.push(0x83), // Latin capital letter E with acute
71 0x00D1 => result.push(0x84), // Latin capital letter N with tilde
72 0x00D6 => result.push(0x85), // Latin capital letter O with diaeresis
73 0x00DC => result.push(0x86), // Latin capital letter U with diaeresis
74 0x00E1 => result.push(0x87), // Latin small letter a with acute
75 0x00E0 => result.push(0x88), // Latin small letter a with grave
76 0x00E2 => result.push(0x89), // Latin small letter a with circumflex
77 0x00E4 => result.push(0x8A), // Latin small letter a with diaeresis
78 0x00E3 => result.push(0x8B), // Latin small letter a with tilde
79 0x00E5 => result.push(0x8C), // Latin small letter a with ring above
80 0x00E7 => result.push(0x8D), // Latin small letter c with cedilla
81 0x00E9 => result.push(0x8E), // Latin small letter e with acute
82 0x00E8 => result.push(0x8F), // Latin small letter e with grave
83 0x00EA => result.push(0x90), // Latin small letter e with circumflex
84 0x00EB => result.push(0x91), // Latin small letter e with diaeresis
85 0x00ED => result.push(0x92), // Latin small letter i with acute
86 0x00EC => result.push(0x93), // Latin small letter i with grave
87 0x00EE => result.push(0x94), // Latin small letter i with circumflex
88 0x00EF => result.push(0x95), // Latin small letter i with diaeresis
89 0x00F1 => result.push(0x96), // Latin small letter n with tilde
90 0x00F3 => result.push(0x97), // Latin small letter o with acute
91 0x00F2 => result.push(0x98), // Latin small letter o with grave
92 0x00F4 => result.push(0x99), // Latin small letter o with circumflex
93 0x00F6 => result.push(0x9A), // Latin small letter o with diaeresis
94 0x00F5 => result.push(0x9B), // Latin small letter o with tilde
95 0x00FA => result.push(0x9C), // Latin small letter u with acute
96 0x00F9 => result.push(0x9D), // Latin small letter u with grave
97 0x00FB => result.push(0x9E), // Latin small letter u with circumflex
98 0x00FC => result.push(0x9F), // Latin small letter u with diaeresis
99 0x2020 => result.push(0xA0), // Dagger
100 0x00B0 => result.push(0xA1), // Degree sign
101 0x00A2 => result.push(0xA2), // Cent sign
102 0x00A3 => result.push(0xA3), // Pound sign
103 0x00A7 => result.push(0xA4), // Section sign
104 0x2022 => result.push(0xA5), // Bullet
105 0x00B6 => result.push(0xA6), // Pilcrow sign
106 0x00DF => result.push(0xA7), // Latin small letter sharp s
107 0x00AE => result.push(0xA8), // Registered sign
108 0x00A9 => result.push(0xA9), // Copyright sign
109 0x2122 => result.push(0xAA), // Trade mark sign
110 0x00B4 => result.push(0xAB), // Acute accent
111 0x00A8 => result.push(0xAC), // Diaeresis
112 0x2260 => result.push(0xAD), // Not equal to
113 0x00C6 => result.push(0xAE), // Latin capital letter AE
114 0x00D8 => result.push(0xAF), // Latin capital letter O with stroke
115 // Default: use question mark for unmapped characters
116 _ => result.push(b'?'),
117 }
118 }
119 result
120 }
121 }
122 }
123
124 pub fn decode(&self, data: &[u8]) -> String {
125 match self {
126 TextEncoding::StandardEncoding | TextEncoding::PdfDocEncoding => {
127 // For now, assume UTF-8
128 String::from_utf8_lossy(data).to_string()
129 }
130 TextEncoding::WinAnsiEncoding => {
131 // Decode Windows-1252 to UTF-8
132 let mut result = String::new();
133 for &byte in data {
134 let ch = match byte {
135 // ASCII range
136 0x00..=0x7F => byte as char,
137 // Windows-1252 specific mappings
138 0x80 => '\u{20AC}', // Euro sign
139 0x82 => '\u{201A}', // Single low quotation mark
140 0x83 => '\u{0192}', // Latin small letter f with hook
141 0x84 => '\u{201E}', // Double low quotation mark
142 0x85 => '\u{2026}', // Horizontal ellipsis
143 0x86 => '\u{2020}', // Dagger
144 0x87 => '\u{2021}', // Double dagger
145 0x88 => '\u{02C6}', // Circumflex accent
146 0x89 => '\u{2030}', // Per mille sign
147 0x8A => '\u{0160}', // Latin capital letter S with caron
148 0x8B => '\u{2039}', // Single left angle quotation mark
149 0x8C => '\u{0152}', // Latin capital ligature OE
150 0x8E => '\u{017D}', // Latin capital letter Z with caron
151 0x91 => '\u{2018}', // Left single quotation mark
152 0x92 => '\u{2019}', // Right single quotation mark
153 0x93 => '\u{201C}', // Left double quotation mark
154 0x94 => '\u{201D}', // Right double quotation mark
155 0x95 => '\u{2022}', // Bullet
156 0x96 => '\u{2013}', // En dash
157 0x97 => '\u{2014}', // Em dash
158 0x98 => '\u{02DC}', // Small tilde
159 0x99 => '\u{2122}', // Trade mark sign
160 0x9A => '\u{0161}', // Latin small letter s with caron
161 0x9B => '\u{203A}', // Single right angle quotation mark
162 0x9C => '\u{0153}', // Latin small ligature oe
163 0x9E => '\u{017E}', // Latin small letter z with caron
164 0x9F => '\u{0178}', // Latin capital letter Y with diaeresis
165 // Latin-1 range that overlaps with Windows-1252
166 0xA0..=0xFF => char::from_u32(byte as u32).unwrap_or('?'),
167 // Undefined bytes
168 _ => '?',
169 };
170 result.push(ch);
171 }
172 result
173 }
174 TextEncoding::MacRomanEncoding => {
175 // Decode Mac Roman to UTF-8
176 let mut result = String::new();
177 for &byte in data {
178 let ch = match byte {
179 // ASCII range
180 0x00..=0x7F => byte as char,
181 // Mac Roman specific mappings
182 0x80 => '\u{00C4}', // Latin capital letter A with diaeresis
183 0x81 => '\u{00C5}', // Latin capital letter A with ring above
184 0x82 => '\u{00C7}', // Latin capital letter C with cedilla
185 0x83 => '\u{00C9}', // Latin capital letter E with acute
186 0x84 => '\u{00D1}', // Latin capital letter N with tilde
187 0x85 => '\u{00D6}', // Latin capital letter O with diaeresis
188 0x86 => '\u{00DC}', // Latin capital letter U with diaeresis
189 0x87 => '\u{00E1}', // Latin small letter a with acute
190 0x88 => '\u{00E0}', // Latin small letter a with grave
191 0x89 => '\u{00E2}', // Latin small letter a with circumflex
192 0x8A => '\u{00E4}', // Latin small letter a with diaeresis
193 0x8B => '\u{00E3}', // Latin small letter a with tilde
194 0x8C => '\u{00E5}', // Latin small letter a with ring above
195 0x8D => '\u{00E7}', // Latin small letter c with cedilla
196 0x8E => '\u{00E9}', // Latin small letter e with acute
197 0x8F => '\u{00E8}', // Latin small letter e with grave
198 0x90 => '\u{00EA}', // Latin small letter e with circumflex
199 0x91 => '\u{00EB}', // Latin small letter e with diaeresis
200 0x92 => '\u{00ED}', // Latin small letter i with acute
201 0x93 => '\u{00EC}', // Latin small letter i with grave
202 0x94 => '\u{00EE}', // Latin small letter i with circumflex
203 0x95 => '\u{00EF}', // Latin small letter i with diaeresis
204 0x96 => '\u{00F1}', // Latin small letter n with tilde
205 0x97 => '\u{00F3}', // Latin small letter o with acute
206 0x98 => '\u{00F2}', // Latin small letter o with grave
207 0x99 => '\u{00F4}', // Latin small letter o with circumflex
208 0x9A => '\u{00F6}', // Latin small letter o with diaeresis
209 0x9B => '\u{00F5}', // Latin small letter o with tilde
210 0x9C => '\u{00FA}', // Latin small letter u with acute
211 0x9D => '\u{00F9}', // Latin small letter u with grave
212 0x9E => '\u{00FB}', // Latin small letter u with circumflex
213 0x9F => '\u{00FC}', // Latin small letter u with diaeresis
214 0xA0 => '\u{2020}', // Dagger
215 0xA1 => '\u{00B0}', // Degree sign
216 0xA2 => '\u{00A2}', // Cent sign
217 0xA3 => '\u{00A3}', // Pound sign
218 0xA4 => '\u{00A7}', // Section sign
219 0xA5 => '\u{2022}', // Bullet
220 0xA6 => '\u{00B6}', // Pilcrow sign
221 0xA7 => '\u{00DF}', // Latin small letter sharp s
222 0xA8 => '\u{00AE}', // Registered sign
223 0xA9 => '\u{00A9}', // Copyright sign
224 0xAA => '\u{2122}', // Trade mark sign
225 0xAB => '\u{00B4}', // Acute accent
226 0xAC => '\u{00A8}', // Diaeresis
227 0xAD => '\u{2260}', // Not equal to
228 0xAE => '\u{00C6}', // Latin capital letter AE
229 0xAF => '\u{00D8}', // Latin capital letter O with stroke
230 0xB0 => '\u{221E}', // Infinity
231 0xB1 => '\u{00B1}', // Plus-minus sign
232 0xB2 => '\u{2264}', // Less-than or equal to
233 0xB3 => '\u{2265}', // Greater-than or equal to
234 0xB4 => '\u{00A5}', // Yen sign
235 0xB5 => '\u{00B5}', // Micro sign
236 0xB6 => '\u{2202}', // Partial differential
237 0xB7 => '\u{2211}', // N-ary summation
238 0xB8 => '\u{220F}', // N-ary product
239 0xB9 => '\u{03C0}', // Greek small letter pi
240 0xBA => '\u{222B}', // Integral
241 0xBB => '\u{00AA}', // Feminine ordinal indicator
242 0xBC => '\u{00BA}', // Masculine ordinal indicator
243 0xBD => '\u{03A9}', // Greek capital letter omega
244 0xBE => '\u{00E6}', // Latin small letter ae
245 0xBF => '\u{00F8}', // Latin small letter o with stroke
246 0xC0 => '\u{00BF}', // Inverted question mark
247 0xC1 => '\u{00A1}', // Inverted exclamation mark
248 0xC2 => '\u{00AC}', // Not sign
249 0xC3 => '\u{221A}', // Square root
250 0xC4 => '\u{0192}', // Latin small letter f with hook
251 0xC5 => '\u{2248}', // Almost equal to
252 0xC6 => '\u{2206}', // Increment
253 0xC7 => '\u{00AB}', // Left-pointing double angle quotation mark
254 0xC8 => '\u{00BB}', // Right-pointing double angle quotation mark
255 0xC9 => '\u{2026}', // Horizontal ellipsis
256 0xCA => '\u{00A0}', // No-break space
257 0xCB => '\u{00C0}', // Latin capital letter A with grave
258 0xCC => '\u{00C3}', // Latin capital letter A with tilde
259 0xCD => '\u{00D5}', // Latin capital letter O with tilde
260 0xCE => '\u{0152}', // Latin capital ligature OE
261 0xCF => '\u{0153}', // Latin small ligature oe
262 0xD0 => '\u{2013}', // En dash
263 0xD1 => '\u{2014}', // Em dash
264 0xD2 => '\u{201C}', // Left double quotation mark
265 0xD3 => '\u{201D}', // Right double quotation mark
266 0xD4 => '\u{2018}', // Left single quotation mark
267 0xD5 => '\u{2019}', // Right single quotation mark
268 0xD6 => '\u{00F7}', // Division sign
269 0xD7 => '\u{25CA}', // Lozenge
270 0xD8 => '\u{00FF}', // Latin small letter y with diaeresis
271 0xD9 => '\u{0178}', // Latin capital letter Y with diaeresis
272 0xDA => '\u{2044}', // Fraction slash
273 0xDB => '\u{20AC}', // Euro sign
274 0xDC => '\u{2039}', // Single left-pointing angle quotation mark
275 0xDD => '\u{203A}', // Single right-pointing angle quotation mark
276 0xDE => '\u{FB01}', // Latin small ligature fi
277 0xDF => '\u{FB02}', // Latin small ligature fl
278 0xE0 => '\u{2021}', // Double dagger
279 0xE1 => '\u{00B7}', // Middle dot
280 0xE2 => '\u{201A}', // Single low-9 quotation mark
281 0xE3 => '\u{201E}', // Double low-9 quotation mark
282 0xE4 => '\u{2030}', // Per mille sign
283 0xE5 => '\u{00C2}', // Latin capital letter A with circumflex
284 0xE6 => '\u{00CA}', // Latin capital letter E with circumflex
285 0xE7 => '\u{00C1}', // Latin capital letter A with acute
286 0xE8 => '\u{00CB}', // Latin capital letter E with diaeresis
287 0xE9 => '\u{00C8}', // Latin capital letter E with grave
288 0xEA => '\u{00CD}', // Latin capital letter I with acute
289 0xEB => '\u{00CE}', // Latin capital letter I with circumflex
290 0xEC => '\u{00CF}', // Latin capital letter I with diaeresis
291 0xED => '\u{00CC}', // Latin capital letter I with grave
292 0xEE => '\u{00D3}', // Latin capital letter O with acute
293 0xEF => '\u{00D4}', // Latin capital letter O with circumflex
294 0xF0 => '\u{F8FF}', // Apple logo
295 0xF1 => '\u{00D2}', // Latin capital letter O with grave
296 0xF2 => '\u{00DA}', // Latin capital letter U with acute
297 0xF3 => '\u{00DB}', // Latin capital letter U with circumflex
298 0xF4 => '\u{00D9}', // Latin capital letter U with grave
299 0xF5 => '\u{0131}', // Latin small letter dotless i
300 0xF6 => '\u{02C6}', // Modifier letter circumflex accent
301 0xF7 => '\u{02DC}', // Small tilde
302 0xF8 => '\u{00AF}', // Macron
303 0xF9 => '\u{02D8}', // Breve
304 0xFA => '\u{02D9}', // Dot above
305 0xFB => '\u{02DA}', // Ring above
306 0xFC => '\u{00B8}', // Cedilla
307 0xFD => '\u{02DD}', // Double acute accent
308 0xFE => '\u{02DB}', // Ogonek
309 0xFF => '\u{02C7}', // Caron
310 };
311 result.push(ch);
312 }
313 result
314 }
315 }
316 }
317}
318
319#[cfg(test)]
320mod tests {
321 use super::*;
322
323 #[test]
324 fn test_text_encoding_variants() {
325 let encodings = [
326 TextEncoding::StandardEncoding,
327 TextEncoding::MacRomanEncoding,
328 TextEncoding::WinAnsiEncoding,
329 TextEncoding::PdfDocEncoding,
330 ];
331
332 for encoding in &encodings {
333 assert_eq!(*encoding, *encoding);
334 }
335
336 assert_ne!(
337 TextEncoding::StandardEncoding,
338 TextEncoding::WinAnsiEncoding
339 );
340 }
341
342 #[test]
343 fn test_standard_encoding_basic_ascii() {
344 let encoding = TextEncoding::StandardEncoding;
345 let text = "Hello World!";
346
347 let encoded = encoding.encode(text);
348 let decoded = encoding.decode(&encoded);
349 assert_eq!(decoded, text);
350 }
351
352 #[test]
353 fn test_win_ansi_encoding_special_chars() {
354 let encoding = TextEncoding::WinAnsiEncoding;
355
356 // Test Euro sign
357 let text = "€100";
358 let encoded = encoding.encode(text);
359 assert_eq!(encoded[0], 0x80);
360 let decoded = encoding.decode(&encoded);
361 assert_eq!(decoded, text);
362
363 // Test other special characters
364 let text2 = "Hello—World"; // Em dash
365 let encoded2 = encoding.encode(text2);
366 let decoded2 = encoding.decode(&encoded2);
367 assert_eq!(decoded2, text2);
368 }
369
370 #[test]
371 fn test_mac_roman_encoding_special_chars() {
372 let encoding = TextEncoding::MacRomanEncoding;
373
374 // Test accented characters
375 let text = "café";
376 let encoded = encoding.encode(text);
377 assert_eq!(encoded[3], 0x8E); // é
378 let decoded = encoding.decode(&encoded);
379 assert_eq!(decoded, text);
380
381 // Test Apple logo (special Mac character)
382 let apple_bytes = vec![0xF0];
383 let decoded_apple = encoding.decode(&apple_bytes);
384 assert_eq!(decoded_apple, "\u{F8FF}");
385
386 // Test various accented characters
387 let text2 = "Zürich";
388 let encoded2 = encoding.encode(text2);
389 assert_eq!(encoded2[1], 0x9F); // ü
390 let decoded2 = encoding.decode(&encoded2);
391 assert_eq!(decoded2, text2);
392 }
393
394 #[test]
395 fn test_pdf_doc_encoding() {
396 let encoding = TextEncoding::PdfDocEncoding;
397 let text = "PDF Document";
398
399 let encoded = encoding.encode(text);
400 let decoded = encoding.decode(&encoded);
401
402 assert_eq!(text, decoded);
403 }
404
405 #[test]
406 fn test_pdf_doc_encoding_basic_ascii() {
407 let encoding = TextEncoding::PdfDocEncoding;
408 let text = "Hello World!";
409
410 let encoded = encoding.encode(text);
411 let decoded = encoding.decode(&encoded);
412
413 assert_eq!(text, decoded);
414 }
415
416 #[test]
417 fn test_mac_roman_encoding_basic_ascii() {
418 let encoding = TextEncoding::MacRomanEncoding;
419 let text = "Hello World!";
420
421 let encoded = encoding.encode(text);
422 let decoded = encoding.decode(&encoded);
423
424 assert_eq!(text, decoded);
425 }
426
427 #[test]
428 fn test_win_ansi_encoding_basic_ascii() {
429 let encoding = TextEncoding::WinAnsiEncoding;
430 let text = "Hello World!";
431
432 let encoded = encoding.encode(text);
433 let decoded = encoding.decode(&encoded);
434
435 assert_eq!(text, decoded);
436 }
437
438 #[test]
439 fn test_win_ansi_encoding_special_characters() {
440 let encoding = TextEncoding::WinAnsiEncoding;
441
442 // Test Euro sign
443 let euro_text = "€";
444 let encoded = encoding.encode(euro_text);
445 assert_eq!(encoded, vec![0x80]);
446 let decoded = encoding.decode(&encoded);
447 assert_eq!(decoded, euro_text);
448
449 // Test em dash
450 let dash_text = "—";
451 let encoded = encoding.encode(dash_text);
452 assert_eq!(encoded, vec![0x97]);
453 let decoded = encoding.decode(&encoded);
454 assert_eq!(decoded, dash_text);
455
456 // Test single low quotation mark
457 let quote_text = "‚";
458 let encoded = encoding.encode(quote_text);
459 assert_eq!(encoded, vec![0x82]);
460 let decoded = encoding.decode(&encoded);
461 assert_eq!(decoded, quote_text);
462 }
463
464 #[test]
465 fn test_win_ansi_encoding_latin_supplement() {
466 let encoding = TextEncoding::WinAnsiEncoding;
467 let text = "café";
468
469 let encoded = encoding.encode(text);
470 let decoded = encoding.decode(&encoded);
471
472 assert_eq!(text, decoded);
473 }
474
475 #[test]
476 fn test_win_ansi_encoding_unmapped_character() {
477 let encoding = TextEncoding::WinAnsiEncoding;
478
479 // Use a character that's not in Windows-1252
480 let text = "❤"; // Heart emoji
481 let encoded = encoding.encode(text);
482 assert_eq!(encoded, vec![b'?']); // Should be replaced with ?
483
484 let decoded = encoding.decode(&encoded);
485 assert_eq!(decoded, "?");
486 }
487
488 #[test]
489 fn test_win_ansi_encoding_round_trip_special_chars() {
490 let encoding = TextEncoding::WinAnsiEncoding;
491
492 let special_chars = [
493 ("€", 0x80), // Euro sign
494 ("‚", 0x82), // Single low quotation mark
495 ("ƒ", 0x83), // Latin small letter f with hook
496 ("„", 0x84), // Double low quotation mark
497 ("…", 0x85), // Horizontal ellipsis
498 ("†", 0x86), // Dagger
499 ("‡", 0x87), // Double dagger
500 ("‰", 0x89), // Per mille sign
501 ("\u{2018}", 0x91), // Left single quotation mark
502 ("\u{2019}", 0x92), // Right single quotation mark
503 ("\u{201C}", 0x93), // Left double quotation mark
504 ("\u{201D}", 0x94), // Right double quotation mark
505 ("•", 0x95), // Bullet
506 ("–", 0x96), // En dash
507 ("—", 0x97), // Em dash
508 ("™", 0x99), // Trade mark sign
509 ];
510
511 for (text, expected_byte) in &special_chars {
512 let encoded = encoding.encode(text);
513 assert_eq!(encoded, vec![*expected_byte], "Failed for character {text}");
514
515 let decoded = encoding.decode(&encoded);
516 assert_eq!(decoded, *text, "Round trip failed for character {text}");
517 }
518 }
519
520 #[test]
521 fn test_encoding_equality() {
522 assert_eq!(
523 TextEncoding::StandardEncoding,
524 TextEncoding::StandardEncoding
525 );
526 assert_eq!(TextEncoding::WinAnsiEncoding, TextEncoding::WinAnsiEncoding);
527
528 assert_ne!(
529 TextEncoding::StandardEncoding,
530 TextEncoding::WinAnsiEncoding
531 );
532 assert_ne!(TextEncoding::MacRomanEncoding, TextEncoding::PdfDocEncoding);
533 }
534
535 #[test]
536 fn test_encoding_debug() {
537 let encoding = TextEncoding::WinAnsiEncoding;
538 let debug_str = format!("{encoding:?}");
539 assert_eq!(debug_str, "WinAnsiEncoding");
540 }
541
542 #[test]
543 fn test_encoding_clone() {
544 let encoding1 = TextEncoding::PdfDocEncoding;
545 let encoding2 = encoding1;
546 assert_eq!(encoding1, encoding2);
547 }
548
549 #[test]
550 fn test_encoding_copy() {
551 let encoding1 = TextEncoding::StandardEncoding;
552 let encoding2 = encoding1; // Copy semantics
553 assert_eq!(encoding1, encoding2);
554
555 // Both variables should still be usable
556 assert_eq!(encoding1, TextEncoding::StandardEncoding);
557 assert_eq!(encoding2, TextEncoding::StandardEncoding);
558 }
559
560 #[test]
561 fn test_empty_string_encoding() {
562 for encoding in &[
563 TextEncoding::StandardEncoding,
564 TextEncoding::MacRomanEncoding,
565 TextEncoding::WinAnsiEncoding,
566 TextEncoding::PdfDocEncoding,
567 ] {
568 let encoded = encoding.encode("");
569 assert!(encoded.is_empty());
570
571 let decoded = encoding.decode(&[]);
572 assert!(decoded.is_empty());
573 }
574 }
575
576 #[test]
577 fn test_win_ansi_decode_undefined_bytes() {
578 let encoding = TextEncoding::WinAnsiEncoding;
579
580 // Test some undefined bytes in Windows-1252 (0x81, 0x8D, 0x8F, 0x90, 0x9D)
581 let undefined_bytes = [0x81, 0x8D, 0x8F, 0x90, 0x9D];
582
583 for &byte in &undefined_bytes {
584 let decoded = encoding.decode(&[byte]);
585 assert_eq!(
586 decoded, "?",
587 "Undefined byte 0x{byte:02X} should decode to '?'"
588 );
589 }
590 }
591
592 #[test]
593 fn test_win_ansi_ascii_range() {
594 let encoding = TextEncoding::WinAnsiEncoding;
595
596 // Test ASCII range (0x00-0x7F)
597 for byte in 0x20..=0x7E {
598 // Printable ASCII
599 let text = char::from(byte).to_string();
600 let encoded = encoding.encode(&text);
601 assert_eq!(encoded, vec![byte]);
602
603 let decoded = encoding.decode(&encoded);
604 assert_eq!(decoded, text);
605 }
606 }
607
608 #[test]
609 fn test_win_ansi_latin1_overlap() {
610 let encoding = TextEncoding::WinAnsiEncoding;
611
612 // Test Latin-1 range that overlaps with Windows-1252 (0xA0-0xFF)
613 let test_chars = "¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ";
614
615 let encoded = encoding.encode(test_chars);
616 let decoded = encoding.decode(&encoded);
617
618 assert_eq!(decoded, test_chars);
619 }
620
621 #[test]
622 fn test_mac_roman_encode_special_characters() {
623 let encoding = TextEncoding::MacRomanEncoding;
624
625 // Test Mac Roman specific mappings
626 let test_cases = [
627 ("Ä", 0x80), // A with diaeresis
628 ("Å", 0x81), // A with ring
629 ("Ç", 0x82), // C with cedilla
630 ("É", 0x83), // E with acute
631 ("Ñ", 0x84), // N with tilde
632 ("Ö", 0x85), // O with diaeresis
633 ("Ü", 0x86), // U with diaeresis
634 ("á", 0x87), // a with acute
635 ("à", 0x88), // a with grave
636 ("â", 0x89), // a with circumflex
637 ("ä", 0x8A), // a with diaeresis
638 ("ã", 0x8B), // a with tilde
639 ("å", 0x8C), // a with ring
640 ("ç", 0x8D), // c with cedilla
641 ("é", 0x8E), // e with acute
642 ("è", 0x8F), // e with grave
643 ("ê", 0x90), // e with circumflex
644 ("ë", 0x91), // e with diaeresis
645 ("í", 0x92), // i with acute
646 ("ì", 0x93), // i with grave
647 ("î", 0x94), // i with circumflex
648 ("ï", 0x95), // i with diaeresis
649 ("ñ", 0x96), // n with tilde
650 ("ó", 0x97), // o with acute
651 ("ò", 0x98), // o with grave
652 ("ô", 0x99), // o with circumflex
653 ("ö", 0x9A), // o with diaeresis
654 ("õ", 0x9B), // o with tilde
655 ("ú", 0x9C), // u with acute
656 ("ù", 0x9D), // u with grave
657 ("û", 0x9E), // u with circumflex
658 ("ü", 0x9F), // u with diaeresis
659 ];
660
661 for (text, expected_byte) in &test_cases {
662 let encoded = encoding.encode(text);
663 assert_eq!(
664 encoded,
665 vec![*expected_byte],
666 "Failed encoding {text} (U+{:04X})",
667 text.chars().next().unwrap() as u32
668 );
669 }
670 }
671
672 #[test]
673 fn test_mac_roman_encode_symbols() {
674 let encoding = TextEncoding::MacRomanEncoding;
675
676 let test_cases = [
677 ("†", 0xA0), // Dagger
678 ("°", 0xA1), // Degree sign
679 ("¢", 0xA2), // Cent sign
680 ("£", 0xA3), // Pound sign
681 ("§", 0xA4), // Section sign
682 ("•", 0xA5), // Bullet
683 ("¶", 0xA6), // Pilcrow sign
684 ("ß", 0xA7), // Sharp s
685 ("®", 0xA8), // Registered sign
686 ("©", 0xA9), // Copyright sign
687 ("™", 0xAA), // Trade mark sign
688 ("´", 0xAB), // Acute accent
689 ("¨", 0xAC), // Diaeresis
690 ("≠", 0xAD), // Not equal to
691 ("Æ", 0xAE), // AE ligature
692 ("Ø", 0xAF), // O with stroke
693 ];
694
695 for (text, expected_byte) in &test_cases {
696 let encoded = encoding.encode(text);
697 assert_eq!(encoded, vec![*expected_byte], "Failed encoding {text}");
698 }
699 }
700
701 #[test]
702 fn test_mac_roman_decode_extended_range() {
703 let encoding = TextEncoding::MacRomanEncoding;
704
705 // Test extended range (0xB0-0xFF)
706 let test_cases: Vec<(u8, char)> = vec![
707 (0xB0, '∞'), // Infinity
708 (0xB1, '±'), // Plus-minus
709 (0xB2, '≤'), // Less-than or equal
710 (0xB3, '≥'), // Greater-than or equal
711 (0xB4, '¥'), // Yen sign
712 (0xB5, 'µ'), // Micro sign
713 (0xB6, '∂'), // Partial differential
714 (0xB7, '∑'), // Summation
715 (0xB8, '∏'), // Product
716 (0xB9, 'π'), // Pi
717 (0xBA, '∫'), // Integral
718 (0xBB, 'ª'), // Feminine ordinal
719 (0xBC, 'º'), // Masculine ordinal
720 (0xBD, 'Ω'), // Omega
721 (0xBE, 'æ'), // ae ligature
722 (0xBF, 'ø'), // o with stroke
723 (0xC0, '¿'), // Inverted question mark
724 (0xC1, '¡'), // Inverted exclamation
725 (0xC2, '¬'), // Not sign
726 (0xC3, '√'), // Square root
727 (0xC4, 'ƒ'), // f with hook
728 (0xC5, '≈'), // Almost equal
729 (0xC6, '∆'), // Increment
730 (0xC7, '«'), // Left double angle quote
731 (0xC8, '»'), // Right double angle quote
732 (0xC9, '…'), // Horizontal ellipsis
733 (0xCA, '\u{00A0}'), // No-break space
734 (0xCB, 'À'), // A with grave
735 (0xCC, 'Ã'), // A with tilde
736 (0xCD, 'Õ'), // O with tilde
737 (0xCE, 'Œ'), // OE ligature
738 (0xCF, 'œ'), // oe ligature
739 ];
740
741 for (byte, expected_char) in test_cases {
742 let decoded = encoding.decode(&[byte]);
743 assert_eq!(
744 decoded.chars().next().unwrap(),
745 expected_char,
746 "Failed decoding byte 0x{byte:02X}"
747 );
748 }
749 }
750
751 #[test]
752 fn test_mac_roman_decode_high_range() {
753 let encoding = TextEncoding::MacRomanEncoding;
754
755 let test_cases: Vec<(u8, char)> = vec![
756 (0xD0, '\u{2013}'), // En dash
757 (0xD1, '\u{2014}'), // Em dash
758 (0xD2, '\u{201C}'), // Left double quote
759 (0xD3, '\u{201D}'), // Right double quote
760 (0xD4, '\u{2018}'), // Left single quote
761 (0xD5, '\u{2019}'), // Right single quote
762 (0xD6, '\u{00F7}'), // Division sign
763 (0xD7, '\u{25CA}'), // Lozenge
764 (0xD8, '\u{00FF}'), // y with diaeresis
765 (0xD9, '\u{0178}'), // Y with diaeresis
766 (0xDA, '\u{2044}'), // Fraction slash
767 (0xDB, '\u{20AC}'), // Euro sign
768 (0xDC, '\u{2039}'), // Single left angle quote
769 (0xDD, '\u{203A}'), // Single right angle quote
770 (0xDE, '\u{FB01}'), // fi ligature
771 (0xDF, '\u{FB02}'), // fl ligature
772 (0xE0, '\u{2021}'), // Double dagger
773 (0xE1, '\u{00B7}'), // Middle dot
774 (0xE2, '\u{201A}'), // Single low quote
775 (0xE3, '\u{201E}'), // Double low quote
776 (0xE4, '\u{2030}'), // Per mille sign
777 (0xE5, '\u{00C2}'), // A with circumflex
778 (0xE6, '\u{00CA}'), // E with circumflex
779 (0xE7, '\u{00C1}'), // A with acute
780 (0xE8, '\u{00CB}'), // E with diaeresis
781 (0xE9, '\u{00C8}'), // E with grave
782 (0xEA, '\u{00CD}'), // I with acute
783 (0xEB, '\u{00CE}'), // I with circumflex
784 (0xEC, '\u{00CF}'), // I with diaeresis
785 (0xED, '\u{00CC}'), // I with grave
786 (0xEE, '\u{00D3}'), // O with acute
787 (0xEF, '\u{00D4}'), // O with circumflex
788 ];
789
790 for (byte, expected_char) in test_cases {
791 let decoded = encoding.decode(&[byte]);
792 assert_eq!(
793 decoded.chars().next().unwrap(),
794 expected_char,
795 "Failed decoding byte 0x{byte:02X}"
796 );
797 }
798 }
799
800 #[test]
801 fn test_mac_roman_decode_final_range() {
802 let encoding = TextEncoding::MacRomanEncoding;
803
804 let test_cases: Vec<(u8, char)> = vec![
805 (0xF0, '\u{F8FF}'), // Apple logo
806 (0xF1, 'Ò'), // O with grave
807 (0xF2, 'Ú'), // U with acute
808 (0xF3, 'Û'), // U with circumflex
809 (0xF4, 'Ù'), // U with grave
810 (0xF5, 'ı'), // Dotless i
811 (0xF6, 'ˆ'), // Circumflex modifier
812 (0xF7, '˜'), // Small tilde
813 (0xF8, '¯'), // Macron
814 (0xF9, '˘'), // Breve
815 (0xFA, '˙'), // Dot above
816 (0xFB, '˚'), // Ring above
817 (0xFC, '¸'), // Cedilla
818 (0xFD, '˝'), // Double acute
819 (0xFE, '˛'), // Ogonek
820 (0xFF, 'ˇ'), // Caron
821 ];
822
823 for (byte, expected_char) in test_cases {
824 let decoded = encoding.decode(&[byte]);
825 assert_eq!(
826 decoded.chars().next().unwrap(),
827 expected_char,
828 "Failed decoding byte 0x{byte:02X}"
829 );
830 }
831 }
832
833 #[test]
834 fn test_mac_roman_unmapped_character() {
835 let encoding = TextEncoding::MacRomanEncoding;
836
837 // Use a character that's not in Mac Roman
838 let text = "❤"; // Heart emoji
839 let encoded = encoding.encode(text);
840 assert_eq!(encoded, vec![b'?']);
841 }
842
843 #[test]
844 fn test_win_ansi_encode_all_special_mappings() {
845 let encoding = TextEncoding::WinAnsiEncoding;
846
847 let test_cases = [
848 ("\u{0160}", 0x8A), // S with caron
849 ("\u{0152}", 0x8C), // OE ligature
850 ("\u{017D}", 0x8E), // Z with caron
851 ("\u{0161}", 0x9A), // s with caron
852 ("\u{0153}", 0x9C), // oe ligature
853 ("\u{017E}", 0x9E), // z with caron
854 ("\u{0178}", 0x9F), // Y with diaeresis
855 ("\u{02C6}", 0x88), // Circumflex
856 ("\u{02DC}", 0x98), // Small tilde
857 ("\u{2039}", 0x8B), // Single left angle quote
858 ("\u{203A}", 0x9B), // Single right angle quote
859 ];
860
861 for (text, expected_byte) in &test_cases {
862 let encoded = encoding.encode(text);
863 assert_eq!(
864 encoded,
865 vec![*expected_byte],
866 "Failed encoding {text} (U+{:04X})",
867 text.chars().next().unwrap() as u32
868 );
869 }
870 }
871
872 #[test]
873 fn test_long_text_encoding_roundtrip() {
874 let encodings = [
875 TextEncoding::StandardEncoding,
876 TextEncoding::WinAnsiEncoding,
877 TextEncoding::MacRomanEncoding,
878 TextEncoding::PdfDocEncoding,
879 ];
880
881 let long_text = "The quick brown fox jumps over the lazy dog. 0123456789!@#$%^&*()";
882
883 for encoding in &encodings {
884 let encoded = encoding.encode(long_text);
885 let decoded = encoding.decode(&encoded);
886 assert_eq!(decoded, long_text, "Failed for {encoding:?}");
887 }
888 }
889
890 #[test]
891 fn test_win_ansi_decode_all_special_bytes() {
892 let encoding = TextEncoding::WinAnsiEncoding;
893
894 // Test all defined special bytes
895 let test_cases: Vec<(u8, char)> = vec![
896 (0x80, '\u{20AC}'), // Euro sign
897 (0x82, '\u{201A}'), // Single low quotation mark
898 (0x83, '\u{0192}'), // f with hook
899 (0x84, '\u{201E}'), // Double low quotation mark
900 (0x85, '\u{2026}'), // Horizontal ellipsis
901 (0x86, '\u{2020}'), // Dagger
902 (0x87, '\u{2021}'), // Double dagger
903 (0x88, '\u{02C6}'), // Circumflex accent
904 (0x89, '\u{2030}'), // Per mille sign
905 (0x8A, '\u{0160}'), // S with caron
906 (0x8B, '\u{2039}'), // Single left angle quote
907 (0x8C, '\u{0152}'), // OE ligature
908 (0x8E, '\u{017D}'), // Z with caron
909 (0x91, '\u{2018}'), // Left single quotation mark
910 (0x92, '\u{2019}'), // Right single quotation mark
911 (0x93, '\u{201C}'), // Left double quotation mark
912 (0x94, '\u{201D}'), // Right double quotation mark
913 (0x95, '\u{2022}'), // Bullet
914 (0x96, '\u{2013}'), // En dash
915 (0x97, '\u{2014}'), // Em dash
916 (0x98, '\u{02DC}'), // Small tilde
917 (0x99, '\u{2122}'), // Trade mark sign
918 (0x9A, '\u{0161}'), // s with caron
919 (0x9B, '\u{203A}'), // Single right angle quote
920 (0x9C, '\u{0153}'), // oe ligature
921 (0x9E, '\u{017E}'), // z with caron
922 (0x9F, '\u{0178}'), // Y with diaeresis
923 ];
924
925 for (byte, expected_char) in test_cases {
926 let decoded = encoding.decode(&[byte]);
927 assert_eq!(
928 decoded.chars().next().unwrap(),
929 expected_char,
930 "Failed decoding byte 0x{byte:02X}"
931 );
932 }
933 }
934}