copybook_charset/
lib.rs

1#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
2// SPDX-License-Identifier: AGPL-3.0-or-later
3//! Character set conversion utilities for EBCDIC ↔ UTF-8.
4//!
5//! Provides bidirectional conversion between five IBM EBCDIC codepages and
6//! Unicode/UTF-8 using compile-time lookup tables for maximum throughput.
7//!
8//! ## Supported Codepages
9//!
10//! | Codepage | Region |
11//! |----------|--------|
12//! | CP 037 | US / Canada |
13//! | CP 273 | Germany / Austria |
14//! | CP 500 | International Latin-1 |
15//! | CP 1047 | Latin-1 / Open Systems |
16//! | CP 1140 | US / Canada with € |
17//!
18//! ## Key Functions
19//!
20//! - [`ebcdic_to_utf8`] — Decode an EBCDIC byte slice to a Rust `String`.
21//! - [`utf8_to_ebcdic`] — Encode a UTF-8 string into an EBCDIC byte vector.
22//!
23//! Both functions accept an [`UnmappablePolicy`] that controls behaviour when a
24//! character has no mapping in the target encoding (replace with a substitute
25//! character or return an error).
26
27pub use copybook_codepage::{Codepage, UnmappablePolicy, get_zoned_sign_table, space_byte};
28use copybook_error::{Error, ErrorCode, Result};
29use std::convert::TryFrom;
30use tracing::warn;
31
32// ============================================================================
33// Charset conversion code (moved from copybook-codec/src/charset.rs)
34// ============================================================================
35
36// EBCDIC to Unicode lookup tables for supported code pages
37// Each table maps EBCDIC byte values (0-255) to Unicode code points
38
39/// EBCDIC Code Page 037 (US/Canada) to Unicode lookup table
40static CP037_TO_UNICODE: [u32; 256] = [
41    0x0000, 0x0001, 0x0002, 0x0003, 0x009C, 0x0009, 0x0086, 0x007F, // 00-07
42    0x0097, 0x008D, 0x008E, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
43    0x0010, 0x0011, 0x0012, 0x0013, 0x009D, 0x0085, 0x0008, 0x0087, // 10-17
44    0x0018, 0x0019, 0x0092, 0x008F, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
45    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000A, 0x0017, 0x001B, // 20-27
46    0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x0005, 0x0006, 0x0007, // 28-2F
47    0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, // 30-37
48    0x0098, 0x0099, 0x009A, 0x009B, 0x0014, 0x0015, 0x009E, 0x001A, // 38-3F
49    0x0020, 0x00A0, 0x00E2, 0x00E4, 0x00E0, 0x00E1, 0x00E3,
50    0x00E5, // 40-47 (space, nbsp, â, ä, à, á, ã, å)
51    0x00E7, 0x00F1, 0x00A2, 0x002E, 0x003C, 0x0028, 0x002B,
52    0x007C, // 48-4F (ç, ñ, ¢, ., <, (, +, |)
53    0x0026, 0x00E9, 0x00EA, 0x00EB, 0x00E8, 0x00ED, 0x00EE,
54    0x00EF, // 50-57 (&, é, ê, ë, è, í, î, ï)
55    0x00EC, 0x00DF, 0x0021, 0x0024, 0x002A, 0x0029, 0x003B,
56    0x00AC, // 58-5F (ì, ß, !, $, *, ), ;, ¬)
57    0x002D, 0x002F, 0x00C2, 0x00C4, 0x00C0, 0x00C1, 0x00C3,
58    0x00C5, // 60-67 (-, /, Â, Ä, À, Á, Ã, Å)
59    0x00C7, 0x00D1, 0x00A6, 0x002C, 0x0025, 0x005F, 0x003E,
60    0x003F, // 68-6F (Ç, Ñ, ¦, ,, %, _, >, ?)
61    0x00F8, 0x00C9, 0x00CA, 0x00CB, 0x00C8, 0x00CD, 0x00CE,
62    0x00CF, // 70-77 (ø, É, Ê, Ë, È, Í, Î, Ï)
63    0x00CC, 0x0060, 0x003A, 0x0023, 0x0040, 0x0027, 0x003D,
64    0x0022, // 78-7F (Ì, `, :, #, @, ', =, ")
65    0x00D8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 80-87 (Ø, a-g)
66    0x0068, 0x0069, 0x00AB, 0x00BB, 0x00F0, 0x00FD, 0x00FE,
67    0x00B1, // 88-8F (h, i, «, », ð, ý, þ, ±)
68    0x00B0, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, // 90-97 (°, j-p)
69    0x0071, 0x0072, 0x00AA, 0x00BA, 0x00E6, 0x00B8, 0x00C6,
70    0x00A4, // 98-9F (q, r, ª, º, æ, ¸, Æ, ¤)
71    0x00B5, 0x007E, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, // A0-A7 (µ, ~, s-x)
72    0x0079, 0x007A, 0x00A1, 0x00BF, 0x00D0, 0x00DD, 0x00DE,
73    0x00AE, // A8-AF (y, z, ¡, ¿, Ð, Ý, Þ, ®)
74    0x005E, 0x00A3, 0x00A5, 0x00B7, 0x00A9, 0x00A7, 0x00B6,
75    0x00BC, // B0-B7 (^, £, ¥, ·, ©, §, ¶, ¼)
76    0x00BD, 0x00BE, 0x005B, 0x005D, 0x00AF, 0x00A8, 0x00B4,
77    0x00D7, // B8-BF (½, ¾, [, ], ¯, ¨, ´, ×)
78    0x007B, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // C0-C7 ({, A-G)
79    0x0048, 0x0049, 0x00AD, 0x00F4, 0x00F6, 0x00F2, 0x00F3,
80    0x00F5, // C8-CF (H, I, , ô, ö, ò, ó, õ)
81    0x007D, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, // D0-D7 (}, J-P)
82    0x0051, 0x0052, 0x00B9, 0x00FB, 0x00FC, 0x00F9, 0x00FA,
83    0x00FF, // D8-DF (Q, R, ¹, û, ü, ù, ú, ÿ)
84    0x005C, 0x00F7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, // E0-E7 (\, ÷, S-X)
85    0x0059, 0x005A, 0x00B2, 0x00D4, 0x00D6, 0x00D2, 0x00D3,
86    0x00D5, // E8-EF (Y, Z, ², Ô, Ö, Ò, Ó, Õ)
87    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // F0-F7 (0-7)
88    0x0038, 0x0039, 0x00B3, 0x00DB, 0x00DC, 0x00D9, 0x00DA,
89    0x009F, // F8-FF (8, 9, ³, Û, Ü, Ù, Ú, control)
90];
91
92/// EBCDIC Code Page 273 (Germany/Austria) to Unicode lookup table
93static CP273_TO_UNICODE: [u32; 256] = [
94    0x0000, 0x0001, 0x0002, 0x0003, 0x009C, 0x0009, 0x0086, 0x007F, // 00-07
95    0x0097, 0x008D, 0x008E, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
96    0x0010, 0x0011, 0x0012, 0x0013, 0x009D, 0x0085, 0x0008, 0x0087, // 10-17
97    0x0018, 0x0019, 0x0092, 0x008F, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
98    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000A, 0x0017, 0x001B, // 20-27
99    0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x0005, 0x0006, 0x0007, // 28-2F
100    0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, // 30-37
101    0x0098, 0x0099, 0x009A, 0x009B, 0x0014, 0x0015, 0x009E, 0x001A, // 38-3F
102    0x0020, 0x00A0, 0x00E2, 0x007B, 0x00E0, 0x00E1, 0x00E3,
103    0x00E5, // 40-47 (space, nbsp, â, {, à, á, ã, å)
104    0x00E7, 0x00F1, 0x00C4, 0x002E, 0x003C, 0x0028, 0x002B,
105    0x0021, // 48-4F (ç, ñ, Ä, ., <, (, +, !)
106    0x0026, 0x00E9, 0x00EA, 0x00EB, 0x00E8, 0x00ED, 0x00EE,
107    0x00EF, // 50-57 (&, é, ê, ë, è, í, î, ï)
108    0x00EC, 0x00DF, 0x00DC, 0x0024, 0x002A, 0x0029, 0x003B,
109    0x005E, // 58-5F (ì, ß, Ü, $, *, ), ;, ^)
110    0x002D, 0x002F, 0x00C2, 0x005B, 0x00C0, 0x00C1, 0x00C3,
111    0x00C5, // 60-67 (-, /, Â, [, À, Á, Ã, Å)
112    0x00C7, 0x00D1, 0x00F6, 0x002C, 0x0025, 0x005F, 0x003E,
113    0x003F, // 68-6F (Ç, Ñ, ö, ,, %, _, >, ?)
114    0x00F8, 0x00C9, 0x00CA, 0x00CB, 0x00C8, 0x00CD, 0x00CE,
115    0x00CF, // 70-77 (ø, É, Ê, Ë, È, Í, Î, Ï)
116    0x00CC, 0x0060, 0x003A, 0x0023, 0x00A7, 0x0027, 0x003D,
117    0x0022, // 78-7F (Ì, `, :, #, §, ', =, ")
118    0x00D8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 80-87 (Ø, a-g)
119    0x0068, 0x0069, 0x00AB, 0x00BB, 0x00F0, 0x00FD, 0x00FE,
120    0x00B1, // 88-8F (h, i, «, », ð, ý, þ, ±)
121    0x00B0, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, // 90-97 (°, j-p)
122    0x0071, 0x0072, 0x00AA, 0x00BA, 0x00E6, 0x00B8, 0x00C6,
123    0x00A4, // 98-9F (q, r, ª, º, æ, ¸, Æ, ¤)
124    0x00B5, 0x007E, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, // A0-A7 (µ, ~, s-x)
125    0x0079, 0x007A, 0x00A1, 0x00BF, 0x00D0, 0x00DD, 0x00DE,
126    0x00AE, // A8-AF (y, z, ¡, ¿, Ð, Ý, Þ, ®)
127    0x00A2, 0x00A3, 0x00A5, 0x00B7, 0x00A9, 0x0040, 0x00B6,
128    0x00BC, // B0-B7 (¢, £, ¥, ·, ©, @, ¶, ¼)
129    0x00BD, 0x00BE, 0x00AC, 0x007C, 0x00AF, 0x00A8, 0x00B4,
130    0x00D7, // B8-BF (½, ¾, ¬, |, ¯, ¨, ´, ×)
131    0x00E4, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // C0-C7 (ä, A-G)
132    0x0048, 0x0049, 0x00AD, 0x00F4, 0x00A6, 0x00F2, 0x00F3,
133    0x00F5, // C8-CF (H, I, , ô, ¦, ò, ó, õ)
134    0x00FC, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, // D0-D7 (ü, J-P)
135    0x0051, 0x0052, 0x00B9, 0x00FB, 0x007D, 0x00F9, 0x00FA,
136    0x00FF, // D8-DF (Q, R, ¹, û, }, ù, ú, ÿ)
137    0x00D6, 0x00F7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, // E0-E7 (Ö, ÷, S-X)
138    0x0059, 0x005A, 0x00B2, 0x00D4, 0x005C, 0x00D2, 0x00D3,
139    0x00D5, // E8-EF (Y, Z, ², Ô, \, Ò, Ó, Õ)
140    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // F0-F7 (0-7)
141    0x0038, 0x0039, 0x00B3, 0x00DB, 0x005D, 0x00D9, 0x00DA,
142    0x009F, // F8-FF (8, 9, ³, Û, ], Ù, Ú, control)
143];
144
145/// EBCDIC Code Page 500 (International) to Unicode lookup table
146static CP500_TO_UNICODE: [u32; 256] = [
147    0x0000, 0x0001, 0x0002, 0x0003, 0x009C, 0x0009, 0x0086, 0x007F, // 00-07
148    0x0097, 0x008D, 0x008E, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
149    0x0010, 0x0011, 0x0012, 0x0013, 0x009D, 0x0085, 0x0008, 0x0087, // 10-17
150    0x0018, 0x0019, 0x0092, 0x008F, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
151    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000A, 0x0017, 0x001B, // 20-27
152    0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x0005, 0x0006, 0x0007, // 28-2F
153    0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, // 30-37
154    0x0098, 0x0099, 0x009A, 0x009B, 0x0014, 0x0015, 0x009E, 0x001A, // 38-3F
155    0x0020, 0x00A0, 0x00E2, 0x00E4, 0x00E0, 0x00E1, 0x00E3,
156    0x00E5, // 40-47 (space, nbsp, â, ä, à, á, ã, å)
157    0x00E7, 0x00F1, 0x005B, 0x002E, 0x003C, 0x0028, 0x002B,
158    0x0021, // 48-4F (ç, ñ, [, ., <, (, +, !)
159    0x0026, 0x00E9, 0x00EA, 0x00EB, 0x00E8, 0x00ED, 0x00EE,
160    0x00EF, // 50-57 (&, é, ê, ë, è, í, î, ï)
161    0x00EC, 0x00DF, 0x005D, 0x0024, 0x002A, 0x0029, 0x003B,
162    0x005E, // 58-5F (ì, ß, ], $, *, ), ;, ^)
163    0x002D, 0x002F, 0x00C2, 0x00C4, 0x00C0, 0x00C1, 0x00C3,
164    0x00C5, // 60-67 (-, /, Â, Ä, À, Á, Ã, Å)
165    0x00C7, 0x00D1, 0x00A6, 0x002C, 0x0025, 0x005F, 0x003E,
166    0x003F, // 68-6F (Ç, Ñ, ¦, ,, %, _, >, ?)
167    0x00F8, 0x00C9, 0x00CA, 0x00CB, 0x00C8, 0x00CD, 0x00CE,
168    0x00CF, // 70-77 (ø, É, Ê, Ë, È, Í, Î, Ï)
169    0x00CC, 0x0060, 0x003A, 0x0023, 0x0040, 0x0027, 0x003D,
170    0x0022, // 78-7F (Ì, `, :, #, @, ', =, ")
171    0x00D8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 80-87 (Ø, a-g)
172    0x0068, 0x0069, 0x00AB, 0x00BB, 0x00F0, 0x00FD, 0x00FE,
173    0x00B1, // 88-8F (h, i, «, », ð, ý, þ, ±)
174    0x00B0, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, // 90-97 (°, j-p)
175    0x0071, 0x0072, 0x00AA, 0x00BA, 0x00E6, 0x00B8, 0x00C6,
176    0x00A4, // 98-9F (q, r, ª, º, æ, ¸, Æ, ¤)
177    0x00B5, 0x007E, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, // A0-A7 (µ, ~, s-x)
178    0x0079, 0x007A, 0x00A1, 0x00BF, 0x00D0, 0x00DD, 0x00DE,
179    0x00AE, // A8-AF (y, z, ¡, ¿, Ð, Ý, Þ, ®)
180    0x00A2, 0x00A3, 0x00A5, 0x00B7, 0x00A9, 0x00A7, 0x00B6,
181    0x00BC, // B0-B7 (¢, £, ¥, ·, ©, §, ¶, ¼)
182    0x00BD, 0x00BE, 0x00AC, 0x007C, 0x00AF, 0x00A8, 0x00B4,
183    0x00D7, // B8-BF (½, ¾, ¬, |, ¯, ¨, ´, ×)
184    0x007B, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // C0-C7 ({, A-G)
185    0x0048, 0x0049, 0x00AD, 0x00F4, 0x00F6, 0x00F2, 0x00F3,
186    0x00F5, // C8-CF (H, I, , ô, ö, ò, ó, õ)
187    0x007D, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, // D0-D7 (}, J-P)
188    0x0051, 0x0052, 0x00B9, 0x00FB, 0x00FC, 0x00F9, 0x00FA,
189    0x00FF, // D8-DF (Q, R, ¹, û, ü, ù, ú, ÿ)
190    0x005C, 0x00F7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, // E0-E7 (\, ÷, S-X)
191    0x0059, 0x005A, 0x00B2, 0x00D4, 0x00D6, 0x00D2, 0x00D3,
192    0x00D5, // E8-EF (Y, Z, ², Ô, Ö, Ò, Ó, Õ)
193    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // F0-F7 (0-7)
194    0x0038, 0x0039, 0x00B3, 0x00DB, 0x00DC, 0x00D9, 0x00DA,
195    0x009F, // F8-FF (8, 9, ³, Û, Ü, Ù, Ú, control)
196];
197
198/// EBCDIC Code Page 1047 (Open Systems) to Unicode lookup table
199static CP1047_TO_UNICODE: [u32; 256] = [
200    0x0000, 0x0001, 0x0002, 0x0003, 0x009C, 0x0009, 0x0086, 0x007F, // 00-07
201    0x0097, 0x008D, 0x008E, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
202    0x0010, 0x0011, 0x0012, 0x0013, 0x009D, 0x0085, 0x0008, 0x0087, // 10-17
203    0x0018, 0x0019, 0x0092, 0x008F, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
204    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000A, 0x0017, 0x001B, // 20-27
205    0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x0005, 0x0006, 0x0007, // 28-2F
206    0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, // 30-37
207    0x0098, 0x0099, 0x009A, 0x009B, 0x0014, 0x0015, 0x009E, 0x001A, // 38-3F
208    0x0020, 0x00A0, 0x00E2, 0x00E4, 0x00E0, 0x00E1, 0x00E3,
209    0x00E5, // 40-47 (space, nbsp, â, ä, à, á, ã, å)
210    0x00E7, 0x00F1, 0x00A2, 0x002E, 0x003C, 0x0028, 0x002B,
211    0x007C, // 48-4F (ç, ñ, ¢, ., <, (, +, |)
212    0x0026, 0x00E9, 0x00EA, 0x00EB, 0x00E8, 0x00ED, 0x00EE,
213    0x00EF, // 50-57 (&, é, ê, ë, è, í, î, ï)
214    0x00EC, 0x00DF, 0x0021, 0x0024, 0x002A, 0x0029, 0x003B,
215    0x00AC, // 58-5F (ì, ß, !, $, *, ), ;, ¬)
216    0x002D, 0x002F, 0x00C2, 0x00C4, 0x00C0, 0x00C1, 0x00C3,
217    0x00C5, // 60-67 (-, /, Â, Ä, À, Á, Ã, Å)
218    0x00C7, 0x00D1, 0x00A6, 0x002C, 0x0025, 0x005F, 0x003E,
219    0x003F, // 68-6F (Ç, Ñ, ¦, ,, %, _, >, ?)
220    0x00F8, 0x00C9, 0x00CA, 0x00CB, 0x00C8, 0x00CD, 0x00CE,
221    0x00CF, // 70-77 (ø, É, Ê, Ë, È, Í, Î, Ï)
222    0x00CC, 0x0060, 0x003A, 0x0023, 0x0040, 0x0027, 0x003D,
223    0x0022, // 78-7F (Ì, `, :, #, @, ', =, ")
224    0x00D8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 80-87 (Ø, a-g)
225    0x0068, 0x0069, 0x00AB, 0x00BB, 0x00F0, 0x00FD, 0x00FE,
226    0x00B1, // 88-8F (h, i, «, », ð, ý, þ, ±)
227    0x00B0, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, // 90-97 (°, j-p)
228    0x0071, 0x0072, 0x00AA, 0x00BA, 0x00E6, 0x00B8, 0x00C6,
229    0x00A4, // 98-9F (q, r, ª, º, æ, ¸, Æ, ¤)
230    0x00B5, 0x007E, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, // A0-A7 (µ, ~, s-x)
231    0x0079, 0x007A, 0x00A1, 0x00BF, 0x00D0, 0x005B, 0x00DE,
232    0x00AE, // A8-AF (y, z, ¡, ¿, Ð, [, Þ, ®)
233    0x005E, 0x00A3, 0x00A5, 0x00B7, 0x00A9, 0x00A7, 0x00B6,
234    0x00BC, // B0-B7 (^, £, ¥, ·, ©, §, ¶, ¼)
235    0x00BD, 0x00BE, 0x00DD, 0x00A8, 0x00AF, 0x005D, 0x00B4,
236    0x00D7, // B8-BF (½, ¾, Ý, ¨, ¯, ], ´, ×)
237    0x007B, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // C0-C7 ({, A-G)
238    0x0048, 0x0049, 0x00AD, 0x00F4, 0x00F6, 0x00F2, 0x00F3,
239    0x00F5, // C8-CF (H, I, , ô, ö, ò, ó, õ)
240    0x007D, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, // D0-D7 (}, J-P)
241    0x0051, 0x0052, 0x00B9, 0x00FB, 0x00FC, 0x00F9, 0x00FA,
242    0x00FF, // D8-DF (Q, R, ¹, û, ü, ù, ú, ÿ)
243    0x005C, 0x00F7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, // E0-E7 (\, ÷, S-X)
244    0x0059, 0x005A, 0x00B2, 0x00D4, 0x00D6, 0x00D2, 0x00D3,
245    0x00D5, // E8-EF (Y, Z, ², Ô, Ö, Ò, Ó, Õ)
246    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // F0-F7 (0-7)
247    0x0038, 0x0039, 0x00B3, 0x00DB, 0x00DC, 0x00D9, 0x00DA,
248    0x009F, // F8-FF (8, 9, ³, Û, Ü, Ù, Ú, control)
249];
250
251/// EBCDIC Code Page 1140 (US/Canada with Euro) to Unicode lookup table
252static CP1140_TO_UNICODE: [u32; 256] = [
253    0x0000, 0x0001, 0x0002, 0x0003, 0x009C, 0x0009, 0x0086, 0x007F, // 00-07
254    0x0097, 0x008D, 0x008E, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
255    0x0010, 0x0011, 0x0012, 0x0013, 0x009D, 0x0085, 0x0008, 0x0087, // 10-17
256    0x0018, 0x0019, 0x0092, 0x008F, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
257    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000A, 0x0017, 0x001B, // 20-27
258    0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x0005, 0x0006, 0x0007, // 28-2F
259    0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004, // 30-37
260    0x0098, 0x0099, 0x009A, 0x009B, 0x0014, 0x0015, 0x009E, 0x001A, // 38-3F
261    0x0020, 0x00A0, 0x00E2, 0x00E4, 0x00E0, 0x00E1, 0x00E3,
262    0x00E5, // 40-47 (space, nbsp, â, ä, à, á, ã, å)
263    0x00E7, 0x00F1, 0x00A2, 0x002E, 0x003C, 0x0028, 0x002B,
264    0x007C, // 48-4F (ç, ñ, ¢, ., <, (, +, |)
265    0x0026, 0x00E9, 0x00EA, 0x00EB, 0x00E8, 0x00ED, 0x00EE,
266    0x00EF, // 50-57 (&, é, ê, ë, è, í, î, ï)
267    0x00EC, 0x00DF, 0x0021, 0x0024, 0x002A, 0x0029, 0x003B,
268    0x00AC, // 58-5F (ì, ß, !, $, *, ), ;, ¬)
269    0x002D, 0x002F, 0x00C2, 0x00C4, 0x00C0, 0x00C1, 0x00C3,
270    0x00C5, // 60-67 (-, /, Â, Ä, À, Á, Ã, Å)
271    0x00C7, 0x00D1, 0x00A6, 0x002C, 0x0025, 0x005F, 0x003E,
272    0x003F, // 68-6F (Ç, Ñ, ¦, ,, %, _, >, ?)
273    0x00F8, 0x00C9, 0x00CA, 0x00CB, 0x00C8, 0x00CD, 0x00CE,
274    0x00CF, // 70-77 (ø, É, Ê, Ë, È, Í, Î, Ï)
275    0x00CC, 0x0060, 0x003A, 0x0023, 0x0040, 0x0027, 0x003D,
276    0x0022, // 78-7F (Ì, `, :, #, @, ', =, ")
277    0x00D8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 80-87 (Ø, a-g)
278    0x0068, 0x0069, 0x00AB, 0x00BB, 0x00F0, 0x00FD, 0x00FE,
279    0x00B1, // 88-8F (h, i, «, », ð, ý, þ, ±)
280    0x00B0, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, // 90-97 (°, j-p)
281    0x0071, 0x0072, 0x00AA, 0x00BA, 0x00E6, 0x00B8, 0x00C6,
282    0x00A4, // 98-9F (q, r, ª, º, æ, ¸, Æ, ¤)
283    0x00B5, 0x007E, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, // A0-A7 (µ, ~, s-x)
284    0x0079, 0x007A, 0x00A1, 0x00BF, 0x00D0, 0x00DD, 0x00DE,
285    0x00AE, // A8-AF (y, z, ¡, ¿, Ð, Ý, Þ, ®)
286    0x005E, 0x00A3, 0x00A5, 0x00B7, 0x00A9, 0x00A7, 0x00B6,
287    0x00BC, // B0-B7 (^, £, ¥, ·, ©, §, ¶, ¼)
288    0x00BD, 0x00BE, 0x005B, 0x005D, 0x00AF, 0x00A8, 0x00B4,
289    0x00D7, // B8-BF (½, ¾, [, ], ¯, ¨, ´, ×)
290    0x007B, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // C0-C7 ({, A-G)
291    0x0048, 0x0049, 0x00AD, 0x00F4, 0x00F6, 0x00F2, 0x00F3,
292    0x00F5, // C8-CF (H, I, , ô, ö, ò, ó, õ)
293    0x007D, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, // D0-D7 (}, J-P)
294    0x0051, 0x0052, 0x00B9, 0x00FB, 0x00FC, 0x00F9, 0x00FA,
295    0x00FF, // D8-DF (Q, R, ¹, û, ü, ù, ú, ÿ)
296    0x005C, 0x00F7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, // E0-E7 (\, ÷, S-X)
297    0x0059, 0x005A, 0x00B2, 0x00D4, 0x00D6, 0x00D2, 0x00D3,
298    0x00D5, // E8-EF (Y, Z, ², Ô, Ö, Ò, Ó, Õ)
299    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // F0-F7 (0-7)
300    0x0038, 0x0039, 0x00B3, 0x00DB, 0x00DC, 0x00D9, 0x00DA,
301    0x20AC, // F8-FF (8, 9, ³, Û, Ü, Ù, Ú, €)
302];
303
304/// Get the appropriate lookup table for the given codepage
305fn get_ebcdic_table(codepage: Codepage) -> Option<&'static [u32; 256]> {
306    match codepage {
307        Codepage::ASCII => None,
308        Codepage::CP037 => Some(&CP037_TO_UNICODE),
309        Codepage::CP273 => Some(&CP273_TO_UNICODE),
310        Codepage::CP500 => Some(&CP500_TO_UNICODE),
311        Codepage::CP1047 => Some(&CP1047_TO_UNICODE),
312        Codepage::CP1140 => Some(&CP1140_TO_UNICODE),
313    }
314}
315
316/// Convert EBCDIC bytes to UTF-8 string
317///
318/// Decodes a byte slice from the specified EBCDIC codepage into a Rust `String`.
319/// The [`UnmappablePolicy`] controls behaviour when a byte has no valid Unicode mapping.
320///
321/// # Examples
322///
323/// ```
324/// use copybook_charset::{ebcdic_to_utf8, Codepage, UnmappablePolicy};
325///
326/// // EBCDIC CP037 encoding of "HELLO"
327/// let ebcdic = &[0xC8, 0xC5, 0xD3, 0xD3, 0xD6];
328/// let text = ebcdic_to_utf8(ebcdic, Codepage::CP037, UnmappablePolicy::Error).unwrap();
329/// assert_eq!(text, "HELLO");
330/// ```
331///
332/// # Errors
333/// Returns an error if the EBCDIC data contains invalid bytes that cannot be converted.
334#[inline]
335#[must_use = "Handle the Result or propagate the error"]
336pub fn ebcdic_to_utf8(data: &[u8], codepage: Codepage, policy: UnmappablePolicy) -> Result<String> {
337    // ASCII pass-through mode (transparent 8-bit, not Windows-1252)
338    if codepage == Codepage::ASCII {
339        return Ok(String::from_utf8_lossy(data).into_owned());
340    }
341
342    let table = get_ebcdic_table(codepage).ok_or_else(|| {
343        Error::new(
344            ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
345            format!("Unsupported codepage: {codepage:?}"),
346        )
347    })?;
348
349    let mut result = String::with_capacity(data.len());
350
351    for &byte in data {
352        let unicode_point = table[byte as usize];
353
354        // Check for unmappable characters (control characters < 0x20 except tab, LF, CR)
355        if unicode_point < 0x20
356            && unicode_point != 0x09
357            && unicode_point != 0x0A
358            && unicode_point != 0x0D
359        {
360            match policy {
361                UnmappablePolicy::Error => {
362                    return Err(Error::new(
363                        ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
364                        format!("Unmappable EBCDIC byte: 0x{byte:02X} -> U+{unicode_point:04X}"),
365                    ));
366                }
367                UnmappablePolicy::Replace => {
368                    warn!(
369                        "CBKC301_INVALID_EBCDIC_BYTE: Unmappable EBCDIC byte 0x{:02X}, replacing with U+FFFD",
370                        byte
371                    );
372                    result.push('\u{FFFD}'); // Unicode replacement character
373                    continue;
374                }
375                UnmappablePolicy::Skip => {
376                    warn!(
377                        "CBKC301_INVALID_EBCDIC_BYTE: Unmappable EBCDIC byte 0x{:02X}, skipping",
378                        byte
379                    );
380                    continue;
381                }
382            }
383        }
384
385        // Convert Unicode code point to char
386        if let Some(ch) = char::from_u32(unicode_point) {
387            result.push(ch);
388        } else {
389            match policy {
390                UnmappablePolicy::Error => {
391                    return Err(Error::new(
392                        ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
393                        format!("Invalid Unicode code point: U+{unicode_point:04X}"),
394                    ));
395                }
396                UnmappablePolicy::Replace => {
397                    warn!(
398                        "CBKC301_INVALID_EBCDIC_BYTE: Invalid Unicode code point U+{:04X}, replacing with U+FFFD",
399                        unicode_point
400                    );
401                    result.push('\u{FFFD}');
402                }
403                UnmappablePolicy::Skip => {
404                    warn!(
405                        "CBKC301_INVALID_EBCDIC_BYTE: Invalid Unicode code point U+{:04X}, skipping",
406                        unicode_point
407                    );
408                }
409            }
410        }
411    }
412
413    Ok(result)
414}
415
416/// Convert UTF-8 string to EBCDIC bytes
417///
418/// Encodes a UTF-8 string into EBCDIC bytes for the specified codepage.
419///
420/// # Examples
421///
422/// ```
423/// use copybook_charset::{utf8_to_ebcdic, Codepage};
424///
425/// let ebcdic = utf8_to_ebcdic("HELLO", Codepage::CP037).unwrap();
426/// assert_eq!(ebcdic, vec![0xC8, 0xC5, 0xD3, 0xD3, 0xD6]);
427/// ```
428///
429/// # Errors
430/// Returns an error if the UTF-8 text contains characters that cannot be mapped to the target codepage.
431#[inline]
432#[must_use = "Handle the Result or propagate the error"]
433pub fn utf8_to_ebcdic(text: &str, codepage: Codepage) -> Result<Vec<u8>> {
434    // ASCII pass-through mode (transparent 8-bit, not Windows-1252)
435    if codepage == Codepage::ASCII {
436        return Ok(text.as_bytes().to_vec());
437    }
438
439    let table = get_ebcdic_table(codepage).ok_or_else(|| {
440        Error::new(
441            ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
442            format!("Unsupported codepage: {codepage:?}"),
443        )
444    })?;
445
446    // Build reverse lookup table (Unicode -> EBCDIC)
447    let mut reverse_table = std::collections::HashMap::new();
448    for (ebcdic_index, &unicode_point) in table.iter().enumerate() {
449        if let Some(ch) = char::from_u32(unicode_point) {
450            let ebcdic_byte = u8::try_from(ebcdic_index).map_err(|_| {
451                Error::new(
452                    ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
453                    format!("EBCDIC byte index {ebcdic_index} exceeds u8 range"),
454                )
455            })?;
456            reverse_table.insert(ch, ebcdic_byte);
457        }
458    }
459
460    let mut result = Vec::with_capacity(text.len());
461
462    for ch in text.chars() {
463        if let Some(&ebcdic_byte) = reverse_table.get(&ch) {
464            result.push(ebcdic_byte);
465        } else {
466            return Err(Error::new(
467                ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
468                format!("Character '{ch}' cannot be mapped to {codepage:?}"),
469            ));
470        }
471    }
472
473    Ok(result)
474}
475
476#[cfg(test)]
477#[allow(clippy::expect_used, clippy::unwrap_used)]
478mod tests {
479    use super::*;
480    use copybook_error::ErrorCode;
481
482    #[test]
483    fn test_space_byte_ascii() {
484        assert_eq!(space_byte(Codepage::ASCII), 0x20);
485    }
486
487    #[test]
488    fn test_space_byte_ebcdic() {
489        // All EBCDIC codepages use 0x40 for space
490        assert_eq!(space_byte(Codepage::CP037), 0x40);
491        assert_eq!(space_byte(Codepage::CP273), 0x40);
492        assert_eq!(space_byte(Codepage::CP500), 0x40);
493        assert_eq!(space_byte(Codepage::CP1047), 0x40);
494        assert_eq!(space_byte(Codepage::CP1140), 0x40);
495    }
496
497    #[test]
498    fn test_codepage_is_ascii() {
499        assert!(Codepage::ASCII.is_ascii());
500        assert!(!Codepage::CP037.is_ascii());
501    }
502
503    #[test]
504    fn test_codepage_is_ebcdic() {
505        assert!(!Codepage::ASCII.is_ebcdic());
506        assert!(Codepage::CP037.is_ebcdic());
507    }
508
509    #[test]
510    fn test_codepage_code_page_number() {
511        assert_eq!(Codepage::ASCII.code_page_number(), None);
512        assert_eq!(Codepage::CP037.code_page_number(), Some(37));
513        assert_eq!(Codepage::CP1140.code_page_number(), Some(1140));
514    }
515
516    // --- ebcdic_to_utf8 tests ---
517
518    #[test]
519    fn test_ebcdic_to_utf8_empty_input() {
520        let result = ebcdic_to_utf8(&[], Codepage::CP037, UnmappablePolicy::Error).unwrap();
521        assert_eq!(result, "");
522    }
523
524    #[test]
525    fn test_ebcdic_to_utf8_ascii_passthrough() {
526        let data = b"Hello, World!";
527        let result = ebcdic_to_utf8(data, Codepage::ASCII, UnmappablePolicy::Error).unwrap();
528        assert_eq!(result, "Hello, World!");
529    }
530
531    #[test]
532    fn test_ebcdic_to_utf8_ascii_passthrough_non_utf8() {
533        // Non-UTF8 bytes get replaced with U+FFFD via from_utf8_lossy
534        let data: &[u8] = &[0xFF, 0xFE];
535        let result = ebcdic_to_utf8(data, Codepage::ASCII, UnmappablePolicy::Error).unwrap();
536        assert!(result.contains('\u{FFFD}'));
537    }
538
539    #[test]
540    fn test_ebcdic_to_utf8_cp037_space() {
541        // EBCDIC 0x40 = space in CP037
542        let data: &[u8] = &[0x40];
543        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Error).unwrap();
544        assert_eq!(result, " ");
545    }
546
547    #[test]
548    fn test_ebcdic_to_utf8_cp037_digits() {
549        // EBCDIC 0xF0-0xF9 = digits 0-9 in CP037
550        let data: Vec<u8> = (0xF0..=0xF9).collect();
551        let result = ebcdic_to_utf8(&data, Codepage::CP037, UnmappablePolicy::Error).unwrap();
552        assert_eq!(result, "0123456789");
553    }
554
555    #[test]
556    fn test_ebcdic_to_utf8_cp037_uppercase() {
557        // EBCDIC 0xC1-0xC9 = A-I in CP037
558        let data: &[u8] = &[0xC1, 0xC2, 0xC3];
559        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Error).unwrap();
560        assert_eq!(result, "ABC");
561    }
562
563    #[test]
564    fn test_ebcdic_to_utf8_cp037_lowercase() {
565        // EBCDIC 0x81-0x89 = a-i in CP037
566        let data: &[u8] = &[0x81, 0x82, 0x83];
567        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Error).unwrap();
568        assert_eq!(result, "abc");
569    }
570
571    #[test]
572    fn test_ebcdic_to_utf8_unmappable_error_policy() {
573        // EBCDIC 0x00 maps to U+0000 (NUL) which is < 0x20 and not tab/LF/CR
574        let data: &[u8] = &[0x00];
575        let err = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Error).unwrap_err();
576        assert_eq!(err.code, ErrorCode::CBKC301_INVALID_EBCDIC_BYTE);
577    }
578
579    #[test]
580    fn test_ebcdic_to_utf8_unmappable_replace_policy() {
581        let data: &[u8] = &[0x00];
582        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Replace).unwrap();
583        assert_eq!(result, "\u{FFFD}");
584    }
585
586    #[test]
587    fn test_ebcdic_to_utf8_unmappable_skip_policy() {
588        let data: &[u8] = &[0x00];
589        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Skip).unwrap();
590        assert_eq!(result, "");
591    }
592
593    #[test]
594    fn test_ebcdic_to_utf8_mixed_valid_and_unmappable_skip() {
595        // 0x00 (unmappable), 0xC1 (A), 0x00 (unmappable), 0xC2 (B)
596        let data: &[u8] = &[0x00, 0xC1, 0x00, 0xC2];
597        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Skip).unwrap();
598        assert_eq!(result, "AB");
599    }
600
601    #[test]
602    fn test_ebcdic_to_utf8_all_codepages_digits() {
603        // Digits 0xF0-0xF9 should map to 0-9 on all EBCDIC codepages
604        let data: Vec<u8> = (0xF0..=0xF9).collect();
605        for cp in [
606            Codepage::CP037,
607            Codepage::CP273,
608            Codepage::CP500,
609            Codepage::CP1047,
610        ] {
611            let result = ebcdic_to_utf8(&data, cp, UnmappablePolicy::Error).unwrap();
612            assert_eq!(result, "0123456789", "Failed for {cp:?}");
613        }
614    }
615
616    #[test]
617    fn test_ebcdic_to_utf8_cp1140_euro_sign() {
618        // CP1140 maps 0x9F to € (U+20AC) — unlike CP037 which maps it to a control char
619        let data: &[u8] = &[0xFF];
620        let result = ebcdic_to_utf8(data, Codepage::CP1140, UnmappablePolicy::Error).unwrap();
621        assert_eq!(result, "€");
622    }
623
624    #[test]
625    fn test_ebcdic_to_utf8_cp037_tab_allowed() {
626        // EBCDIC 0x05 -> U+0009 (tab) — should be allowed (not unmappable)
627        let data: &[u8] = &[0x05];
628        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Error).unwrap();
629        assert_eq!(result, "\t");
630    }
631
632    #[test]
633    fn test_ebcdic_to_utf8_cp037_lf_allowed() {
634        // EBCDIC 0x25 -> U+000A (LF) — should be allowed
635        let data: &[u8] = &[0x25];
636        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Error).unwrap();
637        assert_eq!(result, "\n");
638    }
639
640    #[test]
641    fn test_ebcdic_to_utf8_cp037_cr_allowed() {
642        // EBCDIC 0x0D -> U+000D (CR) — should be allowed
643        let data: &[u8] = &[0x0D];
644        let result = ebcdic_to_utf8(data, Codepage::CP037, UnmappablePolicy::Error).unwrap();
645        assert_eq!(result, "\r");
646    }
647
648    // --- utf8_to_ebcdic tests ---
649
650    #[test]
651    fn test_utf8_to_ebcdic_empty_input() {
652        let result = utf8_to_ebcdic("", Codepage::CP037).unwrap();
653        assert!(result.is_empty());
654    }
655
656    #[test]
657    fn test_utf8_to_ebcdic_ascii_passthrough() {
658        let result = utf8_to_ebcdic("Hello", Codepage::ASCII).unwrap();
659        assert_eq!(result, b"Hello");
660    }
661
662    #[test]
663    fn test_utf8_to_ebcdic_cp037_space() {
664        let result = utf8_to_ebcdic(" ", Codepage::CP037).unwrap();
665        assert_eq!(result, &[0x40]);
666    }
667
668    #[test]
669    fn test_utf8_to_ebcdic_cp037_digits() {
670        let result = utf8_to_ebcdic("0123456789", Codepage::CP037).unwrap();
671        let expected: Vec<u8> = (0xF0..=0xF9).collect();
672        assert_eq!(result, expected);
673    }
674
675    #[test]
676    fn test_utf8_to_ebcdic_cp037_uppercase() {
677        let result = utf8_to_ebcdic("ABC", Codepage::CP037).unwrap();
678        assert_eq!(result, &[0xC1, 0xC2, 0xC3]);
679    }
680
681    #[test]
682    fn test_utf8_to_ebcdic_unmappable_character() {
683        // Chinese character cannot be mapped to CP037
684        let err = utf8_to_ebcdic("日", Codepage::CP037).unwrap_err();
685        assert_eq!(err.code, ErrorCode::CBKC301_INVALID_EBCDIC_BYTE);
686    }
687
688    #[test]
689    fn test_ebcdic_roundtrip_cp037() {
690        let original = "Hello World 123";
691        let ebcdic = utf8_to_ebcdic(original, Codepage::CP037).unwrap();
692        let roundtrip = ebcdic_to_utf8(&ebcdic, Codepage::CP037, UnmappablePolicy::Error).unwrap();
693        assert_eq!(roundtrip, original);
694    }
695
696    #[test]
697    fn test_ebcdic_roundtrip_cp500() {
698        let original = "Test 789";
699        let ebcdic = utf8_to_ebcdic(original, Codepage::CP500).unwrap();
700        let roundtrip = ebcdic_to_utf8(&ebcdic, Codepage::CP500, UnmappablePolicy::Error).unwrap();
701        assert_eq!(roundtrip, original);
702    }
703
704    #[test]
705    fn test_ebcdic_roundtrip_cp1047() {
706        let original = "COBOL DATA";
707        let ebcdic = utf8_to_ebcdic(original, Codepage::CP1047).unwrap();
708        let roundtrip = ebcdic_to_utf8(&ebcdic, Codepage::CP1047, UnmappablePolicy::Error).unwrap();
709        assert_eq!(roundtrip, original);
710    }
711
712    // ====================================================================
713    // Exhaustive charset conversion tests
714    // ====================================================================
715
716    /// All EBCDIC codepages under test.
717    const ALL_EBCDIC: [Codepage; 5] = [
718        Codepage::CP037,
719        Codepage::CP273,
720        Codepage::CP500,
721        Codepage::CP1047,
722        Codepage::CP1140,
723    ];
724
725    // --- 1. Full printable ASCII range (both directions) per codepage ---
726
727    #[test]
728    fn test_printable_ascii_roundtrip_cp037() {
729        roundtrip_printable_ascii(Codepage::CP037);
730    }
731
732    #[test]
733    fn test_printable_ascii_roundtrip_cp273() {
734        roundtrip_printable_ascii(Codepage::CP273);
735    }
736
737    #[test]
738    fn test_printable_ascii_roundtrip_cp500() {
739        roundtrip_printable_ascii(Codepage::CP500);
740    }
741
742    #[test]
743    fn test_printable_ascii_roundtrip_cp1047() {
744        roundtrip_printable_ascii(Codepage::CP1047);
745    }
746
747    #[test]
748    fn test_printable_ascii_roundtrip_cp1140() {
749        roundtrip_printable_ascii(Codepage::CP1140);
750    }
751
752    /// Encode every printable ASCII char (0x20..=0x7E) to EBCDIC then back,
753    /// asserting perfect round-trip for the given codepage.
754    fn roundtrip_printable_ascii(cp: Codepage) {
755        let printable: String = (0x20u8..=0x7Eu8).map(|b| b as char).collect();
756        let ebcdic =
757            utf8_to_ebcdic(&printable, cp).unwrap_or_else(|e| panic!("{cp:?} encode failed: {e}"));
758        let back = ebcdic_to_utf8(&ebcdic, cp, UnmappablePolicy::Error)
759            .unwrap_or_else(|e| panic!("{cp:?} decode failed: {e}"));
760        assert_eq!(back, printable, "Round-trip mismatch for {cp:?}");
761    }
762
763    // --- 2. Special characters ---
764
765    #[test]
766    fn test_cp1140_euro_sign_roundtrip() {
767        // CP1140 byte 0xFF maps to U+20AC (€)
768        let decoded = ebcdic_to_utf8(&[0xFF], Codepage::CP1140, UnmappablePolicy::Error).unwrap();
769        assert_eq!(decoded, "€");
770        let encoded = utf8_to_ebcdic("€", Codepage::CP1140).unwrap();
771        assert_eq!(encoded, &[0xFF]);
772    }
773
774    #[test]
775    fn test_cp037_currency_sign_at_9f() {
776        // CP037 0x9F maps to U+00A4 (¤) – the international currency sign
777        let decoded = ebcdic_to_utf8(&[0x9F], Codepage::CP037, UnmappablePolicy::Error).unwrap();
778        assert_eq!(decoded, "¤");
779    }
780
781    #[test]
782    fn test_cp273_national_chars() {
783        // CP273 has German national characters at different positions than CP037
784        // 0x4A -> Ä (U+00C4), 0x6A -> ö (U+00F6), 0xC0 -> ä (U+00E4)
785        let data: &[u8] = &[0x4A, 0x6A, 0xC0];
786        let decoded = ebcdic_to_utf8(data, Codepage::CP273, UnmappablePolicy::Error).unwrap();
787        assert_eq!(decoded, "Äöä");
788        // Round-trip
789        let encoded = utf8_to_ebcdic("Äöä", Codepage::CP273).unwrap();
790        assert_eq!(encoded, data);
791    }
792
793    #[test]
794    fn test_cp1140_vs_cp037_difference() {
795        // CP1140 is identical to CP037 except at byte 0x9F:
796        //   CP037  0x9F -> U+00A4 (¤)
797        //   CP1140 byte 0x9F -> U+00A4 (¤) as well, but 0xFF differs:
798        //   CP037  0xFF -> U+009F (control)
799        //   CP1140 0xFF -> U+20AC (€)
800        let cp037_ff = ebcdic_to_utf8(&[0xFF], Codepage::CP037, UnmappablePolicy::Replace).unwrap();
801        let cp1140_ff = ebcdic_to_utf8(&[0xFF], Codepage::CP1140, UnmappablePolicy::Error).unwrap();
802        assert_ne!(cp037_ff, cp1140_ff, "CP037 and CP1140 must differ at 0xFF");
803        assert_eq!(cp1140_ff, "€");
804    }
805
806    // --- 3. Control characters ---
807
808    #[test]
809    fn test_control_chars_error_policy_all_codepages() {
810        // EBCDIC 0x00 maps to U+0000 (NUL) on all codepages – a control char
811        for cp in ALL_EBCDIC {
812            let err = ebcdic_to_utf8(&[0x00], cp, UnmappablePolicy::Error).unwrap_err();
813            assert_eq!(
814                err.code,
815                ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
816                "Expected error for NUL on {cp:?}"
817            );
818        }
819    }
820
821    #[test]
822    fn test_control_chars_replace_policy_all_codepages() {
823        for cp in ALL_EBCDIC {
824            let result = ebcdic_to_utf8(&[0x00], cp, UnmappablePolicy::Replace).unwrap();
825            assert_eq!(result, "\u{FFFD}", "Replace policy failed for {cp:?}");
826        }
827    }
828
829    #[test]
830    fn test_control_chars_skip_policy_all_codepages() {
831        for cp in ALL_EBCDIC {
832            let result = ebcdic_to_utf8(&[0x00], cp, UnmappablePolicy::Skip).unwrap();
833            assert_eq!(result, "", "Skip policy failed for {cp:?}");
834        }
835    }
836
837    #[test]
838    fn test_allowed_control_chars_tab_lf_cr_all_codepages() {
839        // Tab (0x05), LF (0x25), CR (0x0D) should pass through on all EBCDIC codepages
840        for cp in ALL_EBCDIC {
841            let tab = ebcdic_to_utf8(&[0x05], cp, UnmappablePolicy::Error).unwrap();
842            assert_eq!(tab, "\t", "Tab failed for {cp:?}");
843
844            let lf = ebcdic_to_utf8(&[0x25], cp, UnmappablePolicy::Error).unwrap();
845            assert_eq!(lf, "\n", "LF failed for {cp:?}");
846
847            let cr = ebcdic_to_utf8(&[0x0D], cp, UnmappablePolicy::Error).unwrap();
848            assert_eq!(cr, "\r", "CR failed for {cp:?}");
849        }
850    }
851
852    // --- 4. Unmappable character handling (utf8_to_ebcdic direction) ---
853
854    #[test]
855    fn test_utf8_to_ebcdic_unmappable_cjk_all_codepages() {
856        // CJK character '日' cannot be mapped to any EBCDIC codepage
857        for cp in ALL_EBCDIC {
858            let err = utf8_to_ebcdic("日", cp).unwrap_err();
859            assert_eq!(
860                err.code,
861                ErrorCode::CBKC301_INVALID_EBCDIC_BYTE,
862                "Expected unmappable error for {cp:?}"
863            );
864        }
865    }
866
867    #[test]
868    fn test_utf8_to_ebcdic_emoji_unmappable() {
869        let err = utf8_to_ebcdic("😀", Codepage::CP037).unwrap_err();
870        assert_eq!(err.code, ErrorCode::CBKC301_INVALID_EBCDIC_BYTE);
871    }
872
873    // --- 5. Empty input ---
874
875    #[test]
876    fn test_empty_input_all_codepages_both_directions() {
877        for cp in ALL_EBCDIC {
878            let decoded = ebcdic_to_utf8(&[], cp, UnmappablePolicy::Error).unwrap();
879            assert_eq!(decoded, "", "Empty decode failed for {cp:?}");
880
881            let encoded = utf8_to_ebcdic("", cp).unwrap();
882            assert!(encoded.is_empty(), "Empty encode failed for {cp:?}");
883        }
884        // Also ASCII
885        let decoded = ebcdic_to_utf8(&[], Codepage::ASCII, UnmappablePolicy::Error).unwrap();
886        assert_eq!(decoded, "");
887        let encoded = utf8_to_ebcdic("", Codepage::ASCII).unwrap();
888        assert!(encoded.is_empty());
889    }
890
891    // --- 6. Full round-trip consistency (EBCDIC -> UTF-8 -> EBCDIC) per codepage ---
892
893    #[test]
894    fn test_full_byte_roundtrip_cp037() {
895        full_byte_roundtrip(Codepage::CP037);
896    }
897
898    #[test]
899    fn test_full_byte_roundtrip_cp273() {
900        full_byte_roundtrip(Codepage::CP273);
901    }
902
903    #[test]
904    fn test_full_byte_roundtrip_cp500() {
905        full_byte_roundtrip(Codepage::CP500);
906    }
907
908    #[test]
909    fn test_full_byte_roundtrip_cp1047() {
910        full_byte_roundtrip(Codepage::CP1047);
911    }
912
913    #[test]
914    fn test_full_byte_roundtrip_cp1140() {
915        full_byte_roundtrip(Codepage::CP1140);
916    }
917
918    /// For every EBCDIC byte 0x00..=0xFF that decodes to a non-control Unicode
919    /// character, verify EBCDIC → UTF-8 → EBCDIC produces the original byte.
920    fn full_byte_roundtrip(cp: Codepage) {
921        for byte in 0x00u8..=0xFF {
922            let Ok(decoded) = ebcdic_to_utf8(&[byte], cp, UnmappablePolicy::Skip) else {
923                continue;
924            };
925            if decoded.is_empty() {
926                // Skipped control char – that's fine
927                continue;
928            }
929            let Ok(re_encoded) = utf8_to_ebcdic(&decoded, cp) else {
930                continue;
931            };
932            assert_eq!(
933                re_encoded,
934                &[byte],
935                "{cp:?}: byte 0x{byte:02X} decoded to {decoded:?} but re-encoded to {re_encoded:?}"
936            );
937        }
938    }
939
940    // --- 7. Large buffer conversion ---
941
942    #[test]
943    fn test_large_buffer_decode_cp037() {
944        // 10 000 EBCDIC spaces (0x40) should decode to 10 000 ASCII spaces
945        let large_input = vec![0x40u8; 10_000];
946        let result =
947            ebcdic_to_utf8(&large_input, Codepage::CP037, UnmappablePolicy::Error).unwrap();
948        assert_eq!(result.len(), 10_000);
949        assert!(result.chars().all(|c| c == ' '));
950    }
951
952    #[test]
953    fn test_large_buffer_encode_cp037() {
954        let large_text: String = std::iter::repeat_n('A', 10_000).collect();
955        let encoded = utf8_to_ebcdic(&large_text, Codepage::CP037).unwrap();
956        assert_eq!(encoded.len(), 10_000);
957        assert!(encoded.iter().all(|&b| b == 0xC1)); // 'A' in CP037
958    }
959
960    #[test]
961    fn test_large_buffer_roundtrip_all_codepages() {
962        let pattern = "HELLO WORLD 12345 ";
963        let large_text: String = pattern.repeat(500); // ~9 000 chars
964        for cp in ALL_EBCDIC {
965            let encoded = utf8_to_ebcdic(&large_text, cp)
966                .unwrap_or_else(|e| panic!("{cp:?} large encode failed: {e}"));
967            let decoded = ebcdic_to_utf8(&encoded, cp, UnmappablePolicy::Error)
968                .unwrap_or_else(|e| panic!("{cp:?} large decode failed: {e}"));
969            assert_eq!(decoded, large_text, "Large roundtrip failed for {cp:?}");
970        }
971    }
972
973    // --- 8. Mixed content with unmappable bytes ---
974
975    #[test]
976    fn test_mixed_valid_and_control_replace_all_codepages() {
977        // Byte sequence: NUL, 'A' (0xC1 on CP037/500/1047/1140, 0xC1 on CP273), NUL
978        for cp in ALL_EBCDIC {
979            let data: &[u8] = &[0x00, 0xC1, 0x00];
980            let result = ebcdic_to_utf8(data, cp, UnmappablePolicy::Replace).unwrap();
981            // Should have replacement chars around the letter
982            assert_eq!(
983                result.matches('\u{FFFD}').count(),
984                2,
985                "Replace count wrong for {cp:?}"
986            );
987            assert!(result.contains('A'), "Missing 'A' for {cp:?}");
988        }
989    }
990
991    #[test]
992    fn test_mixed_valid_and_control_skip_preserves_valid() {
993        // NUL, space (0x40), digit-1 (0xF1), NUL
994        for cp in ALL_EBCDIC {
995            let data: &[u8] = &[0x00, 0x40, 0xF1, 0x00];
996            let result = ebcdic_to_utf8(data, cp, UnmappablePolicy::Skip).unwrap();
997            assert_eq!(result, " 1", "Skip mixed content wrong for {cp:?}");
998        }
999    }
1000
1001    // --- 9. Codepage-specific letter position differences ---
1002
1003    #[test]
1004    fn test_uppercase_letters_all_codepages() {
1005        // Verify A-I, J-R, S-Z positions are correct per codepage
1006        let alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1007        for cp in ALL_EBCDIC {
1008            let encoded =
1009                utf8_to_ebcdic(alpha, cp).unwrap_or_else(|e| panic!("{cp:?} alpha encode: {e}"));
1010            let decoded = ebcdic_to_utf8(&encoded, cp, UnmappablePolicy::Error)
1011                .unwrap_or_else(|e| panic!("{cp:?} alpha decode: {e}"));
1012            assert_eq!(decoded, alpha, "Alphabet roundtrip failed for {cp:?}");
1013            // EBCDIC letters live in C1-C9 (A-I), D1-D9 (J-R), E2-E9 (S-Z)
1014            assert_eq!(encoded[0], 0xC1, "{cp:?}: 'A' should be 0xC1");
1015            assert_eq!(encoded[9], 0xD1, "{cp:?}: 'J' should be 0xD1");
1016            assert_eq!(encoded[18], 0xE2, "{cp:?}: 'S' should be 0xE2");
1017        }
1018    }
1019
1020    #[test]
1021    fn test_lowercase_letters_all_codepages() {
1022        let alpha = "abcdefghijklmnopqrstuvwxyz";
1023        for cp in ALL_EBCDIC {
1024            let encoded =
1025                utf8_to_ebcdic(alpha, cp).unwrap_or_else(|e| panic!("{cp:?} lower encode: {e}"));
1026            let decoded = ebcdic_to_utf8(&encoded, cp, UnmappablePolicy::Error)
1027                .unwrap_or_else(|e| panic!("{cp:?} lower decode: {e}"));
1028            assert_eq!(decoded, alpha, "Lowercase roundtrip failed for {cp:?}");
1029            // EBCDIC lowercase: 81-89 (a-i), 91-99 (j-r), A2-A9 (s-z)
1030            assert_eq!(encoded[0], 0x81, "{cp:?}: 'a' should be 0x81");
1031            assert_eq!(encoded[9], 0x91, "{cp:?}: 'j' should be 0x91");
1032            assert_eq!(encoded[18], 0xA2, "{cp:?}: 's' should be 0xA2");
1033        }
1034    }
1035
1036    #[test]
1037    fn test_digits_all_codepages() {
1038        let digits = "0123456789";
1039        for cp in ALL_EBCDIC {
1040            let encoded =
1041                utf8_to_ebcdic(digits, cp).unwrap_or_else(|e| panic!("{cp:?} digit encode: {e}"));
1042            assert_eq!(encoded.len(), 10);
1043            // Digits are always F0-F9 on all EBCDIC codepages
1044            for (i, &b) in encoded.iter().enumerate() {
1045                assert_eq!(
1046                    b,
1047                    0xF0 + u8::try_from(i).unwrap(),
1048                    "{cp:?}: digit {i} wrong"
1049                );
1050            }
1051        }
1052    }
1053}
copybook_charset/lib.rs

copybook_charset/
lib.rs