1mod ebcdic;
16mod iso_8859;
17mod shift_jis;
18mod ucs4;
19mod us_ascii;
20mod utf16;
21mod utf8;
22
23use std::{
24 borrow::Cow,
25 collections::BTreeMap,
26 sync::{LazyLock, RwLock},
27};
28
29pub use ebcdic::*;
30pub use iso_8859::*;
31pub use shift_jis::*;
32pub use ucs4::*;
33pub use us_ascii::*;
34pub use utf8::*;
35pub use utf16::*;
36
37#[derive(Debug, Clone)]
38pub enum EncodeError {
39 InputIsEmpty,
41 OutputTooShort,
44 Unmappable { read: usize, write: usize, c: char },
50 Other { msg: Cow<'static, str> },
52}
53
54impl std::fmt::Display for EncodeError {
55 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56 write!(f, "{self:?}")
57 }
58}
59
60impl std::error::Error for EncodeError {}
61
62pub trait Encoder {
63 fn name(&self) -> &'static str;
64 fn encode(
66 &mut self,
67 src: &str,
68 dst: &mut [u8],
69 finish: bool,
70 ) -> Result<(usize, usize), EncodeError>;
71}
72
73#[derive(Debug, Clone)]
74pub enum DecodeError {
75 InputIsEmpty,
77 OutputTooShort,
80 Malformed {
85 read: usize,
86 write: usize,
87 length: usize,
88 offset: usize,
89 },
90 Other { msg: Cow<'static, str> },
92}
93
94impl std::fmt::Display for DecodeError {
95 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96 write!(f, "{self:?}")
97 }
98}
99
100impl std::error::Error for DecodeError {}
101
102pub trait Decoder {
103 fn name(&self) -> &'static str;
104 fn decode(
106 &mut self,
107 src: &[u8],
108 dst: &mut String,
109 finish: bool,
110 ) -> Result<(usize, usize), DecodeError>;
111}
112
113pub const DEFAULT_SUPPORTED_ENCODINGS: &[&str] = {
117 const NAMES: &[&str] = &[
118 IBM037,
119 IBM1026,
120 IBM273,
121 IBM274,
122 IBM275,
123 IBM277,
124 IBM278,
125 IBM280,
126 IBM284,
127 IBM285,
128 IBM290,
129 IBM297,
130 IBM420,
131 IBM423,
132 IBM424,
133 IBM437,
134 IBM500,
135 IBM850,
136 IBM851,
137 IBM852,
138 IBM855,
139 IBM857,
140 IBM860,
141 IBM861,
142 IBM862,
143 IBM863,
144 IBM864,
145 IBM865,
146 IBM868,
147 IBM869,
148 IBM870,
149 IBM871,
150 IBM880,
151 IBM891,
152 IBM903,
153 IBM904,
154 IBM905,
155 IBM918,
156 ISO_8859_10_NAME,
157 ISO_8859_13_NAME,
158 ISO_8859_14_NAME,
159 ISO_8859_15_NAME,
160 ISO_8859_16_NAME,
161 ISO_8859_1_NAME,
162 ISO_8859_2_NAME,
163 ISO_8859_3_NAME,
164 ISO_8859_4_NAME,
165 ISO_8859_5_NAME,
166 ISO_8859_6_NAME,
167 ISO_8859_7_NAME,
168 ISO_8859_8_NAME,
169 ISO_8859_9_NAME,
170 SHIFT_JIS_NAME,
171 ISO_8859_11_NAME,
172 US_ASCII_NAME,
173 UTF16_NAME,
174 UTF16BE_NAME,
175 UTF16LE_NAME,
176 UTF32_NAME,
177 UTF32BE_NAME,
178 UTF32LE_NAME,
179 UTF8_NAME,
180 ];
181 let len = NAMES.len();
182 let mut i = 0;
183 while i + 1 < len {
184 let x = NAMES[i].as_bytes();
185 let y = NAMES[i + 1].as_bytes();
186 let mut j = 0;
187 while j < x.len() {
188 assert!(x[j] <= y[j]);
189 if x[j] < y[j] {
190 break;
191 }
192 j += 1;
193 if j == x.len() {
194 break;
195 }
196 assert!(j < y.len());
197 }
198 i += 1;
199 }
200 NAMES
201};
202pub static ENCODING_ALIASES: LazyLock<RwLock<BTreeMap<Cow<'static, str>, &'static str>>> =
204 LazyLock::new(|| {
205 RwLock::new(BTreeMap::from([
207 ("UTF8".into(), UTF8_NAME),
208 ("UTF16".into(), UTF16_NAME),
209 ("UTF16BE".into(), UTF16BE_NAME),
210 ("UTF16LE".into(), UTF16LE_NAME),
211 ("ISO-IR-100".into(), ISO_8859_1_NAME),
212 ("ISO_8859-1".into(), ISO_8859_1_NAME),
213 ("ISO-8859-1".into(), ISO_8859_1_NAME),
214 ("LATIN1".into(), ISO_8859_1_NAME),
215 ("L1".into(), ISO_8859_1_NAME),
216 ("IBM819".into(), ISO_8859_1_NAME),
217 ("CP819".into(), ISO_8859_1_NAME),
218 ("ISOLATIN1".into(), ISO_8859_1_NAME),
219 ("ISO-IR-101".into(), ISO_8859_2_NAME),
220 ("ISO_8859-2".into(), ISO_8859_2_NAME),
221 ("ISO-8859-2".into(), ISO_8859_2_NAME),
222 ("LATIN2".into(), ISO_8859_2_NAME),
223 ("L2".into(), ISO_8859_2_NAME),
224 ("ISOLATIN2".into(), ISO_8859_2_NAME),
225 ("ISO-IR-109".into(), ISO_8859_3_NAME),
226 ("ISO_8859-3".into(), ISO_8859_3_NAME),
227 ("ISO-8859-3".into(), ISO_8859_3_NAME),
228 ("LATIN3".into(), ISO_8859_3_NAME),
229 ("L3".into(), ISO_8859_3_NAME),
230 ("ISOLATIN3".into(), ISO_8859_3_NAME),
231 ("ISO-IR-110".into(), ISO_8859_4_NAME),
232 ("ISO_8859-4".into(), ISO_8859_4_NAME),
233 ("ISO-8859-4".into(), ISO_8859_4_NAME),
234 ("LATIN4".into(), ISO_8859_4_NAME),
235 ("L4".into(), ISO_8859_4_NAME),
236 ("ISOLATIN4".into(), ISO_8859_4_NAME),
237 ("ISO-IR-144".into(), ISO_8859_5_NAME),
238 ("ISO_8859-5".into(), ISO_8859_5_NAME),
239 ("ISO-8859-5".into(), ISO_8859_5_NAME),
240 ("CYRILLIC".into(), ISO_8859_5_NAME),
241 ("ISOLATINCYRILLIC".into(), ISO_8859_5_NAME),
242 ("ISO-IR-127".into(), ISO_8859_6_NAME),
243 ("ISO_8859-6".into(), ISO_8859_6_NAME),
244 ("ISO-8859-6".into(), ISO_8859_6_NAME),
245 ("ECMA-114".into(), ISO_8859_6_NAME),
246 ("ASMO-708".into(), ISO_8859_6_NAME),
247 ("ARABIC".into(), ISO_8859_6_NAME),
248 ("ISOLATINARABIC".into(), ISO_8859_6_NAME),
249 ("ISO-IR-126".into(), ISO_8859_7_NAME),
250 ("ISO_8859-7".into(), ISO_8859_7_NAME),
251 ("ISO-8859-7".into(), ISO_8859_7_NAME),
252 ("ELOT_928".into(), ISO_8859_7_NAME),
253 ("ECMA-118".into(), ISO_8859_7_NAME),
254 ("GREEK".into(), ISO_8859_7_NAME),
255 ("GREEK8".into(), ISO_8859_7_NAME),
256 ("ISOLATINGREEK".into(), ISO_8859_7_NAME),
257 ("ISO-IR-138".into(), ISO_8859_8_NAME),
258 ("ISO_8859-8".into(), ISO_8859_8_NAME),
259 ("ISO-8859-8".into(), ISO_8859_8_NAME),
260 ("HEBREW".into(), ISO_8859_8_NAME),
261 ("ISOLATINHEBREW".into(), ISO_8859_8_NAME),
262 ("ISO-IR-148".into(), ISO_8859_9_NAME),
263 ("ISO_8859-9".into(), ISO_8859_9_NAME),
264 ("ISO-8859-9".into(), ISO_8859_9_NAME),
265 ("LATIN5".into(), ISO_8859_9_NAME),
266 ("L5".into(), ISO_8859_9_NAME),
267 ("ISOLATIN5".into(), ISO_8859_9_NAME),
268 ("ISO-IR-157".into(), ISO_8859_10_NAME),
269 ("L6".into(), ISO_8859_10_NAME),
270 ("ISO_8859-10:1992".into(), ISO_8859_10_NAME),
271 ("ISOLATIN6".into(), ISO_8859_10_NAME),
272 ("LATIN6".into(), ISO_8859_10_NAME),
273 ("TIS620".into(), ISO_8859_11_NAME),
274 ("ISO-8859-11".into(), ISO_8859_11_NAME),
275 ("ISO885913".into(), ISO_8859_13_NAME),
276 ("ISO-IR-199".into(), ISO_8859_14_NAME),
277 ("ISO_8859-14:1998".into(), ISO_8859_14_NAME),
278 ("ISO_8859-14".into(), ISO_8859_14_NAME),
279 ("LATIN8".into(), ISO_8859_14_NAME),
280 ("ISO-CELTIC".into(), ISO_8859_14_NAME),
281 ("L8".into(), ISO_8859_14_NAME),
282 ("ISO885914".into(), ISO_8859_14_NAME),
283 ("ISO_8859-15".into(), ISO_8859_15_NAME),
284 ("LATIN-9".into(), ISO_8859_15_NAME),
285 ("ISO885915".into(), ISO_8859_15_NAME),
286 ("ISO-IR-226".into(), ISO_8859_16_NAME),
287 ("ISO_8859-16:2001".into(), ISO_8859_16_NAME),
288 ("ISO_8859-16".into(), ISO_8859_16_NAME),
289 ("LATIN10".into(), ISO_8859_16_NAME),
290 ("L10".into(), ISO_8859_16_NAME),
291 ("ISO885916".into(), ISO_8859_16_NAME),
292 ("UTF32".into(), UTF32_NAME),
293 ("UTF32BE".into(), UTF32BE_NAME),
294 ("UTF32LE".into(), UTF32LE_NAME),
295 ("MS_KANJI".into(), SHIFT_JIS_NAME),
296 ("SHIFTJIS".into(), SHIFT_JIS_NAME),
297 ("ISO-IR-6".into(), US_ASCII_NAME),
298 ("ANSI_X3.4-1968".into(), US_ASCII_NAME),
299 ("ANSI_X3.4-1986".into(), US_ASCII_NAME),
300 ("ISO_646.IRV:1991".into(), US_ASCII_NAME),
301 ("ISO646-US".into(), US_ASCII_NAME),
302 ("US-ASCII".into(), US_ASCII_NAME),
303 ("US".into(), US_ASCII_NAME),
304 ("IBM367".into(), US_ASCII_NAME),
305 ("CP367".into(), US_ASCII_NAME),
306 ("ASCII".into(), US_ASCII_NAME),
307 ("CP037".into(), IBM037),
308 ("EBCDIC-CP-US".into(), IBM037),
309 ("EBCDIC-CP-CA".into(), IBM037),
310 ("EBCDIC-CP-WT".into(), IBM037),
311 ("EBCDIC-CP-NL".into(), IBM037),
312 ("CP273".into(), IBM273),
313 ("EBCDIC-BE".into(), IBM274),
314 ("CP274".into(), IBM274),
315 ("EBCDIC-BR".into(), IBM275),
316 ("CP275".into(), IBM275),
317 ("EBCDIC-CP-DK".into(), IBM277),
318 ("EBCDIC-CP-NO".into(), IBM277),
319 ("CP278".into(), IBM278),
320 ("EBCDIC-CP-FI".into(), IBM278),
321 ("EBCDIC-CP-SE".into(), IBM278),
322 ("CP280".into(), IBM280),
323 ("EBCDIC-CP-IT".into(), IBM280),
324 ("CP284".into(), IBM284),
325 ("EBCDIC-CP-ES".into(), IBM284),
326 ("CP285".into(), IBM285),
327 ("EBCDIC-CP-GB".into(), IBM285),
328 ("CP290".into(), IBM290),
329 ("EBCDIC-JP-KANA".into(), IBM290),
330 ("CP297".into(), IBM297),
331 ("EBCDIC-CP-FR".into(), IBM297),
332 ("CP420".into(), IBM420),
333 ("EBCDIC-CP-AR1".into(), IBM420),
334 ("CP423".into(), IBM423),
335 ("EBCDIC-CP-GR".into(), IBM423),
336 ("CP424".into(), IBM424),
337 ("EBCDIC-CP-HE".into(), IBM424),
338 ("CP437".into(), IBM437),
339 ("437".into(), IBM437),
340 ("PC8CODEPAGE437".into(), IBM437),
341 ("CP500".into(), IBM500),
342 ("EBCDIC-CP-BE".into(), IBM500),
343 ("EBCDIC-CP-CH".into(), IBM500),
344 ("CP851".into(), IBM851),
345 ("851".into(), IBM851),
346 ("CP852".into(), IBM852),
347 ("852".into(), IBM852),
348 ("PCP852".into(), IBM852),
349 ("CP855".into(), IBM855),
350 ("855".into(), IBM855),
351 ("CP857".into(), IBM857),
352 ("857".into(), IBM857),
353 ("CP860".into(), IBM860),
354 ("860".into(), IBM860),
355 ("CP861".into(), IBM861),
356 ("861".into(), IBM861),
357 ("CP-IS".into(), IBM861),
358 ("CP863".into(), IBM863),
359 ("863".into(), IBM863),
360 ("CP864".into(), IBM864),
361 ("CP865".into(), IBM865),
362 ("865".into(), IBM865),
363 ("CP868".into(), IBM868),
364 ("CP-AR".into(), IBM868),
365 ("CP869".into(), IBM869),
366 ("869".into(), IBM869),
367 ("CP-GR".into(), IBM869),
368 ("CP870".into(), IBM870),
369 ("EBCDIC-CP-ROECE".into(), IBM870),
370 ("EBCDIC-CP-YU".into(), IBM870),
371 ("CP871".into(), IBM871),
372 ("EBCDIC-CP-IS".into(), IBM871),
373 ("CP880".into(), IBM880),
374 ("EBCDIC-CYRILLIC".into(), IBM880),
375 ("CP891".into(), IBM891),
376 ("CP903".into(), IBM903),
377 ("CP904".into(), IBM904),
378 ("904".into(), IBM904),
379 ("IBBM904".into(), IBM904),
382 ("CP905".into(), IBM905),
383 ("EBCDIC-CP-TR".into(), IBM905),
384 ("CP918".into(), IBM918),
385 ("EBCDIC-CP-AR2".into(), IBM918),
386 ("CP1026".into(), IBM1026),
387 ]))
388 });
389pub fn register_encoding_alias(alias: &'static str, real: &'static str) -> Option<&'static str> {
402 let mut table = ENCODING_ALIASES.write().unwrap();
403 if alias.chars().all(|c| c.is_ascii_uppercase()) {
404 table.insert(alias.into(), real)
405 } else {
406 table.insert(alias.to_ascii_uppercase().into(), real)
407 }
408}
409pub fn unregister_encoding_alias(alias: &'static str) -> Option<&'static str> {
412 ENCODING_ALIASES
413 .write()
414 .unwrap()
415 .remove(alias.to_ascii_uppercase().as_str())
416}
417pub fn resolve_encoding_alias(alias: &str) -> Option<&'static str> {
422 let aliases = ENCODING_ALIASES.read().unwrap();
423 aliases
424 .get(alias)
425 .or_else(|| aliases.get(alias.to_ascii_uppercase().as_str()))
426 .copied()
427}
428
429pub type EncoderFactory = fn() -> Box<dyn Encoder>;
430pub static ENCODER_TABLE: LazyLock<RwLock<BTreeMap<&'static str, EncoderFactory>>> =
431 LazyLock::new(|| {
432 let mut map = BTreeMap::<&'static str, EncoderFactory>::new();
433 map.insert(UTF8_NAME, || Box::new(UTF8Encoder));
434 map.insert(UTF16_NAME, || Box::new(UTF16Encoder::default()));
435 map.insert(UTF16BE_NAME, || Box::new(UTF16BEEncoder));
436 map.insert(UTF16LE_NAME, || Box::new(UTF16LEEncoder));
437 map.insert(ISO_8859_1_NAME, || Box::new(ISO8859_1Encoder));
438 map.insert(ISO_8859_2_NAME, || Box::new(ISO8859_2Encoder));
439 map.insert(ISO_8859_3_NAME, || Box::new(ISO8859_3Encoder));
440 map.insert(ISO_8859_4_NAME, || Box::new(ISO8859_4Encoder));
441 map.insert(ISO_8859_5_NAME, || Box::new(ISO8859_5Encoder));
442 map.insert(ISO_8859_6_NAME, || Box::new(ISO8859_6Encoder));
443 map.insert(ISO_8859_7_NAME, || Box::new(ISO8859_7Encoder));
444 map.insert(ISO_8859_8_NAME, || Box::new(ISO8859_8Encoder));
445 map.insert(ISO_8859_9_NAME, || Box::new(ISO8859_9Encoder));
446 map.insert(ISO_8859_10_NAME, || Box::new(ISO8859_10Encoder));
447 map.insert(ISO_8859_11_NAME, || Box::new(ISO8859_11Encoder));
448 map.insert(ISO_8859_13_NAME, || Box::new(ISO8859_13Encoder));
449 map.insert(ISO_8859_14_NAME, || Box::new(ISO8859_14Encoder));
450 map.insert(ISO_8859_15_NAME, || Box::new(ISO8859_15Encoder));
451 map.insert(ISO_8859_16_NAME, || Box::new(ISO8859_16Encoder));
452 map.insert(UTF32_NAME, || Box::new(UTF32Encoder::default()));
453 map.insert(UTF32BE_NAME, || Box::new(UTF32BEEncoder));
454 map.insert(UTF32LE_NAME, || Box::new(UTF32LEEncoder));
455 map.insert(SHIFT_JIS_NAME, || Box::new(ShiftJISEncoder));
456 map.insert(US_ASCII_NAME, || Box::new(USASCIIEncoder));
457 RwLock::new(map)
458 });
459pub fn find_encoder(encoding_name: &str) -> Option<Box<dyn Encoder>> {
460 let table = ENCODER_TABLE.read().unwrap();
461 if let Some(factory) = table.get(encoding_name) {
462 return Some(factory());
463 }
464 if let Some(factory) = table.get(encoding_name.to_ascii_uppercase().as_str()) {
465 return Some(factory());
466 }
467
468 let alias = resolve_encoding_alias(encoding_name)?;
469 table.get(alias).map(|f| f())
470}
471pub fn register_encoder(
472 encoding_name: &'static str,
473 factory: EncoderFactory,
474) -> Option<EncoderFactory> {
475 ENCODER_TABLE
476 .write()
477 .unwrap()
478 .insert(encoding_name, factory)
479}
480pub fn unregister_encoder(encoding_name: &str) -> Option<EncoderFactory> {
481 ENCODER_TABLE.write().unwrap().remove(encoding_name)
482}
483
484pub type DecoderFactory = fn() -> Box<dyn Decoder>;
485pub static DECODER_TABLE: LazyLock<RwLock<BTreeMap<&'static str, DecoderFactory>>> =
486 LazyLock::new(|| {
487 let mut map = BTreeMap::<&'static str, DecoderFactory>::new();
488 map.insert(UTF8_NAME, || Box::new(UTF8Decoder));
489 map.insert(UTF16_NAME, || Box::new(UTF16Decoder::default()));
490 map.insert(UTF16BE_NAME, || Box::new(UTF16BEDecoder));
491 map.insert(UTF16LE_NAME, || Box::new(UTF16LEDecoder));
492 map.insert(ISO_8859_1_NAME, || Box::new(ISO8859_1Decoder));
493 map.insert(ISO_8859_2_NAME, || Box::new(ISO8859_2Decoder));
494 map.insert(ISO_8859_3_NAME, || Box::new(ISO8859_3Decoder));
495 map.insert(ISO_8859_4_NAME, || Box::new(ISO8859_4Decoder));
496 map.insert(ISO_8859_5_NAME, || Box::new(ISO8859_5Decoder));
497 map.insert(ISO_8859_6_NAME, || Box::new(ISO8859_6Decoder));
498 map.insert(ISO_8859_7_NAME, || Box::new(ISO8859_7Decoder));
499 map.insert(ISO_8859_8_NAME, || Box::new(ISO8859_8Decoder));
500 map.insert(ISO_8859_9_NAME, || Box::new(ISO8859_9Decoder));
501 map.insert(ISO_8859_10_NAME, || Box::new(ISO8859_10Decoder));
502 map.insert(ISO_8859_11_NAME, || Box::new(ISO8859_11Decoder));
503 map.insert(ISO_8859_13_NAME, || Box::new(ISO8859_13Decoder));
504 map.insert(ISO_8859_14_NAME, || Box::new(ISO8859_14Decoder));
505 map.insert(ISO_8859_15_NAME, || Box::new(ISO8859_15Decoder));
506 map.insert(ISO_8859_16_NAME, || Box::new(ISO8859_16Decoder));
507 map.insert(UTF32_NAME, || Box::new(UTF32Decoder::default()));
508 map.insert(UTF32BE_NAME, || Box::new(UTF32BEDecoder));
509 map.insert(UTF32LE_NAME, || Box::new(UTF32LEDecoder));
510 map.insert(SHIFT_JIS_NAME, || Box::new(ShiftJISDecoder));
511 map.insert(US_ASCII_NAME, || Box::new(USASCIIDecoder));
512 RwLock::new(map)
513 });
514pub fn find_decoder(encoding_name: &str) -> Option<Box<dyn Decoder>> {
515 let table = DECODER_TABLE.read().unwrap();
516 if let Some(factory) = table.get(encoding_name) {
517 return Some(factory());
518 }
519 if let Some(factory) = table.get(encoding_name.to_ascii_uppercase().as_str()) {
520 return Some(factory());
521 }
522
523 let alias = resolve_encoding_alias(encoding_name)?;
524 table.get(alias).map(|f| f())
525}
526pub fn register_decoder(
527 encoding_name: &'static str,
528 factory: DecoderFactory,
529) -> Option<DecoderFactory> {
530 DECODER_TABLE
531 .write()
532 .unwrap()
533 .insert(encoding_name, factory)
534}
535pub fn unregister_decoder(encoding_name: &str) -> Option<DecoderFactory> {
536 DECODER_TABLE.write().unwrap().remove(encoding_name)
537}