1use std::borrow::Cow;
15
16#[must_use]
20pub fn decode_dvb_string(bytes: &[u8]) -> String {
21 if bytes.is_empty() {
22 return String::new();
23 }
24
25 let (charset, body) = split_charset(bytes);
26 let decoded = match charset {
27 Charset::Iso6937 => decode_iso_6937(body),
28 Charset::Iso8859(n) => decode_iso_8859(n, body),
29 Charset::Utf8 => String::from_utf8_lossy(body).into_owned(),
30 Charset::Ucs2Be => decode_ucs2_be(body),
31 Charset::Unsupported(_indicator) => body.iter().map(|_| '\u{FFFD}').collect(),
32 };
33
34 decoded
38 .chars()
39 .filter_map(|c| match c as u32 {
40 0x86 | 0x87 => None,
41 0x8A => Some(' '),
42 0x0A => Some(' '),
43 code if code < 0x20 => None,
44 code if (0x80..0xA0).contains(&code) => None,
45 _ => Some(c),
46 })
47 .collect()
48}
49
50#[must_use]
53pub fn decode(bytes: &[u8]) -> Cow<'_, str> {
54 if bytes.iter().all(|&b| b.is_ascii() && b >= 0x20) {
55 return Cow::Borrowed(std::str::from_utf8(bytes).unwrap_or(""));
56 }
57 Cow::Owned(decode_dvb_string(bytes))
58}
59
60#[derive(Debug)]
61enum Charset {
62 Iso6937,
63 Iso8859(u8),
64 Utf8,
65 Ucs2Be,
66 Unsupported(u8),
67}
68
69fn split_charset(bytes: &[u8]) -> (Charset, &[u8]) {
70 match bytes[0] {
71 b if b >= 0x20 => (Charset::Iso6937, bytes),
72 0x00 => (Charset::Iso6937, &bytes[1..]),
73 0x08 => (Charset::Unsupported(0x08), &bytes[1..]),
76 0x01..=0x0B => (Charset::Iso8859(bytes[0] + 4), &bytes[1..]),
77 0x10 if bytes.len() >= 3 && bytes[1] == 0x00 => {
78 (Charset::Iso8859(bytes[2]), &bytes[3..])
79 }
80 0x11 => (Charset::Ucs2Be, &bytes[1..]),
81 0x15 => (Charset::Utf8, &bytes[1..]),
82 other => (Charset::Unsupported(other), &bytes[1..]),
83 }
84}
85
86fn decode_iso_6937(bytes: &[u8]) -> String {
87 let mut out = String::with_capacity(bytes.len());
88 let mut i = 0;
89 while i < bytes.len() {
90 let b = bytes[i];
91 if (0xC0..=0xCF).contains(&b) {
93 match combining_mark(b) {
94 Some(mark) if i + 1 < bytes.len() => {
95 let base = bytes[i + 1];
96 if let Some(c) = combine(b, base) {
97 out.push(c);
98 } else {
99 out.push(iso_6937_single(base));
102 out.push(mark);
103 }
104 i += 2;
105 }
106 _ => {
108 out.push('\u{FFFD}');
109 i += 1;
110 }
111 }
112 continue;
113 }
114 out.push(iso_6937_single(b));
115 i += 1;
116 }
117 out
118}
119
120fn iso_6937_single(b: u8) -> char {
128 match b {
129 0x00..=0x7F => b as char,
130 0x86 | 0x87 | 0x8A => b as char,
132 0x80..=0x9F => '\u{FFFD}',
133 0xA0 => '\u{00A0}', 0xA1 => '¡',
135 0xA2 => '¢',
136 0xA3 => '£',
137 0xA4 => '\u{20AC}', 0xA5 => '¥',
139 0xA6 => '\u{FFFD}', 0xA7 => '§',
141 0xA8 => '\u{00A4}', 0xA9 => '\u{2018}', 0xAA => '\u{201C}', 0xAB => '«',
145 0xAC => '\u{2190}', 0xAD => '\u{2191}', 0xAE => '\u{2192}', 0xAF => '\u{2193}', 0xB0 => '°',
150 0xB1 => '±',
151 0xB2 => '²',
152 0xB3 => '³',
153 0xB4 => '\u{00D7}', 0xB5 => 'µ',
155 0xB6 => '¶',
156 0xB7 => '·',
157 0xB8 => '\u{00F7}', 0xB9 => '\u{2019}', 0xBA => '\u{201D}', 0xBB => '»',
161 0xBC => '¼',
162 0xBD => '½',
163 0xBE => '¾',
164 0xBF => '¿',
165 0xC0..=0xCF => '\u{FFFD}',
167 0xD0 => '\u{2015}', 0xD1 => '¹',
169 0xD2 => '®',
170 0xD3 => '©',
171 0xD4 => '\u{2122}', 0xD5 => '\u{266A}', 0xD6 => '¬',
174 0xD7 => '\u{00A6}', 0xD8..=0xDB => '\u{FFFD}', 0xDC => '\u{215B}', 0xDD => '\u{215C}', 0xDE => '\u{215D}', 0xDF => '\u{215E}', 0xE0 => '\u{2126}', 0xE1 => 'Æ',
182 0xE2 => '\u{0110}', 0xE3 => 'ª',
184 0xE4 => '\u{0126}', 0xE5 => '\u{FFFD}', 0xE6 => '\u{0132}', 0xE7 => '\u{013F}', 0xE8 => '\u{0141}', 0xE9 => 'Ø',
190 0xEA => '\u{0152}', 0xEB => 'º',
192 0xEC => 'Þ',
193 0xED => '\u{0166}', 0xEE => '\u{014A}', 0xEF => '\u{0149}', 0xF0 => '\u{0138}', 0xF1 => 'æ',
198 0xF2 => '\u{0111}', 0xF3 => 'ð',
200 0xF4 => '\u{0127}', 0xF5 => '\u{0131}', 0xF6 => '\u{0133}', 0xF7 => '\u{0140}', 0xF8 => '\u{0142}', 0xF9 => 'ø',
206 0xFA => '\u{0153}', 0xFB => 'ß',
208 0xFC => '\u{00FE}', 0xFD => '\u{0167}', 0xFE => '\u{014B}', 0xFF => '\u{00AD}', }
213}
214
215fn combining_mark(prefix: u8) -> Option<char> {
218 Some(match prefix {
219 0xC1 => '\u{0300}', 0xC2 => '\u{0301}', 0xC3 => '\u{0302}', 0xC4 => '\u{0303}', 0xC5 => '\u{0304}', 0xC6 => '\u{0306}', 0xC7 => '\u{0307}', 0xC8 => '\u{0308}', 0xCA => '\u{030A}', 0xCB => '\u{0327}', 0xCD => '\u{030B}', 0xCE => '\u{0328}', 0xCF => '\u{030C}', _ => return None,
233 })
234}
235
236fn combine(prefix: u8, base: u8) -> Option<char> {
237 Some(match (prefix, base) {
238 (0xC1, b'A') => 'À', (0xC1, b'E') => 'È', (0xC1, b'I') => 'Ì',
239 (0xC1, b'O') => 'Ò', (0xC1, b'U') => 'Ù',
240 (0xC1, b'a') => 'à', (0xC1, b'e') => 'è', (0xC1, b'i') => 'ì',
241 (0xC1, b'o') => 'ò', (0xC1, b'u') => 'ù',
242 (0xC2, b'A') => 'Á', (0xC2, b'E') => 'É', (0xC2, b'I') => 'Í',
243 (0xC2, b'O') => 'Ó', (0xC2, b'U') => 'Ú', (0xC2, b'Y') => 'Ý',
244 (0xC2, b'a') => 'á', (0xC2, b'e') => 'é', (0xC2, b'i') => 'í',
245 (0xC2, b'o') => 'ó', (0xC2, b'u') => 'ú', (0xC2, b'y') => 'ý',
246 (0xC2, b'C') => 'Ć', (0xC2, b'c') => 'ć', (0xC2, b'L') => 'Ĺ',
247 (0xC2, b'l') => 'ĺ', (0xC2, b'N') => 'Ń', (0xC2, b'n') => 'ń',
248 (0xC2, b'R') => 'Ŕ', (0xC2, b'r') => 'ŕ', (0xC2, b'S') => 'Ś',
249 (0xC2, b's') => 'ś', (0xC2, b'Z') => 'Ź', (0xC2, b'z') => 'ź',
250 (0xC3, b'A') => 'Â', (0xC3, b'E') => 'Ê', (0xC3, b'I') => 'Î',
251 (0xC3, b'O') => 'Ô', (0xC3, b'U') => 'Û',
252 (0xC3, b'a') => 'â', (0xC3, b'e') => 'ê', (0xC3, b'i') => 'î',
253 (0xC3, b'o') => 'ô', (0xC3, b'u') => 'û',
254 (0xC4, b'A') => 'Ã', (0xC4, b'N') => 'Ñ', (0xC4, b'O') => 'Õ',
255 (0xC4, b'a') => 'ã', (0xC4, b'n') => 'ñ', (0xC4, b'o') => 'õ',
256 (0xC4, b'I') => 'Ĩ', (0xC4, b'i') => 'ĩ', (0xC4, b'U') => 'Ũ',
257 (0xC4, b'u') => 'ũ',
258 (0xC5, b'A') => 'Ā', (0xC5, b'a') => 'ā', (0xC5, b'E') => 'Ē',
260 (0xC5, b'e') => 'ē', (0xC5, b'I') => 'Ī', (0xC5, b'i') => 'ī',
261 (0xC5, b'O') => 'Ō', (0xC5, b'o') => 'ō', (0xC5, b'U') => 'Ū',
262 (0xC5, b'u') => 'ū',
263 (0xC6, b'A') => 'Ă', (0xC6, b'a') => 'ă', (0xC6, b'G') => 'Ğ',
265 (0xC6, b'g') => 'ğ', (0xC6, b'U') => 'Ŭ', (0xC6, b'u') => 'ŭ',
266 (0xC7, b'C') => 'Ċ', (0xC7, b'c') => 'ċ', (0xC7, b'E') => 'Ė',
268 (0xC7, b'e') => 'ė', (0xC7, b'G') => 'Ġ', (0xC7, b'g') => 'ġ',
269 (0xC7, b'I') => 'İ', (0xC7, b'Z') => 'Ż', (0xC7, b'z') => 'ż',
270 (0xC8, b'A') => 'Ä', (0xC8, b'E') => 'Ë', (0xC8, b'I') => 'Ï',
271 (0xC8, b'O') => 'Ö', (0xC8, b'U') => 'Ü', (0xC8, b'Y') => 'Ÿ',
272 (0xC8, b'a') => 'ä', (0xC8, b'e') => 'ë', (0xC8, b'i') => 'ï',
273 (0xC8, b'o') => 'ö', (0xC8, b'u') => 'ü', (0xC8, b'y') => 'ÿ',
274 (0xCA, b'A') => 'Å', (0xCA, b'a') => 'å', (0xCA, b'U') => 'Ů',
276 (0xCA, b'u') => 'ů',
277 (0xCB, b'C') => 'Ç', (0xCB, b'c') => 'ç', (0xCB, b'G') => 'Ģ',
278 (0xCB, b'g') => 'ģ', (0xCB, b'K') => 'Ķ', (0xCB, b'k') => 'ķ',
279 (0xCB, b'L') => 'Ļ', (0xCB, b'l') => 'ļ', (0xCB, b'N') => 'Ņ',
280 (0xCB, b'n') => 'ņ', (0xCB, b'R') => 'Ŗ', (0xCB, b'r') => 'ŗ',
281 (0xCB, b'S') => 'Ş', (0xCB, b's') => 'ş', (0xCB, b'T') => 'Ţ',
282 (0xCB, b't') => 'ţ',
283 (0xCD, b'O') => 'Ő', (0xCD, b'o') => 'ő', (0xCD, b'U') => 'Ű',
285 (0xCD, b'u') => 'ű',
286 (0xCE, b'A') => 'Ą', (0xCE, b'a') => 'ą', (0xCE, b'E') => 'Ę',
288 (0xCE, b'e') => 'ę', (0xCE, b'I') => 'Į', (0xCE, b'i') => 'į',
289 (0xCE, b'U') => 'Ų', (0xCE, b'u') => 'ų',
290 (0xCF, b'C') => 'Č', (0xCF, b'c') => 'č', (0xCF, b'D') => 'Ď',
292 (0xCF, b'd') => 'ď', (0xCF, b'E') => 'Ě', (0xCF, b'e') => 'ě',
293 (0xCF, b'L') => 'Ľ', (0xCF, b'l') => 'ľ', (0xCF, b'N') => 'Ň',
294 (0xCF, b'n') => 'ň', (0xCF, b'R') => 'Ř', (0xCF, b'r') => 'ř',
295 (0xCF, b'S') => 'Š', (0xCF, b's') => 'š', (0xCF, b'T') => 'Ť',
296 (0xCF, b't') => 'ť', (0xCF, b'Z') => 'Ž', (0xCF, b'z') => 'ž',
297 _ => return None,
298 })
299}
300
301fn decode_iso_8859(n: u8, bytes: &[u8]) -> String {
302 use encoding_rs::*;
303 let encoding: &'static Encoding = match n {
304 2 => ISO_8859_2,
305 3 => ISO_8859_3,
306 4 => ISO_8859_4,
307 5 => ISO_8859_5,
308 6 => ISO_8859_6,
309 7 => ISO_8859_7,
310 8 => ISO_8859_8,
311 9 => WINDOWS_1254,
312 10 => ISO_8859_10,
313 11 => WINDOWS_874,
314 13 => ISO_8859_13,
315 14 => ISO_8859_14,
316 15 => ISO_8859_15,
317 _ => return bytes.iter().map(|&b| b as char).collect(),
318 };
319 let (cow, _, _) = encoding.decode(bytes);
320 cow.into_owned()
321}
322
323fn decode_ucs2_be(bytes: &[u8]) -> String {
324 let code_units: Vec<u16> = bytes
325 .chunks_exact(2)
326 .map(|pair| u16::from_be_bytes([pair[0], pair[1]]))
327 .collect();
328 String::from_utf16_lossy(&code_units)
329}
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 #[test]
336 fn decode_empty_input_returns_empty_string() {
337 assert_eq!(decode_dvb_string(&[]), "");
338 }
339
340 #[test]
341 fn decode_plain_ascii_is_borrowed() {
342 let cow = decode(b"HELLO");
343 assert!(matches!(cow, Cow::Borrowed(_)));
344 assert_eq!(cow, "HELLO");
345 }
346
347 #[test]
348 fn decode_iso6937_latin_accent_chars() {
349 assert_eq!(decode_dvb_string(&[0x00, 0xC2, b'A']), "Á");
350 assert_eq!(decode_dvb_string(&[0x00, 0xC1, b'e']), "è");
351 assert_eq!(decode_dvb_string(&[0x00, 0xC8, b'o']), "ö");
352 }
353
354 #[test]
355 fn decode_selector_0x01_yields_iso8859_5_cyrillic() {
356 let s = decode_dvb_string(&[0x01, 0xB0, 0xB1]);
357 assert!(s.chars().all(|c| c != '\u{FFFD}'), "got: {s:?}");
358 assert!(!s.is_empty());
359 }
360
361 #[test]
362 fn decode_selector_0x10_extended_yields_iso8859_nn() {
363 let s = decode_dvb_string(&[0x10, 0x00, 0x09, b'A', b'B']);
364 assert_eq!(s, "AB");
365 }
366
367 #[test]
368 fn decode_selector_0x11_ucs2_be() {
369 let s = decode_dvb_string(&[0x11, 0x00, 0x41, 0x00, 0x42]);
370 assert_eq!(s, "AB");
371 }
372
373 #[test]
374 fn decode_selector_0x15_utf8_passthrough() {
375 let s = decode_dvb_string(&[0x15, 0xC3, 0xA9, 0xC3, 0xA9]);
376 assert_eq!(s, "éé");
377 }
378
379 #[test]
380 fn decode_control_chars_stripped_linefeed_becomes_space() {
381 let s = decode_dvb_string(b"A\x01B\nC");
382 assert_eq!(s, "AB C");
383 }
384
385 #[test]
386 fn emphasis_on_off_markers_stripped_per_annex_a2() {
387 let s = decode_dvb_string(&[0x00, b'A', 0x86, b'B', 0x87, b'C']);
390 assert_eq!(s, "ABC");
391 }
392
393 #[test]
394 fn decode_annex_a2_crlf_0x8a_becomes_space() {
395 let s = decode_dvb_string(&[0x00, b'A', 0x8A, b'B']);
397 assert_eq!(s, "A B");
398 }
399
400 #[test]
402 fn reserved_selector_0x08_is_unsupported() {
403 let s = decode_dvb_string(&[0x08, 0x41, 0x42]);
404 assert!(s.chars().all(|c| c == '\u{FFFD}'));
405 assert_eq!(s.chars().count(), 2);
406 }
407
408 #[test]
409 fn unknown_selector_returns_replacement_characters() {
410 let s = decode_dvb_string(&[0x1F, 0xAA, 0xBB, 0xCC]);
412 assert_eq!(s.chars().count(), 3);
413 assert!(s.chars().all(|c| c == '\u{FFFD}'));
414 }
415
416 #[test]
421 fn figure_a1_gr_area_single_byte_mappings() {
422 let pins: &[(u8, char)] = &[
423 (0xA0, '\u{00A0}'), (0xA1, '¡'),
425 (0xA2, '¢'),
426 (0xA3, '£'),
427 (0xA4, '\u{20AC}'), (0xA5, '¥'),
429 (0xA7, '§'),
430 (0xA8, '\u{00A4}'), (0xA9, '\u{2018}'), (0xAA, '\u{201C}'), (0xAB, '«'),
434 (0xAC, '\u{2190}'), (0xAD, '\u{2191}'), (0xAE, '\u{2192}'), (0xAF, '\u{2193}'), (0xB0, '°'),
439 (0xB1, '±'),
440 (0xB2, '²'),
441 (0xB3, '³'),
442 (0xB4, '\u{00D7}'), (0xB5, 'µ'),
444 (0xB6, '¶'),
445 (0xB7, '·'),
446 (0xB8, '\u{00F7}'), (0xB9, '\u{2019}'), (0xBA, '\u{201D}'), (0xBB, '»'),
450 (0xBC, '¼'),
451 (0xBD, '½'),
452 (0xBE, '¾'),
453 (0xBF, '¿'),
454 (0xD0, '\u{2015}'), (0xD1, '¹'),
456 (0xD2, '®'),
457 (0xD3, '©'),
458 (0xD4, '\u{2122}'), (0xD5, '\u{266A}'), (0xD6, '¬'),
461 (0xD7, '\u{00A6}'), (0xDC, '\u{215B}'), (0xDD, '\u{215C}'), (0xDE, '\u{215D}'), (0xDF, '\u{215E}'), (0xE0, '\u{2126}'), (0xE1, 'Æ'),
468 (0xE2, '\u{0110}'), (0xE3, 'ª'),
470 (0xE4, '\u{0126}'), (0xE6, '\u{0132}'), (0xE7, '\u{013F}'), (0xE8, '\u{0141}'), (0xE9, 'Ø'),
475 (0xEA, '\u{0152}'), (0xEB, 'º'),
477 (0xEC, 'Þ'),
478 (0xED, '\u{0166}'), (0xEE, '\u{014A}'), (0xEF, '\u{0149}'), (0xF0, '\u{0138}'), (0xF1, 'æ'),
483 (0xF2, '\u{0111}'), (0xF3, 'ð'),
485 (0xF4, '\u{0127}'), (0xF5, '\u{0131}'), (0xF6, '\u{0133}'), (0xF7, '\u{0140}'), (0xF8, '\u{0142}'), (0xF9, 'ø'),
491 (0xFA, '\u{0153}'), (0xFB, 'ß'),
493 (0xFC, '\u{00FE}'), (0xFD, '\u{0167}'), (0xFE, '\u{014B}'), (0xFF, '\u{00AD}'), ];
498 for &(byte, want) in pins {
499 let got = decode_dvb_string(&[0x00, byte]);
500 assert_eq!(
501 got,
502 want.to_string(),
503 "byte {byte:#04x}: want {want:?} (U+{:04X}), got {got:?}",
504 want as u32
505 );
506 }
507 }
508
509 #[test]
511 fn figure_a1_undefined_positions_are_replacement() {
512 for byte in [0xA6u8, 0xD8, 0xD9, 0xDA, 0xDB, 0xE5] {
513 let got = decode_dvb_string(&[0x00, byte]);
514 assert_eq!(got, "\u{FFFD}", "byte {byte:#04x} should be U+FFFD");
515 }
516 }
517
518 #[test]
520 fn figure_a1_combining_precomposed() {
521 assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'a']), "å"); assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'A']), "Å");
523 assert_eq!(decode_dvb_string(&[0x00, 0xCF, b's']), "š"); assert_eq!(decode_dvb_string(&[0x00, 0xCF, b'Z']), "Ž");
525 assert_eq!(decode_dvb_string(&[0x00, 0xCE, b'e']), "ę"); assert_eq!(decode_dvb_string(&[0x00, 0xCD, b'o']), "ő"); assert_eq!(decode_dvb_string(&[0x00, 0xC7, b'z']), "ż"); assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'a']), "ā"); assert_eq!(decode_dvb_string(&[0x00, 0xC6, b'g']), "ğ"); }
531
532 #[test]
535 fn figure_a1_combining_fallback_emits_base_plus_mark() {
536 assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'x']), "x\u{0304}");
537 }
538
539 #[test]
542 fn figure_a1_combining_undefined_or_dangling_prefix() {
543 assert_eq!(decode_dvb_string(&[0x00, 0xC0, b'a']), "\u{FFFD}a");
544 assert_eq!(decode_dvb_string(&[0x00, 0xC9, b'a']), "\u{FFFD}a");
545 assert_eq!(decode_dvb_string(&[0x00, 0xCC, b'a']), "\u{FFFD}a");
546 assert_eq!(decode_dvb_string(&[0x00, 0xC2]), "\u{FFFD}");
547 }
548}