dvb_si/text/
mod.rs

1//! DVB-SI text decoding — ETSI EN 300 468 Annex A.
2//!
3//! Covers the full Annex A Table A.3 selector set: the default Latin table
4//! (Figure A.1, an ISO 6937 superset — see `iso_6937_single`), ISO 8859-n
5//! (single-byte 0x01–0x0B and extended 0x10 forms), UCS-2 BE (0x11),
6//! KS X 1001 Korean (0x12, decoded as EUC-KR), GB-2312 Simplified Chinese
7//! (0x13, decoded via GBK which is a GB-2312 superset), Big5 Traditional
8//! Chinese (0x14), UTF-8 (0x15), and the 0x1F `encoding_type_id` escape
9//! (no ids are registered for broadcast use — yields U+FFFD). Reserved
10//! selectors (0x08, 0x0C–0x0F, 0x16–0x1E) yield U+FFFD per byte.
11//!
12//! Glyph mappings are pinned to EN 300 468 V1.19.1 (2025-02) Figure A.1
13//! "Character code table 00 - Latin alphabet with Unicode equivalents"
14//! (PDF p. 159, vendored at `specs/etsi_en_300_468_v01.19.01_dvb_si.pdf`;
15//! transcription in `dvb-si/docs/en_300_468.md`).
16
17use std::borrow::Cow;
18
19/// Decode a DVB text payload (e.g. short_event_descriptor event_name_char)
20/// into an owned UTF-8 `String`. The first byte may be a charset indicator
21/// per ETSI EN 300 468 Annex A Table A.3.
22#[must_use]
23pub fn decode_dvb_string(bytes: &[u8]) -> String {
24    if bytes.is_empty() {
25        return String::new();
26    }
27
28    let (charset, body) = split_charset(bytes);
29    let decoded = match charset {
30        Charset::Iso6937 => decode_iso_6937(body),
31        Charset::Iso8859(n) => decode_iso_8859(n, body),
32        Charset::Utf8 => String::from_utf8_lossy(body).into_owned(),
33        Charset::Ucs2Be => decode_ucs2_be(body),
34        Charset::Ksx1001 => decode_with(encoding_rs::EUC_KR, body),
35        Charset::Gb2312 => decode_with(encoding_rs::GBK, body),
36        Charset::Big5 => decode_with(encoding_rs::BIG5, body),
37        Charset::Unsupported(_indicator) => body.iter().map(|_| '\u{FFFD}').collect(),
38    };
39
40    // Annex A.1 control codes:
41    //   single-byte tables: 0x86 emphasis on, 0x87 emphasis off, 0x8A CR/LF
42    //   -> space; other C0/C1 controls are stripped.
43    //   two-byte tables (Table A.2): the same functions live at U+E086 /
44    //   U+E087 / U+E08A inside the ISO 10646 PUA; the rest of
45    //   U+E080..U+E09F is reserved for control functions and stripped.
46    decoded
47        .chars()
48        .filter_map(|c| match c as u32 {
49            0x86 | 0x87 | 0xE086 | 0xE087 => None,
50            0x8A | 0xE08A => Some(' '),
51            0x0A => Some(' '),
52            code if code < 0x20 => None,
53            code if (0x80..0xA0).contains(&code) => None,
54            code if (0xE080..0xE0A0).contains(&code) => None,
55            _ => Some(c),
56        })
57        .collect()
58}
59
60/// Convenience wrapper returning `Cow::Borrowed` for pure-ASCII input,
61/// `Cow::Owned` otherwise.
62#[must_use]
63pub fn decode(bytes: &[u8]) -> Cow<'_, str> {
64    if bytes.iter().all(|&b| b.is_ascii() && b >= 0x20) {
65        return Cow::Borrowed(std::str::from_utf8(bytes).unwrap_or(""));
66    }
67    Cow::Owned(decode_dvb_string(bytes))
68}
69
70#[derive(Debug)]
71enum Charset {
72    Iso6937,
73    Iso8859(u8),
74    Utf8,
75    Ucs2Be,
76    /// KS X 1001 (selector 0x12), decoded as EUC-KR.
77    Ksx1001,
78    /// GB-2312 (selector 0x13), decoded via GBK (a GB-2312 superset).
79    Gb2312,
80    /// Big5 (selector 0x14).
81    Big5,
82    Unsupported(u8),
83}
84
85fn split_charset(bytes: &[u8]) -> (Charset, &[u8]) {
86    match bytes[0] {
87        b if b >= 0x20 => (Charset::Iso6937, bytes),
88        0x00 => (Charset::Iso6937, &bytes[1..]),
89        // Table A.3: 0x01..=0x0B map to ISO 8859-5..-15, EXCEPT 0x08 which is
90        // "reserved for future use" (there is no ISO 8859-12).
91        0x08 => (Charset::Unsupported(0x08), &bytes[1..]),
92        0x01..=0x0B => (Charset::Iso8859(bytes[0] + 4), &bytes[1..]),
93        0x10 if bytes.len() >= 3 && bytes[1] == 0x00 => (Charset::Iso8859(bytes[2]), &bytes[3..]),
94        0x11 => (Charset::Ucs2Be, &bytes[1..]),
95        0x12 => (Charset::Ksx1001, &bytes[1..]),
96        0x13 => (Charset::Gb2312, &bytes[1..]),
97        0x14 => (Charset::Big5, &bytes[1..]),
98        0x15 => (Charset::Utf8, &bytes[1..]),
99        // 0x1F: an 8-bit encoding_type_id follows (Table A.4 area); no ids are
100        // registered for broadcast text — treat the body as undecodable.
101        0x1F if bytes.len() >= 2 => (Charset::Unsupported(0x1F), &bytes[2..]),
102        other => (Charset::Unsupported(other), &bytes[1..]),
103    }
104}
105
106fn decode_iso_6937(bytes: &[u8]) -> String {
107    let mut out = String::with_capacity(bytes.len());
108    let mut i = 0;
109    while i < bytes.len() {
110        let b = bytes[i];
111        // 0xC0..=0xCF is the Figure A.1 non-spacing (combining-prefix) row.
112        if (0xC0..=0xCF).contains(&b) {
113            match combining_mark(b) {
114                Some(mark) if i + 1 < bytes.len() => {
115                    let base = bytes[i + 1];
116                    if let Some(c) = combine(b, base) {
117                        out.push(c);
118                    } else {
119                        // No precomposed form — emit base + Unicode combining
120                        // mark, which is canonically equivalent.
121                        out.push(iso_6937_single(base));
122                        out.push(mark);
123                    }
124                    i += 2;
125                }
126                // Undefined prefix (0xC0/0xC9/0xCC) or dangling prefix at end.
127                _ => {
128                    out.push('\u{FFFD}');
129                    i += 1;
130                }
131            }
132            continue;
133        }
134        out.push(iso_6937_single(b));
135        i += 1;
136    }
137    out
138}
139
140/// Decode a single (non-combining) byte of the default Latin table.
141///
142/// Source: ETSI EN 300 468 V1.19.1 (2025-02) Figure A.1 — "Character code
143/// table 00 - Latin alphabet with Unicode equivalents" (PDF p. 159). Per the
144/// note under the figure, the table is a superset of ISO/IEC 6937 with the
145/// Euro symbol (U+20AC) added at position 0xA4. Grey (undefined) positions
146/// decode to U+FFFD.
147fn iso_6937_single(b: u8) -> char {
148    match b {
149        0x00..=0x7F => b as char,
150        // Preserve ETSI Annex A.2 C1 control codes so the post-filter can act on them.
151        0x86 | 0x87 | 0x8A => b as char,
152        0x80..=0x9F => '\u{FFFD}',
153        0xA0 => '\u{00A0}', // NBSP
154        0xA1 => '¡',
155        0xA2 => '¢',
156        0xA3 => '£',
157        0xA4 => '\u{20AC}', // € — DVB addition (note under Figure A.1)
158        0xA5 => '¥',
159        0xA6 => '\u{FFFD}', // undefined
160        0xA7 => '§',
161        0xA8 => '\u{00A4}', // ¤ general currency sign
162        0xA9 => '\u{2018}', // ' left single quotation mark
163        0xAA => '\u{201C}', // " left double quotation mark
164        0xAB => '«',
165        0xAC => '\u{2190}', // ←
166        0xAD => '\u{2191}', // ↑
167        0xAE => '\u{2192}', // →
168        0xAF => '\u{2193}', // ↓
169        0xB0 => '°',
170        0xB1 => '±',
171        0xB2 => '²',
172        0xB3 => '³',
173        0xB4 => '\u{00D7}', // ×
174        0xB5 => 'µ',
175        0xB6 => '¶',
176        0xB7 => '·',
177        0xB8 => '\u{00F7}', // ÷
178        0xB9 => '\u{2019}', // ' right single quotation mark
179        0xBA => '\u{201D}', // " right double quotation mark
180        0xBB => '»',
181        0xBC => '¼',
182        0xBD => '½',
183        0xBE => '¾',
184        0xBF => '¿',
185        // Combining-prefix row; reached only for a dangling/undefined prefix.
186        0xC0..=0xCF => '\u{FFFD}',
187        0xD0 => '\u{2015}', // ― horizontal bar
188        0xD1 => '¹',
189        0xD2 => '®',
190        0xD3 => '©',
191        0xD4 => '\u{2122}', // ™
192        0xD5 => '\u{266A}', // ♪ eighth note
193        0xD6 => '¬',
194        0xD7 => '\u{00A6}',        // ¦ broken bar
195        0xD8..=0xDB => '\u{FFFD}', // undefined
196        0xDC => '\u{215B}',        // ⅛
197        0xDD => '\u{215C}',        // ⅜
198        0xDE => '\u{215D}',        // ⅝
199        0xDF => '\u{215E}',        // ⅞
200        0xE0 => '\u{2126}',        // Ω OHM SIGN
201        0xE1 => 'Æ',
202        0xE2 => '\u{0110}', // Đ
203        0xE3 => 'ª',
204        0xE4 => '\u{0126}', // Ħ
205        0xE5 => '\u{FFFD}', // undefined
206        0xE6 => '\u{0132}', // Ĳ
207        0xE7 => '\u{013F}', // Ŀ
208        0xE8 => '\u{0141}', // Ł
209        0xE9 => 'Ø',
210        0xEA => '\u{0152}', // Œ
211        0xEB => 'º',
212        0xEC => 'Þ',
213        0xED => '\u{0166}', // Ŧ
214        0xEE => '\u{014A}', // Ŋ
215        0xEF => '\u{0149}', // ŉ
216        0xF0 => '\u{0138}', // ĸ
217        0xF1 => 'æ',
218        0xF2 => '\u{0111}', // đ
219        0xF3 => 'ð',
220        0xF4 => '\u{0127}', // ħ
221        0xF5 => '\u{0131}', // ı dotless i
222        0xF6 => '\u{0133}', // ĳ
223        0xF7 => '\u{0140}', // ŀ
224        0xF8 => '\u{0142}', // ł
225        0xF9 => 'ø',
226        0xFA => '\u{0153}', // œ
227        0xFB => 'ß',
228        0xFC => '\u{00FE}', // þ
229        0xFD => '\u{0167}', // ŧ
230        0xFE => '\u{014B}', // ŋ
231        0xFF => '\u{00AD}', // SHY soft hyphen
232    }
233}
234
235/// Unicode combining mark for a Figure A.1 non-spacing prefix byte
236/// (row 0xC0..=0xCF). `None` for the undefined positions 0xC0/0xC9/0xCC.
237fn combining_mark(prefix: u8) -> Option<char> {
238    Some(match prefix {
239        0xC1 => '\u{0300}', // grave
240        0xC2 => '\u{0301}', // acute
241        0xC3 => '\u{0302}', // circumflex
242        0xC4 => '\u{0303}', // tilde
243        0xC5 => '\u{0304}', // macron
244        0xC6 => '\u{0306}', // breve
245        0xC7 => '\u{0307}', // dot above
246        0xC8 => '\u{0308}', // diaeresis
247        0xCA => '\u{030A}', // ring above
248        0xCB => '\u{0327}', // cedilla
249        0xCD => '\u{030B}', // double acute
250        0xCE => '\u{0328}', // ogonek
251        0xCF => '\u{030C}', // caron
252        _ => return None,
253    })
254}
255
256fn combine(prefix: u8, base: u8) -> Option<char> {
257    Some(match (prefix, base) {
258        (0xC1, b'A') => 'À',
259        (0xC1, b'E') => 'È',
260        (0xC1, b'I') => 'Ì',
261        (0xC1, b'O') => 'Ò',
262        (0xC1, b'U') => 'Ù',
263        (0xC1, b'a') => 'à',
264        (0xC1, b'e') => 'è',
265        (0xC1, b'i') => 'ì',
266        (0xC1, b'o') => 'ò',
267        (0xC1, b'u') => 'ù',
268        (0xC2, b'A') => 'Á',
269        (0xC2, b'E') => 'É',
270        (0xC2, b'I') => 'Í',
271        (0xC2, b'O') => 'Ó',
272        (0xC2, b'U') => 'Ú',
273        (0xC2, b'Y') => 'Ý',
274        (0xC2, b'a') => 'á',
275        (0xC2, b'e') => 'é',
276        (0xC2, b'i') => 'í',
277        (0xC2, b'o') => 'ó',
278        (0xC2, b'u') => 'ú',
279        (0xC2, b'y') => 'ý',
280        (0xC2, b'C') => 'Ć',
281        (0xC2, b'c') => 'ć',
282        (0xC2, b'L') => 'Ĺ',
283        (0xC2, b'l') => 'ĺ',
284        (0xC2, b'N') => 'Ń',
285        (0xC2, b'n') => 'ń',
286        (0xC2, b'R') => 'Ŕ',
287        (0xC2, b'r') => 'ŕ',
288        (0xC2, b'S') => 'Ś',
289        (0xC2, b's') => 'ś',
290        (0xC2, b'Z') => 'Ź',
291        (0xC2, b'z') => 'ź',
292        (0xC3, b'A') => 'Â',
293        (0xC3, b'E') => 'Ê',
294        (0xC3, b'I') => 'Î',
295        (0xC3, b'O') => 'Ô',
296        (0xC3, b'U') => 'Û',
297        (0xC3, b'a') => 'â',
298        (0xC3, b'e') => 'ê',
299        (0xC3, b'i') => 'î',
300        (0xC3, b'o') => 'ô',
301        (0xC3, b'u') => 'û',
302        (0xC4, b'A') => 'Ã',
303        (0xC4, b'N') => 'Ñ',
304        (0xC4, b'O') => 'Õ',
305        (0xC4, b'a') => 'ã',
306        (0xC4, b'n') => 'ñ',
307        (0xC4, b'o') => 'õ',
308        (0xC4, b'I') => 'Ĩ',
309        (0xC4, b'i') => 'ĩ',
310        (0xC4, b'U') => 'Ũ',
311        (0xC4, b'u') => 'ũ',
312        // macron
313        (0xC5, b'A') => 'Ā',
314        (0xC5, b'a') => 'ā',
315        (0xC5, b'E') => 'Ē',
316        (0xC5, b'e') => 'ē',
317        (0xC5, b'I') => 'Ī',
318        (0xC5, b'i') => 'ī',
319        (0xC5, b'O') => 'Ō',
320        (0xC5, b'o') => 'ō',
321        (0xC5, b'U') => 'Ū',
322        (0xC5, b'u') => 'ū',
323        // breve
324        (0xC6, b'A') => 'Ă',
325        (0xC6, b'a') => 'ă',
326        (0xC6, b'G') => 'Ğ',
327        (0xC6, b'g') => 'ğ',
328        (0xC6, b'U') => 'Ŭ',
329        (0xC6, b'u') => 'ŭ',
330        // dot above
331        (0xC7, b'C') => 'Ċ',
332        (0xC7, b'c') => 'ċ',
333        (0xC7, b'E') => 'Ė',
334        (0xC7, b'e') => 'ė',
335        (0xC7, b'G') => 'Ġ',
336        (0xC7, b'g') => 'ġ',
337        (0xC7, b'I') => 'İ',
338        (0xC7, b'Z') => 'Ż',
339        (0xC7, b'z') => 'ż',
340        (0xC8, b'A') => 'Ä',
341        (0xC8, b'E') => 'Ë',
342        (0xC8, b'I') => 'Ï',
343        (0xC8, b'O') => 'Ö',
344        (0xC8, b'U') => 'Ü',
345        (0xC8, b'Y') => 'Ÿ',
346        (0xC8, b'a') => 'ä',
347        (0xC8, b'e') => 'ë',
348        (0xC8, b'i') => 'ï',
349        (0xC8, b'o') => 'ö',
350        (0xC8, b'u') => 'ü',
351        (0xC8, b'y') => 'ÿ',
352        // ring above
353        (0xCA, b'A') => 'Å',
354        (0xCA, b'a') => 'å',
355        (0xCA, b'U') => 'Ů',
356        (0xCA, b'u') => 'ů',
357        (0xCB, b'C') => 'Ç',
358        (0xCB, b'c') => 'ç',
359        (0xCB, b'G') => 'Ģ',
360        (0xCB, b'g') => 'ģ',
361        (0xCB, b'K') => 'Ķ',
362        (0xCB, b'k') => 'ķ',
363        (0xCB, b'L') => 'Ļ',
364        (0xCB, b'l') => 'ļ',
365        (0xCB, b'N') => 'Ņ',
366        (0xCB, b'n') => 'ņ',
367        (0xCB, b'R') => 'Ŗ',
368        (0xCB, b'r') => 'ŗ',
369        (0xCB, b'S') => 'Ş',
370        (0xCB, b's') => 'ş',
371        (0xCB, b'T') => 'Ţ',
372        (0xCB, b't') => 'ţ',
373        // double acute
374        (0xCD, b'O') => 'Ő',
375        (0xCD, b'o') => 'ő',
376        (0xCD, b'U') => 'Ű',
377        (0xCD, b'u') => 'ű',
378        // ogonek
379        (0xCE, b'A') => 'Ą',
380        (0xCE, b'a') => 'ą',
381        (0xCE, b'E') => 'Ę',
382        (0xCE, b'e') => 'ę',
383        (0xCE, b'I') => 'Į',
384        (0xCE, b'i') => 'į',
385        (0xCE, b'U') => 'Ų',
386        (0xCE, b'u') => 'ų',
387        // caron
388        (0xCF, b'C') => 'Č',
389        (0xCF, b'c') => 'č',
390        (0xCF, b'D') => 'Ď',
391        (0xCF, b'd') => 'ď',
392        (0xCF, b'E') => 'Ě',
393        (0xCF, b'e') => 'ě',
394        (0xCF, b'L') => 'Ľ',
395        (0xCF, b'l') => 'ľ',
396        (0xCF, b'N') => 'Ň',
397        (0xCF, b'n') => 'ň',
398        (0xCF, b'R') => 'Ř',
399        (0xCF, b'r') => 'ř',
400        (0xCF, b'S') => 'Š',
401        (0xCF, b's') => 'š',
402        (0xCF, b'T') => 'Ť',
403        (0xCF, b't') => 'ť',
404        (0xCF, b'Z') => 'Ž',
405        (0xCF, b'z') => 'ž',
406        _ => return None,
407    })
408}
409
410fn decode_iso_8859(n: u8, bytes: &[u8]) -> String {
411    use encoding_rs::*;
412    let encoding: &'static Encoding = match n {
413        2 => ISO_8859_2,
414        3 => ISO_8859_3,
415        4 => ISO_8859_4,
416        5 => ISO_8859_5,
417        6 => ISO_8859_6,
418        7 => ISO_8859_7,
419        8 => ISO_8859_8,
420        9 => WINDOWS_1254,
421        10 => ISO_8859_10,
422        11 => WINDOWS_874,
423        13 => ISO_8859_13,
424        14 => ISO_8859_14,
425        15 => ISO_8859_15,
426        _ => return bytes.iter().map(|&b| b as char).collect(),
427    };
428    let (cow, _, _) = encoding.decode(bytes);
429    cow.into_owned()
430}
431
432fn decode_with(encoding: &'static encoding_rs::Encoding, bytes: &[u8]) -> String {
433    let (cow, _, _) = encoding.decode(bytes);
434    cow.into_owned()
435}
436
437fn decode_ucs2_be(bytes: &[u8]) -> String {
438    let code_units: Vec<u16> = bytes
439        .chunks_exact(2)
440        .map(|pair| u16::from_be_bytes([pair[0], pair[1]]))
441        .collect();
442    String::from_utf16_lossy(&code_units)
443}
444
445#[cfg(test)]
446mod tests {
447    use super::*;
448
449    #[test]
450    fn decode_empty_input_returns_empty_string() {
451        assert_eq!(decode_dvb_string(&[]), "");
452    }
453
454    #[test]
455    fn decode_plain_ascii_is_borrowed() {
456        let cow = decode(b"HELLO");
457        assert!(matches!(cow, Cow::Borrowed(_)));
458        assert_eq!(cow, "HELLO");
459    }
460
461    #[test]
462    fn decode_iso6937_latin_accent_chars() {
463        assert_eq!(decode_dvb_string(&[0x00, 0xC2, b'A']), "Á");
464        assert_eq!(decode_dvb_string(&[0x00, 0xC1, b'e']), "è");
465        assert_eq!(decode_dvb_string(&[0x00, 0xC8, b'o']), "ö");
466    }
467
468    #[test]
469    fn decode_selector_0x01_yields_iso8859_5_cyrillic() {
470        let s = decode_dvb_string(&[0x01, 0xB0, 0xB1]);
471        assert!(s.chars().all(|c| c != '\u{FFFD}'), "got: {s:?}");
472        assert!(!s.is_empty());
473    }
474
475    #[test]
476    fn decode_selector_0x10_extended_yields_iso8859_nn() {
477        let s = decode_dvb_string(&[0x10, 0x00, 0x09, b'A', b'B']);
478        assert_eq!(s, "AB");
479    }
480
481    #[test]
482    fn decode_selector_0x11_ucs2_be() {
483        let s = decode_dvb_string(&[0x11, 0x00, 0x41, 0x00, 0x42]);
484        assert_eq!(s, "AB");
485    }
486
487    #[test]
488    fn decode_selector_0x15_utf8_passthrough() {
489        let s = decode_dvb_string(&[0x15, 0xC3, 0xA9, 0xC3, 0xA9]);
490        assert_eq!(s, "éé");
491    }
492
493    #[test]
494    fn decode_control_chars_stripped_linefeed_becomes_space() {
495        let s = decode_dvb_string(b"A\x01B\nC");
496        assert_eq!(s, "AB C");
497    }
498
499    #[test]
500    fn emphasis_on_off_markers_stripped_per_annex_a2() {
501        // 0x86 and 0x87 are emphasis on/off markers per ETSI Annex A.2 — not
502        // representable in plain text, strip silently.
503        let s = decode_dvb_string(&[0x00, b'A', 0x86, b'B', 0x87, b'C']);
504        assert_eq!(s, "ABC");
505    }
506
507    #[test]
508    fn decode_annex_a2_crlf_0x8a_becomes_space() {
509        // 0x8A in DVB text maps to CR/LF per Annex A.2 — render as space.
510        let s = decode_dvb_string(&[0x00, b'A', 0x8A, b'B']);
511        assert_eq!(s, "A B");
512    }
513
514    #[test]
515    fn decode_selector_0x12_ksx1001_euc_kr() {
516        // EUC-KR 0xB0A1 = '가' (HANGUL SYLLABLE GA).
517        assert_eq!(decode_dvb_string(&[0x12, 0xB0, 0xA1]), "가");
518    }
519
520    #[test]
521    fn decode_selector_0x13_gb2312() {
522        // GB-2312/GBK 0xC4E3 = '你'.
523        assert_eq!(decode_dvb_string(&[0x13, 0xC4, 0xE3]), "你");
524    }
525
526    #[test]
527    fn decode_selector_0x14_big5() {
528        // Big5 0xA4A4 = '中'.
529        assert_eq!(decode_dvb_string(&[0x14, 0xA4, 0xA4]), "中");
530    }
531
532    /// A multi-byte trail byte in 0x80–0x9F must survive: the C1 control
533    /// filter operates on decoded code points, never on raw trail bytes.
534    /// GBK 0x8180 = '亐' (U+4E90, trail byte in the C1 range).
535    #[test]
536    fn decode_selector_0x13_gbk_trail_byte_in_c1_range() {
537        assert_eq!(decode_dvb_string(&[0x13, 0x81, 0x80]), "亐");
538    }
539
540    /// Annex A.1 two-byte control codes live at U+E080–U+E09F in the PUA
541    /// (Table A.2): U+E08A is CR/LF → space; the reserved rest is stripped.
542    /// GBK 0xABCD decodes to U+E08A; GBK 0xABC3 decodes to U+E080.
543    #[test]
544    fn two_byte_control_codes_filtered() {
545        assert_eq!(decode_dvb_string(&[0x13, 0xAB, 0xCD]), " ");
546        assert_eq!(decode_dvb_string(&[0x13, 0xAB, 0xC3]), "");
547    }
548
549    /// 0x1F consumes its 8-bit encoding_type_id; the body is undecodable
550    /// (no registered broadcast ids) and yields U+FFFD per byte.
551    #[test]
552    fn decode_selector_0x1f_encoding_type_id() {
553        let s = decode_dvb_string(&[0x1F, 0x01, 0x41, 0x42]);
554        assert_eq!(s.chars().count(), 2);
555        assert!(s.chars().all(|c| c == '\u{FFFD}'));
556    }
557
558    /// Table A.3 marks single-byte selector 0x08 reserved (no ISO 8859-12).
559    #[test]
560    fn reserved_selector_0x08_is_unsupported() {
561        let s = decode_dvb_string(&[0x08, 0x41, 0x42]);
562        assert!(s.chars().all(|c| c == '\u{FFFD}'));
563        assert_eq!(s.chars().count(), 2);
564    }
565
566    #[test]
567    fn unknown_selector_returns_replacement_characters() {
568        // Selector 0x16 is reserved for future use — each byte becomes U+FFFD.
569        let s = decode_dvb_string(&[0x16, 0xAA, 0xBB, 0xCC]);
570        assert_eq!(s.chars().count(), 3);
571        assert!(s.chars().all(|c| c == '\u{FFFD}'));
572    }
573
574    /// Pins the GR-area single-byte mappings to ETSI EN 300 468 V1.19.1
575    /// (2025-02) Figure A.1 — "Character code table 00 - Latin alphabet with
576    /// Unicode equivalents" (PDF p. 159; vendored at
577    /// `specs/etsi_en_300_468_v01.19.01_dvb_si.pdf`).
578    #[test]
579    fn figure_a1_gr_area_single_byte_mappings() {
580        let pins: &[(u8, char)] = &[
581            (0xA0, '\u{00A0}'), // NBSP
582            (0xA1, '¡'),
583            (0xA2, '¢'),
584            (0xA3, '£'),
585            (0xA4, '\u{20AC}'), // € — DVB addition (note under Figure A.1)
586            (0xA5, '¥'),
587            (0xA7, '§'),
588            (0xA8, '\u{00A4}'), // ¤ general currency sign
589            (0xA9, '\u{2018}'), // '
590            (0xAA, '\u{201C}'), // "
591            (0xAB, '«'),
592            (0xAC, '\u{2190}'), // ←
593            (0xAD, '\u{2191}'), // ↑
594            (0xAE, '\u{2192}'), // →
595            (0xAF, '\u{2193}'), // ↓
596            (0xB0, '°'),
597            (0xB1, '±'),
598            (0xB2, '²'),
599            (0xB3, '³'),
600            (0xB4, '\u{00D7}'), // ×
601            (0xB5, 'µ'),
602            (0xB6, '¶'),
603            (0xB7, '·'),
604            (0xB8, '\u{00F7}'), // ÷
605            (0xB9, '\u{2019}'), // '
606            (0xBA, '\u{201D}'), // "
607            (0xBB, '»'),
608            (0xBC, '¼'),
609            (0xBD, '½'),
610            (0xBE, '¾'),
611            (0xBF, '¿'),
612            (0xD0, '\u{2015}'), // ―
613            (0xD1, '¹'),
614            (0xD2, '®'),
615            (0xD3, '©'),
616            (0xD4, '\u{2122}'), // ™
617            (0xD5, '\u{266A}'), // ♪
618            (0xD6, '¬'),
619            (0xD7, '\u{00A6}'), // ¦
620            (0xDC, '\u{215B}'), // ⅛
621            (0xDD, '\u{215C}'), // ⅜
622            (0xDE, '\u{215D}'), // ⅝
623            (0xDF, '\u{215E}'), // ⅞
624            (0xE0, '\u{2126}'), // Ω OHM SIGN
625            (0xE1, 'Æ'),
626            (0xE2, '\u{0110}'), // Đ
627            (0xE3, 'ª'),
628            (0xE4, '\u{0126}'), // Ħ
629            (0xE6, '\u{0132}'), // Ĳ
630            (0xE7, '\u{013F}'), // Ŀ
631            (0xE8, '\u{0141}'), // Ł
632            (0xE9, 'Ø'),
633            (0xEA, '\u{0152}'), // Œ
634            (0xEB, 'º'),
635            (0xEC, 'Þ'),
636            (0xED, '\u{0166}'), // Ŧ
637            (0xEE, '\u{014A}'), // Ŋ
638            (0xEF, '\u{0149}'), // ŉ
639            (0xF0, '\u{0138}'), // ĸ
640            (0xF1, 'æ'),
641            (0xF2, '\u{0111}'), // đ
642            (0xF3, 'ð'),
643            (0xF4, '\u{0127}'), // ħ
644            (0xF5, '\u{0131}'), // ı
645            (0xF6, '\u{0133}'), // ĳ
646            (0xF7, '\u{0140}'), // ŀ
647            (0xF8, '\u{0142}'), // ł
648            (0xF9, 'ø'),
649            (0xFA, '\u{0153}'), // œ
650            (0xFB, 'ß'),
651            (0xFC, '\u{00FE}'), // þ
652            (0xFD, '\u{0167}'), // ŧ
653            (0xFE, '\u{014B}'), // ŋ
654            (0xFF, '\u{00AD}'), // SHY soft hyphen
655        ];
656        for &(byte, want) in pins {
657            let got = decode_dvb_string(&[0x00, byte]);
658            assert_eq!(
659                got,
660                want.to_string(),
661                "byte {byte:#04x}: want {want:?} (U+{:04X}), got {got:?}",
662                want as u32
663            );
664        }
665    }
666
667    /// Bytes undefined (grey) in Figure A.1 decode to U+FFFD.
668    #[test]
669    fn figure_a1_undefined_positions_are_replacement() {
670        for byte in [0xA6u8, 0xD8, 0xD9, 0xDA, 0xDB, 0xE5] {
671            let got = decode_dvb_string(&[0x00, byte]);
672            assert_eq!(got, "\u{FFFD}", "byte {byte:#04x} should be U+FFFD");
673        }
674    }
675
676    /// C-row prefixes with precomposed entries (Figure A.1 non-spacing row).
677    #[test]
678    fn figure_a1_combining_precomposed() {
679        assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'a']), "å"); // ring U+030A
680        assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'A']), "Å");
681        assert_eq!(decode_dvb_string(&[0x00, 0xCF, b's']), "š"); // caron U+030C
682        assert_eq!(decode_dvb_string(&[0x00, 0xCF, b'Z']), "Ž");
683        assert_eq!(decode_dvb_string(&[0x00, 0xCE, b'e']), "ę"); // ogonek U+0328
684        assert_eq!(decode_dvb_string(&[0x00, 0xCD, b'o']), "ő"); // double acute U+030B
685        assert_eq!(decode_dvb_string(&[0x00, 0xC7, b'z']), "ż"); // dot above U+0307
686        assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'a']), "ā"); // macron U+0304
687        assert_eq!(decode_dvb_string(&[0x00, 0xC6, b'g']), "ğ"); // breve U+0306
688    }
689
690    /// A defined prefix with no precomposed form falls back to
691    /// base + Unicode combining mark (canonically equivalent).
692    #[test]
693    fn figure_a1_combining_fallback_emits_base_plus_mark() {
694        assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'x']), "x\u{0304}");
695    }
696
697    /// Undefined C-row prefixes (0xC0, 0xC9, 0xCC) and a dangling prefix at
698    /// end of input decode to U+FFFD.
699    #[test]
700    fn figure_a1_combining_undefined_or_dangling_prefix() {
701        assert_eq!(decode_dvb_string(&[0x00, 0xC0, b'a']), "\u{FFFD}a");
702        assert_eq!(decode_dvb_string(&[0x00, 0xC9, b'a']), "\u{FFFD}a");
703        assert_eq!(decode_dvb_string(&[0x00, 0xCC, b'a']), "\u{FFFD}a");
704        assert_eq!(decode_dvb_string(&[0x00, 0xC2]), "\u{FFFD}");
705    }
706}
dvb_si/text/mod.rs

dvb_si/text/
mod.rs