dvb_si/text/
mod.rs

1//! DVB-SI text decoding — ETSI EN 300 468 Annex A.
2//!
3//! Covers the common charsets: the default Latin table (Figure A.1, an ISO 6937
4//! superset — see `iso_6937_single`), ISO 8859-n, UTF-8 (selector 0x15) and
5//! UCS-2 BE (selector 0x11). Remaining Annex A coverage (Cyrillic/Arabic/Greek/
6//! Hebrew figures via selectors already routed to ISO 8859-n, CJK selectors
7//! 0x12–0x14, emphasis pairs) is future work.
8//!
9//! Glyph mappings are pinned to EN 300 468 V1.19.1 (2025-02) Figure A.1
10//! "Character code table 00 - Latin alphabet with Unicode equivalents"
11//! (PDF p. 159, vendored at `specs/etsi_en_300_468_v01.19.01_dvb_si.pdf`;
12//! transcription in `dvb-si/docs/en_300_468.md`).
13
14use std::borrow::Cow;
15
16/// Decode a DVB text payload (e.g. short_event_descriptor event_name_char)
17/// into an owned UTF-8 `String`. The first byte may be a charset indicator
18/// per ETSI EN 300 468 Annex A Table A.3.
19#[must_use]
20pub fn decode_dvb_string(bytes: &[u8]) -> String {
21    if bytes.is_empty() {
22        return String::new();
23    }
24
25    let (charset, body) = split_charset(bytes);
26    let decoded = match charset {
27        Charset::Iso6937 => decode_iso_6937(body),
28        Charset::Iso8859(n) => decode_iso_8859(n, body),
29        Charset::Utf8 => String::from_utf8_lossy(body).into_owned(),
30        Charset::Ucs2Be => decode_ucs2_be(body),
31        Charset::Unsupported(_indicator) => body.iter().map(|_| '\u{FFFD}').collect(),
32    };
33
34    // Annex A.2 control codes:
35    //   0x86 emphasis on, 0x87 emphasis off, 0x8A CR/LF -> space.
36    //   Other C0/C1 controls are stripped.
37    decoded
38        .chars()
39        .filter_map(|c| match c as u32 {
40            0x86 | 0x87 => None,
41            0x8A => Some(' '),
42            0x0A => Some(' '),
43            code if code < 0x20 => None,
44            code if (0x80..0xA0).contains(&code) => None,
45            _ => Some(c),
46        })
47        .collect()
48}
49
50/// Convenience wrapper returning `Cow::Borrowed` for pure-ASCII input,
51/// `Cow::Owned` otherwise.
52#[must_use]
53pub fn decode(bytes: &[u8]) -> Cow<'_, str> {
54    if bytes.iter().all(|&b| b.is_ascii() && b >= 0x20) {
55        return Cow::Borrowed(std::str::from_utf8(bytes).unwrap_or(""));
56    }
57    Cow::Owned(decode_dvb_string(bytes))
58}
59
60#[derive(Debug)]
61enum Charset {
62    Iso6937,
63    Iso8859(u8),
64    Utf8,
65    Ucs2Be,
66    Unsupported(u8),
67}
68
69fn split_charset(bytes: &[u8]) -> (Charset, &[u8]) {
70    match bytes[0] {
71        b if b >= 0x20 => (Charset::Iso6937, bytes),
72        0x00 => (Charset::Iso6937, &bytes[1..]),
73        // Table A.3: 0x01..=0x0B map to ISO 8859-5..-15, EXCEPT 0x08 which is
74        // "reserved for future use" (there is no ISO 8859-12).
75        0x08 => (Charset::Unsupported(0x08), &bytes[1..]),
76        0x01..=0x0B => (Charset::Iso8859(bytes[0] + 4), &bytes[1..]),
77        0x10 if bytes.len() >= 3 && bytes[1] == 0x00 => {
78            (Charset::Iso8859(bytes[2]), &bytes[3..])
79        }
80        0x11 => (Charset::Ucs2Be, &bytes[1..]),
81        0x15 => (Charset::Utf8, &bytes[1..]),
82        other => (Charset::Unsupported(other), &bytes[1..]),
83    }
84}
85
86fn decode_iso_6937(bytes: &[u8]) -> String {
87    let mut out = String::with_capacity(bytes.len());
88    let mut i = 0;
89    while i < bytes.len() {
90        let b = bytes[i];
91        // 0xC0..=0xCF is the Figure A.1 non-spacing (combining-prefix) row.
92        if (0xC0..=0xCF).contains(&b) {
93            match combining_mark(b) {
94                Some(mark) if i + 1 < bytes.len() => {
95                    let base = bytes[i + 1];
96                    if let Some(c) = combine(b, base) {
97                        out.push(c);
98                    } else {
99                        // No precomposed form — emit base + Unicode combining
100                        // mark, which is canonically equivalent.
101                        out.push(iso_6937_single(base));
102                        out.push(mark);
103                    }
104                    i += 2;
105                }
106                // Undefined prefix (0xC0/0xC9/0xCC) or dangling prefix at end.
107                _ => {
108                    out.push('\u{FFFD}');
109                    i += 1;
110                }
111            }
112            continue;
113        }
114        out.push(iso_6937_single(b));
115        i += 1;
116    }
117    out
118}
119
120/// Decode a single (non-combining) byte of the default Latin table.
121///
122/// Source: ETSI EN 300 468 V1.19.1 (2025-02) Figure A.1 — "Character code
123/// table 00 - Latin alphabet with Unicode equivalents" (PDF p. 159). Per the
124/// note under the figure, the table is a superset of ISO/IEC 6937 with the
125/// Euro symbol (U+20AC) added at position 0xA4. Grey (undefined) positions
126/// decode to U+FFFD.
127fn iso_6937_single(b: u8) -> char {
128    match b {
129        0x00..=0x7F => b as char,
130        // Preserve ETSI Annex A.2 C1 control codes so the post-filter can act on them.
131        0x86 | 0x87 | 0x8A => b as char,
132        0x80..=0x9F => '\u{FFFD}',
133        0xA0 => '\u{00A0}', // NBSP
134        0xA1 => '¡',
135        0xA2 => '¢',
136        0xA3 => '£',
137        0xA4 => '\u{20AC}', // € — DVB addition (note under Figure A.1)
138        0xA5 => '¥',
139        0xA6 => '\u{FFFD}', // undefined
140        0xA7 => '§',
141        0xA8 => '\u{00A4}', // ¤ general currency sign
142        0xA9 => '\u{2018}', // ' left single quotation mark
143        0xAA => '\u{201C}', // " left double quotation mark
144        0xAB => '«',
145        0xAC => '\u{2190}', // ←
146        0xAD => '\u{2191}', // ↑
147        0xAE => '\u{2192}', // →
148        0xAF => '\u{2193}', // ↓
149        0xB0 => '°',
150        0xB1 => '±',
151        0xB2 => '²',
152        0xB3 => '³',
153        0xB4 => '\u{00D7}', // ×
154        0xB5 => 'µ',
155        0xB6 => '¶',
156        0xB7 => '·',
157        0xB8 => '\u{00F7}', // ÷
158        0xB9 => '\u{2019}', // ' right single quotation mark
159        0xBA => '\u{201D}', // " right double quotation mark
160        0xBB => '»',
161        0xBC => '¼',
162        0xBD => '½',
163        0xBE => '¾',
164        0xBF => '¿',
165        // Combining-prefix row; reached only for a dangling/undefined prefix.
166        0xC0..=0xCF => '\u{FFFD}',
167        0xD0 => '\u{2015}', // ― horizontal bar
168        0xD1 => '¹',
169        0xD2 => '®',
170        0xD3 => '©',
171        0xD4 => '\u{2122}', // ™
172        0xD5 => '\u{266A}', // ♪ eighth note
173        0xD6 => '¬',
174        0xD7 => '\u{00A6}', // ¦ broken bar
175        0xD8..=0xDB => '\u{FFFD}', // undefined
176        0xDC => '\u{215B}', // ⅛
177        0xDD => '\u{215C}', // ⅜
178        0xDE => '\u{215D}', // ⅝
179        0xDF => '\u{215E}', // ⅞
180        0xE0 => '\u{2126}', // Ω OHM SIGN
181        0xE1 => 'Æ',
182        0xE2 => '\u{0110}', // Đ
183        0xE3 => 'ª',
184        0xE4 => '\u{0126}', // Ħ
185        0xE5 => '\u{FFFD}', // undefined
186        0xE6 => '\u{0132}', // Ĳ
187        0xE7 => '\u{013F}', // Ŀ
188        0xE8 => '\u{0141}', // Ł
189        0xE9 => 'Ø',
190        0xEA => '\u{0152}', // Œ
191        0xEB => 'º',
192        0xEC => 'Þ',
193        0xED => '\u{0166}', // Ŧ
194        0xEE => '\u{014A}', // Ŋ
195        0xEF => '\u{0149}', // ŉ
196        0xF0 => '\u{0138}', // ĸ
197        0xF1 => 'æ',
198        0xF2 => '\u{0111}', // đ
199        0xF3 => 'ð',
200        0xF4 => '\u{0127}', // ħ
201        0xF5 => '\u{0131}', // ı dotless i
202        0xF6 => '\u{0133}', // ĳ
203        0xF7 => '\u{0140}', // ŀ
204        0xF8 => '\u{0142}', // ł
205        0xF9 => 'ø',
206        0xFA => '\u{0153}', // œ
207        0xFB => 'ß',
208        0xFC => '\u{00FE}', // þ
209        0xFD => '\u{0167}', // ŧ
210        0xFE => '\u{014B}', // ŋ
211        0xFF => '\u{00AD}', // SHY soft hyphen
212    }
213}
214
215/// Unicode combining mark for a Figure A.1 non-spacing prefix byte
216/// (row 0xC0..=0xCF). `None` for the undefined positions 0xC0/0xC9/0xCC.
217fn combining_mark(prefix: u8) -> Option<char> {
218    Some(match prefix {
219        0xC1 => '\u{0300}', // grave
220        0xC2 => '\u{0301}', // acute
221        0xC3 => '\u{0302}', // circumflex
222        0xC4 => '\u{0303}', // tilde
223        0xC5 => '\u{0304}', // macron
224        0xC6 => '\u{0306}', // breve
225        0xC7 => '\u{0307}', // dot above
226        0xC8 => '\u{0308}', // diaeresis
227        0xCA => '\u{030A}', // ring above
228        0xCB => '\u{0327}', // cedilla
229        0xCD => '\u{030B}', // double acute
230        0xCE => '\u{0328}', // ogonek
231        0xCF => '\u{030C}', // caron
232        _ => return None,
233    })
234}
235
236fn combine(prefix: u8, base: u8) -> Option<char> {
237    Some(match (prefix, base) {
238        (0xC1, b'A') => 'À', (0xC1, b'E') => 'È', (0xC1, b'I') => 'Ì',
239        (0xC1, b'O') => 'Ò', (0xC1, b'U') => 'Ù',
240        (0xC1, b'a') => 'à', (0xC1, b'e') => 'è', (0xC1, b'i') => 'ì',
241        (0xC1, b'o') => 'ò', (0xC1, b'u') => 'ù',
242        (0xC2, b'A') => 'Á', (0xC2, b'E') => 'É', (0xC2, b'I') => 'Í',
243        (0xC2, b'O') => 'Ó', (0xC2, b'U') => 'Ú', (0xC2, b'Y') => 'Ý',
244        (0xC2, b'a') => 'á', (0xC2, b'e') => 'é', (0xC2, b'i') => 'í',
245        (0xC2, b'o') => 'ó', (0xC2, b'u') => 'ú', (0xC2, b'y') => 'ý',
246        (0xC2, b'C') => 'Ć', (0xC2, b'c') => 'ć', (0xC2, b'L') => 'Ĺ',
247        (0xC2, b'l') => 'ĺ', (0xC2, b'N') => 'Ń', (0xC2, b'n') => 'ń',
248        (0xC2, b'R') => 'Ŕ', (0xC2, b'r') => 'ŕ', (0xC2, b'S') => 'Ś',
249        (0xC2, b's') => 'ś', (0xC2, b'Z') => 'Ź', (0xC2, b'z') => 'ź',
250        (0xC3, b'A') => 'Â', (0xC3, b'E') => 'Ê', (0xC3, b'I') => 'Î',
251        (0xC3, b'O') => 'Ô', (0xC3, b'U') => 'Û',
252        (0xC3, b'a') => 'â', (0xC3, b'e') => 'ê', (0xC3, b'i') => 'î',
253        (0xC3, b'o') => 'ô', (0xC3, b'u') => 'û',
254        (0xC4, b'A') => 'Ã', (0xC4, b'N') => 'Ñ', (0xC4, b'O') => 'Õ',
255        (0xC4, b'a') => 'ã', (0xC4, b'n') => 'ñ', (0xC4, b'o') => 'õ',
256        (0xC4, b'I') => 'Ĩ', (0xC4, b'i') => 'ĩ', (0xC4, b'U') => 'Ũ',
257        (0xC4, b'u') => 'ũ',
258        // macron
259        (0xC5, b'A') => 'Ā', (0xC5, b'a') => 'ā', (0xC5, b'E') => 'Ē',
260        (0xC5, b'e') => 'ē', (0xC5, b'I') => 'Ī', (0xC5, b'i') => 'ī',
261        (0xC5, b'O') => 'Ō', (0xC5, b'o') => 'ō', (0xC5, b'U') => 'Ū',
262        (0xC5, b'u') => 'ū',
263        // breve
264        (0xC6, b'A') => 'Ă', (0xC6, b'a') => 'ă', (0xC6, b'G') => 'Ğ',
265        (0xC6, b'g') => 'ğ', (0xC6, b'U') => 'Ŭ', (0xC6, b'u') => 'ŭ',
266        // dot above
267        (0xC7, b'C') => 'Ċ', (0xC7, b'c') => 'ċ', (0xC7, b'E') => 'Ė',
268        (0xC7, b'e') => 'ė', (0xC7, b'G') => 'Ġ', (0xC7, b'g') => 'ġ',
269        (0xC7, b'I') => 'İ', (0xC7, b'Z') => 'Ż', (0xC7, b'z') => 'ż',
270        (0xC8, b'A') => 'Ä', (0xC8, b'E') => 'Ë', (0xC8, b'I') => 'Ï',
271        (0xC8, b'O') => 'Ö', (0xC8, b'U') => 'Ü', (0xC8, b'Y') => 'Ÿ',
272        (0xC8, b'a') => 'ä', (0xC8, b'e') => 'ë', (0xC8, b'i') => 'ï',
273        (0xC8, b'o') => 'ö', (0xC8, b'u') => 'ü', (0xC8, b'y') => 'ÿ',
274        // ring above
275        (0xCA, b'A') => 'Å', (0xCA, b'a') => 'å', (0xCA, b'U') => 'Ů',
276        (0xCA, b'u') => 'ů',
277        (0xCB, b'C') => 'Ç', (0xCB, b'c') => 'ç', (0xCB, b'G') => 'Ģ',
278        (0xCB, b'g') => 'ģ', (0xCB, b'K') => 'Ķ', (0xCB, b'k') => 'ķ',
279        (0xCB, b'L') => 'Ļ', (0xCB, b'l') => 'ļ', (0xCB, b'N') => 'Ņ',
280        (0xCB, b'n') => 'ņ', (0xCB, b'R') => 'Ŗ', (0xCB, b'r') => 'ŗ',
281        (0xCB, b'S') => 'Ş', (0xCB, b's') => 'ş', (0xCB, b'T') => 'Ţ',
282        (0xCB, b't') => 'ţ',
283        // double acute
284        (0xCD, b'O') => 'Ő', (0xCD, b'o') => 'ő', (0xCD, b'U') => 'Ű',
285        (0xCD, b'u') => 'ű',
286        // ogonek
287        (0xCE, b'A') => 'Ą', (0xCE, b'a') => 'ą', (0xCE, b'E') => 'Ę',
288        (0xCE, b'e') => 'ę', (0xCE, b'I') => 'Į', (0xCE, b'i') => 'į',
289        (0xCE, b'U') => 'Ų', (0xCE, b'u') => 'ų',
290        // caron
291        (0xCF, b'C') => 'Č', (0xCF, b'c') => 'č', (0xCF, b'D') => 'Ď',
292        (0xCF, b'd') => 'ď', (0xCF, b'E') => 'Ě', (0xCF, b'e') => 'ě',
293        (0xCF, b'L') => 'Ľ', (0xCF, b'l') => 'ľ', (0xCF, b'N') => 'Ň',
294        (0xCF, b'n') => 'ň', (0xCF, b'R') => 'Ř', (0xCF, b'r') => 'ř',
295        (0xCF, b'S') => 'Š', (0xCF, b's') => 'š', (0xCF, b'T') => 'Ť',
296        (0xCF, b't') => 'ť', (0xCF, b'Z') => 'Ž', (0xCF, b'z') => 'ž',
297        _ => return None,
298    })
299}
300
301fn decode_iso_8859(n: u8, bytes: &[u8]) -> String {
302    use encoding_rs::*;
303    let encoding: &'static Encoding = match n {
304        2 => ISO_8859_2,
305        3 => ISO_8859_3,
306        4 => ISO_8859_4,
307        5 => ISO_8859_5,
308        6 => ISO_8859_6,
309        7 => ISO_8859_7,
310        8 => ISO_8859_8,
311        9 => WINDOWS_1254,
312        10 => ISO_8859_10,
313        11 => WINDOWS_874,
314        13 => ISO_8859_13,
315        14 => ISO_8859_14,
316        15 => ISO_8859_15,
317        _ => return bytes.iter().map(|&b| b as char).collect(),
318    };
319    let (cow, _, _) = encoding.decode(bytes);
320    cow.into_owned()
321}
322
323fn decode_ucs2_be(bytes: &[u8]) -> String {
324    let code_units: Vec<u16> = bytes
325        .chunks_exact(2)
326        .map(|pair| u16::from_be_bytes([pair[0], pair[1]]))
327        .collect();
328    String::from_utf16_lossy(&code_units)
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn decode_empty_input_returns_empty_string() {
337        assert_eq!(decode_dvb_string(&[]), "");
338    }
339
340    #[test]
341    fn decode_plain_ascii_is_borrowed() {
342        let cow = decode(b"HELLO");
343        assert!(matches!(cow, Cow::Borrowed(_)));
344        assert_eq!(cow, "HELLO");
345    }
346
347    #[test]
348    fn decode_iso6937_latin_accent_chars() {
349        assert_eq!(decode_dvb_string(&[0x00, 0xC2, b'A']), "Á");
350        assert_eq!(decode_dvb_string(&[0x00, 0xC1, b'e']), "è");
351        assert_eq!(decode_dvb_string(&[0x00, 0xC8, b'o']), "ö");
352    }
353
354    #[test]
355    fn decode_selector_0x01_yields_iso8859_5_cyrillic() {
356        let s = decode_dvb_string(&[0x01, 0xB0, 0xB1]);
357        assert!(s.chars().all(|c| c != '\u{FFFD}'), "got: {s:?}");
358        assert!(!s.is_empty());
359    }
360
361    #[test]
362    fn decode_selector_0x10_extended_yields_iso8859_nn() {
363        let s = decode_dvb_string(&[0x10, 0x00, 0x09, b'A', b'B']);
364        assert_eq!(s, "AB");
365    }
366
367    #[test]
368    fn decode_selector_0x11_ucs2_be() {
369        let s = decode_dvb_string(&[0x11, 0x00, 0x41, 0x00, 0x42]);
370        assert_eq!(s, "AB");
371    }
372
373    #[test]
374    fn decode_selector_0x15_utf8_passthrough() {
375        let s = decode_dvb_string(&[0x15, 0xC3, 0xA9, 0xC3, 0xA9]);
376        assert_eq!(s, "éé");
377    }
378
379    #[test]
380    fn decode_control_chars_stripped_linefeed_becomes_space() {
381        let s = decode_dvb_string(b"A\x01B\nC");
382        assert_eq!(s, "AB C");
383    }
384
385    #[test]
386    fn emphasis_on_off_markers_stripped_per_annex_a2() {
387        // 0x86 and 0x87 are emphasis on/off markers per ETSI Annex A.2 — not
388        // representable in plain text, strip silently.
389        let s = decode_dvb_string(&[0x00, b'A', 0x86, b'B', 0x87, b'C']);
390        assert_eq!(s, "ABC");
391    }
392
393    #[test]
394    fn decode_annex_a2_crlf_0x8a_becomes_space() {
395        // 0x8A in DVB text maps to CR/LF per Annex A.2 — render as space.
396        let s = decode_dvb_string(&[0x00, b'A', 0x8A, b'B']);
397        assert_eq!(s, "A B");
398    }
399
400    /// Table A.3 marks single-byte selector 0x08 reserved (no ISO 8859-12).
401    #[test]
402    fn reserved_selector_0x08_is_unsupported() {
403        let s = decode_dvb_string(&[0x08, 0x41, 0x42]);
404        assert!(s.chars().all(|c| c == '\u{FFFD}'));
405        assert_eq!(s.chars().count(), 2);
406    }
407
408    #[test]
409    fn unknown_selector_returns_replacement_characters() {
410        // Selector 0x1F is reserved/unsupported — each byte becomes U+FFFD.
411        let s = decode_dvb_string(&[0x1F, 0xAA, 0xBB, 0xCC]);
412        assert_eq!(s.chars().count(), 3);
413        assert!(s.chars().all(|c| c == '\u{FFFD}'));
414    }
415
416    /// Pins the GR-area single-byte mappings to ETSI EN 300 468 V1.19.1
417    /// (2025-02) Figure A.1 — "Character code table 00 - Latin alphabet with
418    /// Unicode equivalents" (PDF p. 159; vendored at
419    /// `specs/etsi_en_300_468_v01.19.01_dvb_si.pdf`).
420    #[test]
421    fn figure_a1_gr_area_single_byte_mappings() {
422        let pins: &[(u8, char)] = &[
423            (0xA0, '\u{00A0}'), // NBSP
424            (0xA1, '¡'),
425            (0xA2, '¢'),
426            (0xA3, '£'),
427            (0xA4, '\u{20AC}'), // € — DVB addition (note under Figure A.1)
428            (0xA5, '¥'),
429            (0xA7, '§'),
430            (0xA8, '\u{00A4}'), // ¤ general currency sign
431            (0xA9, '\u{2018}'), // '
432            (0xAA, '\u{201C}'), // "
433            (0xAB, '«'),
434            (0xAC, '\u{2190}'), // ←
435            (0xAD, '\u{2191}'), // ↑
436            (0xAE, '\u{2192}'), // →
437            (0xAF, '\u{2193}'), // ↓
438            (0xB0, '°'),
439            (0xB1, '±'),
440            (0xB2, '²'),
441            (0xB3, '³'),
442            (0xB4, '\u{00D7}'), // ×
443            (0xB5, 'µ'),
444            (0xB6, '¶'),
445            (0xB7, '·'),
446            (0xB8, '\u{00F7}'), // ÷
447            (0xB9, '\u{2019}'), // '
448            (0xBA, '\u{201D}'), // "
449            (0xBB, '»'),
450            (0xBC, '¼'),
451            (0xBD, '½'),
452            (0xBE, '¾'),
453            (0xBF, '¿'),
454            (0xD0, '\u{2015}'), // ―
455            (0xD1, '¹'),
456            (0xD2, '®'),
457            (0xD3, '©'),
458            (0xD4, '\u{2122}'), // ™
459            (0xD5, '\u{266A}'), // ♪
460            (0xD6, '¬'),
461            (0xD7, '\u{00A6}'), // ¦
462            (0xDC, '\u{215B}'), // ⅛
463            (0xDD, '\u{215C}'), // ⅜
464            (0xDE, '\u{215D}'), // ⅝
465            (0xDF, '\u{215E}'), // ⅞
466            (0xE0, '\u{2126}'), // Ω OHM SIGN
467            (0xE1, 'Æ'),
468            (0xE2, '\u{0110}'), // Đ
469            (0xE3, 'ª'),
470            (0xE4, '\u{0126}'), // Ħ
471            (0xE6, '\u{0132}'), // Ĳ
472            (0xE7, '\u{013F}'), // Ŀ
473            (0xE8, '\u{0141}'), // Ł
474            (0xE9, 'Ø'),
475            (0xEA, '\u{0152}'), // Œ
476            (0xEB, 'º'),
477            (0xEC, 'Þ'),
478            (0xED, '\u{0166}'), // Ŧ
479            (0xEE, '\u{014A}'), // Ŋ
480            (0xEF, '\u{0149}'), // ŉ
481            (0xF0, '\u{0138}'), // ĸ
482            (0xF1, 'æ'),
483            (0xF2, '\u{0111}'), // đ
484            (0xF3, 'ð'),
485            (0xF4, '\u{0127}'), // ħ
486            (0xF5, '\u{0131}'), // ı
487            (0xF6, '\u{0133}'), // ĳ
488            (0xF7, '\u{0140}'), // ŀ
489            (0xF8, '\u{0142}'), // ł
490            (0xF9, 'ø'),
491            (0xFA, '\u{0153}'), // œ
492            (0xFB, 'ß'),
493            (0xFC, '\u{00FE}'), // þ
494            (0xFD, '\u{0167}'), // ŧ
495            (0xFE, '\u{014B}'), // ŋ
496            (0xFF, '\u{00AD}'), // SHY soft hyphen
497        ];
498        for &(byte, want) in pins {
499            let got = decode_dvb_string(&[0x00, byte]);
500            assert_eq!(
501                got,
502                want.to_string(),
503                "byte {byte:#04x}: want {want:?} (U+{:04X}), got {got:?}",
504                want as u32
505            );
506        }
507    }
508
509    /// Bytes undefined (grey) in Figure A.1 decode to U+FFFD.
510    #[test]
511    fn figure_a1_undefined_positions_are_replacement() {
512        for byte in [0xA6u8, 0xD8, 0xD9, 0xDA, 0xDB, 0xE5] {
513            let got = decode_dvb_string(&[0x00, byte]);
514            assert_eq!(got, "\u{FFFD}", "byte {byte:#04x} should be U+FFFD");
515        }
516    }
517
518    /// C-row prefixes with precomposed entries (Figure A.1 non-spacing row).
519    #[test]
520    fn figure_a1_combining_precomposed() {
521        assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'a']), "å"); // ring U+030A
522        assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'A']), "Å");
523        assert_eq!(decode_dvb_string(&[0x00, 0xCF, b's']), "š"); // caron U+030C
524        assert_eq!(decode_dvb_string(&[0x00, 0xCF, b'Z']), "Ž");
525        assert_eq!(decode_dvb_string(&[0x00, 0xCE, b'e']), "ę"); // ogonek U+0328
526        assert_eq!(decode_dvb_string(&[0x00, 0xCD, b'o']), "ő"); // double acute U+030B
527        assert_eq!(decode_dvb_string(&[0x00, 0xC7, b'z']), "ż"); // dot above U+0307
528        assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'a']), "ā"); // macron U+0304
529        assert_eq!(decode_dvb_string(&[0x00, 0xC6, b'g']), "ğ"); // breve U+0306
530    }
531
532    /// A defined prefix with no precomposed form falls back to
533    /// base + Unicode combining mark (canonically equivalent).
534    #[test]
535    fn figure_a1_combining_fallback_emits_base_plus_mark() {
536        assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'x']), "x\u{0304}");
537    }
538
539    /// Undefined C-row prefixes (0xC0, 0xC9, 0xCC) and a dangling prefix at
540    /// end of input decode to U+FFFD.
541    #[test]
542    fn figure_a1_combining_undefined_or_dangling_prefix() {
543        assert_eq!(decode_dvb_string(&[0x00, 0xC0, b'a']), "\u{FFFD}a");
544        assert_eq!(decode_dvb_string(&[0x00, 0xC9, b'a']), "\u{FFFD}a");
545        assert_eq!(decode_dvb_string(&[0x00, 0xCC, b'a']), "\u{FFFD}a");
546        assert_eq!(decode_dvb_string(&[0x00, 0xC2]), "\u{FFFD}");
547    }
548}
dvb_si/text/mod.rs

dvb_si/text/
mod.rs