dvb_si/text/
mod.rs

1//! DVB-SI text decoding — ETSI EN 300 468 Annex A.
2//!
3//! Covers the full Annex A Table A.3 selector set: the default Latin table
4//! (Figure A.1, an ISO 6937 superset — see `iso_6937_single`), ISO 8859-n
5//! (single-byte 0x01–0x0B and extended 0x10 forms), UCS-2 BE (0x11),
6//! KS X 1001 Korean (0x12, decoded as EUC-KR), GB-2312 Simplified Chinese
7//! (0x13, decoded via GBK which is a GB-2312 superset), Big5 Traditional
8//! Chinese (0x14), UTF-8 (0x15), and the 0x1F `encoding_type_id` escape
9//! (no ids are registered for broadcast use — yields U+FFFD). Reserved
10//! selectors (0x08, 0x0C–0x0F, 0x16–0x1E) yield U+FFFD per byte.
11//!
12//! Glyph mappings are pinned to EN 300 468 V1.19.1 (2025-02) Figure A.1
13//! "Character code table 00 - Latin alphabet with Unicode equivalents"
14//! (PDF p. 159, vendored at `specs/etsi_en_300_468_v01.19.01_dvb_si.pdf`;
15//! transcription in `dvb-si/docs/en_300_468.md`).
16//!
17//! [`DvbText`] wraps the raw wire bytes and decodes only on demand — parsing
18//! stays zero-copy; decoding happens when you call [`DvbText::decode`], `Display`,
19//! or serde:
20//!
21//! ```
22//! use dvb_si::text::{DvbText, LangCode};
23//!
24//! // Leading 0x15 is the Annex A UTF-8 selector; "café" follows.
25//! let name = DvbText::new(&[0x15, b'c', b'a', b'f', 0xC3, 0xA9]);
26//! assert_eq!(name.decode(), "café");
27//! assert_eq!(name.raw(), &[0x15, b'c', b'a', b'f', 0xC3, 0xA9]); // selector kept
28//!
29//! // A selector-less default-Latin (ISO 6937) sequence: combining acute + e → é.
30//! assert_eq!(DvbText::new(&[0xC2, b'e']).decode(), "é");
31//!
32//! // LangCode is 3 raw bytes (ISO 639-2 / ISO 3166) decoded lossily on demand.
33//! assert_eq!(LangCode(*b"fre").as_str(), "fre");
34//! ```
35
36use alloc::borrow::Cow;
37use alloc::string::String;
38use alloc::vec::Vec;
39
40/// Decode a DVB text payload (e.g. short_event_descriptor event_name_char)
41/// into an owned UTF-8 `String`. The first byte may be a charset indicator
42/// per ETSI EN 300 468 Annex A Table A.3.
43#[must_use]
44pub fn decode_dvb_string(bytes: &[u8]) -> String {
45    if bytes.is_empty() {
46        return String::new();
47    }
48
49    let (charset, body) = split_charset(bytes);
50    let decoded = match charset {
51        Charset::Iso6937 => decode_iso_6937(body),
52        Charset::Iso8859(n) => decode_iso_8859(n, body),
53        Charset::Utf8 => String::from_utf8_lossy(body).into_owned(),
54        Charset::Ucs2Be => decode_ucs2_be(body),
55        #[cfg(feature = "std")]
56        Charset::Ksx1001 => decode_with(encoding_rs::EUC_KR, body),
57        #[cfg(feature = "std")]
58        Charset::Gb2312 => decode_with(encoding_rs::GBK, body),
59        #[cfg(feature = "std")]
60        Charset::Big5 => decode_with(encoding_rs::BIG5, body),
61        // The CJK codec tables come from `encoding_rs`, which is std-only; under
62        // `no_std` these decode lossily (replacement chars), like an unsupported
63        // charset. The raw bytes remain available via `DvbText`.
64        #[cfg(not(feature = "std"))]
65        Charset::Ksx1001 | Charset::Gb2312 | Charset::Big5 => {
66            body.iter().map(|_| '\u{FFFD}').collect()
67        }
68        Charset::Unsupported(_indicator) => body.iter().map(|_| '\u{FFFD}').collect(),
69    };
70
71    // Annex A.1 control codes:
72    //   single-byte tables: 0x86 emphasis on, 0x87 emphasis off, 0x8A CR/LF
73    //   -> space; other C0/C1 controls are stripped.
74    //   two-byte tables (Table A.2): the same functions live at U+E086 /
75    //   U+E087 / U+E08A inside the ISO 10646 PUA; the rest of
76    //   U+E080..U+E09F is reserved for control functions and stripped.
77    decoded
78        .chars()
79        .filter_map(|c| match c as u32 {
80            0x86 | 0x87 | 0xE086 | 0xE087 => None,
81            0x8A | 0xE08A => Some(' '),
82            0x0A => Some(' '),
83            code if code < 0x20 => None,
84            code if (0x80..0xA0).contains(&code) => None,
85            code if (0xE080..0xE0A0).contains(&code) => None,
86            _ => Some(c),
87        })
88        .collect()
89}
90
91/// Convenience wrapper returning `Cow::Borrowed` for pure-ASCII input,
92/// `Cow::Owned` otherwise.
93#[must_use]
94pub fn decode(bytes: &[u8]) -> Cow<'_, str> {
95    if bytes.iter().all(|&b| b.is_ascii() && b >= 0x20) {
96        return Cow::Borrowed(core::str::from_utf8(bytes).unwrap_or(""));
97    }
98    Cow::Owned(decode_dvb_string(bytes))
99}
100
101/// Borrowed DVB-encoded text (EN 300 468 Annex A). Wraps the raw selector +
102/// body bytes; decoding happens only on [`DvbText::decode`] / `Display` /
103/// serde — never in the parse hot path.
104#[derive(Clone, Copy, PartialEq, Eq, Hash)]
105#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
106pub struct DvbText<'a>(&'a [u8]);
107
108impl<'a> DvbText<'a> {
109    /// Wrap raw Annex A bytes (charset selector included, if any).
110    #[must_use]
111    pub const fn new(raw: &'a [u8]) -> Self {
112        Self(raw)
113    }
114    /// The raw wire bytes, selector included.
115    #[must_use]
116    pub const fn raw(&self) -> &'a [u8] {
117        self.0
118    }
119    /// Decode per Annex A (Table A.3 selector + control codes). Borrows only
120    /// for selector-less printable-ASCII input; any charset selector byte
121    /// forces an owned decode.
122    #[must_use]
123    pub fn decode(&self) -> Cow<'a, str> {
124        decode(self.0)
125    }
126}
127
128impl core::ops::Deref for DvbText<'_> {
129    /// Derefs to the raw wire bytes (selector included) — `len()`/indexing are
130    /// byte counts for serialization, not decoded character counts.
131    type Target = [u8];
132    fn deref(&self) -> &[u8] {
133        self.0
134    }
135}
136
137impl core::fmt::Display for DvbText<'_> {
138    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
139        f.write_str(&self.decode())
140    }
141}
142
143impl core::fmt::Debug for DvbText<'_> {
144    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
145        write!(f, "DvbText({:?})", self.decode())
146    }
147}
148
149impl<'a> From<&'a [u8]> for DvbText<'a> {
150    fn from(raw: &'a [u8]) -> Self {
151        Self(raw)
152    }
153}
154
155#[cfg(feature = "serde")]
156impl serde::Serialize for DvbText<'_> {
157    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
158        s.serialize_str(&self.decode())
159    }
160}
161// Serialize-only: re-encoding decoded text into DVB charset bytes is lossy.
162// Structs holding DvbText derive Serialize only; re-parse from wire bytes.
163
164/// ISO 639-2 language code or ISO 3166 country code — 3 raw bytes.
165#[derive(Clone, Copy, PartialEq, Eq, Hash)]
166pub struct LangCode(pub [u8; 3]);
167
168impl LangCode {
169    /// The code as a string; lossy (U+FFFD) for non-ASCII garbage.
170    #[must_use]
171    pub fn as_str(&self) -> Cow<'_, str> {
172        String::from_utf8_lossy(&self.0)
173    }
174}
175
176impl core::ops::Deref for LangCode {
177    type Target = [u8; 3];
178    fn deref(&self) -> &[u8; 3] {
179        &self.0
180    }
181}
182
183impl core::fmt::Display for LangCode {
184    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
185        f.write_str(&self.as_str())
186    }
187}
188
189impl core::fmt::Debug for LangCode {
190    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
191        write!(f, "LangCode({})", self.as_str())
192    }
193}
194
195#[cfg(feature = "serde")]
196impl serde::Serialize for LangCode {
197    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
198        s.serialize_str(&self.as_str())
199    }
200}
201
202#[derive(Debug)]
203enum Charset {
204    Iso6937,
205    Iso8859(u8),
206    Utf8,
207    Ucs2Be,
208    /// KS X 1001 (selector 0x12), decoded as EUC-KR.
209    Ksx1001,
210    /// GB-2312 (selector 0x13), decoded via GBK (a GB-2312 superset).
211    Gb2312,
212    /// Big5 (selector 0x14).
213    Big5,
214    Unsupported(u8),
215}
216
217fn split_charset(bytes: &[u8]) -> (Charset, &[u8]) {
218    match bytes[0] {
219        b if b >= 0x20 => (Charset::Iso6937, bytes),
220        0x00 => (Charset::Iso6937, &bytes[1..]),
221        // Table A.3: 0x01..=0x0B map to ISO 8859-5..-15, EXCEPT 0x08 which is
222        // "reserved for future use" (there is no ISO 8859-12).
223        0x08 => (Charset::Unsupported(0x08), &bytes[1..]),
224        0x01..=0x0B => (Charset::Iso8859(bytes[0] + 4), &bytes[1..]),
225        0x10 if bytes.len() >= 3 && bytes[1] == 0x00 => (Charset::Iso8859(bytes[2]), &bytes[3..]),
226        0x11 => (Charset::Ucs2Be, &bytes[1..]),
227        0x12 => (Charset::Ksx1001, &bytes[1..]),
228        0x13 => (Charset::Gb2312, &bytes[1..]),
229        0x14 => (Charset::Big5, &bytes[1..]),
230        0x15 => (Charset::Utf8, &bytes[1..]),
231        // 0x1F: an 8-bit encoding_type_id follows (Table A.4 area); no ids are
232        // registered for broadcast text — treat the body as undecodable.
233        0x1F if bytes.len() >= 2 => (Charset::Unsupported(0x1F), &bytes[2..]),
234        other => (Charset::Unsupported(other), &bytes[1..]),
235    }
236}
237
238fn decode_iso_6937(bytes: &[u8]) -> String {
239    let mut out = String::with_capacity(bytes.len());
240    let mut i = 0;
241    while i < bytes.len() {
242        let b = bytes[i];
243        // 0xC0..=0xCF is the Figure A.1 non-spacing (combining-prefix) row.
244        if (0xC0..=0xCF).contains(&b) {
245            match combining_mark(b) {
246                Some(mark) if i + 1 < bytes.len() => {
247                    let base = bytes[i + 1];
248                    if let Some(c) = combine(b, base) {
249                        out.push(c);
250                    } else {
251                        // No precomposed form — emit base + Unicode combining
252                        // mark, which is canonically equivalent.
253                        out.push(iso_6937_single(base));
254                        out.push(mark);
255                    }
256                    i += 2;
257                }
258                // Undefined prefix (0xC0/0xC9/0xCC) or dangling prefix at end.
259                _ => {
260                    out.push('\u{FFFD}');
261                    i += 1;
262                }
263            }
264            continue;
265        }
266        out.push(iso_6937_single(b));
267        i += 1;
268    }
269    out
270}
271
272/// Decode a single (non-combining) byte of the default Latin table.
273///
274/// Source: ETSI EN 300 468 V1.19.1 (2025-02) Figure A.1 — "Character code
275/// table 00 - Latin alphabet with Unicode equivalents" (PDF p. 159). Per the
276/// note under the figure, the table is a superset of ISO/IEC 6937 with the
277/// Euro symbol (U+20AC) added at position 0xA4. Grey (undefined) positions
278/// decode to U+FFFD.
279fn iso_6937_single(b: u8) -> char {
280    match b {
281        0x00..=0x7F => b as char,
282        // Preserve ETSI Annex A.2 C1 control codes so the post-filter can act on them.
283        0x86 | 0x87 | 0x8A => b as char,
284        0x80..=0x9F => '\u{FFFD}',
285        0xA0 => '\u{00A0}', // NBSP
286        0xA1 => '¡',
287        0xA2 => '¢',
288        0xA3 => '£',
289        0xA4 => '\u{20AC}', // € — DVB addition (note under Figure A.1)
290        0xA5 => '¥',
291        0xA6 => '\u{FFFD}', // undefined
292        0xA7 => '§',
293        0xA8 => '\u{00A4}', // ¤ general currency sign
294        0xA9 => '\u{2018}', // ' left single quotation mark
295        0xAA => '\u{201C}', // " left double quotation mark
296        0xAB => '«',
297        0xAC => '\u{2190}', // ←
298        0xAD => '\u{2191}', // ↑
299        0xAE => '\u{2192}', // →
300        0xAF => '\u{2193}', // ↓
301        0xB0 => '°',
302        0xB1 => '±',
303        0xB2 => '²',
304        0xB3 => '³',
305        0xB4 => '\u{00D7}', // ×
306        0xB5 => 'µ',
307        0xB6 => '¶',
308        0xB7 => '·',
309        0xB8 => '\u{00F7}', // ÷
310        0xB9 => '\u{2019}', // ' right single quotation mark
311        0xBA => '\u{201D}', // " right double quotation mark
312        0xBB => '»',
313        0xBC => '¼',
314        0xBD => '½',
315        0xBE => '¾',
316        0xBF => '¿',
317        // Combining-prefix row; reached only for a dangling/undefined prefix.
318        0xC0..=0xCF => '\u{FFFD}',
319        0xD0 => '\u{2015}', // ― horizontal bar
320        0xD1 => '¹',
321        0xD2 => '®',
322        0xD3 => '©',
323        0xD4 => '\u{2122}', // ™
324        0xD5 => '\u{266A}', // ♪ eighth note
325        0xD6 => '¬',
326        0xD7 => '\u{00A6}',        // ¦ broken bar
327        0xD8..=0xDB => '\u{FFFD}', // undefined
328        0xDC => '\u{215B}',        // ⅛
329        0xDD => '\u{215C}',        // ⅜
330        0xDE => '\u{215D}',        // ⅝
331        0xDF => '\u{215E}',        // ⅞
332        0xE0 => '\u{2126}',        // Ω OHM SIGN
333        0xE1 => 'Æ',
334        0xE2 => '\u{0110}', // Đ
335        0xE3 => 'ª',
336        0xE4 => '\u{0126}', // Ħ
337        0xE5 => '\u{FFFD}', // undefined
338        0xE6 => '\u{0132}', // Ĳ
339        0xE7 => '\u{013F}', // Ŀ
340        0xE8 => '\u{0141}', // Ł
341        0xE9 => 'Ø',
342        0xEA => '\u{0152}', // Œ
343        0xEB => 'º',
344        0xEC => 'Þ',
345        0xED => '\u{0166}', // Ŧ
346        0xEE => '\u{014A}', // Ŋ
347        0xEF => '\u{0149}', // ŉ
348        0xF0 => '\u{0138}', // ĸ
349        0xF1 => 'æ',
350        0xF2 => '\u{0111}', // đ
351        0xF3 => 'ð',
352        0xF4 => '\u{0127}', // ħ
353        0xF5 => '\u{0131}', // ı dotless i
354        0xF6 => '\u{0133}', // ĳ
355        0xF7 => '\u{0140}', // ŀ
356        0xF8 => '\u{0142}', // ł
357        0xF9 => 'ø',
358        0xFA => '\u{0153}', // œ
359        0xFB => 'ß',
360        0xFC => '\u{00FE}', // þ
361        0xFD => '\u{0167}', // ŧ
362        0xFE => '\u{014B}', // ŋ
363        0xFF => '\u{00AD}', // SHY soft hyphen
364    }
365}
366
367/// Unicode combining mark for a Figure A.1 non-spacing prefix byte
368/// (row 0xC0..=0xCF). `None` for the undefined positions 0xC0/0xC9/0xCC.
369fn combining_mark(prefix: u8) -> Option<char> {
370    Some(match prefix {
371        0xC1 => '\u{0300}', // grave
372        0xC2 => '\u{0301}', // acute
373        0xC3 => '\u{0302}', // circumflex
374        0xC4 => '\u{0303}', // tilde
375        0xC5 => '\u{0304}', // macron
376        0xC6 => '\u{0306}', // breve
377        0xC7 => '\u{0307}', // dot above
378        0xC8 => '\u{0308}', // diaeresis
379        0xCA => '\u{030A}', // ring above
380        0xCB => '\u{0327}', // cedilla
381        0xCD => '\u{030B}', // double acute
382        0xCE => '\u{0328}', // ogonek
383        0xCF => '\u{030C}', // caron
384        _ => return None,
385    })
386}
387
388fn combine(prefix: u8, base: u8) -> Option<char> {
389    Some(match (prefix, base) {
390        (0xC1, b'A') => 'À',
391        (0xC1, b'E') => 'È',
392        (0xC1, b'I') => 'Ì',
393        (0xC1, b'O') => 'Ò',
394        (0xC1, b'U') => 'Ù',
395        (0xC1, b'a') => 'à',
396        (0xC1, b'e') => 'è',
397        (0xC1, b'i') => 'ì',
398        (0xC1, b'o') => 'ò',
399        (0xC1, b'u') => 'ù',
400        (0xC2, b'A') => 'Á',
401        (0xC2, b'E') => 'É',
402        (0xC2, b'I') => 'Í',
403        (0xC2, b'O') => 'Ó',
404        (0xC2, b'U') => 'Ú',
405        (0xC2, b'Y') => 'Ý',
406        (0xC2, b'a') => 'á',
407        (0xC2, b'e') => 'é',
408        (0xC2, b'i') => 'í',
409        (0xC2, b'o') => 'ó',
410        (0xC2, b'u') => 'ú',
411        (0xC2, b'y') => 'ý',
412        (0xC2, b'C') => 'Ć',
413        (0xC2, b'c') => 'ć',
414        (0xC2, b'L') => 'Ĺ',
415        (0xC2, b'l') => 'ĺ',
416        (0xC2, b'N') => 'Ń',
417        (0xC2, b'n') => 'ń',
418        (0xC2, b'R') => 'Ŕ',
419        (0xC2, b'r') => 'ŕ',
420        (0xC2, b'S') => 'Ś',
421        (0xC2, b's') => 'ś',
422        (0xC2, b'Z') => 'Ź',
423        (0xC2, b'z') => 'ź',
424        (0xC3, b'A') => 'Â',
425        (0xC3, b'E') => 'Ê',
426        (0xC3, b'I') => 'Î',
427        (0xC3, b'O') => 'Ô',
428        (0xC3, b'U') => 'Û',
429        (0xC3, b'a') => 'â',
430        (0xC3, b'e') => 'ê',
431        (0xC3, b'i') => 'î',
432        (0xC3, b'o') => 'ô',
433        (0xC3, b'u') => 'û',
434        (0xC4, b'A') => 'Ã',
435        (0xC4, b'N') => 'Ñ',
436        (0xC4, b'O') => 'Õ',
437        (0xC4, b'a') => 'ã',
438        (0xC4, b'n') => 'ñ',
439        (0xC4, b'o') => 'õ',
440        (0xC4, b'I') => 'Ĩ',
441        (0xC4, b'i') => 'ĩ',
442        (0xC4, b'U') => 'Ũ',
443        (0xC4, b'u') => 'ũ',
444        // macron
445        (0xC5, b'A') => 'Ā',
446        (0xC5, b'a') => 'ā',
447        (0xC5, b'E') => 'Ē',
448        (0xC5, b'e') => 'ē',
449        (0xC5, b'I') => 'Ī',
450        (0xC5, b'i') => 'ī',
451        (0xC5, b'O') => 'Ō',
452        (0xC5, b'o') => 'ō',
453        (0xC5, b'U') => 'Ū',
454        (0xC5, b'u') => 'ū',
455        // breve
456        (0xC6, b'A') => 'Ă',
457        (0xC6, b'a') => 'ă',
458        (0xC6, b'G') => 'Ğ',
459        (0xC6, b'g') => 'ğ',
460        (0xC6, b'U') => 'Ŭ',
461        (0xC6, b'u') => 'ŭ',
462        // dot above
463        (0xC7, b'C') => 'Ċ',
464        (0xC7, b'c') => 'ċ',
465        (0xC7, b'E') => 'Ė',
466        (0xC7, b'e') => 'ė',
467        (0xC7, b'G') => 'Ġ',
468        (0xC7, b'g') => 'ġ',
469        (0xC7, b'I') => 'İ',
470        (0xC7, b'Z') => 'Ż',
471        (0xC7, b'z') => 'ż',
472        (0xC8, b'A') => 'Ä',
473        (0xC8, b'E') => 'Ë',
474        (0xC8, b'I') => 'Ï',
475        (0xC8, b'O') => 'Ö',
476        (0xC8, b'U') => 'Ü',
477        (0xC8, b'Y') => 'Ÿ',
478        (0xC8, b'a') => 'ä',
479        (0xC8, b'e') => 'ë',
480        (0xC8, b'i') => 'ï',
481        (0xC8, b'o') => 'ö',
482        (0xC8, b'u') => 'ü',
483        (0xC8, b'y') => 'ÿ',
484        // ring above
485        (0xCA, b'A') => 'Å',
486        (0xCA, b'a') => 'å',
487        (0xCA, b'U') => 'Ů',
488        (0xCA, b'u') => 'ů',
489        (0xCB, b'C') => 'Ç',
490        (0xCB, b'c') => 'ç',
491        (0xCB, b'G') => 'Ģ',
492        (0xCB, b'g') => 'ģ',
493        (0xCB, b'K') => 'Ķ',
494        (0xCB, b'k') => 'ķ',
495        (0xCB, b'L') => 'Ļ',
496        (0xCB, b'l') => 'ļ',
497        (0xCB, b'N') => 'Ņ',
498        (0xCB, b'n') => 'ņ',
499        (0xCB, b'R') => 'Ŗ',
500        (0xCB, b'r') => 'ŗ',
501        (0xCB, b'S') => 'Ş',
502        (0xCB, b's') => 'ş',
503        (0xCB, b'T') => 'Ţ',
504        (0xCB, b't') => 'ţ',
505        // double acute
506        (0xCD, b'O') => 'Ő',
507        (0xCD, b'o') => 'ő',
508        (0xCD, b'U') => 'Ű',
509        (0xCD, b'u') => 'ű',
510        // ogonek
511        (0xCE, b'A') => 'Ą',
512        (0xCE, b'a') => 'ą',
513        (0xCE, b'E') => 'Ę',
514        (0xCE, b'e') => 'ę',
515        (0xCE, b'I') => 'Į',
516        (0xCE, b'i') => 'į',
517        (0xCE, b'U') => 'Ų',
518        (0xCE, b'u') => 'ų',
519        // caron
520        (0xCF, b'C') => 'Č',
521        (0xCF, b'c') => 'č',
522        (0xCF, b'D') => 'Ď',
523        (0xCF, b'd') => 'ď',
524        (0xCF, b'E') => 'Ě',
525        (0xCF, b'e') => 'ě',
526        (0xCF, b'L') => 'Ľ',
527        (0xCF, b'l') => 'ľ',
528        (0xCF, b'N') => 'Ň',
529        (0xCF, b'n') => 'ň',
530        (0xCF, b'R') => 'Ř',
531        (0xCF, b'r') => 'ř',
532        (0xCF, b'S') => 'Š',
533        (0xCF, b's') => 'š',
534        (0xCF, b'T') => 'Ť',
535        (0xCF, b't') => 'ť',
536        (0xCF, b'Z') => 'Ž',
537        (0xCF, b'z') => 'ž',
538        _ => return None,
539    })
540}
541
542fn decode_iso_8859(n: u8, bytes: &[u8]) -> String {
543    // ISO/IEC 8859-1 (Latin-1) is the first 256 Unicode code points exactly, so
544    // a byte→char cast is a correct decode — and needs no codec tables, so it
545    // works under `no_std` too. (encoding_rs has no pure 8859-1; WINDOWS_1252
546    // differs in 0x80–0x9F, so don't use it here.)
547    if n == 1 {
548        return bytes.iter().map(|&b| b as char).collect();
549    }
550    // The other 8859 parts use `encoding_rs`'s codec tables, which are std-only;
551    // under `no_std` they decode lossily (replacement chars). Raw bytes remain
552    // available via `DvbText`.
553    #[cfg(feature = "std")]
554    {
555        use encoding_rs::*;
556        let encoding: &'static Encoding = match n {
557            2 => ISO_8859_2,
558            3 => ISO_8859_3,
559            4 => ISO_8859_4,
560            5 => ISO_8859_5,
561            6 => ISO_8859_6,
562            7 => ISO_8859_7,
563            8 => ISO_8859_8,
564            9 => WINDOWS_1254,
565            10 => ISO_8859_10,
566            11 => WINDOWS_874,
567            13 => ISO_8859_13,
568            14 => ISO_8859_14,
569            15 => ISO_8859_15,
570            _ => return bytes.iter().map(|_| '\u{FFFD}').collect(),
571        };
572        let (cow, _, _) = encoding.decode(bytes);
573        cow.into_owned()
574    }
575    #[cfg(not(feature = "std"))]
576    {
577        let _ = n;
578        bytes.iter().map(|_| '\u{FFFD}').collect()
579    }
580}
581
582#[cfg(feature = "std")]
583fn decode_with(encoding: &'static encoding_rs::Encoding, bytes: &[u8]) -> String {
584    let (cow, _, _) = encoding.decode(bytes);
585    cow.into_owned()
586}
587
588fn decode_ucs2_be(bytes: &[u8]) -> String {
589    let code_units: Vec<u16> = bytes
590        .chunks_exact(2)
591        .map(|pair| u16::from_be_bytes([pair[0], pair[1]]))
592        .collect();
593    String::from_utf16_lossy(&code_units)
594}
595
596#[cfg(test)]
597mod tests {
598    use super::*;
599
600    #[test]
601    fn decode_empty_input_returns_empty_string() {
602        assert_eq!(decode_dvb_string(&[]), "");
603    }
604
605    #[test]
606    fn decode_plain_ascii_is_borrowed() {
607        let cow = decode(b"HELLO");
608        assert!(matches!(cow, Cow::Borrowed(_)));
609        assert_eq!(cow, "HELLO");
610    }
611
612    #[test]
613    fn decode_iso6937_latin_accent_chars() {
614        assert_eq!(decode_dvb_string(&[0x00, 0xC2, b'A']), "Á");
615        assert_eq!(decode_dvb_string(&[0x00, 0xC1, b'e']), "è");
616        assert_eq!(decode_dvb_string(&[0x00, 0xC8, b'o']), "ö");
617    }
618
619    #[test]
620    fn decode_selector_0x01_yields_iso8859_5_cyrillic() {
621        let s = decode_dvb_string(&[0x01, 0xB0, 0xB1]);
622        assert!(s.chars().all(|c| c != '\u{FFFD}'), "got: {s:?}");
623        assert!(!s.is_empty());
624    }
625
626    #[test]
627    fn decode_selector_0x10_extended_yields_iso8859_nn() {
628        let s = decode_dvb_string(&[0x10, 0x00, 0x09, b'A', b'B']);
629        assert_eq!(s, "AB");
630    }
631
632    #[test]
633    fn decode_selector_0x11_ucs2_be() {
634        let s = decode_dvb_string(&[0x11, 0x00, 0x41, 0x00, 0x42]);
635        assert_eq!(s, "AB");
636    }
637
638    #[test]
639    fn decode_selector_0x15_utf8_passthrough() {
640        let s = decode_dvb_string(&[0x15, 0xC3, 0xA9, 0xC3, 0xA9]);
641        assert_eq!(s, "éé");
642    }
643
644    #[test]
645    fn decode_control_chars_stripped_linefeed_becomes_space() {
646        let s = decode_dvb_string(b"A\x01B\nC");
647        assert_eq!(s, "AB C");
648    }
649
650    #[test]
651    fn emphasis_on_off_markers_stripped_per_annex_a2() {
652        // 0x86 and 0x87 are emphasis on/off markers per ETSI Annex A.2 — not
653        // representable in plain text, strip silently.
654        let s = decode_dvb_string(&[0x00, b'A', 0x86, b'B', 0x87, b'C']);
655        assert_eq!(s, "ABC");
656    }
657
658    #[test]
659    fn decode_annex_a2_crlf_0x8a_becomes_space() {
660        // 0x8A in DVB text maps to CR/LF per Annex A.2 — render as space.
661        let s = decode_dvb_string(&[0x00, b'A', 0x8A, b'B']);
662        assert_eq!(s, "A B");
663    }
664
665    #[test]
666    fn decode_selector_0x12_ksx1001_euc_kr() {
667        // EUC-KR 0xB0A1 = '가' (HANGUL SYLLABLE GA).
668        assert_eq!(decode_dvb_string(&[0x12, 0xB0, 0xA1]), "가");
669    }
670
671    #[test]
672    fn decode_selector_0x13_gb2312() {
673        // GB-2312/GBK 0xC4E3 = '你'.
674        assert_eq!(decode_dvb_string(&[0x13, 0xC4, 0xE3]), "你");
675    }
676
677    #[test]
678    fn decode_selector_0x14_big5() {
679        // Big5 0xA4A4 = '中'.
680        assert_eq!(decode_dvb_string(&[0x14, 0xA4, 0xA4]), "中");
681    }
682
683    /// A multi-byte trail byte in 0x80–0x9F must survive: the C1 control
684    /// filter operates on decoded code points, never on raw trail bytes.
685    /// GBK 0x8180 = '亐' (U+4E90, trail byte in the C1 range).
686    #[test]
687    fn decode_selector_0x13_gbk_trail_byte_in_c1_range() {
688        assert_eq!(decode_dvb_string(&[0x13, 0x81, 0x80]), "亐");
689    }
690
691    /// Annex A.1 two-byte control codes live at U+E080–U+E09F in the PUA
692    /// (Table A.2): U+E08A is CR/LF → space; the reserved rest is stripped.
693    /// GBK 0xABCD decodes to U+E08A; GBK 0xABC3 decodes to U+E080.
694    #[test]
695    fn two_byte_control_codes_filtered() {
696        assert_eq!(decode_dvb_string(&[0x13, 0xAB, 0xCD]), " ");
697        assert_eq!(decode_dvb_string(&[0x13, 0xAB, 0xC3]), "");
698    }
699
700    /// 0x1F consumes its 8-bit encoding_type_id; the body is undecodable
701    /// (no registered broadcast ids) and yields U+FFFD per byte.
702    #[test]
703    fn decode_selector_0x1f_encoding_type_id() {
704        let s = decode_dvb_string(&[0x1F, 0x01, 0x41, 0x42]);
705        assert_eq!(s.chars().count(), 2);
706        assert!(s.chars().all(|c| c == '\u{FFFD}'));
707    }
708
709    /// Table A.3 marks single-byte selector 0x08 reserved (no ISO 8859-12).
710    #[test]
711    fn reserved_selector_0x08_is_unsupported() {
712        let s = decode_dvb_string(&[0x08, 0x41, 0x42]);
713        assert!(s.chars().all(|c| c == '\u{FFFD}'));
714        assert_eq!(s.chars().count(), 2);
715    }
716
717    #[test]
718    fn unknown_selector_returns_replacement_characters() {
719        // Selector 0x16 is reserved for future use — each byte becomes U+FFFD.
720        let s = decode_dvb_string(&[0x16, 0xAA, 0xBB, 0xCC]);
721        assert_eq!(s.chars().count(), 3);
722        assert!(s.chars().all(|c| c == '\u{FFFD}'));
723    }
724
725    /// An unsupported ISO 8859 part number (via 0x10 extended selector) yields
726    /// U+FFFD per byte rather than Latin-1 passthrough.
727    #[test]
728    fn selector_0x10_iso_8859_1_decodes_latin1() {
729        // 0x10 0x00 0x01 → ISO/IEC 8859-1 (Latin-1): bytes are the first 256
730        // Unicode code points 1:1, so 0xE9 → 'é'. (A valid charset; must NOT be
731        // treated as unsupported / U+FFFD.)
732        let s = decode_dvb_string(&[0x10, 0x00, 0x01, 0x41, 0xE9]);
733        assert_eq!(s, "Aé");
734    }
735
736    #[test]
737    fn unsupported_iso_8859_12_yields_replacement() {
738        // 0x10 0x00 0x0C → ISO 8859-12 does not exist (reserved); unsupported
739        // parts decode to U+FFFD, not fabricated text.
740        let s = decode_dvb_string(&[0x10, 0x00, 0x0C, 0x41, 0x42]);
741        assert!(s.chars().all(|c| c == '\u{FFFD}'), "got: {s:?}");
742    }
743
744    /// Pins the GR-area single-byte mappings to ETSI EN 300 468 V1.19.1
745    /// (2025-02) Figure A.1 — "Character code table 00 - Latin alphabet with
746    /// Unicode equivalents" (PDF p. 159; vendored at
747    /// `specs/etsi_en_300_468_v01.19.01_dvb_si.pdf`).
748    #[test]
749    fn figure_a1_gr_area_single_byte_mappings() {
750        let pins: &[(u8, char)] = &[
751            (0xA0, '\u{00A0}'), // NBSP
752            (0xA1, '¡'),
753            (0xA2, '¢'),
754            (0xA3, '£'),
755            (0xA4, '\u{20AC}'), // € — DVB addition (note under Figure A.1)
756            (0xA5, '¥'),
757            (0xA7, '§'),
758            (0xA8, '\u{00A4}'), // ¤ general currency sign
759            (0xA9, '\u{2018}'), // '
760            (0xAA, '\u{201C}'), // "
761            (0xAB, '«'),
762            (0xAC, '\u{2190}'), // ←
763            (0xAD, '\u{2191}'), // ↑
764            (0xAE, '\u{2192}'), // →
765            (0xAF, '\u{2193}'), // ↓
766            (0xB0, '°'),
767            (0xB1, '±'),
768            (0xB2, '²'),
769            (0xB3, '³'),
770            (0xB4, '\u{00D7}'), // ×
771            (0xB5, 'µ'),
772            (0xB6, '¶'),
773            (0xB7, '·'),
774            (0xB8, '\u{00F7}'), // ÷
775            (0xB9, '\u{2019}'), // '
776            (0xBA, '\u{201D}'), // "
777            (0xBB, '»'),
778            (0xBC, '¼'),
779            (0xBD, '½'),
780            (0xBE, '¾'),
781            (0xBF, '¿'),
782            (0xD0, '\u{2015}'), // ―
783            (0xD1, '¹'),
784            (0xD2, '®'),
785            (0xD3, '©'),
786            (0xD4, '\u{2122}'), // ™
787            (0xD5, '\u{266A}'), // ♪
788            (0xD6, '¬'),
789            (0xD7, '\u{00A6}'), // ¦
790            (0xDC, '\u{215B}'), // ⅛
791            (0xDD, '\u{215C}'), // ⅜
792            (0xDE, '\u{215D}'), // ⅝
793            (0xDF, '\u{215E}'), // ⅞
794            (0xE0, '\u{2126}'), // Ω OHM SIGN
795            (0xE1, 'Æ'),
796            (0xE2, '\u{0110}'), // Đ
797            (0xE3, 'ª'),
798            (0xE4, '\u{0126}'), // Ħ
799            (0xE6, '\u{0132}'), // Ĳ
800            (0xE7, '\u{013F}'), // Ŀ
801            (0xE8, '\u{0141}'), // Ł
802            (0xE9, 'Ø'),
803            (0xEA, '\u{0152}'), // Œ
804            (0xEB, 'º'),
805            (0xEC, 'Þ'),
806            (0xED, '\u{0166}'), // Ŧ
807            (0xEE, '\u{014A}'), // Ŋ
808            (0xEF, '\u{0149}'), // ŉ
809            (0xF0, '\u{0138}'), // ĸ
810            (0xF1, 'æ'),
811            (0xF2, '\u{0111}'), // đ
812            (0xF3, 'ð'),
813            (0xF4, '\u{0127}'), // ħ
814            (0xF5, '\u{0131}'), // ı
815            (0xF6, '\u{0133}'), // ĳ
816            (0xF7, '\u{0140}'), // ŀ
817            (0xF8, '\u{0142}'), // ł
818            (0xF9, 'ø'),
819            (0xFA, '\u{0153}'), // œ
820            (0xFB, 'ß'),
821            (0xFC, '\u{00FE}'), // þ
822            (0xFD, '\u{0167}'), // ŧ
823            (0xFE, '\u{014B}'), // ŋ
824            (0xFF, '\u{00AD}'), // SHY soft hyphen
825        ];
826        for &(byte, want) in pins {
827            let got = decode_dvb_string(&[0x00, byte]);
828            assert_eq!(
829                got,
830                want.to_string(),
831                "byte {byte:#04x}: want {want:?} (U+{:04X}), got {got:?}",
832                want as u32
833            );
834        }
835    }
836
837    /// Bytes undefined (grey) in Figure A.1 decode to U+FFFD.
838    #[test]
839    fn figure_a1_undefined_positions_are_replacement() {
840        for byte in [0xA6u8, 0xD8, 0xD9, 0xDA, 0xDB, 0xE5] {
841            let got = decode_dvb_string(&[0x00, byte]);
842            assert_eq!(got, "\u{FFFD}", "byte {byte:#04x} should be U+FFFD");
843        }
844    }
845
846    /// C-row prefixes with precomposed entries (Figure A.1 non-spacing row).
847    #[test]
848    fn figure_a1_combining_precomposed() {
849        assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'a']), "å"); // ring U+030A
850        assert_eq!(decode_dvb_string(&[0x00, 0xCA, b'A']), "Å");
851        assert_eq!(decode_dvb_string(&[0x00, 0xCF, b's']), "š"); // caron U+030C
852        assert_eq!(decode_dvb_string(&[0x00, 0xCF, b'Z']), "Ž");
853        assert_eq!(decode_dvb_string(&[0x00, 0xCE, b'e']), "ę"); // ogonek U+0328
854        assert_eq!(decode_dvb_string(&[0x00, 0xCD, b'o']), "ő"); // double acute U+030B
855        assert_eq!(decode_dvb_string(&[0x00, 0xC7, b'z']), "ż"); // dot above U+0307
856        assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'a']), "ā"); // macron U+0304
857        assert_eq!(decode_dvb_string(&[0x00, 0xC6, b'g']), "ğ"); // breve U+0306
858    }
859
860    /// A defined prefix with no precomposed form falls back to
861    /// base + Unicode combining mark (canonically equivalent).
862    #[test]
863    fn figure_a1_combining_fallback_emits_base_plus_mark() {
864        assert_eq!(decode_dvb_string(&[0x00, 0xC5, b'x']), "x\u{0304}");
865    }
866
867    /// Undefined C-row prefixes (0xC0, 0xC9, 0xCC) and a dangling prefix at
868    /// end of input decode to U+FFFD.
869    #[test]
870    fn figure_a1_combining_undefined_or_dangling_prefix() {
871        assert_eq!(decode_dvb_string(&[0x00, 0xC0, b'a']), "\u{FFFD}a");
872        assert_eq!(decode_dvb_string(&[0x00, 0xC9, b'a']), "\u{FFFD}a");
873        assert_eq!(decode_dvb_string(&[0x00, 0xCC, b'a']), "\u{FFFD}a");
874        assert_eq!(decode_dvb_string(&[0x00, 0xC2]), "\u{FFFD}");
875    }
876
877    #[test]
878    fn dvb_text_decodes_with_charset_selector() {
879        let t = DvbText::new(&[0x15, 0xC3, 0xA9]); // UTF-8 selector + é
880        assert_eq!(t.decode(), "é");
881        assert_eq!(t.raw(), &[0x15, 0xC3, 0xA9]);
882        assert_eq!(&t[..], &[0x15, 0xC3, 0xA9]); // Deref
883        assert_eq!(format!("{t}"), "é");
884    }
885
886    #[test]
887    fn lang_code_as_str() {
888        assert_eq!(LangCode(*b"fre").as_str(), "fre");
889        assert_eq!(LangCode([0xFF, b'r', b'e']).as_str(), "\u{FFFD}re"); // lossy, no panic
890    }
891
892    #[cfg(feature = "serde")]
893    #[test]
894    fn dvb_text_serializes_decoded() {
895        let t = DvbText::new(&[0x15, 0xC3, 0xA9]);
896        assert_eq!(serde_json::to_string(&t).unwrap(), "\"é\"");
897    }
898
899    #[cfg(feature = "serde")]
900    #[test]
901    fn lang_code_serializes_as_string() {
902        // Serialize-only: LangCode renders as its decoded string. Parsing FROM
903        // JSON is deliberately unsupported (re-parse from wire bytes instead).
904        let lc = LangCode(*b"FRA");
905        assert_eq!(serde_json::to_string(&lc).unwrap(), "\"FRA\"");
906    }
907}
dvb_si/text/mod.rs

dvb_si/text/
mod.rs