Skip to main content

dicom_toolkit_core/
charset.rs

1//! Character set handling for DICOM's Specific Character Set (0008,0005).
2//!
3//! Wraps the `encoding_rs` crate to provide DICOM-aware character encoding
4//! and decoding, replacing DCMTK's `oficonv` module.
5//!
6//! Supports:
7//! - Single-byte character sets (Latin-1 through Latin-9, Cyrillic, Arabic,
8//!   Greek, Hebrew, Thai)
9//! - Multi-byte character sets (Japanese, Korean, Simplified Chinese)
10//! - ISO 2022 code extensions with escape sequence switching
11//! - UTF-8 (ISO_IR 192)
12
13use crate::error::{DcmError, DcmResult};
14use encoding_rs::Encoding;
15
16/// Maps a DICOM Specific Character Set defined term to an `encoding_rs` encoding.
17///
18/// Reference: DICOM PS3.3 C.12.1.1.2, Table C.12-2 and C.12-3.
19pub fn encoding_for_term(term: &str) -> DcmResult<&'static Encoding> {
20    let encoding = match term.trim() {
21        // Default repertoire — ASCII (we use WINDOWS_1252 as the nearest superset;
22        // encoding_rs does not have a pure ISO 646/ASCII codec)
23        "" | "ISO_IR 6" | "ISO 2022 IR 6" => encoding_rs::WINDOWS_1252,
24
25        // Latin alphabet No. 1 — ISO 8859-1
26        "ISO_IR 100" | "ISO 2022 IR 100" => encoding_rs::WINDOWS_1252,
27        // Latin alphabet No. 2 — ISO 8859-2
28        "ISO_IR 101" | "ISO 2022 IR 101" => encoding_rs::ISO_8859_2,
29        // Latin alphabet No. 3 — ISO 8859-3
30        "ISO_IR 109" | "ISO 2022 IR 109" => encoding_rs::ISO_8859_3,
31        // Latin alphabet No. 4 — ISO 8859-4
32        "ISO_IR 110" | "ISO 2022 IR 110" => encoding_rs::ISO_8859_4,
33        // Cyrillic — ISO 8859-5
34        "ISO_IR 144" | "ISO 2022 IR 144" => encoding_rs::ISO_8859_5,
35        // Arabic — ISO 8859-6
36        "ISO_IR 127" | "ISO 2022 IR 127" => encoding_rs::ISO_8859_6,
37        // Greek — ISO 8859-7
38        "ISO_IR 126" | "ISO 2022 IR 126" => encoding_rs::ISO_8859_7,
39        // Hebrew — ISO 8859-8
40        "ISO_IR 138" | "ISO 2022 IR 138" => encoding_rs::ISO_8859_8,
41        // Latin alphabet No. 5 — ISO 8859-9 (encoding_rs maps WINDOWS_1254 ≈ 8859-9)
42        "ISO_IR 148" | "ISO 2022 IR 148" => encoding_rs::WINDOWS_1254,
43        // Latin alphabet No. 9 — ISO 8859-15 (encoding_rs maps ISO_8859_15)
44        "ISO_IR 203" | "ISO 2022 IR 203" => encoding_rs::ISO_8859_15,
45
46        // Thai — TIS 620-2533 (WINDOWS_874 is the superset)
47        "ISO_IR 166" | "ISO 2022 IR 166" => encoding_rs::WINDOWS_874,
48
49        // Japanese — JIS X 0201 (Shift_JIS covers both Romaji and Katakana halves)
50        "ISO_IR 13" | "ISO 2022 IR 13" => encoding_rs::SHIFT_JIS,
51        // Japanese — JIS X 0208 (Kanji) via ISO-2022-JP
52        "ISO 2022 IR 87" => encoding_rs::ISO_2022_JP,
53        // Japanese — JIS X 0212 (Supplementary Kanji)
54        "ISO 2022 IR 159" => encoding_rs::ISO_2022_JP,
55
56        // Korean — KS X 1001
57        "ISO 2022 IR 149" => encoding_rs::EUC_KR,
58        // Simplified Chinese — GB 2312
59        "ISO 2022 IR 58" => encoding_rs::GB18030,
60
61        // Unicode
62        "ISO_IR 192" => encoding_rs::UTF_8,
63
64        // GBK / GB18030 (extensions used by some Chinese implementations)
65        "GBK" => encoding_rs::GBK,
66        "GB18030" => encoding_rs::GB18030,
67
68        _ => {
69            return Err(DcmError::CharsetError {
70                reason: format!("unknown DICOM character set term: '{term}'"),
71            });
72        }
73    };
74    Ok(encoding)
75}
76
77/// Decodes a byte slice using the specified DICOM character set term.
78pub fn decode_string(bytes: &[u8], term: &str) -> DcmResult<String> {
79    let encoding = encoding_for_term(term)?;
80    let (decoded, _, had_errors) = encoding.decode(bytes);
81    if had_errors {
82        return Err(DcmError::CharsetError {
83            reason: format!("decoding error using charset '{term}'"),
84        });
85    }
86    Ok(decoded.into_owned())
87}
88
89/// Encodes a string using the specified DICOM character set term.
90pub fn encode_string(s: &str, term: &str) -> DcmResult<Vec<u8>> {
91    let encoding = encoding_for_term(term)?;
92    let (encoded, _, had_errors) = encoding.encode(s);
93    if had_errors {
94        return Err(DcmError::CharsetError {
95            reason: format!("encoding error using charset '{term}'"),
96        });
97    }
98    Ok(encoded.into_owned())
99}
100
101/// Handles DICOM's multi-valued Specific Character Set with ISO 2022
102/// code extension support.
103///
104/// When (0008,0005) contains multiple values (e.g., `ISO_IR 100\ISO 2022 IR 87`),
105/// different segments of a string may use different encodings, separated by
106/// ISO 2022 escape sequences. This decoder handles the segment splitting and
107/// per-segment decoding, matching DCMTK's `DcmSpecificCharacterSet`.
108pub struct DicomCharsetDecoder {
109    /// Default encoding (first term, or ASCII if empty).
110    default_encoding: &'static Encoding,
111    /// Default defined term (used to restore ISO 2022 state).
112    default_term: String,
113    /// Scan mode for the default term.
114    default_scan_mode: ScanMode,
115    /// Map from ISO 2022 defined term → encoding, for code extension switching.
116    extensions: Vec<(String, &'static Encoding)>,
117    /// True if we have multiple charsets (ISO 2022 code extensions).
118    has_extensions: bool,
119}
120
121#[derive(Debug, Clone, Copy, PartialEq, Eq)]
122enum ScanMode {
123    SingleByte,
124    FixedWidth(usize),
125    HighBitLead(usize),
126}
127
128/// ISO 2022 escape sequence to DICOM defined term mapping.
129///
130/// Ported from DCMTK dcspchrs.cc `convertStringWithCodeExtensions()`.
131fn escape_to_term(esc: &[u8]) -> Option<&'static str> {
132    if esc.len() < 2 {
133        return None;
134    }
135    match (esc[0], esc[1]) {
136        (0x28, 0x42) => Some("ISO 2022 IR 6"),   // ASCII, G0
137        (0x2D, 0x41) => Some("ISO 2022 IR 100"), // Latin-1, G1
138        (0x2D, 0x42) => Some("ISO 2022 IR 101"), // Latin-2, G1
139        (0x2D, 0x43) => Some("ISO 2022 IR 109"), // Latin-3, G1
140        (0x2D, 0x44) => Some("ISO 2022 IR 110"), // Latin-4, G1
141        (0x2D, 0x4C) => Some("ISO 2022 IR 144"), // Cyrillic, G1
142        (0x2D, 0x47) => Some("ISO 2022 IR 127"), // Arabic, G1
143        (0x2D, 0x46) => Some("ISO 2022 IR 126"), // Greek, G1
144        (0x2D, 0x48) => Some("ISO 2022 IR 138"), // Hebrew, G1
145        (0x2D, 0x4D) => Some("ISO 2022 IR 148"), // Latin-5, G1
146        (0x2D, 0x62) => Some("ISO 2022 IR 203"), // Latin-9, G1
147        (0x29, 0x49) => Some("ISO 2022 IR 13"),  // Japanese Katakana, G1
148        (0x28, 0x4A) => Some("ISO 2022 IR 13"),  // Japanese Romaji, G0
149        (0x2D, 0x54) => Some("ISO 2022 IR 166"), // Thai, G1
150        (0x24, 0x42) => Some("ISO 2022 IR 87"),  // Japanese Kanji (JIS X0208)
151        (0x24, 0x28) if esc.len() >= 3 && esc[2] == 0x44 => {
152            Some("ISO 2022 IR 159") // Japanese Supplementary Kanji (JIS X0212)
153        }
154        (0x24, 0x29) if esc.len() >= 3 => match esc[2] {
155            0x43 => Some("ISO 2022 IR 149"), // Korean
156            0x41 => Some("ISO 2022 IR 58"),  // Simplified Chinese
157            _ => None,
158        },
159        _ => None,
160    }
161}
162
163/// Returns the length of an ISO 2022 escape sequence (after the ESC byte).
164fn escape_seq_len(data: &[u8]) -> usize {
165    if data.len() < 2 {
166        return 0;
167    }
168    match (data[0], data[1]) {
169        (0x24, 0x28) | (0x24, 0x29) => 3, // 4-byte sequences (ESC + 3)
170        _ => 2,                           // 3-byte sequences (ESC + 2)
171    }
172}
173
174impl DicomCharsetDecoder {
175    /// Creates a new decoder from a DICOM Specific Character Set value.
176    ///
177    /// The value may contain multiple backslash-separated terms.
178    pub fn new(specific_charset: &str) -> DcmResult<Self> {
179        let terms: Vec<&str> = specific_charset.split('\\').collect();
180        let default_term = terms.first().copied().unwrap_or("").trim().to_string();
181        let default_encoding = encoding_for_term(&default_term)?;
182        let default_scan_mode = scan_mode_for_term(&default_term);
183
184        let mut extensions = Vec::new();
185        let mut has_extensions = false;
186        for term in terms.iter().skip(1) {
187            let trimmed = term.trim();
188            if !trimmed.is_empty() {
189                let enc = encoding_for_term(trimmed)?;
190                extensions.push((trimmed.to_string(), enc));
191                has_extensions = true;
192            }
193        }
194
195        // Also register the default encoding under its "ISO 2022 IR" name,
196        // and always include ASCII as a fallback.
197        if has_extensions {
198            let first_term = terms.first().copied().unwrap_or("").trim();
199            if !first_term.is_empty() {
200                extensions.push((first_term.to_string(), default_encoding));
201            }
202            // ASCII is always available
203            extensions.push(("ISO 2022 IR 6".to_string(), encoding_rs::WINDOWS_1252));
204        }
205
206        Ok(Self {
207            default_encoding,
208            default_term,
209            default_scan_mode,
210            extensions,
211            has_extensions,
212        })
213    }
214
215    /// Create a decoder for a single (non-ISO 2022) charset.
216    pub fn single(encoding: &'static Encoding) -> Self {
217        Self {
218            default_encoding: encoding,
219            default_term: String::new(),
220            default_scan_mode: ScanMode::SingleByte,
221            extensions: Vec::new(),
222            has_extensions: false,
223        }
224    }
225
226    /// Default decoder (ASCII / WINDOWS-1252).
227    pub fn default_ascii() -> Self {
228        Self {
229            default_encoding: encoding_rs::WINDOWS_1252,
230            default_term: String::new(),
231            default_scan_mode: ScanMode::SingleByte,
232            extensions: Vec::new(),
233            has_extensions: false,
234        }
235    }
236
237    /// Return the default encoding.
238    pub fn default_encoding(&self) -> &'static Encoding {
239        self.default_encoding
240    }
241
242    /// Decodes a byte string using the configured character sets.
243    ///
244    /// For single-charset configs, decodes the whole buffer with the default encoding.
245    /// For multi-charset configs (ISO 2022 code extensions), splits on ESC sequences
246    /// and decodes each segment with the appropriate encoding.
247    pub fn decode(&self, bytes: &[u8]) -> DcmResult<String> {
248        if bytes.is_empty() {
249            return Ok(String::new());
250        }
251
252        // Fast path: UTF-8 input — avoid re-encoding
253        if self.default_encoding == encoding_rs::UTF_8 && !self.has_extensions {
254            return match std::str::from_utf8(bytes) {
255                Ok(s) => Ok(s.to_string()),
256                Err(_) => Ok(String::from_utf8_lossy(bytes).into_owned()),
257            };
258        }
259
260        // No code extensions: simple single-encoding decode
261        if !self.has_extensions {
262            return self.decode_with(bytes, self.default_encoding);
263        }
264
265        // ISO 2022 code-extension mode: scan for ESC (0x1B) and delimiter
266        // characters, decode each segment with the active charset.
267        self.decode_with_extensions(bytes)
268    }
269
270    /// Encode a string back to bytes using the default encoding.
271    pub fn encode(&self, s: &str) -> DcmResult<Vec<u8>> {
272        if self.default_encoding == encoding_rs::UTF_8 {
273            return Ok(s.as_bytes().to_vec());
274        }
275        let (encoded, _, had_errors) = self.default_encoding.encode(s);
276        if had_errors {
277            return Err(DcmError::CharsetError {
278                reason: "character encoding error".into(),
279            });
280        }
281        Ok(encoded.into_owned())
282    }
283
284    // ── Internal ──────────────────────────────────────────────────────────────
285
286    fn decode_with(&self, bytes: &[u8], encoding: &'static Encoding) -> DcmResult<String> {
287        let (decoded, _, had_errors) = encoding.decode(bytes);
288        if had_errors {
289            // Fall back to lossy decode rather than hard error — many real-world
290            // DICOM files have minor charset issues.
291            let (lossy, _, _) = encoding.decode(bytes);
292            return Ok(lossy.into_owned());
293        }
294        Ok(decoded.into_owned())
295    }
296
297    fn decode_with_extensions(&self, bytes: &[u8]) -> DcmResult<String> {
298        let mut result = String::new();
299        let mut current_term = self.default_term.as_str();
300        let mut current_encoding = self.default_encoding;
301        let mut current_scan_mode = self.default_scan_mode;
302        let mut segment_start = 0;
303        let mut pos = 0;
304
305        while pos < bytes.len() {
306            let b = bytes[pos];
307
308            // Check for ESC (0x1B) — charset switch
309            if b == 0x1B {
310                // Decode segment before the ESC
311                if pos > segment_start {
312                    let segment = &bytes[segment_start..pos];
313                    result.push_str(&self.decode_segment(
314                        segment,
315                        current_term,
316                        current_encoding,
317                    )?);
318                }
319
320                // Parse escape sequence
321                let remaining = &bytes[pos + 1..];
322                let esc_len = escape_seq_len(remaining);
323
324                if esc_len > 0 && remaining.len() >= esc_len {
325                    if let Some(term) = escape_to_term(&remaining[..esc_len]) {
326                        // Look up the encoding for this term
327                        current_term = term;
328                        current_encoding = self.find_encoding(term);
329                        current_scan_mode = scan_mode_for_term(term);
330                    }
331                    pos += 1 + esc_len; // skip ESC + sequence bytes
332                } else {
333                    // Unknown escape sequence — skip ESC and emit it
334                    pos += 1;
335                }
336                segment_start = pos;
337                continue;
338            }
339
340            // Delimiters (CR, LF, FF, HT) reset to default encoding
341            // per ISO 2022 / DICOM PS3.5 6.1.2.5.3
342            if b == 0x0D || b == 0x0A || b == 0x0C || b == 0x09 {
343                // Decode segment before delimiter
344                if pos > segment_start {
345                    let segment = &bytes[segment_start..pos];
346                    result.push_str(&self.decode_segment(
347                        segment,
348                        current_term,
349                        current_encoding,
350                    )?);
351                }
352                result.push(b as char);
353                current_term = self.default_term.as_str();
354                current_encoding = self.default_encoding;
355                current_scan_mode = self.default_scan_mode;
356                pos += 1;
357                segment_start = pos;
358                continue;
359            }
360
361            if let Some(skip) = current_scan_mode.skip_bytes(b, pos, bytes.len()) {
362                pos += skip;
363            }
364            pos += 1;
365        }
366
367        // Decode final segment
368        if segment_start < bytes.len() {
369            let segment = &bytes[segment_start..];
370            result.push_str(&self.decode_segment(segment, current_term, current_encoding)?);
371        }
372
373        Ok(result)
374    }
375
376    fn decode_segment(
377        &self,
378        bytes: &[u8],
379        term: &str,
380        encoding: &'static Encoding,
381    ) -> DcmResult<String> {
382        if bytes.is_empty() {
383            return Ok(String::new());
384        }
385        let wrapped;
386        let bytes = if let Some(segment) = wrap_iso2022_segment(term, bytes) {
387            wrapped = segment;
388            wrapped.as_slice()
389        } else {
390            bytes
391        };
392        let (decoded, _, had_errors) = encoding.decode(bytes);
393        if had_errors && matches!(term, "ISO 2022 IR 87" | "ISO 2022 IR 159") {
394            return Err(DcmError::CharsetError {
395                reason: format!("decoding error using charset '{term}'"),
396            });
397        }
398        Ok(decoded.into_owned())
399    }
400
401    fn find_encoding(&self, term: &str) -> &'static Encoding {
402        for (t, enc) in &self.extensions {
403            if t == term {
404                return enc;
405            }
406        }
407        // Fallback: try the global mapping
408        encoding_for_term(term).unwrap_or(self.default_encoding)
409    }
410}
411
412impl ScanMode {
413    fn skip_bytes(self, first_byte: u8, pos: usize, len: usize) -> Option<usize> {
414        match self {
415            ScanMode::SingleByte => None,
416            ScanMode::FixedWidth(width) if width > 1 && pos + width - 1 < len => Some(width - 1),
417            ScanMode::HighBitLead(width)
418                if width > 1 && (first_byte & 0x80) != 0 && pos + width - 1 < len =>
419            {
420                Some(width - 1)
421            }
422            _ => None,
423        }
424    }
425}
426
427fn scan_mode_for_term(term: &str) -> ScanMode {
428    match term {
429        "ISO 2022 IR 87" | "ISO 2022 IR 159" | "ISO 2022 IR 58" => ScanMode::FixedWidth(2),
430        "ISO 2022 IR 149" => ScanMode::HighBitLead(2),
431        _ => ScanMode::SingleByte,
432    }
433}
434
435fn wrap_iso2022_segment(term: &str, bytes: &[u8]) -> Option<Vec<u8>> {
436    let prefix = match term {
437        "ISO 2022 IR 87" => &[0x1B, 0x24, 0x42][..],
438        "ISO 2022 IR 159" => &[0x1B, 0x24, 0x28, 0x44][..],
439        _ => return None,
440    };
441
442    let mut wrapped = Vec::with_capacity(prefix.len() + bytes.len() + 3);
443    wrapped.extend_from_slice(prefix);
444    wrapped.extend_from_slice(bytes);
445    wrapped.extend_from_slice(&[0x1B, 0x28, 0x42]);
446    Some(wrapped)
447}
448
449#[cfg(test)]
450mod tests {
451    use super::*;
452
453    #[test]
454    fn default_charset() {
455        assert!(encoding_for_term("").is_ok());
456        assert!(encoding_for_term("ISO_IR 6").is_ok());
457        assert!(encoding_for_term("ISO 2022 IR 6").is_ok());
458    }
459
460    #[test]
461    fn utf8_charset() {
462        let encoding = encoding_for_term("ISO_IR 192").unwrap();
463        assert_eq!(encoding, encoding_rs::UTF_8);
464    }
465
466    #[test]
467    fn latin1_maps_to_windows1252() {
468        // ISO_IR 100 must map to WINDOWS_1252 (superset of ISO-8859-1)
469        let enc = encoding_for_term("ISO_IR 100").unwrap();
470        assert_eq!(enc, encoding_rs::WINDOWS_1252);
471    }
472
473    #[test]
474    fn latin9_supported() {
475        let enc = encoding_for_term("ISO_IR 203").unwrap();
476        assert_eq!(enc, encoding_rs::ISO_8859_15);
477    }
478
479    #[test]
480    fn unknown_charset() {
481        assert!(encoding_for_term("UNKNOWN_CHARSET").is_err());
482    }
483
484    #[test]
485    fn decode_ascii() {
486        let result = decode_string(b"Hello", "").unwrap();
487        assert_eq!(result, "Hello");
488    }
489
490    #[test]
491    fn decode_utf8() {
492        let result = decode_string("日本語".as_bytes(), "ISO_IR 192").unwrap();
493        assert_eq!(result, "日本語");
494    }
495
496    #[test]
497    fn decode_latin1_umlaut() {
498        // "Müller" in ISO-8859-1: ü = 0xFC
499        let bytes = vec![b'M', 0xFC, b'l', b'l', b'e', b'r'];
500        let result = decode_string(&bytes, "ISO_IR 100").unwrap();
501        assert_eq!(result, "Müller");
502    }
503
504    #[test]
505    fn decode_latin2() {
506        // Polish "Łódź" in ISO-8859-2: Ł=0xA3, ó=0xF3, ź=0xBC
507        let bytes = vec![0xA3, 0xF3, b'd', 0xBC];
508        let result = decode_string(&bytes, "ISO_IR 101").unwrap();
509        assert_eq!(result, "Łódź");
510    }
511
512    #[test]
513    fn decode_cyrillic() {
514        // "Иванов" in ISO-8859-5
515        let bytes = vec![0xB8, 0xD2, 0xD0, 0xDD, 0xDE, 0xD2];
516        let result = decode_string(&bytes, "ISO_IR 144").unwrap();
517        assert_eq!(result, "Иванов");
518    }
519
520    #[test]
521    fn encode_roundtrip_latin1() {
522        let original = "Müller^Hans";
523        let encoded = encode_string(original, "ISO_IR 100").unwrap();
524        let decoded = decode_string(&encoded, "ISO_IR 100").unwrap();
525        assert_eq!(decoded, original);
526    }
527
528    #[test]
529    fn multi_charset_decoder_single() {
530        let decoder = DicomCharsetDecoder::new("ISO_IR 100").unwrap();
531        let bytes = vec![b'M', 0xFC, b'l', b'l', b'e', b'r'];
532        let result = decoder.decode(&bytes).unwrap();
533        assert_eq!(result, "Müller");
534    }
535
536    #[test]
537    fn multi_charset_decoder_utf8() {
538        let decoder = DicomCharsetDecoder::new("ISO_IR 192").unwrap();
539        let result = decoder.decode("田中太郎".as_bytes()).unwrap();
540        assert_eq!(result, "田中太郎");
541    }
542
543    #[test]
544    fn escape_to_term_known_sequences() {
545        assert_eq!(escape_to_term(&[0x28, 0x42]), Some("ISO 2022 IR 6"));
546        assert_eq!(escape_to_term(&[0x2D, 0x41]), Some("ISO 2022 IR 100"));
547        assert_eq!(escape_to_term(&[0x24, 0x42]), Some("ISO 2022 IR 87"));
548        assert_eq!(escape_to_term(&[0x24, 0x28, 0x44]), Some("ISO 2022 IR 159"));
549        assert_eq!(escape_to_term(&[0x24, 0x29, 0x43]), Some("ISO 2022 IR 149"));
550        assert_eq!(escape_to_term(&[0x24, 0x29, 0x41]), Some("ISO 2022 IR 58"));
551    }
552
553    #[test]
554    fn escape_to_term_unknown() {
555        assert_eq!(escape_to_term(&[0x99, 0x99]), None);
556    }
557
558    #[test]
559    fn decoder_with_iso2022_japanese() {
560        // Simulate: Latin text, then ESC to JIS X0208, then ESC back to ASCII.
561        // "Yamada^日本"
562        let decoder = DicomCharsetDecoder::new("\\ISO 2022 IR 87").unwrap();
563        let bytes = [
564            b'Y', b'a', b'm', b'a', b'd', b'a', b'^', 0x1B, 0x24, 0x42, 0x46, 0x7C, 0x4B, 0x5C,
565            0x1B, 0x28, 0x42,
566        ];
567        let result = decoder.decode(&bytes).unwrap();
568        assert_eq!(result, "Yamada^日本");
569    }
570
571    #[test]
572    fn decoder_encode_roundtrip() {
573        let decoder = DicomCharsetDecoder::new("ISO_IR 100").unwrap();
574        let original = "Schöne Grüße";
575        let encoded = decoder.encode(original).unwrap();
576        let decoded = decoder.decode(&encoded).unwrap();
577        assert_eq!(decoded, original);
578    }
579}