1use crate::error::{DcmError, DcmResult};
14use encoding_rs::Encoding;
15
16pub fn encoding_for_term(term: &str) -> DcmResult<&'static Encoding> {
20 let encoding = match term.trim() {
21 "" | "ISO_IR 6" | "ISO 2022 IR 6" => encoding_rs::WINDOWS_1252,
24
25 "ISO_IR 100" | "ISO 2022 IR 100" => encoding_rs::WINDOWS_1252,
27 "ISO_IR 101" | "ISO 2022 IR 101" => encoding_rs::ISO_8859_2,
29 "ISO_IR 109" | "ISO 2022 IR 109" => encoding_rs::ISO_8859_3,
31 "ISO_IR 110" | "ISO 2022 IR 110" => encoding_rs::ISO_8859_4,
33 "ISO_IR 144" | "ISO 2022 IR 144" => encoding_rs::ISO_8859_5,
35 "ISO_IR 127" | "ISO 2022 IR 127" => encoding_rs::ISO_8859_6,
37 "ISO_IR 126" | "ISO 2022 IR 126" => encoding_rs::ISO_8859_7,
39 "ISO_IR 138" | "ISO 2022 IR 138" => encoding_rs::ISO_8859_8,
41 "ISO_IR 148" | "ISO 2022 IR 148" => encoding_rs::WINDOWS_1254,
43 "ISO_IR 203" | "ISO 2022 IR 203" => encoding_rs::ISO_8859_15,
45
46 "ISO_IR 166" | "ISO 2022 IR 166" => encoding_rs::WINDOWS_874,
48
49 "ISO_IR 13" | "ISO 2022 IR 13" => encoding_rs::SHIFT_JIS,
51 "ISO 2022 IR 87" => encoding_rs::ISO_2022_JP,
53 "ISO 2022 IR 159" => encoding_rs::ISO_2022_JP,
55
56 "ISO 2022 IR 149" => encoding_rs::EUC_KR,
58 "ISO 2022 IR 58" => encoding_rs::GB18030,
60
61 "ISO_IR 192" => encoding_rs::UTF_8,
63
64 "GBK" => encoding_rs::GBK,
66 "GB18030" => encoding_rs::GB18030,
67
68 _ => {
69 return Err(DcmError::CharsetError {
70 reason: format!("unknown DICOM character set term: '{term}'"),
71 });
72 }
73 };
74 Ok(encoding)
75}
76
77pub fn decode_string(bytes: &[u8], term: &str) -> DcmResult<String> {
79 let encoding = encoding_for_term(term)?;
80 let (decoded, _, had_errors) = encoding.decode(bytes);
81 if had_errors {
82 return Err(DcmError::CharsetError {
83 reason: format!("decoding error using charset '{term}'"),
84 });
85 }
86 Ok(decoded.into_owned())
87}
88
89pub fn encode_string(s: &str, term: &str) -> DcmResult<Vec<u8>> {
91 let encoding = encoding_for_term(term)?;
92 let (encoded, _, had_errors) = encoding.encode(s);
93 if had_errors {
94 return Err(DcmError::CharsetError {
95 reason: format!("encoding error using charset '{term}'"),
96 });
97 }
98 Ok(encoded.into_owned())
99}
100
101pub struct DicomCharsetDecoder {
109 default_encoding: &'static Encoding,
111 default_term: String,
113 default_scan_mode: ScanMode,
115 extensions: Vec<(String, &'static Encoding)>,
117 has_extensions: bool,
119}
120
121#[derive(Debug, Clone, Copy, PartialEq, Eq)]
122enum ScanMode {
123 SingleByte,
124 FixedWidth(usize),
125 HighBitLead(usize),
126}
127
128fn escape_to_term(esc: &[u8]) -> Option<&'static str> {
132 if esc.len() < 2 {
133 return None;
134 }
135 match (esc[0], esc[1]) {
136 (0x28, 0x42) => Some("ISO 2022 IR 6"), (0x2D, 0x41) => Some("ISO 2022 IR 100"), (0x2D, 0x42) => Some("ISO 2022 IR 101"), (0x2D, 0x43) => Some("ISO 2022 IR 109"), (0x2D, 0x44) => Some("ISO 2022 IR 110"), (0x2D, 0x4C) => Some("ISO 2022 IR 144"), (0x2D, 0x47) => Some("ISO 2022 IR 127"), (0x2D, 0x46) => Some("ISO 2022 IR 126"), (0x2D, 0x48) => Some("ISO 2022 IR 138"), (0x2D, 0x4D) => Some("ISO 2022 IR 148"), (0x2D, 0x62) => Some("ISO 2022 IR 203"), (0x29, 0x49) => Some("ISO 2022 IR 13"), (0x28, 0x4A) => Some("ISO 2022 IR 13"), (0x2D, 0x54) => Some("ISO 2022 IR 166"), (0x24, 0x42) => Some("ISO 2022 IR 87"), (0x24, 0x28) if esc.len() >= 3 && esc[2] == 0x44 => {
152 Some("ISO 2022 IR 159") }
154 (0x24, 0x29) if esc.len() >= 3 => match esc[2] {
155 0x43 => Some("ISO 2022 IR 149"), 0x41 => Some("ISO 2022 IR 58"), _ => None,
158 },
159 _ => None,
160 }
161}
162
163fn escape_seq_len(data: &[u8]) -> usize {
165 if data.len() < 2 {
166 return 0;
167 }
168 match (data[0], data[1]) {
169 (0x24, 0x28) | (0x24, 0x29) => 3, _ => 2, }
172}
173
174impl DicomCharsetDecoder {
175 pub fn new(specific_charset: &str) -> DcmResult<Self> {
179 let terms: Vec<&str> = specific_charset.split('\\').collect();
180 let default_term = terms.first().copied().unwrap_or("").trim().to_string();
181 let default_encoding = encoding_for_term(&default_term)?;
182 let default_scan_mode = scan_mode_for_term(&default_term);
183
184 let mut extensions = Vec::new();
185 let mut has_extensions = false;
186 for term in terms.iter().skip(1) {
187 let trimmed = term.trim();
188 if !trimmed.is_empty() {
189 let enc = encoding_for_term(trimmed)?;
190 extensions.push((trimmed.to_string(), enc));
191 has_extensions = true;
192 }
193 }
194
195 if has_extensions {
198 let first_term = terms.first().copied().unwrap_or("").trim();
199 if !first_term.is_empty() {
200 extensions.push((first_term.to_string(), default_encoding));
201 }
202 extensions.push(("ISO 2022 IR 6".to_string(), encoding_rs::WINDOWS_1252));
204 }
205
206 Ok(Self {
207 default_encoding,
208 default_term,
209 default_scan_mode,
210 extensions,
211 has_extensions,
212 })
213 }
214
215 pub fn single(encoding: &'static Encoding) -> Self {
217 Self {
218 default_encoding: encoding,
219 default_term: String::new(),
220 default_scan_mode: ScanMode::SingleByte,
221 extensions: Vec::new(),
222 has_extensions: false,
223 }
224 }
225
226 pub fn default_ascii() -> Self {
228 Self {
229 default_encoding: encoding_rs::WINDOWS_1252,
230 default_term: String::new(),
231 default_scan_mode: ScanMode::SingleByte,
232 extensions: Vec::new(),
233 has_extensions: false,
234 }
235 }
236
237 pub fn default_encoding(&self) -> &'static Encoding {
239 self.default_encoding
240 }
241
242 pub fn decode(&self, bytes: &[u8]) -> DcmResult<String> {
248 if bytes.is_empty() {
249 return Ok(String::new());
250 }
251
252 if self.default_encoding == encoding_rs::UTF_8 && !self.has_extensions {
254 return match std::str::from_utf8(bytes) {
255 Ok(s) => Ok(s.to_string()),
256 Err(_) => Ok(String::from_utf8_lossy(bytes).into_owned()),
257 };
258 }
259
260 if !self.has_extensions {
262 return self.decode_with(bytes, self.default_encoding);
263 }
264
265 self.decode_with_extensions(bytes)
268 }
269
270 pub fn encode(&self, s: &str) -> DcmResult<Vec<u8>> {
272 if self.default_encoding == encoding_rs::UTF_8 {
273 return Ok(s.as_bytes().to_vec());
274 }
275 let (encoded, _, had_errors) = self.default_encoding.encode(s);
276 if had_errors {
277 return Err(DcmError::CharsetError {
278 reason: "character encoding error".into(),
279 });
280 }
281 Ok(encoded.into_owned())
282 }
283
284 fn decode_with(&self, bytes: &[u8], encoding: &'static Encoding) -> DcmResult<String> {
287 let (decoded, _, had_errors) = encoding.decode(bytes);
288 if had_errors {
289 let (lossy, _, _) = encoding.decode(bytes);
292 return Ok(lossy.into_owned());
293 }
294 Ok(decoded.into_owned())
295 }
296
297 fn decode_with_extensions(&self, bytes: &[u8]) -> DcmResult<String> {
298 let mut result = String::new();
299 let mut current_term = self.default_term.as_str();
300 let mut current_encoding = self.default_encoding;
301 let mut current_scan_mode = self.default_scan_mode;
302 let mut segment_start = 0;
303 let mut pos = 0;
304
305 while pos < bytes.len() {
306 let b = bytes[pos];
307
308 if b == 0x1B {
310 if pos > segment_start {
312 let segment = &bytes[segment_start..pos];
313 result.push_str(&self.decode_segment(
314 segment,
315 current_term,
316 current_encoding,
317 )?);
318 }
319
320 let remaining = &bytes[pos + 1..];
322 let esc_len = escape_seq_len(remaining);
323
324 if esc_len > 0 && remaining.len() >= esc_len {
325 if let Some(term) = escape_to_term(&remaining[..esc_len]) {
326 current_term = term;
328 current_encoding = self.find_encoding(term);
329 current_scan_mode = scan_mode_for_term(term);
330 }
331 pos += 1 + esc_len; } else {
333 pos += 1;
335 }
336 segment_start = pos;
337 continue;
338 }
339
340 if b == 0x0D || b == 0x0A || b == 0x0C || b == 0x09 {
343 if pos > segment_start {
345 let segment = &bytes[segment_start..pos];
346 result.push_str(&self.decode_segment(
347 segment,
348 current_term,
349 current_encoding,
350 )?);
351 }
352 result.push(b as char);
353 current_term = self.default_term.as_str();
354 current_encoding = self.default_encoding;
355 current_scan_mode = self.default_scan_mode;
356 pos += 1;
357 segment_start = pos;
358 continue;
359 }
360
361 if let Some(skip) = current_scan_mode.skip_bytes(b, pos, bytes.len()) {
362 pos += skip;
363 }
364 pos += 1;
365 }
366
367 if segment_start < bytes.len() {
369 let segment = &bytes[segment_start..];
370 result.push_str(&self.decode_segment(segment, current_term, current_encoding)?);
371 }
372
373 Ok(result)
374 }
375
376 fn decode_segment(
377 &self,
378 bytes: &[u8],
379 term: &str,
380 encoding: &'static Encoding,
381 ) -> DcmResult<String> {
382 if bytes.is_empty() {
383 return Ok(String::new());
384 }
385 let wrapped;
386 let bytes = if let Some(segment) = wrap_iso2022_segment(term, bytes) {
387 wrapped = segment;
388 wrapped.as_slice()
389 } else {
390 bytes
391 };
392 let (decoded, _, had_errors) = encoding.decode(bytes);
393 if had_errors && matches!(term, "ISO 2022 IR 87" | "ISO 2022 IR 159") {
394 return Err(DcmError::CharsetError {
395 reason: format!("decoding error using charset '{term}'"),
396 });
397 }
398 Ok(decoded.into_owned())
399 }
400
401 fn find_encoding(&self, term: &str) -> &'static Encoding {
402 for (t, enc) in &self.extensions {
403 if t == term {
404 return enc;
405 }
406 }
407 encoding_for_term(term).unwrap_or(self.default_encoding)
409 }
410}
411
412impl ScanMode {
413 fn skip_bytes(self, first_byte: u8, pos: usize, len: usize) -> Option<usize> {
414 match self {
415 ScanMode::SingleByte => None,
416 ScanMode::FixedWidth(width) if width > 1 && pos + width - 1 < len => Some(width - 1),
417 ScanMode::HighBitLead(width)
418 if width > 1 && (first_byte & 0x80) != 0 && pos + width - 1 < len =>
419 {
420 Some(width - 1)
421 }
422 _ => None,
423 }
424 }
425}
426
427fn scan_mode_for_term(term: &str) -> ScanMode {
428 match term {
429 "ISO 2022 IR 87" | "ISO 2022 IR 159" | "ISO 2022 IR 58" => ScanMode::FixedWidth(2),
430 "ISO 2022 IR 149" => ScanMode::HighBitLead(2),
431 _ => ScanMode::SingleByte,
432 }
433}
434
435fn wrap_iso2022_segment(term: &str, bytes: &[u8]) -> Option<Vec<u8>> {
436 let prefix = match term {
437 "ISO 2022 IR 87" => &[0x1B, 0x24, 0x42][..],
438 "ISO 2022 IR 159" => &[0x1B, 0x24, 0x28, 0x44][..],
439 _ => return None,
440 };
441
442 let mut wrapped = Vec::with_capacity(prefix.len() + bytes.len() + 3);
443 wrapped.extend_from_slice(prefix);
444 wrapped.extend_from_slice(bytes);
445 wrapped.extend_from_slice(&[0x1B, 0x28, 0x42]);
446 Some(wrapped)
447}
448
449#[cfg(test)]
450mod tests {
451 use super::*;
452
453 #[test]
454 fn default_charset() {
455 assert!(encoding_for_term("").is_ok());
456 assert!(encoding_for_term("ISO_IR 6").is_ok());
457 assert!(encoding_for_term("ISO 2022 IR 6").is_ok());
458 }
459
460 #[test]
461 fn utf8_charset() {
462 let encoding = encoding_for_term("ISO_IR 192").unwrap();
463 assert_eq!(encoding, encoding_rs::UTF_8);
464 }
465
466 #[test]
467 fn latin1_maps_to_windows1252() {
468 let enc = encoding_for_term("ISO_IR 100").unwrap();
470 assert_eq!(enc, encoding_rs::WINDOWS_1252);
471 }
472
473 #[test]
474 fn latin9_supported() {
475 let enc = encoding_for_term("ISO_IR 203").unwrap();
476 assert_eq!(enc, encoding_rs::ISO_8859_15);
477 }
478
479 #[test]
480 fn unknown_charset() {
481 assert!(encoding_for_term("UNKNOWN_CHARSET").is_err());
482 }
483
484 #[test]
485 fn decode_ascii() {
486 let result = decode_string(b"Hello", "").unwrap();
487 assert_eq!(result, "Hello");
488 }
489
490 #[test]
491 fn decode_utf8() {
492 let result = decode_string("日本語".as_bytes(), "ISO_IR 192").unwrap();
493 assert_eq!(result, "日本語");
494 }
495
496 #[test]
497 fn decode_latin1_umlaut() {
498 let bytes = vec![b'M', 0xFC, b'l', b'l', b'e', b'r'];
500 let result = decode_string(&bytes, "ISO_IR 100").unwrap();
501 assert_eq!(result, "Müller");
502 }
503
504 #[test]
505 fn decode_latin2() {
506 let bytes = vec![0xA3, 0xF3, b'd', 0xBC];
508 let result = decode_string(&bytes, "ISO_IR 101").unwrap();
509 assert_eq!(result, "Łódź");
510 }
511
512 #[test]
513 fn decode_cyrillic() {
514 let bytes = vec![0xB8, 0xD2, 0xD0, 0xDD, 0xDE, 0xD2];
516 let result = decode_string(&bytes, "ISO_IR 144").unwrap();
517 assert_eq!(result, "Иванов");
518 }
519
520 #[test]
521 fn encode_roundtrip_latin1() {
522 let original = "Müller^Hans";
523 let encoded = encode_string(original, "ISO_IR 100").unwrap();
524 let decoded = decode_string(&encoded, "ISO_IR 100").unwrap();
525 assert_eq!(decoded, original);
526 }
527
528 #[test]
529 fn multi_charset_decoder_single() {
530 let decoder = DicomCharsetDecoder::new("ISO_IR 100").unwrap();
531 let bytes = vec![b'M', 0xFC, b'l', b'l', b'e', b'r'];
532 let result = decoder.decode(&bytes).unwrap();
533 assert_eq!(result, "Müller");
534 }
535
536 #[test]
537 fn multi_charset_decoder_utf8() {
538 let decoder = DicomCharsetDecoder::new("ISO_IR 192").unwrap();
539 let result = decoder.decode("田中太郎".as_bytes()).unwrap();
540 assert_eq!(result, "田中太郎");
541 }
542
543 #[test]
544 fn escape_to_term_known_sequences() {
545 assert_eq!(escape_to_term(&[0x28, 0x42]), Some("ISO 2022 IR 6"));
546 assert_eq!(escape_to_term(&[0x2D, 0x41]), Some("ISO 2022 IR 100"));
547 assert_eq!(escape_to_term(&[0x24, 0x42]), Some("ISO 2022 IR 87"));
548 assert_eq!(escape_to_term(&[0x24, 0x28, 0x44]), Some("ISO 2022 IR 159"));
549 assert_eq!(escape_to_term(&[0x24, 0x29, 0x43]), Some("ISO 2022 IR 149"));
550 assert_eq!(escape_to_term(&[0x24, 0x29, 0x41]), Some("ISO 2022 IR 58"));
551 }
552
553 #[test]
554 fn escape_to_term_unknown() {
555 assert_eq!(escape_to_term(&[0x99, 0x99]), None);
556 }
557
558 #[test]
559 fn decoder_with_iso2022_japanese() {
560 let decoder = DicomCharsetDecoder::new("\\ISO 2022 IR 87").unwrap();
563 let bytes = [
564 b'Y', b'a', b'm', b'a', b'd', b'a', b'^', 0x1B, 0x24, 0x42, 0x46, 0x7C, 0x4B, 0x5C,
565 0x1B, 0x28, 0x42,
566 ];
567 let result = decoder.decode(&bytes).unwrap();
568 assert_eq!(result, "Yamada^日本");
569 }
570
571 #[test]
572 fn decoder_encode_roundtrip() {
573 let decoder = DicomCharsetDecoder::new("ISO_IR 100").unwrap();
574 let original = "Schöne Grüße";
575 let encoded = decoder.encode(original).unwrap();
576 let decoded = decoder.decode(&encoded).unwrap();
577 assert_eq!(decoded, original);
578 }
579}