marc_record/
marc8.rs

1//! MARC-8 support for MARC records
2
3use core::str;
4
5use unicode_normalization::char::compose;
6
7use crate::{DecodeError, TextDecoder};
8
9/// A MARC-8 decoder for latin language text.
10///
11/// Info about decoding MARC-8 are scarce, but some sources were:
12///
13/// - https://en.wikipedia.org/wiki/MARC-8
14/// - https://en.wikipedia.org/wiki/ANSEL
15///
16pub struct Marc8Decoder {}
17
18impl TextDecoder for Marc8Decoder {
19    /// Tries to decode and transform text into valid UTF-8
20    ///
21    /// If the text is entirely ASCII, the result will simply be a reference to the original string
22    fn decode<'a>(&self, text: &'a [u8]) -> Result<std::borrow::Cow<'a, str>, DecodeError> {
23        // Check if all we received in practice is ASCII, which means that we can just the array as is.
24        // This could be optimized using SIMD
25        if text.iter().all(|ch| *ch <= 127) {
26            return Ok(std::borrow::Cow::Borrowed(
27                str::from_utf8(text).map_err(|e| DecodeError::Utf(e))?,
28            ));
29        }
30
31        let mut out = String::with_capacity(text.len());
32        let mut it = text.into_iter();
33
34        let mut combining_buffer: Vec<char> = Vec::new();
35        while let Some(ch) = it.next() {
36            // The basic set is the same as ASCII and thus uses the same unicode code points
37            if *ch <= 127 {
38                if combining_buffer.is_empty() {
39                    out.push(char::from_u32(*ch as u32).unwrap());
40                    continue;
41                }
42
43                let mut base =
44                    char::from_u32(*ch as u32).expect("we already checked it was below 128");
45                // we had one or more combining characters stacked, consume them and try to form a valid unicode sequence
46
47                let mut combining_it = combining_buffer.iter();
48                while let Some(combining) = combining_it.next() {
49                    if let Some(new) = compose(base, *combining) {
50                        base = new;
51                    } else {
52                        return Err(DecodeError::InvalidPair(base, *combining));
53                    }
54                }
55
56                out.push(base);
57                combining_buffer.clear();
58                continue;
59            }
60
61            if *ch >= 0xA1 && *ch <= 0xC8 {
62                match ch {
63                    0xA1 => out.push('\u{0141}'),
64                    0xA2 => out.push('\u{00D8}'),
65                    0xA3 => out.push('\u{0110}'),
66                    0xA4 => out.push('\u{00DE}'),
67                    0xA5 => out.push('\u{00C6}'),
68                    0xA6 => out.push('\u{0152}'),
69                    0xA7 => out.push('\u{02B9}'),
70                    0xA8 => out.push('\u{00B7}'),
71                    0xA9 => out.push('\u{266D}'),
72                    0xAA => out.push('\u{00AE}'),
73                    0xAB => out.push('\u{00B1}'),
74                    0xAC => out.push('\u{01A0}'),
75                    0xAD => out.push('\u{01AF}'),
76                    0xAE => out.push('\u{02BC}'),
77                    0xB0 => out.push('\u{02BB}'),
78                    0xB1 => out.push('\u{0142}'),
79                    0xB2 => out.push('\u{00F8}'),
80                    0xB3 => out.push('\u{0111}'),
81                    0xB4 => out.push('\u{00FE}'),
82                    0xB5 => out.push('\u{00E6}'),
83                    0xB6 => out.push('\u{0153}'),
84                    0xB7 => out.push('\u{02BA}'),
85                    0xB8 => out.push('\u{0131}'),
86                    0xB9 => out.push('\u{00A3}'),
87                    0xBA => out.push('\u{00F0}'),
88                    0xBC => out.push('\u{01A1}'),
89                    0xBD => out.push('\u{01B0}'),
90                    0xC0 => out.push('\u{00B0}'),
91                    0xC1 => out.push('\u{2113}'),
92                    0xC2 => out.push('\u{2117}'),
93                    0xC3 => out.push('\u{00A9}'),
94                    0xC4 => out.push('\u{266F}'),
95                    0xC5 => out.push('\u{00BF}'),
96                    0xC6 => out.push('\u{00A1}'),
97                    0xC7 => out.push('\u{00DF}'), // Not clear if it's supped to be lowercase of uppercase, I put lower
98                    0xC8 => out.push('\u{20AC}'), // Euro sign
99                    _ => return Err(DecodeError::Unknown(*ch)),
100                };
101                continue;
102            }
103
104            // Combinining characters
105            if *ch >= 0xE0 && *ch <= 0xFE {
106                // let Some((_, n)) = it.next() else {
107                //     return Err(DecodeError::Unknown(*ch));
108                // };
109
110                // let Some(base_char) = char::from_u32(*n as u32) else {
111                //     return Err(DecodeError::Unknown(*ch));
112                // };
113
114                let combining = match ch {
115                    0xE0 => '\u{0309}',
116                    0xE1 => '\u{0300}',
117                    0xE2 => '\u{0301}',
118                    0xE3 => '\u{0302}',
119                    0xE4 => '\u{0303}',
120                    0xE5 => '\u{0304}',
121                    0xE6 => '\u{0306}',
122                    0xE7 => '\u{0307}',
123                    0xE8 => '\u{0308}',
124                    0xE9 => '\u{030C}',
125                    0xEA => '\u{030A}',
126                    0xEB => '\u{FE20}',
127                    0xEC => '\u{FE21}',
128                    0xED => '\u{0315}',
129                    0xEE => '\u{030B}',
130                    0xEF => '\u{0310}',
131                    0xF0 => '\u{0327}',
132                    0xF1 => '\u{0328}',
133                    0xF2 => '\u{0323}',
134                    0xF3 => '\u{0324}',
135                    0xF4 => '\u{0325}',
136                    0xF5 => '\u{0333}',
137                    0xF6 => '\u{0332}',
138                    0xF7 => '\u{0326}',
139                    0xF8 => '\u{031C}',
140                    0xF9 => '\u{032E}',
141                    0xFA => '\u{FE22}',
142                    0xFB => '\u{FE23}',
143                    0xFE => '\u{0313}',
144                    _ => return Err(DecodeError::Unknown(*ch)),
145                };
146
147                combining_buffer.push(combining);
148
149                continue;
150            }
151
152            return Err(DecodeError::Unknown(*ch));
153        }
154
155        // If we're at the end of the string and we were working on a combining sequence, something went wrong
156        if !combining_buffer.is_empty() {
157            return Err(DecodeError::InvalidSequence);
158        }
159
160        return Ok(std::borrow::Cow::Owned(out));
161    }
162}