1use encoding_rs::Encoding as InnerEncoding;
2use lazy_static::lazy_static;
3use std::collections::HashMap;
4
5use unicode_normalization::UnicodeNormalization;
6
7#[derive(Clone, Copy, Debug)]
8pub enum Encoding {
9 Utf8,
10 Marc8,
11 Iso5426,
12 Other(&'static InnerEncoding),
13}
14
15impl Encoding {
16 pub fn decode<'a>(&self, bytes: &'a [u8]) -> Result<std::borrow::Cow<'a, str>, crate::MarcError> {
17 match self {
18 Encoding::Utf8 => std::str::from_utf8(bytes).map(std::borrow::Cow::Borrowed).map_err(|_| crate::MarcError::Encoding),
19 Encoding::Marc8 => {
20 let (cow, _, had_errors) = encoding_rs::WINDOWS_1252.decode(bytes);
22 if had_errors {
23 Err(crate::MarcError::Encoding)
24 } else {
25 Ok(cow)
26 }
27 }
28 Encoding::Iso5426 => decode_iso5426(bytes).map(std::borrow::Cow::Owned).map_err(|_| crate::MarcError::Encoding),
29
30 Encoding::Other(enc) => {
31 let (cow, _, had_errors) = enc.decode(bytes);
32 if had_errors {
33 Err(crate::MarcError::Encoding)
34 } else {
35 Ok(cow)
36 }
37 }
38 }
39 }
40
41 pub fn encode(&self, text: &str) -> Result<Vec<u8>, crate::MarcError> {
42 match self {
43 Encoding::Utf8 => Ok(text.as_bytes().to_vec()),
44 Encoding::Marc8 => {
45 let (cow, _, had_errors) = encoding_rs::WINDOWS_1252.encode(text);
47 if had_errors {
48 Err(crate::MarcError::Encoding)
49 } else {
50 Ok(cow.into_owned())
51 }
52 }
53 Encoding::Iso5426 => encode_iso5426(text).map_err(|_| crate::MarcError::Encoding),
54 Encoding::Other(enc) => {
55 let (cow, _, had_errors) = enc.encode(text);
56 if had_errors {
57 Err(crate::MarcError::Encoding)
58 } else {
59 Ok(cow.into_owned())
60 }
61 }
62 }
63 }
64}
65
66lazy_static! {
67 pub static ref ISO5426_CORRECT: HashMap<u32, u32> = {
70 let mut m = HashMap::new();
71
72 m.insert(0xA1, 0x0141); m.insert(0xA2, 0x00D8); m.insert(0xA3, 0x0110); m.insert(0xA4, 0x00DE); m.insert(0xA5, 0x00C6); m.insert(0xA6, 0x0152); m.insert(0xA8, 0x00B7); m.insert(0xB1, 0x0142); m.insert(0xB2, 0x00F8); m.insert(0xB3, 0x0111); m.insert(0xB4, 0x00FE); m.insert(0xB5, 0x00E6); m.insert(0xB6, 0x0153); m.insert(0xB8, 0x0131); m.insert(0xB9, 0x00A3); m.insert(0xBA, 0x00F0); m.insert(0xC1, 0x0300); m.insert(0xC2, 0x0301); m.insert(0xC3, 0x0302); m.insert(0xC4, 0x0303); m.insert(0xC5, 0x0304); m.insert(0xC6, 0x0306); m.insert(0xC7, 0x0307); m.insert(0xC8, 0x0308); m.insert(0xC9, 0x030C); m.insert(0xCA, 0x030A); m.insert(0xCB, 0x0327); m.insert(0xCC, 0x0328); m.insert(0xCD, 0x0323); m.insert(0xCE, 0x0324); m.insert(0xCF, 0x0313); m.insert(0xD0, 0x030B); m.insert(0xD1, 0x0332); m.insert(0xD2, 0x0325); m.insert(0xD6, 0x0326); m.insert(0xE1, 0x0361); m.insert(0xE2, 0x0360); m
119 };
120
121 static ref UNICODE_TO_ISO5426: HashMap<u32, u8> = {
122 let mut m = HashMap::new();
123 for (iso_byte, unicode_cp) in ISO5426_CORRECT.iter() {
126 m.insert(*unicode_cp, *iso_byte as u8);
127 }
128 m
129 };
130}
131
132fn decode_iso5426(data: &[u8]) -> Result<String, String> {
133 let mut out_codes: Vec<u32> = Vec::with_capacity(data.len());
134 let mut i = 0;
135 while i < data.len() {
136 let b = data[i];
137
138 match b {
139 0x20..=0x7E => {
141 out_codes.push(b as u32);
142 i += 1;
143 }
144 0xC1..=0xCF | 0xD0..=0xDF | 0xE1..=0xE8 => {
147 if let Some(&accent_unicode) = ISO5426_CORRECT.get(&(b as u32)) {
148 if i + 1 < data.len() {
149 let next_byte = data[i + 1];
150
151 let base_char = if next_byte > 0x7F {
153 *ISO5426_CORRECT.get(&(next_byte as u32)).unwrap_or(&(next_byte as u32))
154 } else {
155 next_byte as u32
156 };
157 out_codes.push(base_char);
158
159 out_codes.push(accent_unicode);
161 i += 2; } else {
163 out_codes.push(accent_unicode);
165 i += 1;
166 }
167 } else {
168 out_codes.push(b as u32);
169 i += 1;
170 }
171 }
172 0xA1..=0xBF => {
174 let cp = *ISO5426_CORRECT.get(&(b as u32)).unwrap_or(&(b as u32));
175 out_codes.push(cp);
176 i += 1;
177 }
178 _ => {
180 if b == 0x09 || b == 0x0A || b == 0x0D {
181 out_codes.push(b as u32);
182 }
183 i += 1;
184 }
185 }
186 }
187
188 let raw_string: String = out_codes.into_iter().filter_map(std::char::from_u32).collect();
190
191 Ok(raw_string.nfc().collect())
192}
193
194fn encode_iso5426(text: &str) -> Result<Vec<u8>, String> {
197 let mut result = Vec::with_capacity(text.len());
198
199 let nfd_text: Vec<char> = text.nfd().collect();
201 let mut i = 0;
202
203 while i < nfd_text.len() {
204 let ch = nfd_text[i];
205 let cp = ch as u32;
206
207 if i + 1 < nfd_text.len() {
209 let next_ch = nfd_text[i + 1];
210 let next_cp = next_ch as u32;
211
212 if (0x0300..=0x036F).contains(&next_cp) {
214 if let Some(&accent_byte) = UNICODE_TO_ISO5426.get(&next_cp) {
215 result.push(accent_byte);
217
218 if cp <= 0x7E {
220 result.push(cp as u8);
221 } else if let Some(&base_byte) = UNICODE_TO_ISO5426.get(&cp) {
222 result.push(base_byte);
223 } else {
224 return Err(format!("Base character not supported: {}", ch));
225 }
226
227 i += 2; continue;
229 }
230 }
231 }
232
233 if cp <= 0x7E {
235 result.push(cp as u8);
236 } else if let Some(&byte) = UNICODE_TO_ISO5426.get(&cp) {
237 result.push(byte);
238 } else {
239 return Err(format!("Character not supported in ISO-5426: {}", ch));
241 }
242
243 i += 1;
244 }
245
246 Ok(result)
247}