marc_rs/
encoding.rs

1use encoding_rs::Encoding as InnerEncoding;
2use lazy_static::lazy_static;
3use std::collections::HashMap;
4
5use unicode_normalization::UnicodeNormalization;
6
7#[derive(Clone, Copy, Debug)]
8pub enum Encoding {
9    Utf8,
10    Marc8,
11    Iso5426,
12    Other(&'static InnerEncoding),
13}
14
15impl Encoding {
16    pub fn decode<'a>(&self, bytes: &'a [u8]) -> Result<std::borrow::Cow<'a, str>, crate::MarcError> {
17        match self {
18            Encoding::Utf8 => std::str::from_utf8(bytes).map(std::borrow::Cow::Borrowed).map_err(|_| crate::MarcError::Encoding),
19            Encoding::Marc8 => {
20                // TODO: proper MARC-8 support; for now treat as ISO-8859-1 fallback.
21                let (cow, _, had_errors) = encoding_rs::WINDOWS_1252.decode(bytes);
22                if had_errors {
23                    Err(crate::MarcError::Encoding)
24                } else {
25                    Ok(cow)
26                }
27            }
28            Encoding::Iso5426 => decode_iso5426(bytes).map(std::borrow::Cow::Owned).map_err(|_| crate::MarcError::Encoding),
29
30            Encoding::Other(enc) => {
31                let (cow, _, had_errors) = enc.decode(bytes);
32                if had_errors {
33                    Err(crate::MarcError::Encoding)
34                } else {
35                    Ok(cow)
36                }
37            }
38        }
39    }
40
41    pub fn encode(&self, text: &str) -> Result<Vec<u8>, crate::MarcError> {
42        match self {
43            Encoding::Utf8 => Ok(text.as_bytes().to_vec()),
44            Encoding::Marc8 => {
45                // TODO: proper MARC-8 support; for now treat as Windows-1252.
46                let (cow, _, had_errors) = encoding_rs::WINDOWS_1252.encode(text);
47                if had_errors {
48                    Err(crate::MarcError::Encoding)
49                } else {
50                    Ok(cow.into_owned())
51                }
52            }
53            Encoding::Iso5426 => encode_iso5426(text).map_err(|_| crate::MarcError::Encoding),
54            Encoding::Other(enc) => {
55                let (cow, _, had_errors) = enc.encode(text);
56                if had_errors {
57                    Err(crate::MarcError::Encoding)
58                } else {
59                    Ok(cow.into_owned())
60                }
61            }
62        }
63    }
64}
65
66lazy_static! {
67    /// Mapping ISO 5426 (bytes > 0x7F) to Unicode code points.
68    /// Keys are raw bytes, values are Unicode codes.
69    pub static ref ISO5426_CORRECT: HashMap<u32, u32> = {
70        let mut m = HashMap::new();
71
72        // --- SPECIAL CHARACTERS (0xA1 - 0xBF) ---
73        m.insert(0xA1, 0x0141); // Ł
74        m.insert(0xA2, 0x00D8); // Ø
75        m.insert(0xA3, 0x0110); // Đ
76        m.insert(0xA4, 0x00DE); // Þ
77        m.insert(0xA5, 0x00C6); // Æ
78        m.insert(0xA6, 0x0152); // Œ
79        m.insert(0xA8, 0x00B7); // · (middle dot)
80        m.insert(0xB1, 0x0142); // ł
81        m.insert(0xB2, 0x00F8); // ø
82        m.insert(0xB3, 0x0111); // đ
83        m.insert(0xB4, 0x00FE); // þ
84        m.insert(0xB5, 0x00E6); // æ
85        m.insert(0xB6, 0x0153); // œ
86        m.insert(0xB8, 0x0131); // ı (dotless i)
87        m.insert(0xB9, 0x00A3); // £
88        m.insert(0xBA, 0x00F0); // ð
89
90        // --- SIMPLE COMBINING DIACRITICS (0xC1 - 0xCF) ---
91        // Note: In Unicode they must be placed AFTER the base letter
92        m.insert(0xC1, 0x0300); // Grave `
93        m.insert(0xC2, 0x0301); // Acute ´
94        m.insert(0xC3, 0x0302); // Circumflex ^
95        m.insert(0xC4, 0x0303); // Tilde ~
96        m.insert(0xC5, 0x0304); // Macron ¯
97        m.insert(0xC6, 0x0306); // Breve ˘
98        m.insert(0xC7, 0x0307); // Dot above ˙
99        m.insert(0xC8, 0x0308); // Diaeresis ¨
100        m.insert(0xC9, 0x030C); // Caron ˇ
101        m.insert(0xCA, 0x030A); // Ring above ˚
102        m.insert(0xCB, 0x0327); // Cedilla ¸
103        m.insert(0xCC, 0x0328); // Ogonek ̨
104        m.insert(0xCD, 0x0323); // Dot below ̣
105        m.insert(0xCE, 0x0324); // Diaeresis below ̤
106        m.insert(0xCF, 0x0313); // Comma above
107
108        // --- ADDITIONAL DIACRITICS (0xD0 - 0xDF) ---
109        m.insert(0xD0, 0x030B); // Double acute
110        m.insert(0xD1, 0x0332); // Low line (macron below)
111        m.insert(0xD2, 0x0325); // Ring below
112        m.insert(0xD6, 0x0326); // Comma below
113
114        // --- DOUBLE DIACRITICS (span two letters) ---
115        m.insert(0xE1, 0x0361); // Double inverted breve (t͡s)
116        m.insert(0xE2, 0x0360); // Double tilde
117
118        m
119    };
120
121    static ref UNICODE_TO_ISO5426: HashMap<u32, u8> = {
122        let mut m = HashMap::new();
123        // Inverse of ISO5426_CORRECT table
124        // Note: Only bytes > 0x7F are mapped here
125        for (iso_byte, unicode_cp) in ISO5426_CORRECT.iter() {
126            m.insert(*unicode_cp, *iso_byte as u8);
127        }
128        m
129    };
130}
131
132fn decode_iso5426(data: &[u8]) -> Result<String, String> {
133    let mut out_codes: Vec<u32> = Vec::with_capacity(data.len());
134    let mut i = 0;
135    while i < data.len() {
136        let b = data[i];
137
138        match b {
139            // ASCII standard (0x20-0x7E)
140            0x20..=0x7E => {
141                out_codes.push(b as u32);
142                i += 1;
143            }
144            // DETECT COMBINING ACCENTS (ISO-5426 combining range)
145            // In ISO-5426, the accent comes BEFORE the letter.
146            0xC1..=0xCF | 0xD0..=0xDF | 0xE1..=0xE8 => {
147                if let Some(&accent_unicode) = ISO5426_CORRECT.get(&(b as u32)) {
148                    if i + 1 < data.len() {
149                        let next_byte = data[i + 1];
150
151                        // 1. Push base letter first (Unicode order)
152                        let base_char = if next_byte > 0x7F {
153                            *ISO5426_CORRECT.get(&(next_byte as u32)).unwrap_or(&(next_byte as u32))
154                        } else {
155                            next_byte as u32
156                        };
157                        out_codes.push(base_char);
158
159                        // 2. Push accent after
160                        out_codes.push(accent_unicode);
161                        i += 2; // Consumed accent and letter
162                    } else {
163                        // Lone accent at end of string
164                        out_codes.push(accent_unicode);
165                        i += 1;
166                    }
167                } else {
168                    out_codes.push(b as u32);
169                    i += 1;
170                }
171            }
172            // G1 special characters (Æ, Œ, ł, etc.) non-combining
173            0xA1..=0xBF => {
174                let cp = *ISO5426_CORRECT.get(&(b as u32)).unwrap_or(&(b as u32));
175                out_codes.push(cp);
176                i += 1;
177            }
178            // Control characters (0x00-0x1F) and other
179            _ => {
180                if b == 0x09 || b == 0x0A || b == 0x0D {
181                    out_codes.push(b as u32);
182                }
183                i += 1;
184            }
185        }
186    }
187
188    // Convert to String and NFC normalization (crucial to merge base + accent)
189    let raw_string: String = out_codes.into_iter().filter_map(std::char::from_u32).collect();
190
191    Ok(raw_string.nfc().collect())
192}
193
194/// Encode UTF-8 string to ISO-5426 bytes
195
196fn encode_iso5426(text: &str) -> Result<Vec<u8>, String> {
197    let mut result = Vec::with_capacity(text.len());
198
199    // 1. Use NFD to separate accents from base letters (e.g. 'é' -> 'e' + '\u0301')
200    let nfd_text: Vec<char> = text.nfd().collect();
201    let mut i = 0;
202
203    while i < nfd_text.len() {
204        let ch = nfd_text[i];
205        let cp = ch as u32;
206
207        // 2. Check if next character is a combining diacritic
208        if i + 1 < nfd_text.len() {
209            let next_ch = nfd_text[i + 1];
210            let next_cp = next_ch as u32;
211
212            // If next is an accent handled by ISO-5426
213            if (0x0300..=0x036F).contains(&next_cp) {
214                if let Some(&accent_byte) = UNICODE_TO_ISO5426.get(&next_cp) {
215                    // ISO-5426 rule: write ACCENT first
216                    result.push(accent_byte);
217
218                    // Then write base letter
219                    if cp <= 0x7E {
220                        result.push(cp as u8);
221                    } else if let Some(&base_byte) = UNICODE_TO_ISO5426.get(&cp) {
222                        result.push(base_byte);
223                    } else {
224                        return Err(format!("Base character not supported: {}", ch));
225                    }
226
227                    i += 2; // Consumed letter and its accent
228                    continue;
229                }
230            }
231        }
232
233        // 3. Standalone characters (ASCII or special like Æ, Œ)
234        if cp <= 0x7E {
235            result.push(cp as u8);
236        } else if let Some(&byte) = UNICODE_TO_ISO5426.get(&cp) {
237            result.push(byte);
238        } else {
239            // Optional: replace with '?' or space instead of error
240            return Err(format!("Character not supported in ISO-5426: {}", ch));
241        }
242
243        i += 1;
244    }
245
246    Ok(result)
247}
marc_rs/encoding.rs

marc_rs/
encoding.rs