marc_rs/
encoding.rs

1use crate::format::Encoding as MarcEncoding;
2use encoding_rs::Encoding;
3
4/// Convert bytes from a specific encoding to UTF-8
5pub fn convert_to_utf8(data: &[u8], encoding: MarcEncoding) -> Result<String, String> {
6    if encoding == MarcEncoding::Iso5426 {
7        return decode_iso5426(data);
8    }
9
10    let enc = get_encoding(encoding);
11    let (cow, _, had_errors) = enc.decode(data);
12
13    if had_errors {
14        return Err("Encoding conversion had errors".to_string());
15    }
16
17    Ok(cow.to_string())
18}
19
20/// Convert UTF-8 string to a specific encoding
21pub fn convert_from_encoding(text: &str, encoding: MarcEncoding) -> Result<Vec<u8>, String> {
22    if encoding == MarcEncoding::Iso5426 {
23        return encode_iso5426(text);
24    }
25
26    let enc = get_encoding(encoding);
27    let (cow, _, had_errors) = enc.encode(text);
28
29    if had_errors {
30        return Err("Encoding conversion had errors".to_string());
31    }
32
33    Ok(cow.to_vec())
34}
35
36/// Get encoding_rs::Encoding for our Encoding enum
37fn get_encoding(encoding: MarcEncoding) -> &'static Encoding {
38    match encoding {
39        MarcEncoding::Utf8 => Encoding::for_label(b"utf-8").unwrap_or(encoding_rs::UTF_8),
40        MarcEncoding::Marc8 => {
41            // MARC-8 is a variant, use ISO-8859-1 as fallback
42            // In a full implementation, you'd need a MARC-8 specific decoder
43            Encoding::for_label(b"iso-8859-1").unwrap_or(encoding_rs::WINDOWS_1252)
44        }
45        MarcEncoding::Iso8859_1 => Encoding::for_label(b"iso-8859-1").unwrap_or(encoding_rs::WINDOWS_1252),
46        MarcEncoding::Iso8859_2 => Encoding::for_label(b"iso-8859-2").unwrap(),
47        MarcEncoding::Iso8859_5 => Encoding::for_label(b"iso-8859-5").unwrap(),
48        MarcEncoding::Iso8859_7 => Encoding::for_label(b"iso-8859-7").unwrap(),
49        MarcEncoding::Iso8859_15 => Encoding::for_label(b"iso-8859-15").unwrap(),
50        MarcEncoding::Iso5426 => {
51            // ISO-5426 is handled by custom functions decode_iso5426/encode_iso5426
52            // This should never be called, but kept for consistency
53            Encoding::for_label(b"iso-8859-1").unwrap_or(encoding_rs::WINDOWS_1252)
54        }
55    }
56}
57
58/// Decode ISO-5426 bytes to UTF-8 string
59/// ISO-5426 is compatible with ISO-8859-1 for most characters (0x20-0x7E, 0xA0-0xFF)
60/// Some special characters in the 0x80-0x9F range need special handling
61fn decode_iso5426(data: &[u8]) -> Result<String, String> {
62    let mut result = String::with_capacity(data.len());
63
64    for &byte in data {
65        match byte {
66            // ASCII printable characters (0x20-0x7E) - same as ISO-8859-1
67            0x20..=0x7E => {
68                result.push(byte as char);
69            }
70            // Control characters (0x00-0x1F) - keep as is or skip
71            0x00..=0x1F => {
72                // Skip control characters or convert to space
73                if byte == 0x09 || byte == 0x0A || byte == 0x0D {
74                    result.push(byte as char);
75                }
76            }
77            // DEL character (0x7F)
78            0x7F => {
79                // Skip or replace with space
80            }
81            // ISO-5426 special range (0x80-0x9F) - map to Unicode equivalents
82            0x80..=0x9F => {
83                if let Some(ch) = map_iso5426_special(byte) {
84                    result.push(ch);
85                } else {
86                    // Fallback: use replacement character
87                    result.push('\u{FFFD}');
88                }
89            }
90            // High range (0xA0-0xFF) - same as ISO-8859-1
91            0xA0..=0xFF => {
92                // Use ISO-8859-1 mapping for this range
93                let iso8859_1_enc = Encoding::for_label(b"iso-8859-1").unwrap();
94                let byte_array = [byte];
95                let (cow, _, _) = iso8859_1_enc.decode(&byte_array);
96                let decoded_str = cow.to_string();
97                result.push_str(&decoded_str);
98            }
99        }
100    }
101
102    Ok(result)
103}
104
105/// Encode UTF-8 string to ISO-5426 bytes
106fn encode_iso5426(text: &str) -> Result<Vec<u8>, String> {
107    let mut result = Vec::with_capacity(text.len());
108
109    for ch in text.chars() {
110        let code_point = ch as u32;
111
112        match code_point {
113            // ASCII printable (0x20-0x7E)
114            0x20..=0x7E => {
115                result.push(code_point as u8);
116            }
117            // Control characters
118            0x00..=0x1F => {
119                if code_point == 0x09 || code_point == 0x0A || code_point == 0x0D {
120                    result.push(code_point as u8);
121                }
122            }
123            // Try to map to ISO-5426 special range first
124            _ => {
125                if let Some(byte) = map_unicode_to_iso5426(ch) {
126                    result.push(byte);
127                } else {
128                    // Fallback: use ISO-8859-1 encoding
129                    let iso8859_1_enc = Encoding::for_label(b"iso-8859-1").unwrap();
130                    let ch_str = ch.to_string();
131                    let (cow, _, had_errors) = iso8859_1_enc.encode(&ch_str);
132                    let encoded_bytes = cow.to_vec();
133                    if had_errors || encoded_bytes.is_empty() {
134                        return Err(format!("Cannot encode character '{}' to ISO-5426", ch));
135                    }
136                    result.extend_from_slice(&encoded_bytes);
137                }
138            }
139        }
140    }
141
142    Ok(result)
143}
144
145/// Map ISO-5426 special characters (0x80-0x9F) to Unicode
146/// This is a partial mapping - a full implementation would include all 76 characters
147fn map_iso5426_special(byte: u8) -> Option<char> {
148    match byte {
149        // Common ISO-5426 characters mapped to Unicode
150        // This is a simplified mapping - extend as needed
151        0x80..=0x9F => {
152            // For now, use ISO-8859-1 as fallback for most characters
153            // A full implementation would have a complete mapping table
154            let iso8859_1_enc = Encoding::for_label(b"iso-8859-1").unwrap();
155            let byte_array = [byte];
156            let (cow, _, _) = iso8859_1_enc.decode(&byte_array);
157            let decoded_str = cow.to_string();
158            decoded_str.chars().next()
159        }
160        _ => None,
161    }
162}
163
164/// Map Unicode character to ISO-5426 byte
165fn map_unicode_to_iso5426(ch: char) -> Option<u8> {
166    // Simplified mapping - extend with full ISO-5426 table as needed
167    // For now, try ISO-8859-1 encoding first
168    let iso8859_1_enc = Encoding::for_label(b"iso-8859-1").unwrap();
169    let ch_str = ch.to_string();
170    let (cow, _, had_errors) = iso8859_1_enc.encode(&ch_str);
171    let encoded_bytes = cow.to_vec();
172    if !had_errors && encoded_bytes.len() == 1 {
173        Some(encoded_bytes[0])
174    } else {
175        None
176    }
177}