xim_ctext/
lib.rs

1//! A parser for the compound text encoding used by the X Input Method protocol.
2//!
3//! This is intended to be used as a building block for higher level libraries. See the [`xim`] crate for an example.
4//!
5//! [xim]: https://crates.io/crates/xim
6
7#![no_std]
8#![allow(clippy::uninlined_format_args)]
9#![forbid(unsafe_code, future_incompatible)]
10
11extern crate alloc;
12
13#[cfg(feature = "std")]
14extern crate std;
15
16use alloc::string::String;
17use alloc::vec::Vec;
18use core::fmt;
19
20#[cfg(feature = "std")]
21use std::io::{self, Write};
22
23const UTF8_START: &[u8] = &[0x1B, 0x25, 0x47];
24const UTF8_END: &[u8] = &[0x1B, 0x25, 0x40];
25
26/// Wrapper for reduce allocation
27#[derive(Clone, Copy)]
28#[repr(transparent)]
29pub struct CText<'s> {
30    utf8: &'s str,
31}
32
33impl<'s> fmt::Debug for CText<'s> {
34    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35        f.write_str(self.utf8)
36    }
37}
38
39impl<'s> fmt::Display for CText<'s> {
40    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
41        f.write_str(self.utf8)
42    }
43}
44
45impl<'s> CText<'s> {
46    pub const fn new(utf8: &'s str) -> Self {
47        Self { utf8 }
48    }
49
50    pub const fn len(self) -> usize {
51        self.utf8.len() + UTF8_START.len() + UTF8_END.len()
52    }
53
54    pub const fn is_empty(self) -> bool {
55        self.utf8.is_empty()
56    }
57
58    #[cfg(feature = "std")]
59    pub fn write(self, mut out: impl Write) -> io::Result<usize> {
60        let mut writed = 0;
61        writed += out.write(UTF8_START)?;
62        writed += out.write(self.utf8.as_bytes())?;
63        writed += out.write(UTF8_END)?;
64        Ok(writed)
65    }
66}
67
68/// Encoding utf8 to COMPOUND_TEXT with utf8 escape
69pub fn utf8_to_compound_text(text: &str) -> Vec<u8> {
70    let mut ret = Vec::with_capacity(text.len() + 6);
71    ret.extend_from_slice(UTF8_START);
72    ret.extend_from_slice(text.as_bytes());
73    ret.extend_from_slice(UTF8_END);
74    ret
75}
76
77#[derive(Debug, Clone)]
78pub enum DecodeError {
79    InvalidEncoding,
80    UnsupportedEncoding,
81    Utf8Error(alloc::string::FromUtf8Error),
82}
83
84impl From<alloc::string::FromUtf8Error> for DecodeError {
85    fn from(err: alloc::string::FromUtf8Error) -> Self {
86        DecodeError::Utf8Error(err)
87    }
88}
89
90impl fmt::Display for DecodeError {
91    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
92        match self {
93            Self::InvalidEncoding => write!(f, "Invalid compound text"),
94            Self::UnsupportedEncoding => write!(f, "This encoding is not supported yet"),
95            Self::Utf8Error(e) => write!(f, "Not a valid utf8 {}", e),
96        }
97    }
98}
99
100macro_rules! decode {
101    ($decoder:expr, $out:expr, $bytes:expr, $last:expr) => {
102        let mut _current_bytes: &[u8] = $bytes;
103        loop {
104            let (ret, nread, _) = $decoder.decode_to_string(_current_bytes, $out, $last);
105
106            match ret {
107                encoding_rs::CoderResult::InputEmpty => break,
108                encoding_rs::CoderResult::OutputFull => {
109                    $out.reserve(
110                        $decoder
111                            .max_utf8_buffer_length($bytes.len())
112                            .unwrap_or_default(),
113                    );
114                    _current_bytes = &_current_bytes[nread..];
115                }
116            }
117        }
118    };
119}
120
121pub fn compound_text_to_utf8(bytes: &[u8]) -> Result<String, DecodeError> {
122    let split = bytes.split(|&b| b == 0x1b);
123
124    let mut result = String::new();
125
126    for chunk in split {
127        let mut iter = chunk.iter();
128        match (iter.next(), iter.next()) {
129            // UTF-8
130            (Some(0x25), Some(0x47)) => {
131                let left = iter.as_slice().to_vec();
132                match String::from_utf8(left) {
133                    Ok(out) => result.push_str(&out),
134                    Err(e) => return Err(DecodeError::from(e)),
135                };
136            }
137            // UTF-8 End
138            (Some(0x25), Some(0x40)) => {}
139            // 94N
140            (Some(0x24), Some(0x28)) => match iter.next() {
141                // JP
142                Some(0x42) => {
143                    let left = iter.as_slice();
144                    let mut decoder = encoding_rs::ISO_2022_JP.new_decoder_without_bom_handling();
145                    let mut out = String::new();
146                    decode!(decoder, &mut out, &[0x1B, 0x24, 0x42], false);
147                    decode!(decoder, &mut out, &left, true);
148
149                    result.push_str(&out);
150                }
151
152                // CN (GB2312)
153                Some(0x41) => {
154                    let left: Vec<u8> = iter.map(|&b| b + 0x80).collect();
155                    let (out, _) = encoding_rs::GBK.decode_without_bom_handling(&left);
156                    result.push_str(&out);
157                }
158
159                // KR (KS C 5601)
160                Some(0x43) => {
161                    let left: Vec<u8> = iter.map(|&b| b + 0x80).collect();
162                    let (out, _) = encoding_rs::EUC_KR.decode_with_bom_removal(&left);
163                    result.push_str(&out);
164                }
165                // Invalid encode
166                _ => return Err(DecodeError::InvalidEncoding),
167            },
168            // ISO-8859-1
169            (Some(0x2d), Some(0x41)) => {
170                let left = iter.as_slice();
171                let out = encoding_rs::mem::decode_latin1(left);
172                result.push_str(&out);
173            }
174            // ISO-8859-2
175            (Some(0x2d), Some(0x42)) => {
176                let left = iter.as_slice();
177                let (out, _) = encoding_rs::ISO_8859_2.decode_without_bom_handling(left);
178                result.push_str(&out);
179            }
180            // ISO-8859-3
181            (Some(0x2d), Some(0x43)) => {
182                let left = iter.as_slice();
183                let (out, _) = encoding_rs::ISO_8859_3.decode_without_bom_handling(left);
184                result.push_str(&out);
185            }
186            // ISO-8859-4
187            (Some(0x2d), Some(0x44)) => {
188                let left = iter.as_slice();
189                let (out, _) = encoding_rs::ISO_8859_4.decode_without_bom_handling(left);
190                result.push_str(&out);
191            }
192            // ISO-8859-7
193            (Some(0x2d), Some(0x46)) => {
194                let left = iter.as_slice();
195                let (out, _) = encoding_rs::ISO_8859_7.decode_without_bom_handling(left);
196                result.push_str(&out);
197            }
198            // ISO-8859-6
199            (Some(0x2d), Some(0x47)) => {
200                let left = iter.as_slice();
201                let (out, _) = encoding_rs::ISO_8859_6.decode_without_bom_handling(left);
202                result.push_str(&out);
203            }
204            // ISO-8859-8
205            (Some(0x2d), Some(0x48)) => {
206                let left = iter.as_slice();
207                let (out, _) = encoding_rs::ISO_8859_8.decode_without_bom_handling(left);
208                result.push_str(&out);
209            }
210            // ISO-8859-5
211            (Some(0x2d), Some(0x4c)) => {
212                let left = iter.as_slice();
213                let (out, _) = encoding_rs::ISO_8859_5.decode_without_bom_handling(left);
214                result.push_str(&out);
215            }
216            // ISO-8859-9
217            (Some(0x2d), Some(0x4d)) => {
218                let left = iter.as_slice();
219                let (out, _) = encoding_rs::WINDOWS_1254.decode_without_bom_handling(left);
220                result.push_str(&out);
221            }
222            // ISO-8859-10
223            (Some(0x2d), Some(0x56)) => {
224                let left = iter.as_slice();
225                let (out, _) = encoding_rs::ISO_8859_10.decode_without_bom_handling(left);
226                result.push_str(&out);
227            }
228            // ISO-8859-13
229            (Some(0x2d), Some(0x59)) => {
230                let left = iter.as_slice();
231                let (out, _) = encoding_rs::ISO_8859_13.decode_without_bom_handling(left);
232                result.push_str(&out);
233            }
234            // ISO-8859-14
235            (Some(0x2d), Some(0x5f)) => {
236                let left = iter.as_slice();
237                let (out, _) = encoding_rs::ISO_8859_14.decode_without_bom_handling(left);
238                result.push_str(&out);
239            }
240            // ISO-8859-15
241            (Some(0x2d), Some(0x62)) => {
242                let left = iter.as_slice();
243                let (out, _) = encoding_rs::ISO_8859_15.decode_without_bom_handling(left);
244                result.push_str(&out);
245            }
246            // ISO-8859-16
247            (Some(0x2d), Some(0x66)) => {
248                let left = iter.as_slice();
249                let (out, _) = encoding_rs::ISO_8859_16.decode_without_bom_handling(left);
250                result.push_str(&out);
251            }
252            // defaults to ISO-8859-1
253            _ => {
254                let out = encoding_rs::mem::decode_latin1(chunk);
255                result.push_str(&out);
256            }
257        };
258    }
259    Ok(result)
260}
261
262#[cfg(test)]
263mod tests {
264    #[test]
265    fn korean() {
266        const UTF8: &str = "가나다";
267        const COMP: &[u8] = &[
268            27, 37, 71, 234, 176, 128, 235, 130, 152, 235, 139, 164, 27, 37, 64,
269        ];
270        assert_eq!(crate::utf8_to_compound_text(UTF8), COMP);
271        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
272    }
273
274    #[test]
275    fn iso_2022_jp() {
276        const UTF8: &str = "東京";
277        const COMP: &[u8] = &[27, 36, 40, 66, 69, 108, 53, 126];
278        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
279    }
280
281    #[test]
282    fn iso_2022_jp_long() {
283        const UTF8: &str = "知ってるつもり(B";
284        const COMP: &[u8] = &[
285            27, 36, 40, 66, 67, 78, 36, 67, 36, 70, 36, 107, 36, 68, 36, 98, 36, 106, 27, 40, 66,
286        ];
287        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
288    }
289
290    #[test]
291    fn gb2312_cn() {
292        const UTF8: &str = "很高兴认识你";
293        const COMP: &[u8] = &[
294            0x1b, 0x24, 0x28, 0x41, 0x3a, 0x5c, 0x38, 0x5f, 0x50, 0x4b, 0x48, 0x4f, 0x4a, 0x36,
295            0x44, 0x63,
296        ];
297        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
298    }
299
300    #[test]
301    fn gb2312_cn_mixed() {
302        const UTF8: &str = "炸哦你";
303        const COMP: &[u8] = &[
304            0x1b, 0x24, 0x28, 0x42, 0x5f, 0x5a, 0x53, 0x28, 0x1b, 0x24, 0x28, 0x41, 0x44, 0x63,
305        ];
306        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
307    }
308
309    #[test]
310    fn ks_c_5601() {
311        const UTF8: &str = "넌최고야";
312        const COMP: &[u8] = &[
313            0x1b, 0x24, 0x28, 0x43, 0x33, 0x4d, 0x43, 0x56, 0x30, 0x6d, 0x3e, 0x5f,
314        ];
315        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
316    }
317
318    #[test]
319    fn iso_8859_1() {
320        const UTF8: &str = "¡¸ÀÑâó";
321        const COMP: &[u8] = &[0x1b, 0x2d, 0x41, 0xa1, 0xb8, 0xc0, 0xd1, 0xe2, 0xf3];
322        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
323    }
324
325    #[test]
326    fn iso_8859_2() {
327        const UTF8: &str = "ĄŁĽŚŠŤ";
328        const COMP: &[u8] = &[0x1b, 0x2d, 0x42, 0xa1, 0xa3, 0xa5, 0xa6, 0xa9, 0xab];
329        assert_eq!(crate::compound_text_to_utf8(COMP).unwrap(), UTF8);
330    }
331}