unicode_converter/
cesu_8.rs

1/// The CESU-8 module handle the __Compatibility Encoding Scheme for
2/// UTF-16: 8-Bit__.
3
4use crate::unicode_encoding::UnicodeEncodingError::*;
5use crate::unicode_encoding::UnicodeEncodingError;
6use crate::unicode_encoding::UnicodeEncoding;
7use crate::utf_32::Utf32;
8use crate::utf_16::*;
9use crate::utf_8::Utf8;
10
11/// A wrapper for CESU-8 encoded bytes
12pub struct Cesu8 {
13    /// As CESU-8 is made of UTF-8 data, it makes sense to reuse the UTF-8 type
14    /// here.
15    pub data: Utf8,
16}
17
18impl UnicodeEncoding for Cesu8 {
19    /// Convert UTF-32 data to CESU-8.
20    fn from_utf_32(data_utf32: &Utf32)-> Self {
21        let mut data: Vec<u8> = Vec::new();
22        for glyph in &data_utf32.data {
23            let new_bytes = utf_32_glyph_to_cesu_8(*glyph);
24            for byte in new_bytes {
25                data.push(byte);
26            }
27        }
28        return Cesu8{data: Utf8::from_bytes_no_check(&data, false).unwrap()};
29    }
30
31    /// Convert CESU-8 to UTF-32.
32    fn to_utf_32(&self) -> Utf32 {
33        let tmp_utf32 = self.data.to_utf_32();
34        let mut ret: Vec<u32> = Vec::new();
35        let mut loop_pointer = 0;
36        while loop_pointer < tmp_utf32.data.len() {
37            if loop_pointer < tmp_utf32.data.len() - 1 {
38                match compatible_codepoints(tmp_utf32.data[loop_pointer], tmp_utf32.data[loop_pointer+1]) {
39                    NoError => {
40                        ret.push(tmp_utf32.data[loop_pointer]);
41                        loop_pointer = loop_pointer + 1;
42                    },
43                    AmbiguousUnpairedSurrogates => {
44                        let surrogate_pair = vec![tmp_utf32.data[loop_pointer] as u16, tmp_utf32.data[loop_pointer+1] as u16];
45                        let utf16_bit = Utf16{data: surrogate_pair};
46                        ret.push(utf16_bit.to_utf_32().data[0]);
47                        loop_pointer = loop_pointer + 2;
48                    },
49                    x => {
50                        eprintln!("[UNICODE ENCODING ERROR] {:?}.", x);
51                        panic!("This should not have happen if the source was safely generated with from_string or from_bytes. This could happen if from_string_no_check was used. This need to be corrected from the library's user side.");
52                    }
53                }
54            } else {
55                ret.push(tmp_utf32.data[loop_pointer]);
56                loop_pointer = loop_pointer + 1;
57            }
58        }
59        return Utf32{data: ret};
60    }
61
62    /// Convert the instance of `Cesu8` to a vector of bytes, all the heavy
63    /// lifting in made in the UTF-8 module.
64    fn to_bytes(&self, big_endian: bool) -> Vec<u8> {
65        return self.data.to_bytes(big_endian);
66    }
67
68    /// Convert a stream of bytes encoded as CESU-8 into an instance of the
69    /// `Cesu8` type. All the heavy lifting in made in the UTF-8 module.
70    fn from_bytes_no_check(bytes: &[u8], big_endian: bool) -> Result<Self, UnicodeEncodingError> {
71        match Utf8::from_bytes_no_check(bytes, big_endian) {
72            Ok(x) => Ok(Cesu8{data: x}),
73            Err(y) => Err(y),
74        }
75    }
76
77
78
79}
80
81/* ---------------------------- Helper functions ---------------------------- */
82
83const SMALL_DATA_LIMIT: u32 = 0xFFFF;
84
85fn utf_32_glyph_to_cesu_8(glyph: u32) -> Vec<u8> {
86    let glyph_in_vec = Utf32{data: vec![glyph]};
87    if glyph <= SMALL_DATA_LIMIT {
88        return glyph_in_vec.convert_to::<Utf8>().to_bytes(false);
89    } else {
90        let utf16 = glyph_in_vec.convert_to::<Utf16>();
91        let mut ret: Vec<u8> = Vec::new();
92        for surrogate in utf16.data {
93            let surrogate_in_vec = Utf32{data: vec![surrogate as u32]};
94            for byte in surrogate_in_vec.convert_to::<Utf8>().to_bytes(false) {
95                ret.push(byte);
96            }
97        }
98        return ret;
99    }
100}
101
102/* --------------------------------- Testing -------------------------------- */
103
104#[test]
105fn test_utf_32_glyph_to_cesu_8() {
106    let g1: u32 = 0x0045;
107    let v1: Vec<u8> = vec![0x45];
108    assert_eq!(v1, utf_32_glyph_to_cesu_8(g1));
109
110    let g2: u32 = 0x0205;
111    let v2: Vec<u8> = vec![0xC8, 0x85];
112    assert_eq!(v2, utf_32_glyph_to_cesu_8(g2));
113
114    let g3: u32 = 0x10400;
115    let v3: Vec<u8> = vec![0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80];
116    assert_eq!(v3, utf_32_glyph_to_cesu_8(g3));
117}
118
119#[test]
120fn test_utf32_to_cesu_8_and_back() {
121    fn conv_two_ways(glyph: u32) {
122        let v = utf_32_glyph_to_cesu_8(glyph);
123        let cesu = Cesu8::from_bytes_no_check(&v, false).unwrap();
124        let utf32 = cesu.to_utf_32();
125        let glyph_back = utf32.data[0];
126        assert_eq!(glyph_back, glyph);
127    }
128
129    conv_two_ways(0x0045);
130    conv_two_ways(0x0205);
131    conv_two_ways(0x10400);
132}
133