sj/parser/
unicode_char.rs

1/*
2==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--
3
4SJ
5
6Copyright (C) 2019-2025  Anonymous
7
8There are several releases over multiple years,
9they are listed as ranges, such as: "2019-2025".
10
11This program is free software: you can redistribute it and/or modify
12it under the terms of the GNU Lesser General Public License as published by
13the Free Software Foundation, either version 3 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19GNU Lesser General Public License for more details.
20
21You should have received a copy of the GNU Lesser General Public License
22along with this program.  If not, see <https://www.gnu.org/licenses/>.
23
24::--::--::--::--::--::--::--::--::--::--::--::--::--::--::--::--
25*/
26
27//! # Unicode character
28//!
29//! ## References
30//!
31//! - <https://en.wikipedia.org/wiki/UTF-8>
32
33use {
34    alloc::vec::Vec,
35    crate::Result,
36};
37
38/// # Unicode character
39#[derive(Debug)]
40pub (super) struct UnicodeChar {
41    bytes: [u8; 4],
42    idx: usize,
43}
44
45impl UnicodeChar {
46
47    /// # Makes new instance
48    #[inline(always)]
49    pub fn new() -> Self {
50        Self {
51            bytes: [0; 4],
52            idx: 0,
53        }
54    }
55
56    /// # Adds new byte written as a hexadecimal
57    #[inline(always)]
58    pub fn add_hex(&mut self, hex: &u8) -> Result<()> {
59        match self.bytes.get_mut(self.idx) {
60            Some(item) => {
61                let byte = match hex {
62                    b'0'..=b'9' => hex - b'0',
63                    b'a'..=b'f' => hex - b'a' + 10,
64                    b'A'..=b'F' => hex - b'A' + 10,
65                    _ => return Err(err!("Not a hexadecimal: {:?}", char::from(*hex))),
66                };
67                match self.idx % 2 {
68                    0 => *item = byte * 16,
69                    _ => *item = byte,
70                };
71                self.idx += 1;
72                Ok(())
73            },
74            None => return Err(e!("Bytes are full")),
75        }
76    }
77
78    /// # Checks if data is full
79    #[inline(always)]
80    pub fn is_full(&self) -> bool {
81        self.idx >= self.bytes.len()
82    }
83
84    /// # Encodes as UTF-8 bytes
85    #[inline(always)]
86    pub fn encode_as_utf8_bytes(self, out: &mut Vec<u8>) -> Result<()> {
87        const MARKER: u8 = 0b_1000_0000;
88
89        if self.is_full() {
90            let first = self.bytes[0] + self.bytes[1];
91            let last = self.bytes[2] + self.bytes[3];
92            let byte_count = match first {
93                0x00 => match last {
94                    0x00..=0x7f => ByteCount::One,
95                    _ => ByteCount::Two,
96                },
97                0x01..=0x07 => ByteCount::Two,
98                _ => ByteCount::Three,
99            };
100            match byte_count {
101                ByteCount::One => out.push(last),
102                ByteCount::Two => {
103                    const HEADER_OF_TWO: u8 = 0b_1100_0000;
104                    out.push(HEADER_OF_TWO | (first << 5 >> 3) | (last >> 6));
105                    out.push(MARKER | (last << 2 >> 2));
106                },
107                ByteCount::Three => {
108                    const HEADER_OF_THREE: u8 = 0b_1110_0000;
109                    out.push(HEADER_OF_THREE | (first >> 4));
110                    out.push(MARKER | (first << 4 >> 2) | (last >> 6));
111                    out.push(MARKER | (last << 2 >> 2));
112                },
113            };
114            return Ok(());
115        }
116
117        Err(e!("Bytes are not full"))
118    }
119
120}
121
122/// # Byte count
123#[derive(Debug)]
124enum ByteCount {
125    One,
126    Two,
127    Three,
128}
129
130#[test]
131fn tests() -> Result<()> {
132    for (hex, chr) in &[
133        ("0024", '\u{0024}'), ("00A2", '\u{00a2}'),
134
135        ("07fF", '\u{07ff}'),
136
137        ("0939", '\u{0939}'), ("20AC", '\u{20AC}'), ("d55c", '\u{d55c}'), ("1d2D", '\u{1d2d}'), ("0800", '\u{0800}'), ("fFFf", '\u{FFff}'),
138    ] {
139        let mut buf = [0; 4];
140        chr.encode_utf8(&mut buf);
141        let buf = &buf[..chr.len_utf8()];
142
143        let mut uc = UnicodeChar::new();
144        for b in hex.as_bytes() {
145            uc.add_hex(b)?;
146        }
147
148        let mut out = Vec::with_capacity(4);
149        uc.encode_as_utf8_bytes(&mut out)?;
150        assert_eq!(out, buf);
151    }
152
153    Ok(())
154}