const_utf16/
lib.rs

1//! Const evaluated utf8 to utf16 conversion functions.
2//!
3//! # Use
4//!
5//! ```
6//! # #[macro_use]
7//! # extern crate const_utf16;
8//! # fn main() {}
9//! const HELLO_WORLD_UTF16: &[u16] = const_utf16::encode!("Hello, world!");
10//! ```
11#![no_std]
12#![deny(missing_docs)]
13
14/// Encode a &str as a utf16 buffer.
15#[macro_export]
16macro_rules! encode {
17    ($s:expr) => {{
18        $crate::encode!($s, non_null_terminated)
19    }};
20    ($s:expr, $null_terminated:ident) => {{
21        const __STRING: &'static str = $s;
22    const __EXTRA_BYTE: usize = $crate::encode!(@@ $null_terminated);
23        const __STRING_LEN: usize = __STRING.len() + __EXTRA_BYTE;
24        const __BUFFER_AND_LEN: (&[u16; __STRING_LEN], usize) = {
25            let mut result = [0; __STRING_LEN];
26            let mut utf16_offset = 0;
27
28            let mut iterator = $crate::CodePointIterator::new(__STRING.as_bytes());
29            while let Some((next, mut code)) = iterator.next() {
30                iterator = next;
31                if code == 0 && __EXTRA_BYTE == 1 {
32                    #[allow(unconditional_panic)]
33                    let _ =
34                        ["Found a null byte in string which should have no null bytes"][usize::MAX];
35                }
36                if (code & 0xFFFF) == code {
37                    result[utf16_offset] = code as u16;
38                    utf16_offset += 1;
39                } else {
40                    // Supplementary planes break into surrogates.
41                    code -= 0x1_0000;
42                    result[utf16_offset] = 0xD800 | ((code >> 10) as u16);
43                    result[utf16_offset + 1] = 0xDC00 | ((code as u16) & 0x3FF);
44                    utf16_offset += 2;
45                }
46            }
47            (&{ result }, utf16_offset + __EXTRA_BYTE)
48        };
49        const __OUT: &[u16; __BUFFER_AND_LEN.1] = unsafe {
50            ::core::mem::transmute::<
51                &'static &[u16; __STRING_LEN],
52                &'static &[u16; __BUFFER_AND_LEN.1],
53            >(&__BUFFER_AND_LEN.0)
54        };
55        __OUT
56    }};
57    (@@ null_terminated) => {
58        1
59    };
60    (@@ non_null_terminated) => {
61        0
62    };
63}
64
65/// Encode a &str as a utf16 buffer with a terminating null byte
66///
67/// # Panics
68///
69/// This function panics if called with a string that contains any null bytes.
70#[macro_export]
71macro_rules! encode_null_terminated {
72    ($s:expr) => {{
73        $crate::encode!($s, null_terminated)
74    }};
75}
76
77#[doc(hidden)]
78pub struct CodePointIterator<'a> {
79    buffer: &'a [u8],
80    offset: usize,
81}
82
83impl<'a> CodePointIterator<'a> {
84    #[doc(hidden)]
85    pub const fn new(buffer: &'a [u8]) -> Self {
86        Self::new_with_offset(buffer, 0)
87    }
88
89    #[doc(hidden)]
90    pub const fn new_with_offset(buffer: &'a [u8], offset: usize) -> Self {
91        Self { buffer, offset }
92    }
93
94    #[doc(hidden)]
95    pub const fn next(self) -> Option<(Self, u32)> {
96        if let Some((codepont, num_utf8_bytes)) = next_code_point(self.buffer, self.offset) {
97            Some((
98                Self::new_with_offset(self.buffer, self.offset + num_utf8_bytes),
99                codepont,
100            ))
101        } else {
102            None
103        }
104    }
105}
106
107/// Largely adapted from [Rust core](https://github.com/rust-lang/rust/blob/7e2032390cf34f3ffa726b7bd890141e2684ba63/library/core/src/str/validations.rs#L40-L68).
108const fn next_code_point(bytes: &[u8], start: usize) -> Option<(u32, usize)> {
109    if bytes.len() == start {
110        return None;
111    }
112    let mut num_bytes = 1;
113    let x = bytes[start + 0];
114    if x < 128 {
115        return Some((x as u32, num_bytes));
116    }
117    // Multibyte case follows
118    // Decode from a byte combination out of: [[[x y] z] w]
119    // NOTE: Performance is sensitive to the exact formulation here
120    let init = utf8_first_byte(x, 2);
121    let y = unwrap_or_0(bytes, start + 1);
122    if y != 0 {
123        num_bytes += 1;
124    }
125    let mut ch = utf8_acc_cont_byte(init, y);
126    if x >= 0xE0 {
127        // [[x y z] w] case
128        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
129        let z = unwrap_or_0(bytes, start + 2);
130        if z != 0 {
131            num_bytes += 1;
132        }
133        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
134        ch = init << 12 | y_z;
135        if x >= 0xF0 {
136            // [x y z w] case
137            // use only the lower 3 bits of `init`
138            let w = unwrap_or_0(bytes, start + 3);
139            if w != 0 {
140                num_bytes += 1;
141            }
142            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
143        }
144    }
145
146    Some((ch, num_bytes))
147}
148
149/// Returns the initial codepoint accumulator for the first byte.
150/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
151/// for width 3, and 3 bits for width 4.
152const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
153    (byte & (0x7F >> width)) as u32
154}
155
156const fn unwrap_or_0(slice: &[u8], index: usize) -> u8 {
157    if slice.len() > index {
158        slice[index]
159    } else {
160        0
161    }
162}
163
164const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
165    (ch << 6) | (byte & CONT_MASK) as u32
166}
167
168/// Mask of the value bits of a continuation byte.
169const CONT_MASK: u8 = 0b0011_1111;
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174    use core::iter::once;
175
176    #[test]
177    fn encode_utf16_works() {
178        const TEXT: &str = "Hello \0ä日本 語";
179        let expected = TEXT.encode_utf16();
180        const RESULT: &[u16] = encode!(TEXT);
181
182        assert!(RESULT.iter().cloned().eq(expected));
183    }
184
185    #[test]
186    fn encode_utf16_with_null_byte_works() {
187        const TEXT: &str = "Hello ä日本 語";
188        let expected = TEXT.encode_utf16().chain(once(0));
189        const RESULT: &[u16] = encode_null_terminated!(TEXT);
190
191        assert!(RESULT.iter().cloned().eq(expected));
192    }
193}