irox_bits/
utf.rs

1// SPDX-License-Identifier: MIT
2// Copyright 2025 IROX Contributors
3//
4
5//!
6//! UTF-8 Encoding & Decoding
7//!
8//! Using info from: [here](https://simonsapin.github.io/wtf-8/#generalized-utf8)
9//!
10
11#![allow(clippy::unusual_byte_groupings)]
12#![allow(clippy::indexing_slicing)]
13
14use crate::{Bits, BitsErrorKind, Error, MutBits};
15
16const MAX_ONE_BYTE: u8 = 0x80;
17
18const TAG_TWO_BYTE: u8 = 0b110_00000;
19const TAG_TWO_BYTE_MASK: u8 = 0b111_00000;
20const MAX_TWO_BYTE: u32 = 0x800;
21
22const TAG_THREE_BYTE: u8 = 0b1110_0000;
23const TAG_THREE_BYTE_MASK: u8 = 0b1111_0000;
24const MAX_THREE_BYTE: u32 = 0x10000;
25
26const TAG_FOUR_BYTE: u8 = 0b11110_000;
27const TAG_FOUR_BYTE_MASK: u8 = 0b11111_000;
28
29const TAG_CONTINUE: u8 = 0b1000_0000;
30
31/// Returns the number of required bytes to store the specified character in UTF-8.
32pub fn required_utf8_bytes(val: char) -> usize {
33    let val = val as u32;
34    if val < MAX_ONE_BYTE as u32 {
35        1
36    } else if val < MAX_TWO_BYTE {
37        2
38    } else if val < MAX_THREE_BYTE {
39        3
40    } else {
41        4
42    }
43}
44/// Encodes the character into the provided buffer using UTF-8.  A subslice
45/// of the provided buffer is returned providing the amount of buffer actually
46/// used.
47pub fn encode_be_utf8_char(val: char, buf: &mut [u8; 4]) -> Result<&[u8], Error> {
48    let len = required_utf8_bytes(val);
49    let val = val as u32;
50    match (len, &mut buf[..]) {
51        (1, [a, ..]) => {
52            *a = val as u8;
53        }
54        (2, [a, b, ..]) => {
55            *a = ((val >> 6) & 0x1F) as u8 | TAG_TWO_BYTE;
56            *b = (val & 0x3F) as u8 | TAG_CONTINUE;
57        }
58        (3, [a, b, c, ..]) => {
59            *a = ((val >> 12) & 0x0F) as u8 | TAG_THREE_BYTE;
60            *b = ((val >> 6) & 0x3F) as u8 | TAG_CONTINUE;
61            *c = (val & 0x3F) as u8 | TAG_CONTINUE;
62        }
63        (4, [a, b, c, d]) => {
64            *a = ((val >> 18) & 0x07) as u8 | TAG_FOUR_BYTE;
65            *b = ((val >> 12) & 0x3F) as u8 | TAG_CONTINUE;
66            *c = ((val >> 6) & 0x3F) as u8 | TAG_CONTINUE;
67            *d = (val & 0x3F) as u8 | TAG_CONTINUE;
68        }
69        _ => return Err(BitsErrorKind::FormatError.into()),
70    }
71    Ok(&buf[..len])
72}
73
74/// Writes 1, 2, 3, or 4 bytes representing the unicode UTF-8 format character to the
75/// specified output.  Upon success, returns the number of bytes written.
76pub fn write_be_utf8_char<T: MutBits + ?Sized>(val: char, out: &mut T) -> Result<usize, Error> {
77    let mut buf = [0u8; 4];
78    let val = encode_be_utf8_char(val, &mut buf)?;
79    out.write_all_bytes(val)?;
80    Ok(val.len())
81}
82
83/// Reads 1, 2, 3, or 4 bytes representing the unicode UTF-8 formatted character from the
84/// specified input.  Returns the character read and the number of bytes consumed.
85pub fn read_be_utf8_char<T: Bits + ?Sized>(src: &mut T) -> Result<(char, usize), Error> {
86    let a = src.read_u8()?;
87    if a < MAX_ONE_BYTE {
88        return Ok((a as char, 1));
89    }
90    let (val, len) = if (a & TAG_TWO_BYTE_MASK) == TAG_TWO_BYTE {
91        let b = (src.read_u8()? & 0x3F) as u32;
92        let a = ((a & 0x1F) as u32) << 6;
93        (a | b, 2)
94    } else if (a & TAG_THREE_BYTE_MASK) == TAG_THREE_BYTE {
95        let b = ((src.read_u8()? & 0x3F) as u32) << 6;
96        let c = (src.read_u8()? & 0x3F) as u32;
97        let a = ((a & 0xF) as u32) << 12;
98        (a | b | c, 3)
99    } else if (a & TAG_FOUR_BYTE_MASK) == TAG_FOUR_BYTE {
100        let b = ((src.read_u8()? & 0x3F) as u32) << 12;
101        let c = ((src.read_u8()? & 0x3F) as u32) << 6;
102        let d = (src.read_u8()? & 0x3F) as u32;
103        let a = ((a & 0x7) as u32) << 24;
104        (a | b | c | d, 4)
105    } else {
106        return Err(BitsErrorKind::FormatError.into());
107    };
108    let Some(val) = char::from_u32(val) else {
109        return Err(BitsErrorKind::InvalidInput.into());
110    };
111    Ok((val, len))
112}