file_content/
encoding.rs

1use std::fmt::Display;
2
3use crate::constants::{UTF16BE_BOM, UTF16LE_BOM, UTF16_BUFFER_SIZE, UTF8_BOM};
4
5/// Represents the supported encodings.
6#[derive(Debug, PartialEq, Clone, Copy)]
7pub enum Encoding {
8    Utf8,
9    Utf8Bom,
10    Utf16Be,
11    Utf16Le,
12}
13
14impl From<Encoding> for String {
15    fn from(encoding: Encoding) -> Self {
16        encoding.to_string()
17    }
18}
19
20impl Display for Encoding {
21    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
22        match self {
23            Encoding::Utf8 => write!(f, "UTF-8"),
24            Encoding::Utf8Bom => write!(f, "UTF-8-BOM"),
25            Encoding::Utf16Be => write!(f, "UTF-16-BE"),
26            Encoding::Utf16Le => write!(f, "UTF-16-LE"),
27        }
28    }
29}
30
31/// Encodes a [String] into bytes using [Encoding::Utf8]
32pub fn to_utf8_bom(s: &String) -> Vec<u8> {
33    [UTF8_BOM, s.as_bytes()].concat()
34}
35
36/// Encodes a [String] into bytes using [Encoding::Utf16Be]
37pub fn to_utf16_be(s: &str) -> Vec<u8> {
38    let mut bytes = UTF16BE_BOM.to_vec();
39    let mut buffer = [0u16; UTF16_BUFFER_SIZE];
40    for c in s.chars() {
41        for u16_unit in c.encode_utf16(&mut buffer) {
42            bytes.extend_from_slice(u16_unit.to_be_bytes().as_slice())
43        }
44    }
45
46    bytes
47}
48
49/// Encodes a [String] into bytes using [Encoding::Utf16Le]
50pub fn to_utf16_le(s: &str) -> Vec<u8> {
51    let mut bytes = UTF16LE_BOM.to_vec();
52    let mut buffer = [0u16; UTF16_BUFFER_SIZE];
53    for c in s.chars() {
54        for u16_unit in c.encode_utf16(&mut buffer) {
55            bytes.extend_from_slice(u16_unit.to_le_bytes().as_slice())
56        }
57    }
58
59    bytes
60}
61
62#[cfg(test)]
63mod tests {
64    use test_case::test_case;
65
66    use super::{to_utf16_be, to_utf16_le, to_utf8_bom};
67
68    #[test_case("", b"\xEF\xBB\xBF"; "no chars")] // BOM is always added
69    #[test_case("Hello!", b"\xEF\xBB\xBF\x48\x65\x6C\x6C\x6F\x21"; "ascii chars (8-bit chars)")]
70    #[test_case("éüñç", b"\xEF\xBB\xBF\xC3\xA9\xC3\xBC\xC3\xB1\xC3\xA7"; "latin-1 chars (16-bit chars)")]
71    #[test_case("你好", b"\xEF\xBB\xBF\xE4\xBD\xA0\xE5\xA5\xBD"; "mandarin chars (24-bit chars)")]
72    #[test_case("🌍🚀", b"\xEF\xBB\xBF\xF0\x9F\x8C\x8D\xF0\x9F\x9A\x80"; "Supplementary Multilingual Plane chars (32-bit chars)")]
73    fn test_to_utf8_bom(input: &str, expected_bytes: &[u8]) {
74        let bytes = to_utf8_bom(&input.into());
75        assert_eq!(bytes, expected_bytes);
76    }
77
78    #[test_case("", b"\xFE\xFF"; "no chars")]
79    #[test_case("Hello!", b"\xFE\xFF\x00\x48\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x21"; "16-bit chars")]
80    #[test_case("🌍🚀", b"\xFE\xFF\xD8\x3C\xDF\x0D\xD8\x3D\xDE\x80"; "32-bit chars with BE BOM")]
81    #[test_case("Hello! 😊", b"\xFE\xFF\x00\x48\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x21\x00\x20\xD8\x3D\xDE\x0A"; "mixed-length chars with BE BOM")]
82    fn test_to_utf16_be(input: &str, expected_bytes: &[u8]) {
83        let bytes = to_utf16_be(input);
84        assert_eq!(bytes, expected_bytes);
85    }
86
87    #[test_case("", b"\xFF\xFE"; "no chars")]
88    #[test_case("Hello!", b"\xFF\xFE\x48\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x21\x00"; "16-bit chars")]
89    #[test_case("🌍🚀", b"\xFF\xFE\x3C\xD8\x0D\xDF\x3D\xD8\x80\xDE"; "32-bit chars with BE BOM")]
90    #[test_case("Hello! 😊", b"\xFF\xFE\x48\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x21\x00\x20\x00\x3D\xD8\x0A\xDE"; "mixed-length chars with BE BOM")]
91    fn test_to_utf16_le(input: &str, expected_bytes: &[u8]) {
92        let bytes = to_utf16_le(input);
93        assert_eq!(bytes, expected_bytes);
94    }
95}