ende/
ucs2.rs

1/*!
2UCS-2 encoding and decoding.
3
4# Encoding
5A unicode code point is represented using [two bytes]() in UCS-2, using always this fixed size.
6
7# Decoding
8A UCS-2 code point is decoded into a unicode code point using the the first [two bytes]().
9
10## Representation
11
12**Note**:
13
14* UCS-2 is a subset of UTF-16.
15* UCS-2 is capable of ending 65,536 code points. This is the same as the first 65,536 code points of UTF-16.
16
17### Two bytes
18
19**Encoding**: If the unicode code point is less than 0xFFFF, the unicode code point is represented in UTF-16 using only the 16 least significant bits.
20
21**Decoding**: If the UTF-16 code point is less than 0xD800 or greater than 0xDBFF and less than 0xFFFF, the unicode code point is represented using only the 16 least significant bits.
22
23* Unicode code point: `nnnnnnnn|nnnnnnnn|xxxxxxxx|xxxxxxxx`
24* UTF-16 code point: `xxxxxxxx|xxxxxxxx`
25*/
26
27/// Pretty print the UCS-2 code points in hexadecimal, (binary) and decimal.
28///
29/// # Parameters
30/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
31/// * `binary_flag`: [`bool`] - A flag to print the binary representation of the UCS-2 code points.
32///
33/// # Note
34/// The bytes printed in hexadecimal are code points in UCS-2.
35fn print_ucs2_vec<T: AsRef<Vec<u16>>>(ucs2_cp: T, binary_flag: bool) {
36    let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
37    let string_repr: String = String::from_utf16(&v).unwrap();
38    let binary_repr: Vec<String> = v.iter().map(|x| format!("{:08b}", x)).collect();
39    println!();
40    println!(
41        "--------------- UCS-2 of \"{}\" ---------------",
42        string_repr
43    );
44    println!("Hex: {:x?}", v);
45    if binary_flag {
46        println!("Bin: {:?}", binary_repr);
47    }
48    println!("Dec: {:?}", v);
49    println!(
50        "{}{}",
51        "-".repeat(44),
52        "-".repeat(string_repr.chars().count())
53    );
54    println!();
55}
56
57// ============================================================================
58// ================================ Public API ================================
59// ============================================================================
60/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
61///
62/// # Parameters
63/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
64///
65/// # Note
66/// The bytes printed in hexadecimal are code points in UCS-2.
67///
68/// # Example
69/// ```rust
70/// use ende::prelude::*;
71/// let v: Vec<u16> = vec![0xFFEE];
72/// print_ucs2(&v);
73/// ```
74/// **Output**
75/// ```text
76/// --------------- UTF-16 encoding of "𐀁" ---------------
77/// Hex: [0xFFEE]
78/// Dec: [65518]
79/// ------------------------------------------------------
80pub fn print_ucs2<T: AsRef<Vec<u16>>>(utf2_cp: T) {
81    print_ucs2_vec(utf2_cp, false);
82}
83
84/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
85///
86/// # Parameters
87/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
88///
89/// # Note
90/// The bytes printed in hexadecimal are code points in UCS-2.
91///
92/// # Example
93/// ```rust
94/// use ende::prelude::*;
95/// let v: Vec<u16> = vec![0xFFEE];
96/// print_ucs2_b(&v);
97/// ```
98/// **Output**
99/// ```text
100/// --------------- UTF-16 encoding of "𐀀" ---------------
101/// Hex: [0xFFEE]
102/// Bin: ["1111111111101110"]
103/// Dec: [65518]
104/// ------------------------------------------------------
105pub fn print_ucs2_b<T: AsRef<Vec<u16>>>(ucs2_cp: T) {
106    print_ucs2_vec(ucs2_cp, true);
107}
108/// Encode a vector of unicode code points into a vector of UCS-2 code points.
109///
110/// # Parameters
111/// * `unicode_cp`: [`Vec<u32>`] - A vector of unicode code points.
112///
113/// # Returns
114/// A [`Vec<u16>`] containing the UCS-2 code points.
115///
116/// # Panics
117/// * If the input vector (`unicode_cp`) of unicode code points contains invalid unicode code points.
118///
119/// # Example
120/// ```rust
121/// use ende::prelude::*;
122/// let v: Vec<u32> = vec![0xFFEE]; // Array of code points in unicode
123/// let enc: Vec<u16> = encode_in_ucs2(&v);
124/// assert_eq!(enc, vec![0xFFEE]);
125/// ```
126pub fn encode_in_ucs2<T: AsRef<Vec<u32>>>(unicode_cp: T) -> Vec<u16> {
127    let mut new_v: Vec<u16> = Vec::new();
128    let v: Vec<u32> = unicode_cp.as_ref().to_vec();
129    for i in &v {
130        let code_point = *i;
131        if code_point > 0xFFFF {
132            panic!("Invalid UCS-2 sequence");
133        }
134        new_v.push(code_point as u16);
135    }
136    new_v
137}
138
139/// Decode a vector of UCS-2 code points into a vector of unicode code points.
140///
141/// # Parameters
142/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
143///
144/// # Returns
145/// A [`Vec<u32>`] containing the unicode code points.
146///
147/// # Panics
148/// * If the input vector (`ucs2_cp`) of UCS-2 code points contains invalid UCS-2 code points.
149///
150/// # Example
151/// ```rust
152/// use ende::prelude::*;
153/// let v: Vec<u16> = vec![0xFFEE]; // Array of code points in UCS-2
154/// let dec: Vec<u32> = decode_from_ucs2(&v);
155/// assert_eq!(dec, vec![0xFFEE]);
156/// ```
157pub fn decode_from_ucs2<T: AsRef<Vec<u16>>>(ucs2_cp: T) -> Vec<u32> {
158    let mut new_v: Vec<u32> = Vec::new();
159    let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
160    let mut i = 0;
161    while i < v.len() {
162        let code_point = v[i];
163        if (0xD800..=0xDBFF).contains(&code_point) {
164            panic!("Invalid UCS-2 sequence");
165        }
166        new_v.push(code_point as u32);
167        i += 1;
168    }
169    new_v
170}