ende/ucs2.rs
1/*!
2UCS-2 encoding and decoding.
3
4# Encoding
5A unicode code point is represented using [two bytes]() in UCS-2, using always this fixed size.
6
7# Decoding
8A UCS-2 code point is decoded into a unicode code point using the the first [two bytes]().
9
10## Representation
11
12**Note**:
13
14* UCS-2 is a subset of UTF-16.
15* UCS-2 is capable of ending 65,536 code points. This is the same as the first 65,536 code points of UTF-16.
16
17### Two bytes
18
19**Encoding**: If the unicode code point is less than 0xFFFF, the unicode code point is represented in UTF-16 using only the 16 least significant bits.
20
21**Decoding**: If the UTF-16 code point is less than 0xD800 or greater than 0xDBFF and less than 0xFFFF, the unicode code point is represented using only the 16 least significant bits.
22
23* Unicode code point: `nnnnnnnn|nnnnnnnn|xxxxxxxx|xxxxxxxx`
24* UTF-16 code point: `xxxxxxxx|xxxxxxxx`
25*/
26
27/// Pretty print the UCS-2 code points in hexadecimal, (binary) and decimal.
28///
29/// # Parameters
30/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
31/// * `binary_flag`: [`bool`] - A flag to print the binary representation of the UCS-2 code points.
32///
33/// # Note
34/// The bytes printed in hexadecimal are code points in UCS-2.
35fn print_ucs2_vec<T: AsRef<Vec<u16>>>(ucs2_cp: T, binary_flag: bool) {
36 let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
37 let string_repr: String = String::from_utf16(&v).unwrap();
38 let binary_repr: Vec<String> = v.iter().map(|x| format!("{:08b}", x)).collect();
39 println!();
40 println!(
41 "--------------- UCS-2 of \"{}\" ---------------",
42 string_repr
43 );
44 println!("Hex: {:x?}", v);
45 if binary_flag {
46 println!("Bin: {:?}", binary_repr);
47 }
48 println!("Dec: {:?}", v);
49 println!(
50 "{}{}",
51 "-".repeat(44),
52 "-".repeat(string_repr.chars().count())
53 );
54 println!();
55}
56
57// ============================================================================
58// ================================ Public API ================================
59// ============================================================================
60/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
61///
62/// # Parameters
63/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
64///
65/// # Note
66/// The bytes printed in hexadecimal are code points in UCS-2.
67///
68/// # Example
69/// ```rust
70/// use ende::prelude::*;
71/// let v: Vec<u16> = vec![0xFFEE];
72/// print_ucs2(&v);
73/// ```
74/// **Output**
75/// ```text
76/// --------------- UTF-16 encoding of "𐀁" ---------------
77/// Hex: [0xFFEE]
78/// Dec: [65518]
79/// ------------------------------------------------------
80pub fn print_ucs2<T: AsRef<Vec<u16>>>(utf2_cp: T) {
81 print_ucs2_vec(utf2_cp, false);
82}
83
84/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
85///
86/// # Parameters
87/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
88///
89/// # Note
90/// The bytes printed in hexadecimal are code points in UCS-2.
91///
92/// # Example
93/// ```rust
94/// use ende::prelude::*;
95/// let v: Vec<u16> = vec![0xFFEE];
96/// print_ucs2_b(&v);
97/// ```
98/// **Output**
99/// ```text
100/// --------------- UTF-16 encoding of "𐀀" ---------------
101/// Hex: [0xFFEE]
102/// Bin: ["1111111111101110"]
103/// Dec: [65518]
104/// ------------------------------------------------------
105pub fn print_ucs2_b<T: AsRef<Vec<u16>>>(ucs2_cp: T) {
106 print_ucs2_vec(ucs2_cp, true);
107}
108/// Encode a vector of unicode code points into a vector of UCS-2 code points.
109///
110/// # Parameters
111/// * `unicode_cp`: [`Vec<u32>`] - A vector of unicode code points.
112///
113/// # Returns
114/// A [`Vec<u16>`] containing the UCS-2 code points.
115///
116/// # Panics
117/// * If the input vector (`unicode_cp`) of unicode code points contains invalid unicode code points.
118///
119/// # Example
120/// ```rust
121/// use ende::prelude::*;
122/// let v: Vec<u32> = vec![0xFFEE]; // Array of code points in unicode
123/// let enc: Vec<u16> = encode_in_ucs2(&v);
124/// assert_eq!(enc, vec![0xFFEE]);
125/// ```
126pub fn encode_in_ucs2<T: AsRef<Vec<u32>>>(unicode_cp: T) -> Vec<u16> {
127 let mut new_v: Vec<u16> = Vec::new();
128 let v: Vec<u32> = unicode_cp.as_ref().to_vec();
129 for i in &v {
130 let code_point = *i;
131 if code_point > 0xFFFF {
132 panic!("Invalid UCS-2 sequence");
133 }
134 new_v.push(code_point as u16);
135 }
136 new_v
137}
138
139/// Decode a vector of UCS-2 code points into a vector of unicode code points.
140///
141/// # Parameters
142/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
143///
144/// # Returns
145/// A [`Vec<u32>`] containing the unicode code points.
146///
147/// # Panics
148/// * If the input vector (`ucs2_cp`) of UCS-2 code points contains invalid UCS-2 code points.
149///
150/// # Example
151/// ```rust
152/// use ende::prelude::*;
153/// let v: Vec<u16> = vec![0xFFEE]; // Array of code points in UCS-2
154/// let dec: Vec<u32> = decode_from_ucs2(&v);
155/// assert_eq!(dec, vec![0xFFEE]);
156/// ```
157pub fn decode_from_ucs2<T: AsRef<Vec<u16>>>(ucs2_cp: T) -> Vec<u32> {
158 let mut new_v: Vec<u32> = Vec::new();
159 let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
160 let mut i = 0;
161 while i < v.len() {
162 let code_point = v[i];
163 if (0xD800..=0xDBFF).contains(&code_point) {
164 panic!("Invalid UCS-2 sequence");
165 }
166 new_v.push(code_point as u32);
167 i += 1;
168 }
169 new_v
170}