1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
use std::char;
use std::str;
use regex::Regex;
lazy_static! {
static ref CODEPOINT_SEPARATORS: Regex = Regex::new(r#"[^\w+]"#).unwrap();
static ref CODEPOINT_PREFIX: Regex = Regex::new(r#"^[Uu][+]"#).unwrap();
static ref HEX_SEPARATORS: Regex = Regex::new(r#"[^\w]"#).unwrap();
static ref HEX_PREFIX: Regex = Regex::new(r#"^0[xX]"#).unwrap();
}
pub fn codepoints(string: &str) -> String {
CODEPOINT_SEPARATORS
.split(&string)
.map(|token| {
let mut token = token;
if CODEPOINT_PREFIX.is_match(token) {
token = &token[2..];
}
let codepoint = u32::from_str_radix(token, 16)
.unwrap_or_else(|_| panic!("Cannot parse token as hex number: {}", token));
char::from_u32(codepoint)
.unwrap_or_else(|| panic!("Invalid Unicode Scalar Value code-point: {}", codepoint))
})
.collect::<String>()
}
pub fn utf8_hex(string: &str) -> String {
let utf8 = HEX_SEPARATORS.split(&string).map(|token| {
let mut token = token;
if HEX_PREFIX.is_match(token) {
token = &token[2..];
}
u8::from_str_radix(token, 16)
.unwrap_or_else(|_| panic!("Cannot parse token as hex byte value: {}", token))
});
String::from_utf8(utf8.collect()).expect("Invalid UTF-8 sequence")
}
pub fn utf16_hex(string: &str) -> String {
let utf16 = HEX_SEPARATORS.split(&string).map(|token| {
let mut token = token;
if HEX_PREFIX.is_match(token) {
token = &token[2..];
}
u16::from_str_radix(token, 16)
.unwrap_or_else(|_| panic!("Cannot parse token as hex byte value: {}", token))
});
char::decode_utf16(utf16)
.map(|r| r.expect("Invalid UTF-16 sequence"))
.collect()
}