unic_cli/
parsers.rs

1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use std::char;
12use std::str;
13
14use regex::Regex;
15
16lazy_static! {
17    // Anything not alphanumeric or `+`
18    static ref CODEPOINT_SEPARATORS: Regex = Regex::new(r#"[^\w+]"#).unwrap();
19
20    // Unicode codepoint prefix: `U+`
21    static ref CODEPOINT_PREFIX: Regex = Regex::new(r#"^[Uu][+]"#).unwrap();
22
23    // Anything not alphanumeric
24    static ref HEX_SEPARATORS: Regex = Regex::new(r#"[^\w]"#).unwrap();
25
26    // Hex prefix: `0x`
27    static ref HEX_PREFIX: Regex = Regex::new(r#"^0[xX]"#).unwrap();
28}
29
30pub fn codepoints(string: &str) -> String {
31    CODEPOINT_SEPARATORS
32        .split(&string)
33        .map(|token| {
34            let mut token = token;
35            if CODEPOINT_PREFIX.is_match(token) {
36                token = &token[2..];
37            }
38            let codepoint = u32::from_str_radix(token, 16)
39                .unwrap_or_else(|_| panic!("Cannot parse token as hex number: {}", token));
40            char::from_u32(codepoint)
41                .unwrap_or_else(|| panic!("Invalid Unicode Scalar Value code-point: {}", codepoint))
42        })
43        .collect::<String>()
44}
45
46pub fn utf8_hex(string: &str) -> String {
47    let utf8 = HEX_SEPARATORS.split(&string).map(|token| {
48        let mut token = token;
49        if HEX_PREFIX.is_match(token) {
50            token = &token[2..];
51        }
52        u8::from_str_radix(token, 16)
53            .unwrap_or_else(|_| panic!("Cannot parse token as hex byte value: {}", token))
54    });
55
56    String::from_utf8(utf8.collect()).expect("Invalid UTF-8 sequence")
57}
58
59pub fn utf16_hex(string: &str) -> String {
60    let utf16 = HEX_SEPARATORS.split(&string).map(|token| {
61        let mut token = token;
62        if HEX_PREFIX.is_match(token) {
63            token = &token[2..];
64        }
65        u16::from_str_radix(token, 16)
66            .unwrap_or_else(|_| panic!("Cannot parse token as hex byte value: {}", token))
67    });
68
69    char::decode_utf16(utf16)
70        .map(|r| r.expect("Invalid UTF-16 sequence"))
71        .collect()
72}