1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
/*******************************************************************************
* Copyright (c) 2019 Association Cénotélie (cenotelie.fr)
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General
* Public License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
//! Unicode support
mod blocks;
mod categories;
use std::collections::HashMap;
use lazy_static::lazy_static;
lazy_static! {
/// Contains the supported Unicode blocks
pub static ref BLOCKS: HashMap<&'static str, Block> = blocks::get_blocks();
}
lazy_static! {
/// Contains the supported Unicode blocks
pub static ref CATEGORIES: HashMap<&'static str, Category> = categories::get_categories();
}
/// Represents a Unicode code point
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct CodePoint(u32);
impl CodePoint {
/// Initializes the code point
///
/// # Panics
///
/// Raise a panic when the value is not a valid code point
#[must_use]
pub fn new(value: u32) -> CodePoint {
assert!(
!((0xD800..=0xDFFF).contains(&value) || value >= 0x0011_0000),
"The value is not a valid Unicode character code point"
);
CodePoint(value)
}
/// Gets the code point value
#[must_use]
pub fn value(self) -> u32 {
self.0
}
/// Gets a value indicating whether this codepoint is in Unicode plane 0
#[must_use]
pub fn is_plane0(self) -> bool {
self.0 <= 0xFFFF
}
/// Gets the UTF-16 encoding of this code point
#[must_use]
pub fn get_utf16(self) -> [u16; 2] {
if self.0 <= 0xFFFF {
// plane 0
return [self.0 as u16, 0];
}
let temp = self.0 - 0x10000;
let lead = (temp >> 10) + 0xD800;
let trail = (temp & 0x03FF) + 0xDC00;
[lead as u16, trail as u16]
}
}
/// Represents a range of Unicode characters
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
pub struct Span {
/// Beginning of the range (included)
pub begin: CodePoint,
/// End of the range (included)
pub end: CodePoint
}
impl Span {
/// Initializes this character span
#[must_use]
pub fn new(begin: u32, end: u32) -> Span {
Span {
begin: CodePoint::new(begin),
end: CodePoint::new(end)
}
}
/// Gets the range's length in number of characters
#[must_use]
pub fn len(self) -> u32 {
self.end.0 - self.begin.0 + 1
}
/// Gets whether the range is empty
#[must_use]
pub fn is_empty(self) -> bool {
self.end.0 < self.begin.0
}
/// Gets a value indicating whether this codepoint is in Unicode plane 0
#[must_use]
pub fn is_plane0(self) -> bool {
self.begin.is_plane0() && self.end.is_plane0()
}
}
/// Represents a Unicode block of characters
#[derive(Debug, Clone)]
pub struct Block {
/// The block's name
pub name: String,
/// The block's character span
pub span: Span
}
impl Block {
/// Initializes this Unicode block
#[must_use]
pub fn new(name: &str, begin: u32, end: u32) -> Block {
Block {
name: name.to_string(),
span: Span::new(begin, end)
}
}
/// Initializes this Unicode block
#[must_use]
pub fn new_owned(name: String, begin: u32, end: u32) -> Block {
Block {
name,
span: Span::new(begin, end)
}
}
}
/// Represents a Unicode category
#[derive(Debug, Clone)]
pub struct Category {
/// The category's name
pub name: String,
/// The list of character spans contained in this category
pub spans: Vec<Span>
}
impl Category {
/// Represents a Unicode category
#[must_use]
pub fn new(name: &'static str) -> Category {
Category {
name: name.to_string(),
spans: Vec::new()
}
}
/// Represents a Unicode category
#[must_use]
pub fn new_owned(name: String) -> Category {
Category {
name,
spans: Vec::new()
}
}
/// Adds a span to this category
pub fn add_span(&mut self, begin: u32, end: u32) {
self.spans.push(Span::new(begin, end));
}
/// Aggregate the specified category into this one
pub fn aggregate(&mut self, category: &Category) {
self.spans.extend_from_slice(&category.spans);
}
}