ptero/method/trailing_unicode/character_sets.rs
1/// This trait is used for reading unicode set data.
2///
3/// New sets should implement `get_set` which provides the array with
4/// unicode characters used by the method.
5pub trait GetCharacterSet {
6 /// Returns the array of characters representing the Unicode characters that should be used by the method.
7 /// The size of the array should be a power od 2. This is a requirement to be able to encode integer amount of bits.
8 fn get_set(&self) -> &[char];
9
10 fn size(&self) -> usize {
11 self.get_set().len()
12 }
13
14 /// Maps index (in other words, the value) to the character in the set.
15 ///
16 /// # Arguments
17 ///
18 /// * `index` - the value which will be mapped (i.e. index of the character in the set)
19 ///
20 ///
21 /// # Examples
22 /// ## Gets character which is in the set
23 /// ```
24 /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
25 ///
26 /// let set = CharacterSetType::FullUnicodeSet;
27 ///
28 /// assert_eq!(set.get_character(1), Some(&'\u{0020}'));
29 /// assert_eq!(set.get_character(2), Some(&'\u{2000}'));
30 /// assert_eq!(set.get_character(31), Some(&'\u{FEFF}'));
31 /// ```
32 /// ## Returns None if value cannot be mapped
33 /// ```
34 /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
35 ///
36 /// let set = CharacterSetType::FullUnicodeSet;
37 ///
38 /// assert_eq!(set.get_character(0), None);
39 /// ```
40 /// # Panics
41 /// The method panics if the provided value is larger than the set size.
42 /// ## Panics if index exceeds the size of the set
43 /// ```should_panic
44 /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
45 ///
46 /// let set = CharacterSetType::FullUnicodeSet;
47 ///
48 /// set.get_character(100);
49 /// ```
50 fn get_character(&self, index: u32) -> Option<&char> {
51 let index = index as usize;
52 if index == 0 {
53 None
54 } else if index > self.size() {
55 panic!("Too large number for given unicode set - cannot encode this amount of bits");
56 } else {
57 self.get_set().get(index - 1)
58 }
59 }
60
61 /// Returns the number represented by the character.
62 /// The number is the bit representation of the character - or in other words the index.
63 /// If the character is not recognized it returns 0 by default.
64 ///
65 /// # Arguments
66 ///
67 /// * `chr` - character which will be converted
68 ///
69 /// # Examples
70 /// ## Converts recognized character
71 /// ```
72 /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
73 ///
74 /// let set = CharacterSetType::FullUnicodeSet;
75 /// let value = set.character_to_bits(&'\u{200A}');
76 ///
77 /// assert_eq!(value, 11);
78 /// ```
79 /// ## Converts unrecognized character to 0
80 /// ```
81 /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
82 ///
83 /// let set = CharacterSetType::FullUnicodeSet;
84 /// let value = set.character_to_bits(&'A');
85 ///
86 /// assert_eq!(value, 0);
87 /// ```
88 fn character_to_bits(&self, chr: &char) -> u32 {
89 if let Some(pos) = self.get_set().iter().position(|x| x == chr) {
90 (pos + 1) as u32
91 } else {
92 0
93 }
94 }
95}
96
97/// Full set of used Unicode whitespace and invisible special chars - from different width spaces
98/// to formatting chars and zero-width spaces.
99pub const FULL_UNICODE_CHARACTER_SET: [char; 31] = [
100 '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}',
101 '\u{2007}', '\u{2009}', '\u{200A}', '\u{200B}', '\u{200C}', '\u{200D}', '\u{200E}', '\u{2028}',
102 '\u{202A}', '\u{202C}', '\u{202D}', '\u{202F}', '\u{205F}', '\u{2060}', '\u{2061}', '\u{2062}',
103 '\u{2063}', '\u{2064}', '\u{2066}', '\u{2068}', '\u{2069}', '\u{3000}', '\u{FEFF}',
104];
105
106/// Set of characters used to encode messages on Twitter
107pub const TWITTER_UNICODE_CHARACTER_SET: [char; 15] = [
108 '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}',
109 '\u{2007}', '\u{2009}', '\u{200A}', '\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}',
110];
111
112/// Set providing pre-defined characters for 4-bit encoding capacity.
113pub const FOUR_BIT_CHARACTER_SET: [char; 15] = [
114 '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}',
115 '\u{2007}', '\u{2009}', '\u{200A}', '\u{200B}', '\u{200C}', '\u{200D}', '\u{200E}',
116];
117
118/// Set providing pre-defined characters for 3-bit encoding capacity.
119pub const THREE_BIT_CHARACTER_SET: [char; 7] = [
120 '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}',
121];
122
123/// Set providing pre-defined characters for 2-bit encoding capacity.
124pub const TWO_BIT_CHARACTER_SET: [char; 3] = ['\u{0020}', '\u{2000}', '\u{2001}'];
125
126/// Enum representing possible character sets e.g. [FULL_UNICODE_CHARACTER_SET].
127#[derive(Debug, Clone, Copy, PartialEq)]
128pub enum CharacterSetType {
129 FullUnicodeSet,
130 FourBitUnicodeSet,
131 ThreeBitUnicodeSet,
132 TwoBitUnicodeSet,
133 TwitterUnicodeSet,
134}
135
136impl GetCharacterSet for CharacterSetType {
137 /// Returns pre-defined character sets based on enum value.
138 ///
139 /// # Examples
140 /// ## Get every character set
141 /// ```
142 /// use ptero::method::trailing_unicode::character_sets::{
143 /// CharacterSetType, GetCharacterSet, FULL_UNICODE_CHARACTER_SET,
144 /// THREE_BIT_CHARACTER_SET, TWO_BIT_CHARACTER_SET, FOUR_BIT_CHARACTER_SET,
145 /// TWITTER_UNICODE_CHARACTER_SET
146 /// };
147 ///
148 ///
149 /// assert_eq!(CharacterSetType::FullUnicodeSet.get_set(), &FULL_UNICODE_CHARACTER_SET);
150 /// assert_eq!(CharacterSetType::FourBitUnicodeSet.get_set(), &FOUR_BIT_CHARACTER_SET);
151 /// assert_eq!(CharacterSetType::ThreeBitUnicodeSet.get_set(), &THREE_BIT_CHARACTER_SET);
152 /// assert_eq!(CharacterSetType::TwoBitUnicodeSet.get_set(), &TWO_BIT_CHARACTER_SET);
153 /// assert_eq!(CharacterSetType::TwitterUnicodeSet.get_set(), &TWITTER_UNICODE_CHARACTER_SET);
154 /// ```
155 fn get_set(&self) -> &[char] {
156 match *self {
157 CharacterSetType::FullUnicodeSet => &FULL_UNICODE_CHARACTER_SET,
158 CharacterSetType::FourBitUnicodeSet => &FOUR_BIT_CHARACTER_SET,
159 CharacterSetType::ThreeBitUnicodeSet => &THREE_BIT_CHARACTER_SET,
160 CharacterSetType::TwoBitUnicodeSet => &TWO_BIT_CHARACTER_SET,
161 CharacterSetType::TwitterUnicodeSet => &TWITTER_UNICODE_CHARACTER_SET,
162 }
163 }
164}