ptero/method/trailing_unicode/
character_sets.rs

1/// This trait is used for reading unicode set data.
2///
3/// New sets should implement `get_set` which provides the array with
4/// unicode characters used by the method.
5pub trait GetCharacterSet {
6    /// Returns the array of characters representing the Unicode characters that should be used by the method.
7    /// The size of the array should be a power od 2. This is a requirement to be able to encode integer amount of bits.
8    fn get_set(&self) -> &[char];
9
10    fn size(&self) -> usize {
11        self.get_set().len()
12    }
13
14    /// Maps index (in other words, the value) to the character in the set.
15    ///
16    /// # Arguments
17    ///
18    /// * `index` - the value which will be mapped (i.e. index of the character in the set)
19    ///
20    ///
21    /// # Examples
22    /// ## Gets character which is in the set
23    /// ```
24    /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
25    ///
26    /// let set = CharacterSetType::FullUnicodeSet;
27    ///
28    /// assert_eq!(set.get_character(1), Some(&'\u{0020}'));
29    /// assert_eq!(set.get_character(2), Some(&'\u{2000}'));
30    /// assert_eq!(set.get_character(31), Some(&'\u{FEFF}'));
31    /// ```
32    /// ## Returns None if value cannot be mapped
33    /// ```
34    /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
35    ///
36    /// let set = CharacterSetType::FullUnicodeSet;
37    ///
38    /// assert_eq!(set.get_character(0), None);
39    /// ```
40    /// # Panics
41    /// The method panics if the provided value is larger than the set size.
42    /// ## Panics if index exceeds the size of the set
43    /// ```should_panic
44    /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
45    ///
46    /// let set = CharacterSetType::FullUnicodeSet;
47    ///
48    /// set.get_character(100);
49    /// ```
50    fn get_character(&self, index: u32) -> Option<&char> {
51        let index = index as usize;
52        if index == 0 {
53            None
54        } else if index > self.size() {
55            panic!("Too large number for given unicode set - cannot encode this amount of bits");
56        } else {
57            self.get_set().get(index - 1)
58        }
59    }
60
61    /// Returns the number represented by the character.
62    /// The number is the bit representation of the character - or in other words the index.
63    /// If the character is not recognized it returns 0 by default.
64    ///
65    /// # Arguments
66    ///
67    /// * `chr` - character which will be converted
68    ///
69    /// # Examples
70    /// ## Converts recognized character
71    /// ```
72    /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
73    ///
74    /// let set = CharacterSetType::FullUnicodeSet;
75    /// let value = set.character_to_bits(&'\u{200A}');
76    ///
77    /// assert_eq!(value, 11);
78    /// ```
79    /// ## Converts unrecognized character to 0
80    /// ```
81    /// use ptero::method::trailing_unicode::character_sets::{CharacterSetType, GetCharacterSet};
82    ///
83    /// let set = CharacterSetType::FullUnicodeSet;
84    /// let value = set.character_to_bits(&'A');
85    ///
86    /// assert_eq!(value, 0);
87    /// ```
88    fn character_to_bits(&self, chr: &char) -> u32 {
89        if let Some(pos) = self.get_set().iter().position(|x| x == chr) {
90            (pos + 1) as u32
91        } else {
92            0
93        }
94    }
95}
96
97/// Full set of used Unicode whitespace and invisible special chars - from different width spaces
98/// to formatting chars and zero-width spaces.
99pub const FULL_UNICODE_CHARACTER_SET: [char; 31] = [
100    '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}',
101    '\u{2007}', '\u{2009}', '\u{200A}', '\u{200B}', '\u{200C}', '\u{200D}', '\u{200E}', '\u{2028}',
102    '\u{202A}', '\u{202C}', '\u{202D}', '\u{202F}', '\u{205F}', '\u{2060}', '\u{2061}', '\u{2062}',
103    '\u{2063}', '\u{2064}', '\u{2066}', '\u{2068}', '\u{2069}', '\u{3000}', '\u{FEFF}',
104];
105
106/// Set of characters used to encode messages on Twitter
107pub const TWITTER_UNICODE_CHARACTER_SET: [char; 15] = [
108    '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}',
109    '\u{2007}', '\u{2009}', '\u{200A}', '\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}',
110];
111
112/// Set providing pre-defined characters for 4-bit encoding capacity.
113pub const FOUR_BIT_CHARACTER_SET: [char; 15] = [
114    '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}',
115    '\u{2007}', '\u{2009}', '\u{200A}', '\u{200B}', '\u{200C}', '\u{200D}', '\u{200E}',
116];
117
118/// Set providing pre-defined characters for 3-bit encoding capacity.
119pub const THREE_BIT_CHARACTER_SET: [char; 7] = [
120    '\u{0020}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}',
121];
122
123/// Set providing pre-defined characters for 2-bit encoding capacity.
124pub const TWO_BIT_CHARACTER_SET: [char; 3] = ['\u{0020}', '\u{2000}', '\u{2001}'];
125
126/// Enum representing possible character sets e.g. [FULL_UNICODE_CHARACTER_SET].
127#[derive(Debug, Clone, Copy, PartialEq)]
128pub enum CharacterSetType {
129    FullUnicodeSet,
130    FourBitUnicodeSet,
131    ThreeBitUnicodeSet,
132    TwoBitUnicodeSet,
133    TwitterUnicodeSet,
134}
135
136impl GetCharacterSet for CharacterSetType {
137    /// Returns pre-defined character sets based on enum value.
138    ///
139    /// # Examples
140    /// ## Get every character set
141    /// ```
142    /// use ptero::method::trailing_unicode::character_sets::{
143    ///    CharacterSetType, GetCharacterSet, FULL_UNICODE_CHARACTER_SET,
144    ///    THREE_BIT_CHARACTER_SET, TWO_BIT_CHARACTER_SET, FOUR_BIT_CHARACTER_SET,
145    ///    TWITTER_UNICODE_CHARACTER_SET
146    /// };
147    ///
148    ///
149    /// assert_eq!(CharacterSetType::FullUnicodeSet.get_set(), &FULL_UNICODE_CHARACTER_SET);
150    /// assert_eq!(CharacterSetType::FourBitUnicodeSet.get_set(), &FOUR_BIT_CHARACTER_SET);
151    /// assert_eq!(CharacterSetType::ThreeBitUnicodeSet.get_set(), &THREE_BIT_CHARACTER_SET);
152    /// assert_eq!(CharacterSetType::TwoBitUnicodeSet.get_set(), &TWO_BIT_CHARACTER_SET);
153    /// assert_eq!(CharacterSetType::TwitterUnicodeSet.get_set(), &TWITTER_UNICODE_CHARACTER_SET);
154    /// ```
155    fn get_set(&self) -> &[char] {
156        match *self {
157            CharacterSetType::FullUnicodeSet => &FULL_UNICODE_CHARACTER_SET,
158            CharacterSetType::FourBitUnicodeSet => &FOUR_BIT_CHARACTER_SET,
159            CharacterSetType::ThreeBitUnicodeSet => &THREE_BIT_CHARACTER_SET,
160            CharacterSetType::TwoBitUnicodeSet => &TWO_BIT_CHARACTER_SET,
161            CharacterSetType::TwitterUnicodeSet => &TWITTER_UNICODE_CHARACTER_SET,
162        }
163    }
164}