1extern crate unicode_segmentation;
2use unicode_segmentation::UnicodeSegmentation;
3use std::hash;
4
5#[derive(Debug)]
6#[derive(Copy, Clone, Eq, Hash)]
7pub enum GraphemeCluster {
8 B1(GCBytes<1>),
9 B2(GCBytes<2>),
10 B3(GCBytes<3>),
11 B4(GCBytes<4>),
12 B5(GCBytes<5>),
13 B6(GCBytes<6>),
14}
15
16#[derive(Debug)]
17#[derive(Copy, Clone)]
18pub struct GCBytes<const N: usize> {
19 bytes: [u8; N]
20}
21
22impl<const N: usize> GCBytes<N> {
23 pub fn new(bytes: Vec<u8>) -> GCBytes<N> {
24 let mut buffer = [0u8; N];
25 buffer.copy_from_slice(&bytes);
26
27 Self {
28 bytes: buffer,
29 }
30 }
31}
32
33impl<const N: usize> PartialEq for GCBytes<N> {
34 fn eq(&self, other: &GCBytes<N>) -> bool {
35 self.bytes == other.bytes
36 }
37}
38
39impl<const N: usize> Eq for GCBytes<N> {}
40
41impl PartialEq<GraphemeCluster> for GraphemeCluster {
42 fn eq(&self, other: &GraphemeCluster) -> bool {
43 self.as_bytes() == other.as_bytes()
44 }
45}
46
47impl<const N: usize> hash::Hash for GCBytes<N> {
48 fn hash<H: hash::Hasher>(&self, s: &mut H) {
49 self.bytes.hash(s)
50 }
51}
52
53impl GraphemeCluster {
54 pub fn new(input: &str) -> Self {
55 let bytes = GraphemeCluster::to_vec(input);
56 let len = bytes.len();
57
58 let gc: GraphemeCluster = match len {
59 1 => GraphemeCluster::B1(GCBytes::<1>::new(bytes)),
60 2 => GraphemeCluster::B2(GCBytes::<2>::new(bytes)),
61 3 => GraphemeCluster::B3(GCBytes::<3>::new(bytes)),
62 4 => GraphemeCluster::B4(GCBytes::<4>::new(bytes)),
63 5 => GraphemeCluster::B5(GCBytes::<5>::new(bytes)),
64 6 => GraphemeCluster::B6(GCBytes::<6>::new(bytes)),
65 _ => panic!("length is too long for grapheme {}", len)
66 };
67
68 gc
69 }
70
71 pub fn as_bytes(&self) -> &[u8] {
72 match self {
73 GraphemeCluster::B1(gc_bytes) => &gc_bytes.bytes,
74 GraphemeCluster::B2(gc_bytes) => &gc_bytes.bytes,
75 GraphemeCluster::B3(gc_bytes) => &gc_bytes.bytes,
76 GraphemeCluster::B4(gc_bytes) => &gc_bytes.bytes,
77 GraphemeCluster::B5(gc_bytes) => &gc_bytes.bytes,
78 GraphemeCluster::B6(gc_bytes) => &gc_bytes.bytes,
79 }
80 }
81
82 fn to_vec(input: &str) -> Vec<u8> {
83 let gcs = UnicodeSegmentation::graphemes(input, true).collect::<Vec<&str>>();
84 let first_gc = gcs[0];
85 let bytes: Vec<u8> = first_gc.bytes().collect();
86
87 bytes
88 }
89
90 pub fn graphemes(input: &str) -> Vec<GraphemeCluster> {
91 UnicodeSegmentation::graphemes(input, true)
92 .map(|c| GraphemeCluster::new(c))
93 .collect::<Vec<GraphemeCluster>>()
94 }
95
96 pub fn to_string_lossy(self) -> String {
97 String::from_utf8_lossy(self.as_bytes()).to_string()
98 }
99}
100
101use std::fmt;
102impl fmt::Display for GraphemeCluster {
103 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 write!(f, "{}", self.to_string_lossy())
105 }
106}
107
108#[test]
109fn example() {
110 let input = "AȜनमस्ते";
111
112 let gcs = GraphemeCluster::graphemes(input);
113 assert!(gcs.len() == 6, "length");
114
115 assert_eq!(gcs[0].to_string_lossy(), "A");
116
117 assert_eq!(gcs[1].to_string_lossy(), "Ȝ");
118 assert_eq!(gcs[2].to_string_lossy(), "न");
119 assert_eq!(gcs[3].to_string_lossy(), "म");
120 assert_eq!(gcs[4].to_string_lossy(), "स्");
121 assert_eq!(gcs[5].to_string_lossy(), "ते");
122
123 assert_eq!(gcs[0].as_bytes()[..], [65]);
124 assert_eq!(gcs[1].as_bytes()[..], [200, 156]);
125 assert_eq!(gcs[2].as_bytes()[..], [224, 164, 168]);
126 assert_eq!(gcs[3].as_bytes()[..], [224, 164, 174]);
127 assert_eq!(gcs[4].as_bytes()[..], [224, 164, 184, 224, 165, 141]);
128 assert_eq!(gcs[5].as_bytes()[..], [224, 164, 164, 224, 165, 135]);
129}
130
131#[test]
132fn it_works() {
133 let bytes = GraphemeCluster::to_vec("A");
134
135 assert_eq!([
136 65
137 ], bytes[..], "{:?}", bytes);
138
139 let bytes2 = GraphemeCluster::to_vec("Ȝ");
140
141 assert_eq!([
142 200, 156
143 ], bytes2[..], "{:?}", bytes2);
144
145 let mut b1 = [0u8; 1];
146 b1.copy_from_slice(&bytes);
147
148 let mut b2 = [0u8; 2];
149 b2.copy_from_slice(&bytes2);
150
151 println!("{:?}", GraphemeCluster::new("A"));
152 println!("{:?}", GraphemeCluster::new("Ȝ"));
153 println!("{:?}", GraphemeCluster::new("ते"));
154 let key = GraphemeCluster::new("ते");
155 println!("{:?}", key.as_bytes());
156
157 use std::collections::HashMap;
158 let mut nodes: HashMap<GraphemeCluster, char> = HashMap::new();
159 nodes.entry(key).or_insert('a');
160 nodes.entry(key).or_insert('b');
161
162 assert_eq!(nodes.get(&key), Some(&'a'), "find inserted item first value");
163
164 let key2 = GraphemeCluster::new("ते");
165 assert_eq!(nodes.get(&key2), Some(&'a'), "find existing item with duplicate key");
166
167 let key3 = GraphemeCluster::new("Ȝ");
168 println!("{:?}", key3.as_bytes());
169 assert_eq!(nodes.get(&key3), None, "don't find non-existing item");
170}
171
172#[test]
173fn bytes_1() {
174 let bytes = GraphemeCluster::to_vec("A");
175
176 assert_eq!([
177 65
178 ], bytes[..], "{:?}", bytes);
179}
180
181#[test]
182fn bytes_2() {
183 let bytes = GraphemeCluster::to_vec("Ȝ");
184
185 assert_eq!([
186 200, 156
187 ], bytes[..], "{:?}", bytes);
188}
189
190#[test]
191fn bytes_3() {
192 let bytes = GraphemeCluster::to_vec("न");
193
194 assert_eq!([
195 224, 164, 168
196 ], bytes[..], "{:?}", bytes);
197}
198
199#[test]
200fn bytes_4() {
201 let bytes = GraphemeCluster::to_vec("𐌰");
202
203 assert_eq!([
204 240, 144, 140, 176
205 ], bytes[..], "{:?}", bytes);
206}
207
208#[test]
209fn bytes_6() {
210 let bytes = GraphemeCluster::to_vec("स्");
211
212 assert_eq!([
213 224, 164, 184,
214 224, 165, 141
215 ], bytes[..], "{:?}", bytes);
216}
217
218#[test]
219fn unicode_compare() {
220 let char1 = GraphemeCluster::new("\"");
221 let char2 = GraphemeCluster::new("\"");
222 assert_eq!(char1, char2, "compare unicode chars");
223
224 let char1 = GraphemeCluster::new("Ȝ");
225 let char2 = GraphemeCluster::new("Ȝ");
226 assert_eq!(char1, char2, "compare unicode chars wide");
227}