unicode_clusters/
lib.rs

1extern crate unicode_segmentation;
2use unicode_segmentation::UnicodeSegmentation;
3use std::hash;
4
5#[derive(Debug)]
6#[derive(Copy, Clone, Eq, Hash)]
7pub enum GraphemeCluster {
8	B1(GCBytes<1>),
9	B2(GCBytes<2>),
10	B3(GCBytes<3>),
11	B4(GCBytes<4>),
12	B5(GCBytes<5>),
13	B6(GCBytes<6>),
14}
15
16#[derive(Debug)]
17#[derive(Copy, Clone)]
18pub struct GCBytes<const N: usize> {
19	bytes: [u8; N]
20}
21
22impl<const N: usize> GCBytes<N> {
23	pub fn new(bytes: Vec<u8>) -> GCBytes<N> {
24		let mut buffer = [0u8; N];
25		buffer.copy_from_slice(&bytes);
26
27		Self {
28			bytes: buffer,
29		}
30	}
31}
32
33impl<const N: usize> PartialEq for GCBytes<N> {
34	fn eq(&self, other: &GCBytes<N>) -> bool {
35		 self.bytes == other.bytes
36	}
37}
38
39impl<const N: usize> Eq for GCBytes<N> {}
40
41impl PartialEq<GraphemeCluster> for GraphemeCluster {
42	fn eq(&self, other: &GraphemeCluster) -> bool {
43		self.as_bytes() == other.as_bytes()
44	}
45}
46
47impl<const N: usize> hash::Hash for GCBytes<N> {
48	fn hash<H: hash::Hasher>(&self, s: &mut H) {
49		 self.bytes.hash(s)
50	}
51}
52
53impl GraphemeCluster {
54	pub fn new(input: &str) -> Self {
55		let bytes = GraphemeCluster::to_vec(input);
56		let len = bytes.len();
57
58		let gc: GraphemeCluster = match len {
59			1 => GraphemeCluster::B1(GCBytes::<1>::new(bytes)),
60			2 => GraphemeCluster::B2(GCBytes::<2>::new(bytes)),
61			3 => GraphemeCluster::B3(GCBytes::<3>::new(bytes)),
62			4 => GraphemeCluster::B4(GCBytes::<4>::new(bytes)),
63			5 => GraphemeCluster::B5(GCBytes::<5>::new(bytes)),
64			6 => GraphemeCluster::B6(GCBytes::<6>::new(bytes)),
65			_ => panic!("length is too long for grapheme {}", len)
66		};
67
68		gc
69	}
70
71	pub fn as_bytes(&self) -> &[u8] {
72		match self {
73			GraphemeCluster::B1(gc_bytes) => &gc_bytes.bytes,
74			GraphemeCluster::B2(gc_bytes) => &gc_bytes.bytes,
75			GraphemeCluster::B3(gc_bytes) => &gc_bytes.bytes,
76			GraphemeCluster::B4(gc_bytes) => &gc_bytes.bytes,
77			GraphemeCluster::B5(gc_bytes) => &gc_bytes.bytes,
78			GraphemeCluster::B6(gc_bytes) => &gc_bytes.bytes,
79		}
80	}
81
82	fn to_vec(input: &str) -> Vec<u8> {
83		let gcs = UnicodeSegmentation::graphemes(input, true).collect::<Vec<&str>>();
84		let first_gc = gcs[0];
85		let bytes: Vec<u8> = first_gc.bytes().collect();
86
87		bytes
88	}
89
90	pub fn graphemes(input: &str) -> Vec<GraphemeCluster> {
91		UnicodeSegmentation::graphemes(input, true)
92			.map(|c| GraphemeCluster::new(c))
93			.collect::<Vec<GraphemeCluster>>()
94	}
95
96	pub fn to_string_lossy(self) -> String {
97		String::from_utf8_lossy(self.as_bytes()).to_string()
98	}
99}
100
101use std::fmt;
102impl fmt::Display for GraphemeCluster {
103	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104		write!(f, "{}", self.to_string_lossy())
105	}
106}
107
108#[test]
109fn example() {
110	let input = "AȜनमस्ते";
111
112	let gcs = GraphemeCluster::graphemes(input);
113	assert!(gcs.len() == 6, "length");
114
115	assert_eq!(gcs[0].to_string_lossy(), "A");
116
117	assert_eq!(gcs[1].to_string_lossy(), "Ȝ");
118	assert_eq!(gcs[2].to_string_lossy(), "न");
119	assert_eq!(gcs[3].to_string_lossy(), "म");
120	assert_eq!(gcs[4].to_string_lossy(), "स्");
121	assert_eq!(gcs[5].to_string_lossy(), "ते");
122
123	assert_eq!(gcs[0].as_bytes()[..], [65]);
124	assert_eq!(gcs[1].as_bytes()[..], [200, 156]);
125	assert_eq!(gcs[2].as_bytes()[..], [224, 164, 168]);
126	assert_eq!(gcs[3].as_bytes()[..], [224, 164, 174]);
127	assert_eq!(gcs[4].as_bytes()[..], [224, 164, 184,	224, 165, 141]);
128	assert_eq!(gcs[5].as_bytes()[..], [224, 164, 164,	224, 165, 135]);
129}
130
131#[test]
132fn it_works() {
133	let bytes = GraphemeCluster::to_vec("A");
134
135	assert_eq!([
136		65
137	], bytes[..], "{:?}", bytes);
138
139	let bytes2 = GraphemeCluster::to_vec("Ȝ");
140
141	assert_eq!([
142		200, 156
143	], bytes2[..], "{:?}", bytes2);
144
145	let mut b1 = [0u8; 1];
146	b1.copy_from_slice(&bytes);
147
148	let mut b2 = [0u8; 2];
149	b2.copy_from_slice(&bytes2);
150
151	println!("{:?}", GraphemeCluster::new("A"));
152	println!("{:?}", GraphemeCluster::new("Ȝ"));
153	println!("{:?}", GraphemeCluster::new("ते"));
154	let key = GraphemeCluster::new("ते");
155	println!("{:?}", key.as_bytes());
156
157	use std::collections::HashMap;
158	let mut nodes: HashMap<GraphemeCluster, char> = HashMap::new();
159	nodes.entry(key).or_insert('a');
160	nodes.entry(key).or_insert('b');
161
162	assert_eq!(nodes.get(&key), Some(&'a'), "find inserted item first value");
163
164	let key2 = GraphemeCluster::new("ते");
165	assert_eq!(nodes.get(&key2), Some(&'a'), "find existing item with duplicate key");
166
167	let key3 = GraphemeCluster::new("Ȝ");
168	println!("{:?}", key3.as_bytes());
169	assert_eq!(nodes.get(&key3), None, "don't find non-existing item");
170}
171
172#[test]
173fn bytes_1() {
174	let bytes = GraphemeCluster::to_vec("A");
175
176	assert_eq!([
177		65
178	], bytes[..], "{:?}", bytes);
179}
180
181#[test]
182fn bytes_2() {
183	let bytes = GraphemeCluster::to_vec("Ȝ");
184
185	assert_eq!([
186		200, 156
187	], bytes[..], "{:?}", bytes);
188}
189
190#[test]
191fn bytes_3() {
192	let bytes = GraphemeCluster::to_vec("न");
193
194	assert_eq!([
195		224, 164, 168
196	], bytes[..], "{:?}", bytes);
197}
198
199#[test]
200fn bytes_4() {
201	let bytes = GraphemeCluster::to_vec("𐌰");
202
203	assert_eq!([
204		240, 144, 140, 176
205	], bytes[..], "{:?}", bytes);
206}
207
208#[test]
209fn bytes_6() {
210	let bytes = GraphemeCluster::to_vec("स्");
211
212	assert_eq!([
213		224, 164, 184,
214		224, 165, 141
215	], bytes[..], "{:?}", bytes);
216}
217
218#[test]
219fn unicode_compare() {
220	let char1 = GraphemeCluster::new("\"");
221	let char2 = GraphemeCluster::new("\"");
222	assert_eq!(char1, char2, "compare unicode chars");
223
224	let char1 = GraphemeCluster::new("Ȝ");
225	let char2 = GraphemeCluster::new("Ȝ");
226	assert_eq!(char1, char2, "compare unicode chars wide");
227}