ripeg/charset/mod.rs
1// Copyright (C) 2022 Laurent Wandrebeck
2//
3// This file is part of ripeg.
4//
5// ripeg is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// ripeg is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with ripeg. If not, see <http://www.gnu.org/licenses/>.
17
18//! charset module provides data types and methods for managing sets of characters.
19
20use std::ops::Not;
21
22use bitvec::prelude::*;
23
24/// [`NormalSet`] structure represents a set of chars
25///
26/// 256 bits, one for each possible character value. Little endian (Lsb0)
27pub struct NormalSet {
28 /// 256 bits vector
29 pub bits: BitVec<u8, Lsb0>,
30}
31
32/// [`SmallSet`] structure represents only the ASCII set of chars
33///
34/// 128 bits, one for each possible ASCII character value. Little endian (Lsb0)
35pub struct SmallSet {
36 /// 128 bits vector
37 pub bits: BitVec<u8, Lsb0>,
38}
39
40/// [`Set`] trait defines common methods available for [`NormalSet`] and [`SmallSet`]
41pub trait Set {
42 /// Checks if a [`Set`] contains a character
43 fn has(&self, r: u8) -> bool;
44 /// Instanciates [`Set`] with a given charset
45 fn new(chars: &[u8]) -> Self;
46 /// Counts number of bits with value 1 in the bit vector which is the number of characters matched by [`Set`]
47 fn size(&self) -> usize;
48}
49
50/// Common methods between [`SmallSet`] and [`NormalSet`]
51impl Set for SmallSet {
52 /// Checks if a [`SmallSet`] contains a character
53 ///
54 /// # Examples
55 /// ```
56 /// use crate::ripeg::charset::Set;
57 /// use crate::ripeg::charset::SmallSet;
58 /// use bitvec::prelude::*;
59 /// let s = SmallSet::new(&[67u8, 68, 69]); // C, D, E ASCII decimal value
60 /// assert_eq!(s.has(66), false);
61 /// assert_eq!(s.has(67), true);
62 /// assert_eq!(s.has(250), false);
63 /// ```
64 fn has(&self, r: u8) -> bool {
65 if (r as usize) < self.bits.len() {
66 self.bits[r as usize]
67 } else {
68 false
69 }
70 }
71
72 /// Instanciate a [`SmallSet`] with a given charset
73 ///
74 /// # Examples
75 /// ```
76 /// use crate::ripeg::charset::Set;
77 /// use crate::ripeg::charset::SmallSet;
78 /// use bitvec::prelude::*;
79 /// let s = SmallSet::new(&[67u8, 68, 69]); // C, D, E ASCII decimal value
80 /// assert_eq!(s.has(66), false);
81 /// assert_eq!(s.has(67), true);
82 /// ```
83 fn new(chars: &[u8]) -> Self {
84 let mut s = Self {
85 bits: bitvec![u8, Lsb0; 0; 128],
86 };
87 for i in chars {
88 s.bits.set(*i as usize, true);
89 }
90 s
91 }
92
93 /// Count number of bits with value 1 in the bit vector which is the number of characters matched by a [`SmallSet`]
94 ///
95 /// # Examples
96 /// ```
97 /// use crate::ripeg::charset::Set;
98 /// use crate::ripeg::charset::SmallSet;
99 /// use bitvec::prelude::*;
100 /// let s = SmallSet::new(&[67u8, 68, 69]); // C, D, E ASCII decimal value
101 /// assert_eq!(s.size(), 3);
102 /// ```
103 fn size(&self) -> usize {
104 self.bits.count_ones()
105 }
106}
107
108/// Common methods between [`SmallSet`] and [`NormalSet`]
109impl Set for NormalSet {
110 /// Checks if a [`NormalSet`] contains a character
111 ///
112 /// # Examples
113 /// ```
114 /// use crate::ripeg::charset::Set;
115 /// use crate::ripeg::charset::NormalSet;
116 /// use bitvec::prelude::*;
117 /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
118 /// assert_eq!(s.has(66), false);
119 /// assert_eq!(s.has(67), true);
120 /// assert_eq!(s.has(246), false);
121 /// assert_eq!(s.has(247), true);
122 /// ```
123 fn has(&self, r: u8) -> bool {
124 self.bits[r as usize]
125 }
126
127 /// Instanciate [`NormalSet`] with a given charset
128 ///
129 /// # Examples
130 /// ```
131 /// use crate::ripeg::charset::Set;
132 /// use crate::ripeg::charset::NormalSet;
133 /// use bitvec::prelude::*;
134 /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
135 /// assert_eq!(s.has(66), false);
136 /// assert_eq!(s.has(67), true);
137 /// assert_eq!(s.has(246), false);
138 /// assert_eq!(s.has(247), true);
139 /// ```
140 fn new(chars: &[u8]) -> Self {
141 let mut s = Self {
142 bits: bitvec![u8, Lsb0; 0; 256],
143 };
144 for i in chars {
145 s.bits.set(*i as usize, true);
146 }
147 s
148 }
149
150 /// Count number of bits with value 1 in the bits vector which is the number of characters matched by [`NormalSet`]
151 ///
152 /// # Examples
153 /// ```
154 /// use crate::ripeg::charset::Set;
155 /// use crate::ripeg::charset::NormalSet;
156 /// use bitvec::prelude::*;
157 /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
158 /// assert_eq!(s.size(), 4);
159 /// ```
160 fn size(&self) -> usize {
161 self.bits.count_ones()
162 }
163}
164
165/// Implementations of functions reserved to [`NormalSet`]
166impl NormalSet {
167 /// Adds a [`NormalSet`] to the existing one (binary OR operation)
168 ///
169 /// # Examples
170 /// ```
171 /// use crate::ripeg::charset::Set;
172 /// use crate::ripeg::charset::NormalSet;
173 /// use bitvec::prelude::*;
174 /// let mut s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
175 /// let s2 = NormalSet::new(&[65u8, 66, 247]); // A, B, ÷ ASCII decimal value
176 /// s.add(s2);
177 /// assert_eq!(s.size(), 6); // [65u8, 66, 67, 68, 69, 247]
178 /// ```
179 pub fn add(&mut self, s2: NormalSet) {
180 self.bits |= s2.bits;
181 }
182
183 /// Returns all non-matched characters of a [`NormalSet`]
184 ///
185 /// # Examples
186 /// ```
187 /// use crate::ripeg::charset::Set;
188 /// use crate::ripeg::charset::NormalSet;
189 /// use bitvec::prelude::*;
190 /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
191 /// let s2 = s.complement();
192 /// assert_eq!(s2.size(), 252); // 256-4
193 /// ```
194 pub fn complement(&self) -> NormalSet {
195 let mut s = Self {
196 bits: bitvec![u8, Lsb0; 0; 256],
197 };
198 //s.bits = !self.bits;
199 for i in 0usize..256 {
200 s.bits.set(i, self.bits.get(i).unwrap().not());
201 }
202 s
203 }
204
205 /// Returns true if [`NormalSet`] can be converted to a [`SmallSet`]
206 ///
207 /// # Examples
208 /// ```
209 /// use crate::ripeg::charset::Set;
210 /// use crate::ripeg::charset::NormalSet;
211 /// use bitvec::prelude::*;
212 /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
213 /// let s2 = NormalSet::new(&[65u8, 127]);
214 /// assert_eq!(s.is_small(), false);
215 /// assert_eq!(s2.is_small(), true);
216 /// ```
217 pub fn is_small(&self) -> bool {
218 self.bits[128..256].count_ones() == 0
219 }
220
221 /// Returns a [`NormalSet`] matching all characters between low and high inclusive
222 ///
223 /// # Examples
224 /// ```
225 /// use crate::ripeg::charset::Set;
226 /// use crate::ripeg::charset::NormalSet;
227 /// use bitvec::prelude::*;
228 /// let s = NormalSet::range(48, 57); // 0 to 9 in ASCII decimal value
229 /// assert_eq!(s.size(), 10);
230 /// assert_eq!(s.is_small(), true);
231 /// assert_eq!(s.has(47), false);
232 /// assert_eq!(s.has(48), true);
233 /// assert_eq!(s.has(52), true);
234 /// assert_eq!(s.has(57), true);
235 /// assert_eq!(s.has(58), false);
236 /// ```
237 pub fn range(low: u8, high: u8) -> NormalSet {
238 let mut s = Self {
239 bits: bitvec![u8, Lsb0; 0; 256],
240 };
241 for i in low..=high {
242 s.bits.set(i as usize, true);
243 }
244 s
245 }
246
247 /// [NormalSet::smallset()] method converts a [`NormalSet`] into a [`SmallSet`]
248 ///
249 /// # Examples
250 /// ```
251 /// use crate::ripeg::charset::Set;
252 /// use crate::ripeg::charset::{NormalSet, SmallSet};
253 /// use bitvec::prelude::*;
254 /// let s = NormalSet::range(120, 130);
255 /// let s2 = s.smallset();
256 /// assert_eq!(s.size(), 11);
257 /// assert_eq!(s.is_small(), false);
258 /// assert_eq!(s.has(119), false);
259 /// assert_eq!(s.has(120), true);
260 /// assert_eq!(s.has(130), true);
261 /// assert_eq!(s.has(131), false);
262 /// assert_eq!(s2.size(), 8); // 120..128
263 /// assert_eq!(s2.has(119), false);
264 /// assert_eq!(s2.has(120), true);
265 /// assert_eq!(s2.has(127), true);
266 /// assert_eq!(s2.has(128), false);
267 /// assert_eq!(s2.has(131), false);
268 /// ```
269 pub fn smallset(&self) -> SmallSet {
270 // 0..128
271 SmallSet {
272 bits: self.bits[0..128].to_bitvec(),
273 }
274 }
275
276 /// [NormalSet::string()] returns the string represention of the charset
277 ///
278 /// # Examples
279 /// ```
280 /// use crate::ripeg::charset::Set;
281 /// use crate::ripeg::charset::{NormalSet, SmallSet};
282 /// use bitvec::prelude::*;
283 /// let mut s = NormalSet::range(120, 130);
284 /// let s2 = NormalSet::range(110, 115);
285 /// s.add(s2);
286 /// let output = s.string();
287 /// assert_eq!(output, "{110..116,120..131}");
288 /// ```
289 pub fn string(&self) -> String {
290 let mut s = String::new();
291 let mut inrange = false;
292 for b in 0..=255 {
293 if self.has(b) && b == 255 {
294 s += "\u{00ff}"; // C1 command. Unicode internal stuff.
295 } else if self.has(b) && !inrange {
296 inrange = true;
297 if self.has(b + 1) {
298 s += &b.to_string();
299 s += "..";
300 }
301 } else if !self.has(b) && inrange {
302 inrange = false;
303 s += &b.to_string();
304 s += ",";
305 }
306 }
307 if !s.is_empty() && s.ends_with(',') {
308 s.pop();
309 }
310 s = "{".to_owned() + &s + "}";
311 s
312 }
313
314 /// [`NormalSet::sub()`] method substracts a [`NormalSet`] to the existing one (not operation)
315 ///
316 /// # Examples
317 /// ```
318 /// use crate::ripeg::charset::Set;
319 /// use crate::ripeg::charset::{NormalSet, SmallSet};
320 /// use bitvec::prelude::*;
321 /// let s = NormalSet::range(120, 130); // size 11
322 /// let s2 = NormalSet::range(125, 127); // size 3
323 /// let s3 = s.sub(s2); // 120..125, 128..131
324 /// assert_eq!(s3.size(), 11-3);
325 /// assert_eq!(s3.is_small(), false);
326 /// assert_eq!(s3.has(119), false);
327 /// assert_eq!(s3.has(120), true);
328 /// assert_eq!(s3.has(124), true);
329 /// assert_eq!(s3.has(125), false);
330 /// assert_eq!(s3.has(127), false);
331 /// assert_eq!(s3.has(128), true);
332 /// assert_eq!(s3.has(130), true);
333 /// assert_eq!(s3.has(131), false);
334 /// let s4 = NormalSet::range(110, 119);
335 /// let s5 = s.sub(s4);
336 /// assert_eq!(s5.size(), 11); // s4 contains characters not in s so nothing should change
337 /// assert_eq!(s5.is_small(), false);
338 /// assert_eq!(s5.has(109), false);
339 /// assert_eq!(s5.has(110), false);
340 /// assert_eq!(s5.has(119), false);
341 /// assert_eq!(s5.has(120), true);
342 /// ```
343 pub fn sub(&self, s2: NormalSet) -> NormalSet {
344 let mut s = Self {
345 bits: bitvec![u8, Lsb0; 0; 256],
346 };
347 s.bits = !s2.bits;
348 s.bits &= self.bits.to_bitvec();
349 s
350 }
351}