ripeg/charset/
mod.rs

1// Copyright (C) 2022 Laurent Wandrebeck
2//
3// This file is part of ripeg.
4//
5// ripeg is free software: you can redistribute it and/or modify
6// it under the terms of the GNU General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// ripeg is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13// GNU General Public License for more details.
14//
15// You should have received a copy of the GNU General Public License
16// along with ripeg.  If not, see <http://www.gnu.org/licenses/>.
17
18//! charset module provides data types and methods for managing sets of characters.
19
20use std::ops::Not;
21
22use bitvec::prelude::*;
23
24/// [`NormalSet`] structure represents a set of chars
25///
26/// 256 bits, one for each possible character value. Little endian (Lsb0)
27pub struct NormalSet {
28    /// 256 bits vector
29    pub bits: BitVec<u8, Lsb0>,
30}
31
32/// [`SmallSet`] structure represents only the ASCII set of chars
33///
34/// 128 bits, one for each possible ASCII character value. Little endian (Lsb0)
35pub struct SmallSet {
36    /// 128 bits vector
37    pub bits: BitVec<u8, Lsb0>,
38}
39
40/// [`Set`] trait defines common methods available for [`NormalSet`] and [`SmallSet`]
41pub trait Set {
42    /// Checks if a [`Set`] contains a character
43    fn has(&self, r: u8) -> bool;
44    /// Instanciates [`Set`] with a given charset
45    fn new(chars: &[u8]) -> Self;
46    /// Counts number of bits with value 1 in the bit vector which is the number of characters matched by [`Set`]
47    fn size(&self) -> usize;
48}
49
50/// Common methods between [`SmallSet`] and [`NormalSet`]
51impl Set for SmallSet {
52    /// Checks if a [`SmallSet`] contains a character
53    ///
54    /// # Examples
55    /// ```
56    /// use crate::ripeg::charset::Set;
57    /// use crate::ripeg::charset::SmallSet;
58    /// use bitvec::prelude::*;
59    /// let s = SmallSet::new(&[67u8, 68, 69]); // C, D, E ASCII decimal value
60    /// assert_eq!(s.has(66), false);
61    /// assert_eq!(s.has(67), true);
62    /// assert_eq!(s.has(250), false);
63    /// ```
64    fn has(&self, r: u8) -> bool {
65        if (r as usize) < self.bits.len() {
66            self.bits[r as usize]
67        } else {
68            false
69        }
70    }
71
72    /// Instanciate a [`SmallSet`] with a given charset
73    ///
74    /// # Examples
75    /// ```
76    /// use crate::ripeg::charset::Set;
77    /// use crate::ripeg::charset::SmallSet;
78    /// use bitvec::prelude::*;
79    /// let s = SmallSet::new(&[67u8, 68, 69]); // C, D, E ASCII decimal value
80    /// assert_eq!(s.has(66), false);
81    /// assert_eq!(s.has(67), true);
82    /// ```
83    fn new(chars: &[u8]) -> Self {
84        let mut s = Self {
85            bits: bitvec![u8, Lsb0; 0; 128],
86        };
87        for i in chars {
88            s.bits.set(*i as usize, true);
89        }
90        s
91    }
92
93    /// Count number of bits with value 1 in the bit vector which is the number of characters matched by a [`SmallSet`]
94    ///
95    /// # Examples
96    /// ```
97    /// use crate::ripeg::charset::Set;
98    /// use crate::ripeg::charset::SmallSet;
99    /// use bitvec::prelude::*;
100    /// let s = SmallSet::new(&[67u8, 68, 69]); // C, D, E ASCII decimal value
101    /// assert_eq!(s.size(), 3);
102    /// ```
103    fn size(&self) -> usize {
104        self.bits.count_ones()
105    }
106}
107
108/// Common methods between [`SmallSet`] and [`NormalSet`]
109impl Set for NormalSet {
110    /// Checks if a [`NormalSet`] contains a character
111    ///
112    /// # Examples
113    /// ```
114    /// use crate::ripeg::charset::Set;
115    /// use crate::ripeg::charset::NormalSet;
116    /// use bitvec::prelude::*;
117    /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
118    /// assert_eq!(s.has(66), false);
119    /// assert_eq!(s.has(67), true);
120    /// assert_eq!(s.has(246), false);
121    /// assert_eq!(s.has(247), true);
122    /// ```
123    fn has(&self, r: u8) -> bool {
124        self.bits[r as usize]
125    }
126
127    /// Instanciate [`NormalSet`] with a given charset
128    ///
129    /// # Examples
130    /// ```
131    /// use crate::ripeg::charset::Set;
132    /// use crate::ripeg::charset::NormalSet;
133    /// use bitvec::prelude::*;
134    /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
135    /// assert_eq!(s.has(66), false);
136    /// assert_eq!(s.has(67), true);
137    /// assert_eq!(s.has(246), false);
138    /// assert_eq!(s.has(247), true);
139    /// ```
140    fn new(chars: &[u8]) -> Self {
141        let mut s = Self {
142            bits: bitvec![u8, Lsb0; 0; 256],
143        };
144        for i in chars {
145            s.bits.set(*i as usize, true);
146        }
147        s
148    }
149
150    /// Count number of bits with value 1 in the bits vector which is the number of characters matched by [`NormalSet`]
151    ///
152    /// # Examples
153    /// ```
154    /// use crate::ripeg::charset::Set;
155    /// use crate::ripeg::charset::NormalSet;
156    /// use bitvec::prelude::*;
157    /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
158    /// assert_eq!(s.size(), 4);
159    /// ```
160    fn size(&self) -> usize {
161        self.bits.count_ones()
162    }
163}
164
165/// Implementations of functions reserved to [`NormalSet`]
166impl NormalSet {
167    /// Adds a [`NormalSet`] to the existing one (binary OR operation)
168    ///
169    /// # Examples
170    /// ```
171    /// use crate::ripeg::charset::Set;
172    /// use crate::ripeg::charset::NormalSet;
173    /// use bitvec::prelude::*;
174    /// let mut s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
175    /// let s2 = NormalSet::new(&[65u8, 66, 247]); // A, B, ÷ ASCII decimal value
176    /// s.add(s2);
177    /// assert_eq!(s.size(), 6); // [65u8, 66, 67, 68, 69, 247]
178    /// ```
179    pub fn add(&mut self, s2: NormalSet) {
180        self.bits |= s2.bits;
181    }
182
183    /// Returns all non-matched characters of a [`NormalSet`]
184    ///
185    /// # Examples
186    /// ```
187    /// use crate::ripeg::charset::Set;
188    /// use crate::ripeg::charset::NormalSet;
189    /// use bitvec::prelude::*;
190    /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
191    /// let s2 = s.complement();
192    /// assert_eq!(s2.size(), 252); // 256-4
193    /// ```
194    pub fn complement(&self) -> NormalSet {
195        let mut s = Self {
196            bits: bitvec![u8, Lsb0; 0; 256],
197        };
198        //s.bits = !self.bits;
199        for i in 0usize..256 {
200            s.bits.set(i, self.bits.get(i).unwrap().not());
201        }
202        s
203    }
204
205    /// Returns true if [`NormalSet`] can be converted to a [`SmallSet`]
206    ///
207    /// # Examples
208    /// ```
209    /// use crate::ripeg::charset::Set;
210    /// use crate::ripeg::charset::NormalSet;
211    /// use bitvec::prelude::*;
212    /// let s = NormalSet::new(&[67u8, 68, 69, 247]); // C, D, E, ÷ ASCII decimal value
213    /// let s2 = NormalSet::new(&[65u8, 127]);
214    /// assert_eq!(s.is_small(), false);
215    /// assert_eq!(s2.is_small(), true);
216    /// ```
217    pub fn is_small(&self) -> bool {
218        self.bits[128..256].count_ones() == 0
219    }
220
221    /// Returns a [`NormalSet`] matching all characters between low and high inclusive
222    ///
223    /// # Examples
224    /// ```
225    /// use crate::ripeg::charset::Set;
226    /// use crate::ripeg::charset::NormalSet;
227    /// use bitvec::prelude::*;
228    /// let s = NormalSet::range(48, 57); // 0 to 9 in ASCII decimal value
229    /// assert_eq!(s.size(), 10);
230    /// assert_eq!(s.is_small(), true);
231    /// assert_eq!(s.has(47), false);
232    /// assert_eq!(s.has(48), true);
233    /// assert_eq!(s.has(52), true);
234    /// assert_eq!(s.has(57), true);
235    /// assert_eq!(s.has(58), false);
236    /// ```
237    pub fn range(low: u8, high: u8) -> NormalSet {
238        let mut s = Self {
239            bits: bitvec![u8, Lsb0; 0; 256],
240        };
241        for i in low..=high {
242            s.bits.set(i as usize, true);
243        }
244        s
245    }
246
247    /// [NormalSet::smallset()] method converts a [`NormalSet`] into a [`SmallSet`]
248    ///
249    /// # Examples
250    /// ```
251    /// use crate::ripeg::charset::Set;
252    /// use crate::ripeg::charset::{NormalSet, SmallSet};
253    /// use bitvec::prelude::*;
254    /// let s = NormalSet::range(120, 130);
255    /// let s2 = s.smallset();
256    /// assert_eq!(s.size(), 11);
257    /// assert_eq!(s.is_small(), false);
258    /// assert_eq!(s.has(119), false);
259    /// assert_eq!(s.has(120), true);
260    /// assert_eq!(s.has(130), true);
261    /// assert_eq!(s.has(131), false);
262    /// assert_eq!(s2.size(), 8); // 120..128
263    /// assert_eq!(s2.has(119), false);
264    /// assert_eq!(s2.has(120), true);
265    /// assert_eq!(s2.has(127), true);
266    /// assert_eq!(s2.has(128), false);
267    /// assert_eq!(s2.has(131), false);
268    /// ```
269    pub fn smallset(&self) -> SmallSet {
270        // 0..128
271        SmallSet {
272            bits: self.bits[0..128].to_bitvec(),
273        }
274    }
275
276    /// [NormalSet::string()] returns the string represention of the charset
277    ///
278    /// # Examples
279    /// ```
280    /// use crate::ripeg::charset::Set;
281    /// use crate::ripeg::charset::{NormalSet, SmallSet};
282    /// use bitvec::prelude::*;
283    /// let mut s = NormalSet::range(120, 130);
284    /// let s2 = NormalSet::range(110, 115);
285    /// s.add(s2);
286    /// let output = s.string();
287    /// assert_eq!(output, "{110..116,120..131}");
288    /// ```
289    pub fn string(&self) -> String {
290        let mut s = String::new();
291        let mut inrange = false;
292        for b in 0..=255 {
293            if self.has(b) && b == 255 {
294                s += "\u{00ff}"; // C1 command. Unicode internal stuff.
295            } else if self.has(b) && !inrange {
296                inrange = true;
297                if self.has(b + 1) {
298                    s += &b.to_string();
299                    s += "..";
300                }
301            } else if !self.has(b) && inrange {
302                inrange = false;
303                s += &b.to_string();
304                s += ",";
305            }
306        }
307        if !s.is_empty() && s.ends_with(',') {
308            s.pop();
309        }
310        s = "{".to_owned() + &s + "}";
311        s
312    }
313
314    /// [`NormalSet::sub()`] method substracts a [`NormalSet`] to the existing one (not operation)
315    ///
316    /// # Examples
317    /// ```
318    /// use crate::ripeg::charset::Set;
319    /// use crate::ripeg::charset::{NormalSet, SmallSet};
320    /// use bitvec::prelude::*;
321    /// let s = NormalSet::range(120, 130); // size 11
322    /// let s2 = NormalSet::range(125, 127); // size 3
323    /// let s3 = s.sub(s2); // 120..125, 128..131
324    /// assert_eq!(s3.size(), 11-3);
325    /// assert_eq!(s3.is_small(), false);
326    /// assert_eq!(s3.has(119), false);
327    /// assert_eq!(s3.has(120), true);
328    /// assert_eq!(s3.has(124), true);
329    /// assert_eq!(s3.has(125), false);
330    /// assert_eq!(s3.has(127), false);
331    /// assert_eq!(s3.has(128), true);
332    /// assert_eq!(s3.has(130), true);
333    /// assert_eq!(s3.has(131), false);
334    /// let s4 = NormalSet::range(110, 119);
335    /// let s5 = s.sub(s4);
336    /// assert_eq!(s5.size(), 11); // s4 contains characters not in s so nothing should change
337    /// assert_eq!(s5.is_small(), false);
338    /// assert_eq!(s5.has(109), false);
339    /// assert_eq!(s5.has(110), false);
340    /// assert_eq!(s5.has(119), false);
341    /// assert_eq!(s5.has(120), true);
342    /// ```
343    pub fn sub(&self, s2: NormalSet) -> NormalSet {
344        let mut s = Self {
345            bits: bitvec![u8, Lsb0; 0; 256],
346        };
347        s.bits = !s2.bits;
348        s.bits &= self.bits.to_bitvec();
349        s
350    }
351}