regex_charclass/
lib.rs

1pub mod char;
2mod tokens;
3use std::ops::{Bound, RangeBounds};
4
5use char::{Char, INVALID_MIN, INVALID_SIZE};
6use irange::{integer::Bounded, RangeSet};
7use tokens::identify_character;
8
9pub use irange;
10
11/// A trait for `RangeSet<Char>` to hold ranges of `char`.
12/// 
13/// # Example:
14/// 
15/// ```
16/// use regex_charclass::{irange::{RangeSet, range::AnyRange}, char::Char, CharacterClass};
17/// 
18/// let range1 = RangeSet::new_from_range_char('a'..='z');
19/// assert_eq!(26, range1.get_cardinality());
20/// assert_eq!("[a-z]", range1.to_regex());
21/// 
22/// let range2 = RangeSet::new_from_ranges(&[
23///     AnyRange::from(Char::new('0')..=Char::new('9')),
24///     AnyRange::from(Char::new('A')..=Char::new('F')),
25///     AnyRange::from(Char::new('a')..=Char::new('f')),
26/// ]);
27/// assert_eq!("\\p{ASCII_Hex_Digit}", range2.to_regex());
28/// 
29/// let range2_complement = range2.complement();
30/// assert_eq!("\\P{ASCII_Hex_Digit}", range2_complement.to_regex());
31/// 
32/// 
33/// assert_eq!(".", range2.union(&range2_complement).to_regex());
34/// assert_eq!("[]", range2.intersection(&range2_complement).to_regex());
35/// 
36/// assert_eq!("[g-z]", range1.difference(&range2).to_regex());
37/// ```
38pub trait CharacterClass: Sized {
39    fn new_from_range_u32<R: RangeBounds<u32>>(range: R) -> Option<Self>;
40
41    fn new_from_range_char<R: RangeBounds<char>>(range: R) -> Self;
42
43    fn get_cardinality(&self) -> u32;
44
45    fn to_regex(&self) -> String;
46}
47
48impl CharacterClass for RangeSet<Char> {
49    /// Create a new instance from the given range of `u32`, return `None` if the `char` codes are invalid.
50    ///
51    /// # Example:
52    ///
53    /// ```
54    /// use regex_charclass::{irange::RangeSet, CharacterClass};
55    ///  
56    /// let range = RangeSet::new_from_range_u32(97..=122);
57    /// ```
58    #[inline]
59    fn new_from_range_u32<R: RangeBounds<u32>>(range: R) -> Option<Self> {
60        let min = to_lowerbound_u32(range.start_bound())?;
61        let max = to_upperbound_u32(range.end_bound())?;
62
63        Some(RangeSet::new_from_range(min..=max))
64    }
65
66    /// Create a new instance from the given range of `char`.
67    ///
68    /// # Example:
69    ///
70    /// ```
71    /// use regex_charclass::{irange::RangeSet, CharacterClass};
72    ///  
73    /// let range = RangeSet::new_from_range_char('a'..='z');
74    /// ```
75    #[inline]
76    fn new_from_range_char<R: RangeBounds<char>>(range: R) -> Self {
77        let min = to_lowerbound_char(range.start_bound());
78        let max = to_upperbound_char(range.end_bound());
79
80        RangeSet::new_from_range(min..=max)
81    }
82
83    /// Return the number of possible `char` contained.
84    ///
85    /// # Example:
86    ///
87    /// ```
88    /// use regex_charclass::{char::Char, irange::RangeSet, CharacterClass};
89    ///  
90    /// let range = RangeSet::new_from_range_char('a'..='z');
91    /// assert_eq!(26, range.get_cardinality());
92    /// ```
93    #[inline]
94    fn get_cardinality(&self) -> u32 {
95        let mut cardinality = 0;
96        for r in (0..self.0.len()).step_by(2) {
97            let mut minuhend = self.0[r + 1].to_u32();
98            if minuhend >= INVALID_MIN {
99                minuhend -= INVALID_SIZE;
100            }
101            let mut subtrahend = self.0[r].to_u32();
102            if subtrahend >= INVALID_MIN {
103                subtrahend -= INVALID_SIZE;
104            }
105            cardinality += minuhend - subtrahend + 1;
106        }
107        cardinality
108    }
109
110    /// Return a valid regular expression character class.
111    ///
112    /// # Example:
113    ///
114    /// ```
115    /// use regex_charclass::{irange::{RangeSet, range::AnyRange}, char::Char, CharacterClass};
116    ///  
117    /// let range = RangeSet::new_from_range_char('a'..='z');
118    /// assert_eq!("[a-z]", range.to_regex());
119    ///
120    /// let range = RangeSet::<Char>::new_from_ranges(&[
121    ///     AnyRange::from(Char::new('0')..=Char::new('9')),
122    ///     AnyRange::from(Char::new('A')..=Char::new('F')),
123    ///     AnyRange::from(Char::new('a')..=Char::new('f')),
124    /// ]);
125    /// assert_eq!("\\p{ASCII_Hex_Digit}", range.to_regex());
126    /// ```
127    #[inline]
128    fn to_regex(&self) -> String {
129        let range = self.clone();
130        if self.is_empty() {
131            String::from("[]")
132        } else if range.is_total() {
133            String::from(".")
134        } else if let Some(token) = tokens::identify_class(self) {
135            token.to_owned()
136        } else {
137            convert_to_regex(&range)
138        }
139    }
140}
141
142fn to_lowerbound_u32(bound: Bound<&u32>) -> Option<Char> {
143    match bound {
144        Bound::Included(t) => Char::from_u32(*t),
145        Bound::Excluded(t) => {
146            char::from_u32(*t)?;
147
148            if let Some(c) = Char::from_u32(*t + 1) {
149                Some(c)
150            } else {
151                Some(Char::new('\u{E000}'))
152            }
153        }
154        Bound::Unbounded => Some(Char::min_value()),
155    }
156}
157
158fn to_upperbound_u32(bound: Bound<&u32>) -> Option<Char> {
159    match bound {
160        Bound::Included(t) => Char::from_u32(*t),
161        Bound::Excluded(t) => {
162            char::from_u32(*t)?;
163
164            if let Some(c) = Char::from_u32(*t - 1) {
165                Some(c)
166            } else {
167                Some(Char::new('\u{D7FF}'))
168            }
169        }
170        Bound::Unbounded => Some(Char::min_value()),
171    }
172}
173
174fn to_lowerbound_char(bound: Bound<&char>) -> Char {
175    match bound {
176        Bound::Included(t) => Char::new(*t),
177        Bound::Excluded(t) => {
178            if let Some(c) = Char::from_u32(*t as u32 + 1) {
179                c
180            } else {
181                Char::new('\u{E000}')
182            }
183        }
184        Bound::Unbounded => Char::min_value(),
185    }
186}
187
188fn to_upperbound_char(bound: Bound<&char>) -> Char {
189    match bound {
190        Bound::Included(t) => Char::new(*t),
191        Bound::Excluded(t) => {
192            if let Some(c) = Char::from_u32(*t as u32 - 1) {
193                c
194            } else {
195                Char::new('\u{D7FF}')
196            }
197        }
198        Bound::Unbounded => Char::min_value(),
199    }
200}
201
202fn convert_to_regex(range: &RangeSet<Char>) -> String {
203    let mut sb = String::new();
204
205    let is_complement;
206    let range_to_use;
207    let complement = range.complement();
208    if complement.0.len() < range.0.len() {
209        range_to_use = &complement;
210        is_complement = true;
211    } else {
212        range_to_use = range;
213        is_complement = false;
214    }
215
216    for r in (0..range_to_use.0.len()).step_by(2) {
217        let (min, max) = (range_to_use.0[r], range_to_use.0[r + 1]);
218        if min == max {
219            sb.push_str(get_printable_char(min.to_char()).as_str());
220        } else if min + Char::one() == max {
221            sb.push_str(
222                format!(
223                    "{}{}",
224                    get_printable_char(min.to_char()),
225                    get_printable_char(max.to_char())
226                )
227                .as_str(),
228            );
229        } else {
230            sb.push_str(
231                format!(
232                    "{}-{}",
233                    get_printable_char(min.to_char()),
234                    get_printable_char(max.to_char())
235                )
236                .as_str(),
237            );
238        }
239    }
240
241    if is_complement || range_to_use.0.len() > 2 || range_to_use.0[0] != range_to_use.0[1] {
242        if is_complement {
243            return format!("[^{}]", sb);
244        } else {
245            return format!("[{}]", sb);
246        }
247    }
248
249    sb
250}
251
252fn get_printable_char(character: char) -> String {
253    if ('\u{20}'..'\u{7E}').contains(&character) {
254        if character == '*'
255            || character == '+'
256            || character == '?'
257            || character == '('
258            || character == ')'
259            || character == '['
260            || character == ']'
261            || character == '{'
262            || character == '}'
263            || character == '|'
264            || character == '\\'
265            || character == '-'
266            || character == '^'
267            || character == '.'
268        {
269            format!("\\{}", character)
270        } else {
271            format!("{}", character)
272        }
273    } else if let Some(c) = identify_character(character) {
274        c.to_owned()
275    } else {
276        format!("\\u{{{:04x}}}", character as u32)
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use irange::range::AnyRange;
283
284    use super::*;
285
286    #[test]
287    fn test_empty_and_total() -> Result<(), String> {
288        let range = RangeSet::<Char>::empty();
289        assert!(range.is_empty());
290        assert_eq!("[]", range.to_regex());
291        assert_eq!(0, range.get_cardinality());
292
293        let range = RangeSet::<Char>::total();
294        assert!(range.is_total());
295        assert_eq!(".", range.to_regex());
296        assert_eq!(1_112_064, range.get_cardinality());
297        Ok(())
298    }
299
300    #[test]
301    fn test_operations() -> Result<(), String> {
302        let range1 = RangeSet::new_from_range_char('a'..='z');
303        assert_eq!("[a-z]", range1.to_regex());
304
305        for char in range1.iter() {
306            assert!(range1.contains(char))
307        }
308
309        let range2 = RangeSet::<Char>::new_from_ranges(&[
310            AnyRange::from(Char::new('0')..Char::new('2')),
311            AnyRange::from(Char::new('A')..=Char::new('F')),
312            AnyRange::from(Char::new('a')..=Char::new('f')),
313        ]);
314        assert_eq!("[01A-Fa-f]", range2.to_regex());
315
316        for char in range2.iter() {
317            assert!(range2.contains(char))
318        }
319
320        let intersection = range1.intersection(&range2);
321        assert_eq!("[a-f]", intersection.to_regex());
322
323        for char in intersection.iter() {
324            assert!(intersection.contains(char))
325        }
326
327        Ok(())
328    }
329
330    #[test]
331    fn test_to_regex() -> Result<(), String> {
332        let range = RangeSet::<Char>::new_from_range_char('.'..='.');
333        assert_eq!("\\.", range.to_regex());
334
335        let range = RangeSet::<Char>::new_from_ranges(&[
336            AnyRange::from(Char::new('0')..=Char::new('9')),
337            AnyRange::from(Char::new('A')..=Char::new('F')),
338            AnyRange::from(Char::new('a')..=Char::new('f')),
339        ]);
340        assert_eq!("\\p{ASCII_Hex_Digit}", range.to_regex());
341
342        Ok(())
343    }
344
345    #[test]
346    #[cfg(feature = "serde")]
347    fn test_serde() -> Result<(), String> {
348        let range = RangeSet::empty();
349        let serialized = serde_json::to_string(&range).unwrap();
350        let unserialized: RangeSet<Char> = serde_json::from_str(&serialized).unwrap();
351        assert_eq!(range, unserialized);
352
353        let range = RangeSet::<Char>::total();
354        let serialized = serde_json::to_string(&range).unwrap();
355        let unserialized: RangeSet<Char> = serde_json::from_str(&serialized).unwrap();
356        assert_eq!(range, unserialized);
357
358        let range = RangeSet::new_from_ranges(&[
359            AnyRange::from(Char::new('3')..=Char::new('4')),
360            AnyRange::from(Char::new('7')..Char::new('9')),
361        ]);
362        let serialized = serde_json::to_string(&range).unwrap();
363        let unserialized: RangeSet<Char> = serde_json::from_str(&serialized).unwrap();
364        assert_eq!(range, unserialized);
365        Ok(())
366    }
367}