1pub mod char;
2mod tokens;
3use std::ops::{Bound, RangeBounds};
4
5use char::{Char, INVALID_MIN, INVALID_SIZE};
6use irange::{integer::Bounded, RangeSet};
7use tokens::identify_character;
8
9pub use irange;
10
11pub trait CharacterClass: Sized {
39 fn new_from_range_u32<R: RangeBounds<u32>>(range: R) -> Option<Self>;
40
41 fn new_from_range_char<R: RangeBounds<char>>(range: R) -> Self;
42
43 fn get_cardinality(&self) -> u32;
44
45 fn to_regex(&self) -> String;
46}
47
48impl CharacterClass for RangeSet<Char> {
49 #[inline]
59 fn new_from_range_u32<R: RangeBounds<u32>>(range: R) -> Option<Self> {
60 let min = to_lowerbound_u32(range.start_bound())?;
61 let max = to_upperbound_u32(range.end_bound())?;
62
63 Some(RangeSet::new_from_range(min..=max))
64 }
65
66 #[inline]
76 fn new_from_range_char<R: RangeBounds<char>>(range: R) -> Self {
77 let min = to_lowerbound_char(range.start_bound());
78 let max = to_upperbound_char(range.end_bound());
79
80 RangeSet::new_from_range(min..=max)
81 }
82
83 #[inline]
94 fn get_cardinality(&self) -> u32 {
95 let mut cardinality = 0;
96 for r in (0..self.0.len()).step_by(2) {
97 let mut minuhend = self.0[r + 1].to_u32();
98 if minuhend >= INVALID_MIN {
99 minuhend -= INVALID_SIZE;
100 }
101 let mut subtrahend = self.0[r].to_u32();
102 if subtrahend >= INVALID_MIN {
103 subtrahend -= INVALID_SIZE;
104 }
105 cardinality += minuhend - subtrahend + 1;
106 }
107 cardinality
108 }
109
110 #[inline]
128 fn to_regex(&self) -> String {
129 let range = self.clone();
130 if self.is_empty() {
131 String::from("[]")
132 } else if range.is_total() {
133 String::from(".")
134 } else if let Some(token) = tokens::identify_class(self) {
135 token.to_owned()
136 } else {
137 convert_to_regex(&range)
138 }
139 }
140}
141
142fn to_lowerbound_u32(bound: Bound<&u32>) -> Option<Char> {
143 match bound {
144 Bound::Included(t) => Char::from_u32(*t),
145 Bound::Excluded(t) => {
146 char::from_u32(*t)?;
147
148 if let Some(c) = Char::from_u32(*t + 1) {
149 Some(c)
150 } else {
151 Some(Char::new('\u{E000}'))
152 }
153 }
154 Bound::Unbounded => Some(Char::min_value()),
155 }
156}
157
158fn to_upperbound_u32(bound: Bound<&u32>) -> Option<Char> {
159 match bound {
160 Bound::Included(t) => Char::from_u32(*t),
161 Bound::Excluded(t) => {
162 char::from_u32(*t)?;
163
164 if let Some(c) = Char::from_u32(*t - 1) {
165 Some(c)
166 } else {
167 Some(Char::new('\u{D7FF}'))
168 }
169 }
170 Bound::Unbounded => Some(Char::min_value()),
171 }
172}
173
174fn to_lowerbound_char(bound: Bound<&char>) -> Char {
175 match bound {
176 Bound::Included(t) => Char::new(*t),
177 Bound::Excluded(t) => {
178 if let Some(c) = Char::from_u32(*t as u32 + 1) {
179 c
180 } else {
181 Char::new('\u{E000}')
182 }
183 }
184 Bound::Unbounded => Char::min_value(),
185 }
186}
187
188fn to_upperbound_char(bound: Bound<&char>) -> Char {
189 match bound {
190 Bound::Included(t) => Char::new(*t),
191 Bound::Excluded(t) => {
192 if let Some(c) = Char::from_u32(*t as u32 - 1) {
193 c
194 } else {
195 Char::new('\u{D7FF}')
196 }
197 }
198 Bound::Unbounded => Char::min_value(),
199 }
200}
201
202fn convert_to_regex(range: &RangeSet<Char>) -> String {
203 let mut sb = String::new();
204
205 let is_complement;
206 let range_to_use;
207 let complement = range.complement();
208 if complement.0.len() < range.0.len() {
209 range_to_use = ∁
210 is_complement = true;
211 } else {
212 range_to_use = range;
213 is_complement = false;
214 }
215
216 for r in (0..range_to_use.0.len()).step_by(2) {
217 let (min, max) = (range_to_use.0[r], range_to_use.0[r + 1]);
218 if min == max {
219 sb.push_str(get_printable_char(min.to_char()).as_str());
220 } else if min + Char::one() == max {
221 sb.push_str(
222 format!(
223 "{}{}",
224 get_printable_char(min.to_char()),
225 get_printable_char(max.to_char())
226 )
227 .as_str(),
228 );
229 } else {
230 sb.push_str(
231 format!(
232 "{}-{}",
233 get_printable_char(min.to_char()),
234 get_printable_char(max.to_char())
235 )
236 .as_str(),
237 );
238 }
239 }
240
241 if is_complement || range_to_use.0.len() > 2 || range_to_use.0[0] != range_to_use.0[1] {
242 if is_complement {
243 return format!("[^{}]", sb);
244 } else {
245 return format!("[{}]", sb);
246 }
247 }
248
249 sb
250}
251
252fn get_printable_char(character: char) -> String {
253 if ('\u{20}'..'\u{7E}').contains(&character) {
254 if character == '*'
255 || character == '+'
256 || character == '?'
257 || character == '('
258 || character == ')'
259 || character == '['
260 || character == ']'
261 || character == '{'
262 || character == '}'
263 || character == '|'
264 || character == '\\'
265 || character == '-'
266 || character == '^'
267 || character == '.'
268 {
269 format!("\\{}", character)
270 } else {
271 format!("{}", character)
272 }
273 } else if let Some(c) = identify_character(character) {
274 c.to_owned()
275 } else {
276 format!("\\u{{{:04x}}}", character as u32)
277 }
278}
279
280#[cfg(test)]
281mod tests {
282 use irange::range::AnyRange;
283
284 use super::*;
285
286 #[test]
287 fn test_empty_and_total() -> Result<(), String> {
288 let range = RangeSet::<Char>::empty();
289 assert!(range.is_empty());
290 assert_eq!("[]", range.to_regex());
291 assert_eq!(0, range.get_cardinality());
292
293 let range = RangeSet::<Char>::total();
294 assert!(range.is_total());
295 assert_eq!(".", range.to_regex());
296 assert_eq!(1_112_064, range.get_cardinality());
297 Ok(())
298 }
299
300 #[test]
301 fn test_operations() -> Result<(), String> {
302 let range1 = RangeSet::new_from_range_char('a'..='z');
303 assert_eq!("[a-z]", range1.to_regex());
304
305 for char in range1.iter() {
306 assert!(range1.contains(char))
307 }
308
309 let range2 = RangeSet::<Char>::new_from_ranges(&[
310 AnyRange::from(Char::new('0')..Char::new('2')),
311 AnyRange::from(Char::new('A')..=Char::new('F')),
312 AnyRange::from(Char::new('a')..=Char::new('f')),
313 ]);
314 assert_eq!("[01A-Fa-f]", range2.to_regex());
315
316 for char in range2.iter() {
317 assert!(range2.contains(char))
318 }
319
320 let intersection = range1.intersection(&range2);
321 assert_eq!("[a-f]", intersection.to_regex());
322
323 for char in intersection.iter() {
324 assert!(intersection.contains(char))
325 }
326
327 Ok(())
328 }
329
330 #[test]
331 fn test_to_regex() -> Result<(), String> {
332 let range = RangeSet::<Char>::new_from_range_char('.'..='.');
333 assert_eq!("\\.", range.to_regex());
334
335 let range = RangeSet::<Char>::new_from_ranges(&[
336 AnyRange::from(Char::new('0')..=Char::new('9')),
337 AnyRange::from(Char::new('A')..=Char::new('F')),
338 AnyRange::from(Char::new('a')..=Char::new('f')),
339 ]);
340 assert_eq!("\\p{ASCII_Hex_Digit}", range.to_regex());
341
342 Ok(())
343 }
344
345 #[test]
346 #[cfg(feature = "serde")]
347 fn test_serde() -> Result<(), String> {
348 let range = RangeSet::empty();
349 let serialized = serde_json::to_string(&range).unwrap();
350 let unserialized: RangeSet<Char> = serde_json::from_str(&serialized).unwrap();
351 assert_eq!(range, unserialized);
352
353 let range = RangeSet::<Char>::total();
354 let serialized = serde_json::to_string(&range).unwrap();
355 let unserialized: RangeSet<Char> = serde_json::from_str(&serialized).unwrap();
356 assert_eq!(range, unserialized);
357
358 let range = RangeSet::new_from_ranges(&[
359 AnyRange::from(Char::new('3')..=Char::new('4')),
360 AnyRange::from(Char::new('7')..Char::new('9')),
361 ]);
362 let serialized = serde_json::to_string(&range).unwrap();
363 let unserialized: RangeSet<Char> = serde_json::from_str(&serialized).unwrap();
364 assert_eq!(range, unserialized);
365 Ok(())
366 }
367}