1#[cfg(feature = "no_std")]
5use std::prelude::*;
6
7use std::borrow::Cow;
8use std::collections::HashMap;
9use std::fmt;
10use {ctype, PosixRegex};
11use tree::*;
12
13#[derive(Clone, Copy, PartialEq, Eq)]
15pub struct Range(pub u32, pub Option<u32>);
16impl fmt::Debug for Range {
17 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
18 match self {
19 Range(start, None) => write!(f, "{}..", start),
20 Range(start, Some(end)) => write!(f, "{}..{}", start, end),
21 }
22 }
23}
24
25#[derive(Clone, PartialEq, Eq)]
27pub enum Collation {
28 Char(u8),
29 Class(fn(u8) -> bool)
30}
31impl Collation {
32 pub fn matches(&self, other: u8, insensitive: bool) -> bool {
34 match *self {
35 Collation::Char(me) if insensitive => if ctype::is_alpha(me) && ctype::is_alpha(other) {
36 me | 32 == other | 32
37 } else {
38 me == other
39 },
40 Collation::Char(me) => me == other,
41 Collation::Class(f) => f(other)
42 }
43 }
44}
45impl fmt::Debug for Collation {
46 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
47 match *self {
48 Collation::Char(c) => write!(f, "{:?}", c as char),
49 Collation::Class(c) => write!(f, "{:p}", c),
50 }
51 }
52}
53
54#[derive(Clone, PartialEq, Eq)]
56pub enum Token {
57 InternalStart,
59
60 Alternative,
61 Any,
62 BackRef(u32),
63 Char(u8),
64 End,
65 Group(usize),
66 OneOf {
67 invert: bool,
68 list: Vec<Collation>
69 },
70 Root,
71 Start,
72 WordEnd,
73 WordStart
74}
75impl fmt::Debug for Token {
76 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
77 match *self {
78 Token::InternalStart => write!(f, "<START>"),
79
80 Token::Alternative => write!(f, "Alternative"),
81 Token::Any => write!(f, "."),
82 Token::BackRef(id) => write!(f, "\\{}", id),
83 Token::Char(c) => write!(f, "{:?}", c as char),
84 Token::End => write!(f, "$"),
85 Token::Group(id) => write!(f, "Group({})", id),
86 Token::OneOf { invert, ref list } => write!(f, "{{invert: {}, {:?}}}", invert, list),
87 Token::Root => write!(f, "Root"),
88 Token::Start => write!(f, "^"),
89 Token::WordEnd => write!(f, ">"),
90 Token::WordStart => write!(f, "<")
91 }
92 }
93}
94#[derive(Clone, Debug, PartialEq, Eq)]
96pub enum Error {
97 EOF,
98 EmptyRepetition,
99 Expected(u8, Option<u8>),
100 IllegalRange,
101 IntegerOverflow,
102 InvalidBackRef(u32),
103 LeadingRepetition,
104 UnclosedRepetition,
105 UnexpectedToken(u8),
106 UnknownClass(Vec<u8>),
107 UnknownCollation
108}
109
110pub struct PosixRegexBuilder<'a> {
112 input: &'a [u8],
113 classes: HashMap<&'a [u8], fn(u8) -> bool>,
114 group_id: usize,
115 builder: TreeBuilder
116}
117impl<'a> PosixRegexBuilder<'a> {
118 pub fn new(input: &'a [u8]) -> Self {
120 Self {
121 input,
122 classes: HashMap::new(),
123 group_id: 1,
124 builder: TreeBuilder::default()
125 }
126 }
127 pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
129 self.classes.insert(name, callback);
130 self
131 }
132 pub fn with_default_classes(mut self) -> Self {
134 #[cfg(not(feature = "no_std"))]
135 self.classes.reserve(12);
136
137 self.classes.insert(b"alnum", ctype::is_alnum);
138 self.classes.insert(b"alpha", ctype::is_alpha);
139 self.classes.insert(b"blank", ctype::is_blank);
140 self.classes.insert(b"cntrl", ctype::is_cntrl);
141 self.classes.insert(b"digit", ctype::is_digit);
142 self.classes.insert(b"graph", ctype::is_graph);
143 self.classes.insert(b"lower", ctype::is_lower);
144 self.classes.insert(b"print", ctype::is_print);
145 self.classes.insert(b"punct", ctype::is_punct);
146 self.classes.insert(b"space", ctype::is_space);
147 self.classes.insert(b"upper", ctype::is_upper);
148 self.classes.insert(b"xdigit", ctype::is_xdigit);
149
150 self
151 }
152 pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
154 let tree = self.compile_tokens()?;
155 Ok(PosixRegex::new(Cow::Owned(tree)))
156 }
157 pub fn compile_tokens(mut self) -> Result<Tree, Error> {
158 self.builder.start_internal(Token::Root, Range(1, Some(1)));
159 self.parse()?;
160 self.builder.finish_internal();
161 Ok(self.builder.finish())
162 }
163
164 fn consume(&mut self, amount: usize) {
165 self.input = &self.input[amount..];
166 }
167 fn take_int(&mut self) -> Result<Option<u32>, Error> {
168 let mut out: Option<u32> = None;
169 while let Some(&c @ b'0'..=b'9') = self.input.first() {
170 self.consume(1);
171 out = Some(out.unwrap_or(0)
172 .checked_mul(10)
173 .and_then(|out| out.checked_add((c - b'0') as u32))
174 .ok_or(Error::IntegerOverflow)?);
175 }
176 Ok(out)
177 }
178 fn next(&mut self) -> Result<u8, Error> {
179 self.input.first()
180 .map(|&c| { self.consume(1); c })
181 .ok_or(Error::EOF)
182 }
183 fn expect(&mut self, c: u8) -> Result<(), Error> {
184 if self.input.first() != Some(&c) {
185 return Err(Error::Expected(c, self.input.first().cloned()));
186 }
187 self.consume(1);
188 Ok(())
189 }
190 fn parse_range(&mut self) -> Result<Range, Error> {
191 let mut range = Range(1, Some(1));
192 if let Some(&c) = self.input.first() {
193 let new = match c {
194 b'*' => Some((1, Range(0, None))),
195 b'\\' => match self.input.get(1) {
196 Some(b'?') => Some((2, Range(0, Some(1)))),
197 Some(b'+') => Some((2, Range(1, None))),
198 Some(b'{') => {
199 self.consume(2);
200 let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
201 let mut second = Some(first);
202 if let Some(b',') = self.input.first() {
203 self.consume(1);
204 second = self.take_int()?;
205 }
206 if self.input.first() == Some(&b'}') {
207 self.consume(1);
208 } else if self.input.starts_with(br"\}") {
209 self.consume(2);
210 } else {
211 return Err(Error::UnclosedRepetition);
212 }
213 if second.map(|second| first > second).unwrap_or(false) {
214 return Err(Error::IllegalRange);
215 }
216 range = Range(first, second);
217 None
218 },
219 _ => None
220 },
221 _ => None
222 };
223 if let Some((consume, new)) = new {
224 range = new;
225 self.consume(consume);
226 }
227 }
228 Ok(range)
229 }
230 fn parse(&mut self) -> Result<(), Error> {
231 self.builder.start_internal(Token::Alternative, Range(1, Some(1)));
232 while let Ok(c) = self.next() {
233 let token = match c {
234 b'^' => Token::Start,
235 b'$' => Token::End,
236 b'.' => Token::Any,
237 b'[' => {
238 let mut list = Vec::new();
239 let invert = self.input.first() == Some(&b'^');
240
241 if invert {
242 self.consume(1);
243 }
244
245 loop {
246 let mut c = self.next()?;
247
248 let mut push = true;
249
250 if c == b'[' {
251 match self.next()? {
255 b'.' => {
256 c = self.next()?;
257 self.expect(b'.')?;
258 self.expect(b']')?;
259 },
260 b'=' => {
261 c = self.next()?;
262 self.expect(b'=')?;
263 self.expect(b']')?;
264 },
265 b':' => {
266 let end = self.input.iter().position(|&c| c == b':').ok_or(Error::EOF)?;
267 let key = &self.input[..end];
268 let class = *self.classes.get(key).ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
269 self.consume(end + 1);
270 self.expect(b']')?;
271
272 list.push(Collation::Class(class));
273 push = false;
274 },
275 _ => return Err(Error::UnknownCollation)
276 }
277 }
278
279 if push {
280 list.push(Collation::Char(c));
281
282 if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']') {
283 self.consume(1);
284 let dest = self.next()?;
285 for c in (c+1)..=dest {
286 list.push(Collation::Char(c));
287 }
288 }
289 }
290
291 if self.input.first() == Some(&b']') {
292 self.consume(1);
293 break;
294 }
295 }
296
297 Token::OneOf {
298 invert,
299 list
300 }
301 },
302 b'\\' if self.input.first().map(|&c| (c as char).is_digit(10)).unwrap_or(false) => {
303 let id = self.take_int()?.unwrap();
304 if (id as usize) >= self.group_id {
305 return Err(Error::InvalidBackRef(id));
306 }
307 Token::BackRef(id)
308 }
309 b'\\' => match self.next()? {
310 b'(' => {
311 let id = self.group_id;
312 self.group_id += 1;
313 let checkpoint = self.builder.checkpoint();
314 self.parse()?;
315 let range = self.parse_range()?;
316 self.builder.start_internal_at(checkpoint, Token::Group(id), range);
317 self.builder.finish_internal();
318 continue;
319 },
320 b')' => break,
321 b'|' => {
322 self.builder.finish_internal();
323 self.builder.start_internal(Token::Alternative, Range(1, Some(1)));
324 continue;
325 },
326 b'<' => Token::WordStart,
327 b'>' => Token::WordEnd,
328 b'a' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_alnum)] },
329 b'd' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_digit)] },
330 b's' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_space)] },
331 b'S' => Token::OneOf { invert: true, list: vec![Collation::Class(ctype::is_space)] },
332 b'n' => Token::Char(b'\n'),
333 b'r' => Token::Char(b'\r'),
334 b't' => Token::Char(b'\t'),
335 c => Token::Char(c)
336 },
337 c => Token::Char(c)
338 };
339 let range = self.parse_range()?;
340 self.builder.leaf(token, range);
341 }
342 self.builder.finish_internal();
343 Ok(())
344 }
345}
346
347#[cfg(test)]
348mod tests {
349 use super::*;
350
351 fn compile(input: &[u8]) -> String {
352 format!(
353 "{:?}",
354 PosixRegexBuilder::new(input)
355 .with_default_classes()
356 .compile_tokens()
357 .expect("error compiling regex")
358 )
359 }
360
361 #[test]
362 fn basic() {
363 assert_eq!(
364 compile(b"abc"),
365 "\
366Root 1..1
367 Alternative 1..1
368 'a' 1..1
369 'b' 1..1
370 'c' 1..1
371"
372 );
373 }
374 #[test]
375 fn groups() {
376 assert_eq!(
377 compile(br"\(abc\|bcd\|cde\)"),
378 "\
379Root 1..1
380 Alternative 1..1
381 Group(1) 1..1
382 Alternative 1..1
383 'a' 1..1
384 'b' 1..1
385 'c' 1..1
386 Alternative 1..1
387 'b' 1..1
388 'c' 1..1
389 'd' 1..1
390 Alternative 1..1
391 'c' 1..1
392 'd' 1..1
393 'e' 1..1
394"
395 );
396 assert_eq!(
397 compile(br"\(abc\|\(bcd\|cde\)\)"),
398 "\
399Root 1..1
400 Alternative 1..1
401 Group(1) 1..1
402 Alternative 1..1
403 'a' 1..1
404 'b' 1..1
405 'c' 1..1
406 Alternative 1..1
407 Group(2) 1..1
408 Alternative 1..1
409 'b' 1..1
410 'c' 1..1
411 'd' 1..1
412 Alternative 1..1
413 'c' 1..1
414 'd' 1..1
415 'e' 1..1
416"
417 );
418 }
419 #[test]
420 fn words() {
421 assert_eq!(
422 compile(br"\<word\>"),
423 "\
424Root 1..1
425 Alternative 1..1
426 < 1..1
427 'w' 1..1
428 'o' 1..1
429 'r' 1..1
430 'd' 1..1
431 > 1..1
432"
433 );
434 }
435 #[test]
436 fn repetitions() {
437 assert_eq!(
438 compile(br"yeee*"),
439 "\
440Root 1..1
441 Alternative 1..1
442 'y' 1..1
443 'e' 1..1
444 'e' 1..1
445 'e' 0..
446"
447 );
448 assert_eq!(
449 compile(br"yee\?"),
450 "\
451Root 1..1
452 Alternative 1..1
453 'y' 1..1
454 'e' 1..1
455 'e' 0..1
456"
457 );
458 assert_eq!(
459 compile(br"yee\+"),
460 "\
461Root 1..1
462 Alternative 1..1
463 'y' 1..1
464 'e' 1..1
465 'e' 1..
466"
467 );
468 assert_eq!(
469 compile(br"ye\{2}"),
470 "\
471Root 1..1
472 Alternative 1..1
473 'y' 1..1
474 'e' 2..2
475"
476 );
477 assert_eq!(
478 compile(br"ye\{2,}"),
479 "\
480Root 1..1
481 Alternative 1..1
482 'y' 1..1
483 'e' 2..
484"
485 );
486 assert_eq!(
487 compile(br"ye\{2,3}"),
488 "\
489Root 1..1
490 Alternative 1..1
491 'y' 1..1
492 'e' 2..3
493"
494 );
495 }
496 #[test]
497 fn bracket() {
498 assert_eq!(
499 compile(b"[abc]"),
500 "\
501Root 1..1
502 Alternative 1..1
503 {invert: false, ['a', 'b', 'c']} 1..1
504"
505 );
506 assert_eq!(
507 compile(b"[^abc]"),
508 "\
509Root 1..1
510 Alternative 1..1
511 {invert: true, ['a', 'b', 'c']} 1..1
512"
513 );
514 assert_eq!(
515 compile(b"[]] [^]]"),
516 "\
517Root 1..1
518 Alternative 1..1
519 {invert: false, [']']} 1..1
520 ' ' 1..1
521 {invert: true, [']']} 1..1
522"
523 );
524 assert_eq!(
525 compile(b"[0-3] [a-c] [-1] [1-]"),
526 "\
527Root 1..1
528 Alternative 1..1
529 {invert: false, ['0', '1', '2', '3']} 1..1
530 ' ' 1..1
531 {invert: false, ['a', 'b', 'c']} 1..1
532 ' ' 1..1
533 {invert: false, ['-', '1']} 1..1
534 ' ' 1..1
535 {invert: false, ['1', '-']} 1..1
536"
537 );
538 assert_eq!(
539 compile(b"[[.-.]-/]"),
540 "\
541Root 1..1
542 Alternative 1..1
543 {invert: false, ['-', '.', '/']} 1..1
544"
545 );
546 assert_eq!(
547 compile(b"[[:digit:][:upper:]]"),
548 format!("\
549Root 1..1
550 Alternative 1..1
551 {{invert: false, [{:p}, {:p}]}} 1..1
552", ctype::is_digit as fn(u8) -> bool, ctype::is_upper as fn(u8) -> bool)
553 );
554 }
555 #[test]
556 fn newline() {
557 assert_eq!(
558 compile(br"\r\n"),
559 "\
560Root 1..1
561 Alternative 1..1
562 '\\r' 1..1
563 '\\n' 1..1
564"
565 );
566 }
567 #[test]
568 fn backref() {
569 assert_eq!(
570 compile(br"\([abc]\)\1"),
571 "\
572Root 1..1
573 Alternative 1..1
574 Group(1) 1..1
575 Alternative 1..1
576 {invert: false, ['a', 'b', 'c']} 1..1
577 \\1 1..1
578"
579 )
580 }
581}