1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
use std::{fmt, marker::PhantomData};

use crate::ParseError;

#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct Token<'input>(pub usize, pub &'input str);
impl<'a> fmt::Display for Token<'a> {
    fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
        fmt::Display::fmt(self.1, formatter)
    }
}

struct RegexEntry {
    regex: regex::Regex,
    skip: bool,
}

pub struct MatcherBuilder {
    regex_set: regex::RegexSet,
    regex_vec: Vec<RegexEntry>,
}

impl MatcherBuilder {
    pub fn new<S>(
        exprs: impl IntoIterator<Item = (S, bool)>,
    ) -> Result<MatcherBuilder, regex::Error>
    where
        S: AsRef<str>,
    {
        let exprs = exprs.into_iter();
        let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
        let mut first_error = None;
        let regex_set_result = regex::RegexSet::new(exprs.scan((), |_, (s, skip)| {
            regex_vec.push(match regex::Regex::new(s.as_ref()) {
                Ok(regex) => RegexEntry { regex, skip },
                Err(err) => {
                    first_error = Some(err);
                    return None;
                }
            });
            Some(s)
        }));

        if let Some(err) = first_error {
            return Err(err);
        }
        let regex_set = regex_set_result?;

        Ok(MatcherBuilder {
            regex_set,
            regex_vec,
        })
    }
    pub fn matcher<'input, 'builder, E>(
        &'builder self,
        s: &'input str,
    ) -> Matcher<'input, 'builder, E> {
        Matcher {
            text: s,
            consumed: 0,
            regex_set: &self.regex_set,
            regex_vec: &self.regex_vec,
            _marker: PhantomData,
        }
    }
}

pub struct Matcher<'input, 'builder, E> {
    text: &'input str,
    consumed: usize,
    regex_set: &'builder regex::RegexSet,
    regex_vec: &'builder Vec<RegexEntry>,
    _marker: PhantomData<fn() -> E>,
}

impl<'input, 'builder, E> Iterator for Matcher<'input, 'builder, E> {
    type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            let text = self.text;
            let start_offset = self.consumed;
            if text.is_empty() {
                self.consumed = start_offset;
                return None;
            } else {
                let matches = self.regex_set.matches(text);
                if !matches.matched_any() {
                    return Some(Err(ParseError::InvalidToken {
                        location: start_offset,
                    }));
                } else {
                    let mut longest_match = 0;
                    let mut index = 0;
                    let mut skip = false;
                    for i in matches.iter() {
                        let entry = &self.regex_vec[i];
                        let match_ = entry.regex.find(text).unwrap();
                        let len = match_.end();
                        if len >= longest_match {
                            longest_match = len;
                            index = i;
                            skip = entry.skip;
                        }
                    }

                    let result = &text[..longest_match];
                    let remaining = &text[longest_match..];
                    let end_offset = start_offset + longest_match;
                    self.text = remaining;
                    self.consumed = end_offset;

                    // Skip any whitespace matches
                    if skip {
                        if longest_match == 0 {
                            return Some(Err(ParseError::InvalidToken {
                                location: start_offset,
                            }));
                        }
                        continue;
                    }

                    return Some(Ok((start_offset, Token(index, result), end_offset)));
                }
            }
        }
    }
}