1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

//! # RSLexer #
//! 
//! Simple lexer written in Rust!

#![deny(missing_docs,
    missing_debug_implementations, missing_copy_implementations,
    trivial_casts, trivial_numeric_casts,
    unsafe_code, unstable_features,
    unused_import_braces, unused_qualifications)]

use regex::Regex;

/// Use the `rules!` macro to create Rulse for the lexer!
/// 
/// Simple type around a `Vec<Rule<T>>`
pub type Rules<T> = Vec<Rule<T>>;

/// Don't create a Rule yourself, use the 
/// `rules!` macro to create Rulse for the lexer!
/// 
/// Internal struct that contains a `Regex`
/// and a `Fn(&str, usize, usize) -> Option<T>`
pub struct Rule<T> {
    /// `Regex`
    pub r: Regex,
    /// `Fn(&str, usize, usize) -> Option<T>`
    pub f: Box<dyn Fn(&str, usize, usize) -> Option<T>>,
}

use std::fmt;
impl<T> fmt::Debug for Rule<T> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "Rule({:?})", self.r)
    }
}

/// `rules!` macro to create Rules for the lexer!
/// 
/// `rules!` expects a Type (Token) and a list of rules:
/// &str (regex) => |string, line, character| Option<Token>
/// 
/// Example:
/// ```rs
/// enum Token {
///     Word(String, usize, usize),
/// }
/// 
/// rules!(Token;
///     r"\s+" => |_,_,_| None,
///     r"[^\s]+" => |s, l, c| Some(Token::Word(s.to_str(), l, c)),
/// )
/// 
/// ```
#[macro_export]
macro_rules! rules {
    ($t:ty; $($e:expr => $f:expr,)+) => { rules!($t; $($e => $f),+) };
    ($t:ty; $($e:expr => $f:expr),*) => {{
        use regex::Regex;
        let mut s: Vec<Rule<$t>> = Vec::new();
        $( 
            s.push(Rule {
                r: Regex::new($e).unwrap(),
                f: Box::new($f)
            });
        )*

        s
    }};
}

/// The main funtion of the lexer
/// 
/// `lex` expects the content you want to lex as a &str
/// and a Vec of Rulse (use the `rules!` macro for that)!
/// 
/// Example:
/// ```rs
/// 
/// enum Token {
///     Word(String, usize, usize),
/// }
/// 
/// lex("test string \n new line!",
///     rules!(Token;
///         r"\s+" => |_,_,_| None,
///         r"[^\s]+" => |s, l, c| Some(Token::Word(s.to_str(), l, c)),
///     )
/// )
/// 
/// ```
pub fn lex<T>(content: &str, rules: Rules<T>) -> Result<Vec<T>, String> {

    let mut pos: usize = 0;
    let mut line: usize = 1;
    let mut character: usize = 1;

    let mut ts: Vec<T> = Vec::new();

    while let Some(c) = content.get(pos..) {
        if c.is_empty() { break; }

        let mut changed = false;
        for rule in &rules {

            if let Some(m) = rule.r.find(c) {
                if m.start() != 0 { continue; }

                let mut s = m.as_str();
                if let Some(t) = (rule.f)(s, line, character) {
                    ts.push(t);
                }

                character += s.len();

                while let Some(i) = s.find("\n") {
                    line += 1;
                    s = s.get(i + 1..).unwrap();
                    character = s.len() + 1;
                }

                pos += m.end();
                changed = true;
                break;
            }
        }
        if !changed {
            return Err(format!("No match for content: {:?}", c));
        }
    }

    Ok(ts)
}