1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
//
// lexer.rs
// The PHiLe Compiler
//
// Created by Arpad Goretity (H2CO3)
// on 07/04/2017
//

//! This module contains type definitions and functions for
//! breaking up unstructured source text into lexemes and tokens.
//! It also provides types for mapping tokens to their original
//! location within the source code in a human-readable manner.

use std::fmt::{ self, Display, Formatter };
use regex::Regex;
use error::{ Error, Result };
use util::{ grapheme_count, grapheme_count_by };


/// Represents the location of a single extended grapheme cluster
/// in the sources fed to the lexer.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Location {
    /// 0-based index of the source that this location points into.
    pub src_idx: usize,
    /// 1-based line index within the aforementioned source.
    pub line: usize,
    /// 1-based character index within the line.
    pub column: usize,
}

/// A half-open range representing a source span.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Range {
    /// Location at the beginning of the source range.
    pub start: Location,
    /// Location one past the end of the source range.
    pub end: Location,
}

/// This trait is to be implemented by entities that correspond
/// to some range in the source. This is used for generating
/// location information in user-visible error messages.
pub trait Ranged {
    /// Returns the range `self` was generated from.
    fn range(&self) -> Range;
}

/// Describes the type of a single token or lexeme.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum TokenKind {
    /// Horizontal and vertical (e.g. newline) whitespace. Unicode-aware.
    Whitespace,
    /// Currently, a line comment, beginning with '#' and ending with vertical whitespace or end-of-source.
    Comment,
    /// An identifier or a keyword.
    Word,
    /// Operators and other punctuation characters, e.g. semicolons.
    Punctuation,
    /// String literal.
    StringLiteral,
    /// Integer or floating-point literal.
    NumericLiteral,
}

/// Represents a lexeme and its associated type and location information as an abstract token.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Token<'a> {
    /// The kind associated with the recognized lexeme.
    pub kind: TokenKind,
    /// A pointer into the source where the underlying lexeme was found.
    pub value: &'a str,
    /// Human-readable range information for the underlying lexeme.
    pub range: Range,
}

#[derive(Debug)]
struct Lexer<'a> {
    source: &'a str,
    location: Location,
    tokens: Vec<Token<'a>>,
    regexes: [(TokenKind, Regex); 6],
}


/// Given an array of source strings, returns an array of tokens
/// extracted from those strings, or an error if there is a syntactic
/// (more precisely, lexical) error in any of the source strings.
///
/// # Arguments
///
/// * `sources`: a slice of `str`-convertible source strings.
///
/// # Return value
///
/// * `Ok(Vec<Token>)` if the source files were lexically correct.
/// * `Err(Error::Syntax)` if there was a lexical error in the input.
pub fn lex<'a, S: AsRef<str>>(sources: &'a [S]) -> Result<Vec<Token<'a>>> {
    Lexer::new().lex(sources)
}

impl Location {
    fn advance_by(&self, lexeme: &str) -> Location {
        // TODO(H2CO3): Keep this list in sync with the regexes in Lexer::new()
        let line_breaks: &[char] = &['\n', '\x0b', '\x0c', '\r', '\u{0085}', '\u{2028}', '\u{2029}'];
        match lexeme.rfind(line_breaks) {
            // -1 because the \n itself doesn't count,
            // +1 because humans start counting at 1.
            Some(index) => Location {
                src_idx: self.src_idx,
                line:    self.line + grapheme_count_by(lexeme, |g| g.contains(line_breaks)),
                column:  grapheme_count(&lexeme[index..]) - 1 + 1,
            },
            None => Location {
                src_idx: self.src_idx,
                line:    self.line,
                column:  self.column + grapheme_count(lexeme),
            },
        }
    }
}

impl Display for Location {
    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
        write!(f, "line {}, char {}", self.line, self.column)
    }
}

impl Display for Range {
    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
        write!(f, "{}...{}", self.start, self.end)
    }
}

impl Ranged for Range {
    fn range(&self) -> Range {
        *self
    }
}

impl<'a> Ranged for Token<'a> {
    fn range(&self) -> Range {
        self.range
    }
}

impl<'a> Lexer<'a> {
    fn new() -> Lexer<'a> {
        Lexer {
            source: "",
            location: Default::default(),
            tokens: Vec::new(),
            regexes: [
                (TokenKind::Whitespace,     Regex::new(r#"^\s+"#).unwrap()),
                (TokenKind::Comment,        Regex::new(r#"^#[^\n\v\f\r\x{0085}\x{2028}\x{2029}]*(\r\n|[\n\v\f\r\x{0085}\x{2028}\x{2029}])?"#).unwrap()),
                (TokenKind::Word,           Regex::new(r#"^[_\p{XID_Start}]\p{XID_Continue}*"#).unwrap()),
                (TokenKind::NumericLiteral, Regex::new(r#"^((0[bB][0-1]+)|(0[oO][0-7]+)|(0[xX][[:xdigit:]]+)|([0-9]+(\.[0-9]+([eE][\+\-]?[0-9]+)?)?))"#).unwrap()),
                (TokenKind::Punctuation,    Regex::new(r#"^([!\?\*\+]?<\->[!\?\*\+]?|\.{1,3}|[<>]=?|[=!]=|\->|=>?|::?|[\(\)\[\]\{\}\+\-\*/%&\|~\?,;])"#).unwrap()),
                (TokenKind::StringLiteral,  Regex::new(r#"^"([^\\"]|\\[\\"nrt]|\\x[[:xdigit:]]{2}|\\u\{[[:xdigit:]]+\})*""#).unwrap()),
            ],
        }
    }

    fn lex<S: AsRef<str>>(mut self, sources: &'a [S]) -> Result<Vec<Token<'a>>> {
        self.tokens.reserve(sources.iter().map(|s| s.as_ref().len()).sum());

        for source in sources {
            self.source = source.as_ref();
            self.location.line = 1;
            self.location.column = 1;
            self.lex_single_source()?;
            self.location.src_idx += 1;
        }

        Ok(self.tokens)
    }

    fn lex_single_source(&mut self) -> Result<()> {
        loop {
            match self.next() {
                Ok(Some(token)) => self.tokens.push(token),
                Ok(None)        => return Ok(()),
                Err(error)      => return Err(error),
            }
        }
    }

    fn next(&mut self) -> Result<Option<Token<'a>>> {
        if self.source.is_empty() {
            return Ok(None)
        }

        for &(kind, ref re) in &self.regexes {
            if let Some(m) = re.find(self.source) {
                let value = m.as_str();
                let start = self.location;
                let end   = self.location.advance_by(value);
                let range = Range { start, end };
                let token = Token { kind, value, range };

                self.location = end;
                self.source = &self.source[m.end()..];

                return Ok(Some(token));
            }
        }

        Err(Error::Syntax {
            message: "Invalid token".to_owned(),
            range: Range {
                start: self.location,
                end: self.location.advance_by(self.source),
            },
        })
    }
}