phile/
lexer.rs

1//
2// lexer.rs
3// The PHiLe Compiler
4//
5// Created by Arpad Goretity (H2CO3)
6// on 07/04/2017
7//
8
9//! This module contains type definitions and functions for
10//! breaking up unstructured source text into lexemes and tokens.
11//! It also provides types for mapping tokens to their original
12//! location within the source code in a human-readable manner.
13
14use std::fmt::{ self, Display, Formatter };
15use regex::Regex;
16use error::{ Error, Result };
17use util::{ grapheme_count, grapheme_count_by };
18
19
20/// Represents the location of a single extended grapheme cluster
21/// in the sources fed to the lexer.
22#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
23pub struct Location {
24    /// 0-based index of the source that this location points into.
25    pub src_idx: usize,
26    /// 1-based line index within the aforementioned source.
27    pub line: usize,
28    /// 1-based character index within the line.
29    pub column: usize,
30}
31
32/// A half-open range representing a source span.
33#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
34pub struct Range {
35    /// Location at the beginning of the source range.
36    pub start: Location,
37    /// Location one past the end of the source range.
38    pub end: Location,
39}
40
41/// This trait is to be implemented by entities that correspond
42/// to some range in the source. This is used for generating
43/// location information in user-visible error messages.
44pub trait Ranged {
45    /// Returns the range `self` was generated from.
46    fn range(&self) -> Range;
47}
48
49/// Describes the type of a single token or lexeme.
50#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
51pub enum TokenKind {
52    /// Horizontal and vertical (e.g. newline) whitespace. Unicode-aware.
53    Whitespace,
54    /// Currently, a line comment, beginning with '#' and ending with vertical whitespace or end-of-source.
55    Comment,
56    /// An identifier or a keyword.
57    Word,
58    /// Operators and other punctuation characters, e.g. semicolons.
59    Punctuation,
60    /// String literal.
61    String,
62    /// Integer or floating-point literal.
63    Numeric,
64}
65
66/// Represents a lexeme and its associated type and location information as an abstract token.
67#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
68pub struct Token<'a> {
69    /// The kind associated with the recognized lexeme.
70    pub kind: TokenKind,
71    /// A pointer into the source where the underlying lexeme was found.
72    pub value: &'a str,
73    /// Human-readable range information for the underlying lexeme.
74    pub range: Range,
75}
76
77#[derive(Debug)]
78struct Lexer<'a> {
79    source: &'a str,
80    location: Location,
81    tokens: Vec<Token<'a>>,
82    regexes: [(TokenKind, Regex); NUM_TOKEN_KINDS],
83}
84
85const NUM_TOKEN_KINDS: usize = 6;
86
87
88/// Given an array of source strings, returns an array of tokens
89/// extracted from those strings, or an error if there is a syntactic
90/// (more precisely, lexical) error in any of the source strings.
91///
92/// # Arguments
93///
94/// * `sources`: a slice of `str`-convertible source strings.
95///
96/// # Return value
97///
98/// * `Ok(Vec<Token>)` if the source files were lexically correct.
99/// * `Err(Error::Syntax)` if there was a lexical error in the input.
100pub fn lex<S: AsRef<str>>(sources: &[S]) -> Result<Vec<Token>> {
101    Lexer::new().lex(sources)
102}
103
104impl Location {
105    fn advance_by(&self, lexeme: &str) -> Location {
106        // TODO(H2CO3): Keep this list in sync with the regexes in Lexer::new()
107        let line_breaks: &[char] = &['\n', '\x0b', '\x0c', '\r', '\u{0085}', '\u{2028}', '\u{2029}'];
108        match lexeme.rfind(line_breaks) {
109            // -1 because the \n itself doesn't count,
110            // +1 because humans start counting at 1.
111            Some(index) => Location {
112                src_idx: self.src_idx,
113                line:    self.line + grapheme_count_by(lexeme, |g| g.contains(line_breaks)),
114                column:  grapheme_count(&lexeme[index..]) - 1 + 1,
115            },
116            None => Location {
117                src_idx: self.src_idx,
118                line:    self.line,
119                column:  self.column + grapheme_count(lexeme),
120            },
121        }
122    }
123}
124
125impl Display for Location {
126    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
127        write!(f, "line {}, char {}", self.line, self.column)
128    }
129}
130
131impl Display for Range {
132    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
133        write!(f, "{}...{}", self.start, self.end)
134    }
135}
136
137impl Ranged for Range {
138    fn range(&self) -> Range {
139        *self
140    }
141}
142
143impl<'a> Ranged for Token<'a> {
144    fn range(&self) -> Range {
145        self.range
146    }
147}
148
149impl<'a> Lexer<'a> {
150    fn new() -> Lexer<'a> {
151        Lexer {
152            source: "",
153            location: Default::default(),
154            tokens: Vec::new(),
155            regexes: [
156                (TokenKind::Whitespace,  Regex::new(r#"^\s+"#).unwrap()),
157                (TokenKind::Comment,     Regex::new(r#"^#[^\n\v\f\r\x{0085}\x{2028}\x{2029}]*(\r\n|[\n\v\f\r\x{0085}\x{2028}\x{2029}])?"#).unwrap()),
158                (TokenKind::Word,        Regex::new(r#"^[_\p{XID_Start}]\p{XID_Continue}*"#).unwrap()),
159                (TokenKind::Numeric,     Regex::new(r#"^((0[bB][0-1]+)|(0[oO][0-7]+)|(0[xX][[:xdigit:]]+)|([0-9]+(\.[0-9]+([eE][\+\-]?[0-9]+)?)?))"#).unwrap()),
160                (TokenKind::Punctuation, Regex::new(r#"^([!\?\*\+]?<\->[!\?\*\+]?|\.{1,3}|[<>]=?|[=!]=|\->|=>?|::?|[\(\)\[\]\{\}\+\-\*/%&\|~\?,;])"#).unwrap()),
161                (TokenKind::String,      Regex::new(r#"^"([^\\"]|\\[\\"nrt]|\\x[[:xdigit:]]{2}|\\u\{[[:xdigit:]]+\})*""#).unwrap()),
162            ],
163        }
164    }
165
166    fn lex<S: AsRef<str>>(mut self, sources: &'a [S]) -> Result<Vec<Token<'a>>> {
167        self.tokens.reserve(sources.iter().map(|s| s.as_ref().len()).sum());
168
169        for source in sources {
170            self.source = source.as_ref();
171            self.location.line = 1;
172            self.location.column = 1;
173            self.lex_single_source()?;
174            self.location.src_idx += 1;
175        }
176
177        Ok(self.tokens)
178    }
179
180    fn lex_single_source(&mut self) -> Result<()> {
181        loop {
182            match self.next() {
183                Ok(Some(token)) => self.tokens.push(token),
184                Ok(None)        => return Ok(()),
185                Err(error)      => return Err(error),
186            }
187        }
188    }
189
190    #[cfg_attr(feature = "cargo-clippy", allow(should_implement_trait))]
191    fn next(&mut self) -> Result<Option<Token<'a>>> {
192        if self.source.is_empty() {
193            return Ok(None)
194        }
195
196        for &(kind, ref re) in &self.regexes {
197            if let Some(m) = re.find(self.source) {
198                let value = m.as_str();
199                let start = self.location;
200                let end   = self.location.advance_by(value);
201                let range = Range { start, end };
202                let token = Token { kind, value, range };
203
204                self.location = end;
205                self.source = &self.source[m.end()..];
206
207                return Ok(Some(token));
208            }
209        }
210
211        Err(Error::Syntax {
212            message: "Invalid token".to_owned(),
213            range: Range {
214                start: self.location,
215                end: self.location.advance_by(self.source),
216            },
217        })
218    }
219}