1use std::fmt::{ self, Display, Formatter };
15use regex::Regex;
16use error::{ Error, Result };
17use util::{ grapheme_count, grapheme_count_by };
18
19
20#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
23pub struct Location {
24 pub src_idx: usize,
26 pub line: usize,
28 pub column: usize,
30}
31
32#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
34pub struct Range {
35 pub start: Location,
37 pub end: Location,
39}
40
41pub trait Ranged {
45 fn range(&self) -> Range;
47}
48
49#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
51pub enum TokenKind {
52 Whitespace,
54 Comment,
56 Word,
58 Punctuation,
60 String,
62 Numeric,
64}
65
66#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
68pub struct Token<'a> {
69 pub kind: TokenKind,
71 pub value: &'a str,
73 pub range: Range,
75}
76
77#[derive(Debug)]
78struct Lexer<'a> {
79 source: &'a str,
80 location: Location,
81 tokens: Vec<Token<'a>>,
82 regexes: [(TokenKind, Regex); NUM_TOKEN_KINDS],
83}
84
85const NUM_TOKEN_KINDS: usize = 6;
86
87
88pub fn lex<S: AsRef<str>>(sources: &[S]) -> Result<Vec<Token>> {
101 Lexer::new().lex(sources)
102}
103
104impl Location {
105 fn advance_by(&self, lexeme: &str) -> Location {
106 let line_breaks: &[char] = &['\n', '\x0b', '\x0c', '\r', '\u{0085}', '\u{2028}', '\u{2029}'];
108 match lexeme.rfind(line_breaks) {
109 Some(index) => Location {
112 src_idx: self.src_idx,
113 line: self.line + grapheme_count_by(lexeme, |g| g.contains(line_breaks)),
114 column: grapheme_count(&lexeme[index..]) - 1 + 1,
115 },
116 None => Location {
117 src_idx: self.src_idx,
118 line: self.line,
119 column: self.column + grapheme_count(lexeme),
120 },
121 }
122 }
123}
124
125impl Display for Location {
126 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
127 write!(f, "line {}, char {}", self.line, self.column)
128 }
129}
130
131impl Display for Range {
132 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
133 write!(f, "{}...{}", self.start, self.end)
134 }
135}
136
137impl Ranged for Range {
138 fn range(&self) -> Range {
139 *self
140 }
141}
142
143impl<'a> Ranged for Token<'a> {
144 fn range(&self) -> Range {
145 self.range
146 }
147}
148
149impl<'a> Lexer<'a> {
150 fn new() -> Lexer<'a> {
151 Lexer {
152 source: "",
153 location: Default::default(),
154 tokens: Vec::new(),
155 regexes: [
156 (TokenKind::Whitespace, Regex::new(r#"^\s+"#).unwrap()),
157 (TokenKind::Comment, Regex::new(r#"^#[^\n\v\f\r\x{0085}\x{2028}\x{2029}]*(\r\n|[\n\v\f\r\x{0085}\x{2028}\x{2029}])?"#).unwrap()),
158 (TokenKind::Word, Regex::new(r#"^[_\p{XID_Start}]\p{XID_Continue}*"#).unwrap()),
159 (TokenKind::Numeric, Regex::new(r#"^((0[bB][0-1]+)|(0[oO][0-7]+)|(0[xX][[:xdigit:]]+)|([0-9]+(\.[0-9]+([eE][\+\-]?[0-9]+)?)?))"#).unwrap()),
160 (TokenKind::Punctuation, Regex::new(r#"^([!\?\*\+]?<\->[!\?\*\+]?|\.{1,3}|[<>]=?|[=!]=|\->|=>?|::?|[\(\)\[\]\{\}\+\-\*/%&\|~\?,;])"#).unwrap()),
161 (TokenKind::String, Regex::new(r#"^"([^\\"]|\\[\\"nrt]|\\x[[:xdigit:]]{2}|\\u\{[[:xdigit:]]+\})*""#).unwrap()),
162 ],
163 }
164 }
165
166 fn lex<S: AsRef<str>>(mut self, sources: &'a [S]) -> Result<Vec<Token<'a>>> {
167 self.tokens.reserve(sources.iter().map(|s| s.as_ref().len()).sum());
168
169 for source in sources {
170 self.source = source.as_ref();
171 self.location.line = 1;
172 self.location.column = 1;
173 self.lex_single_source()?;
174 self.location.src_idx += 1;
175 }
176
177 Ok(self.tokens)
178 }
179
180 fn lex_single_source(&mut self) -> Result<()> {
181 loop {
182 match self.next() {
183 Ok(Some(token)) => self.tokens.push(token),
184 Ok(None) => return Ok(()),
185 Err(error) => return Err(error),
186 }
187 }
188 }
189
190 #[cfg_attr(feature = "cargo-clippy", allow(should_implement_trait))]
191 fn next(&mut self) -> Result<Option<Token<'a>>> {
192 if self.source.is_empty() {
193 return Ok(None)
194 }
195
196 for &(kind, ref re) in &self.regexes {
197 if let Some(m) = re.find(self.source) {
198 let value = m.as_str();
199 let start = self.location;
200 let end = self.location.advance_by(value);
201 let range = Range { start, end };
202 let token = Token { kind, value, range };
203
204 self.location = end;
205 self.source = &self.source[m.end()..];
206
207 return Ok(Some(token));
208 }
209 }
210
211 Err(Error::Syntax {
212 message: "Invalid token".to_owned(),
213 range: Range {
214 start: self.location,
215 end: self.location.advance_by(self.source),
216 },
217 })
218 }
219}