1use super::BasicError;
2use super::Mark;
3use super::RcStr;
4use super::Source;
5use std::rc::Rc;
6
7#[derive(Debug, Clone, PartialEq)]
8pub enum Token<'a> {
9 Name(&'a str),
10 Number(f64),
11 RawString(&'a str),
12 String(RcStr),
13 EOF,
14
15 Newline,
17 LParen,
18 RParen,
19 LBracket,
20 RBracket,
21 LBrace,
22 RBrace,
23 Dollar,
24 Dot,
25 Dot2,
26 Colon,
27 Comma,
28 Semicolon,
29 Percent,
30 Plus,
31 Minus,
32 Star,
33 Slash,
34 Slash2,
35 Eq,
36 Bar,
37 Excalamation,
38
39 Eq2,
40 Ne,
41 LessThan,
42 GreaterThan,
43 LessThanOrEqual,
44 GreaterThanOrEqual,
45}
46
47impl<'a> Token<'a> {
48 pub fn name_or_keyword(&self) -> Option<&str> {
49 if let Token::Name(s) = self {
50 Some(s)
51 } else {
52 None
53 }
54 }
55 #[allow(dead_code)]
56 pub fn number(&self) -> Option<f64> {
57 if let Token::Number(x) = self {
58 Some(*x)
59 } else {
60 None
61 }
62 }
63 #[allow(dead_code)]
64 pub fn raw_string(&self) -> Option<&str> {
65 if let Token::RawString(x) = self {
66 Some(x)
67 } else {
68 None
69 }
70 }
71 #[allow(dead_code)]
72 pub fn string(self) -> Option<RcStr> {
73 if let Token::String(x) = self {
74 Some(x)
75 } else {
76 None
77 }
78 }
79}
80
81pub fn lex(source: &Rc<Source>) -> Result<Vec<(Token, Mark)>, BasicError> {
82 let s = &source.data;
83 let mut ret = Vec::<(Token, Mark)>::new();
84 let mut state = State::Neutral;
85 let mut last_ig_ws = 0;
86 let mut pstack = ParenStack::new();
87 let mut chars = Chars::new(s);
88 while let Some(c) = chars.next() {
89 let i = chars.index - c.len_utf8();
90 let mark = Mark {
91 source: source.clone(),
92 pos: i,
93 };
94 match state {
95 State::Neutral => {
96 if c.is_whitespace() && (c != '\n' || pstack.ignore_newline()) {
97 last_ig_ws = i;
101 state = State::Neutral;
102 } else if c.is_ascii_digit() {
103 state = State::Digits(i);
104 } else if c == '_' || c.is_alphanumeric() {
105 state = State::Name(i);
106 } else if c == '"' || c == '\'' {
107 if let Some((Token::Name("r"), _)) = ret.last() {
108 ret.pop().unwrap();
109 state = State::RawString(c, i + c.len_utf8());
110 } else {
111 state = State::String(c, String::new());
112 }
113 } else if c == '#' {
114 if let Some((Token::Name("r"), _)) = ret.last() {
115 ret.pop().unwrap();
116 state = State::DeepRawStringStart(i, 1);
117 } else {
118 state = State::LineComment;
119 }
120 } else {
121 let tok = match c {
122 '\0' => Some(Token::EOF),
123 '\n' => Some(Token::Newline),
124 '(' => Some(Token::LParen),
125 ')' => Some(Token::RParen),
126 '[' => Some(Token::LBracket),
127 ']' => Some(Token::RBracket),
128 '{' => Some(Token::LBrace),
129 '}' => Some(Token::RBrace),
130 '$' => Some(Token::Dollar),
131 '.' => Some(
132 if ret.last().map(|p| &p.0) == Some(&Token::Dot) && last_ig_ws < i - 1 {
133 ret.pop().unwrap();
134 Token::Dot2
135 } else {
136 Token::Dot
137 },
138 ),
139 ':' => Some(Token::Colon),
140 ',' => Some(Token::Comma),
141 ';' => Some(Token::Semicolon),
142 '+' => Some(Token::Plus),
143 '-' => Some(Token::Minus),
144 '*' => Some(Token::Star),
145 '/' => Some(
146 if ret.last().map(|p| &p.0) == Some(&Token::Slash) && last_ig_ws < i - 1
147 {
148 ret.pop().unwrap();
149 Token::Slash2
150 } else {
151 Token::Slash
152 },
153 ),
154 '%' => Some(Token::Percent),
155 '|' => Some(Token::Bar),
156 '!' => Some(Token::Excalamation),
157 '<' => Some(Token::LessThan),
158 '>' => Some(Token::GreaterThan),
159 '=' => Some({
160 if last_ig_ws < i - 1 {
161 match ret.last() {
162 Some((Token::LessThan, _)) => {
163 ret.pop().unwrap();
164 Token::LessThanOrEqual
165 }
166 Some((Token::GreaterThan, _)) => {
167 ret.pop().unwrap();
168 Token::GreaterThanOrEqual
169 }
170 Some((Token::Eq, _)) => {
171 ret.pop().unwrap();
172 Token::Eq2
173 }
174 Some((Token::Excalamation, _)) => {
175 ret.pop().unwrap();
176 Token::Ne
177 }
178 _ => Token::Eq,
179 }
180 } else {
181 Token::Eq
182 }
183 }),
184 _ => None,
185 };
186 if let Some(tok) = tok {
187 match tok {
188 Token::LParen | Token::LBracket => pstack.push(true),
189 Token::LBrace => pstack.push(true),
191 Token::RParen | Token::RBracket | Token::RBrace => match pstack.pop() {
192 Ok(()) => {}
193 Err(message) => {
194 return Err(BasicError {
195 marks: vec![mark],
196 message,
197 help: None,
198 })
199 }
200 },
201 _ => (),
202 }
203 ret.push((tok, mark));
204 state = State::Neutral;
205 } else {
206 return Err(BasicError {
207 marks: vec![mark],
208 message: format!("Unrecognized token: {}", c),
209 help: None,
210 });
211 }
212 }
213 }
214 State::Digits(start) => {
215 if c.is_ascii_digit() {
216 state = State::Digits(start);
217 } else if c == '.' {
218 state = State::Number(start);
219 } else {
220 chars.put_back(c);
221 state = State::Number(start);
222 }
223 }
224 State::Number(start) => {
225 if c.is_ascii_digit() {
226 state = State::Number(start);
227 } else {
228 let n: f64 = s[start..i].parse().unwrap();
229 ret.push((
230 Token::Number(n),
231 Mark {
232 source: source.clone(),
233 pos: start,
234 },
235 ));
236 chars.put_back(c);
237 state = State::Neutral;
238 }
239 }
240 State::Name(start) => {
241 if c == '_' || c.is_alphanumeric() {
242 state = State::Name(start);
243 } else {
244 ret.push((
245 Token::Name(&s[start..i]),
246 Mark {
247 source: source.clone(),
248 pos: start,
249 },
250 ));
251 chars.put_back(c);
252 state = State::Neutral;
253 }
254 }
255 State::DeepRawStringStart(start, hlen) => match c {
256 '#' => state = State::DeepRawStringStart(start, hlen + 1),
257 '"' | '\'' => state = State::DeepRawStringBody(i + 1, c, hlen),
258 _ => {
259 return Err(BasicError::new(
260 vec![Mark {
261 source: source.clone(),
262 pos: i,
263 }],
264 "Expected quote for raw string".into(),
265 ))
266 }
267 },
268 State::DeepRawStringBody(start, quote, hlen) => {
269 if c == quote {
270 state = State::DeepRawStringEnd(start, i, quote, hlen, hlen);
271 }
272 }
273 State::DeepRawStringEnd(start, end, quote, shlen, hlen) => {
274 assert!(hlen > 0);
275 if c == '#' {
276 if hlen == 1 {
277 ret.push((
278 Token::RawString(&s[start..end]),
279 Mark {
280 source: source.clone(),
281 pos: start,
282 },
283 ));
284 state = State::Neutral;
285 } else {
286 state = State::DeepRawStringEnd(start, end, quote, shlen, hlen - 1);
287 }
288 } else {
289 state = State::DeepRawStringBody(start, quote, shlen)
290 }
291 }
292 State::RawString(q, start) => {
293 if c == q {
294 ret.push((
295 Token::RawString(&s[start..i]),
296 Mark {
297 source: source.clone(),
298 pos: start,
299 },
300 ));
301 state = State::Neutral;
302 } else {
303 state = State::RawString(q, start);
304 }
305 }
306 State::String(q, mut string) => {
307 if c == q {
308 ret.push((
309 Token::String(string.into()),
310 Mark {
311 source: source.clone(),
312 pos: i,
313 },
314 ));
315 state = State::Neutral;
316 } else if c == '\\' {
317 state = State::StringEscaped(q, string);
318 } else {
319 string.push(c);
320 state = State::String(q, string);
321 }
322 }
323 State::StringEscaped(q, mut string) => {
324 let s = match c {
325 '\\' => "\\",
326 '\'' => "\'",
327 '\"' => "\"",
328 't' => "\t",
329 'n' => "\n",
330 'r' => "\r",
331 _ => {
332 return Err(BasicError {
333 marks: vec![Mark {
334 source: source.clone(),
335 pos: i,
336 }],
337 message: format!("Invalid string escape ({})", c),
338 help: None,
339 })
340 }
341 };
342 string.push_str(s);
343 state = State::String(q, string);
344 }
345 State::LineComment => {
346 if c == '\n' {
347 state = State::Neutral;
348 }
349 }
350 }
351 }
352 if let State::Neutral = &state {
353 Ok(ret)
354 } else {
355 Err(BasicError {
356 marks: vec![Mark {
357 source: source.clone(),
358 pos: s.len(),
359 }],
360 message: format!("Expected more input: {:?}", state),
361 help: None,
362 })
363 }
364}
365
366#[derive(Debug)]
367enum State {
368 Neutral,
369 Digits(usize),
370 Number(usize),
371 Name(usize),
372 DeepRawStringStart(usize, usize),
373 DeepRawStringBody(usize, char, usize),
374 DeepRawStringEnd(usize, usize, char, usize, usize),
375 RawString(char, usize),
376 String(char, String),
377 StringEscaped(char, String),
378 LineComment,
379}
380
381struct ParenStack {
382 stack: Vec<bool>,
383}
384
385impl ParenStack {
386 pub fn new() -> ParenStack {
387 ParenStack { stack: Vec::new() }
388 }
389 pub fn push(&mut self, ignore_newline: bool) {
390 self.stack.push(ignore_newline)
391 }
392 pub fn pop(&mut self) -> Result<(), String> {
393 match self.stack.pop() {
394 Some(_) => Ok(()),
395 None => Err(format!("Mismatched grouping symbols")),
396 }
397 }
398 pub fn ignore_newline(&self) -> bool {
399 self.stack.last().cloned().unwrap_or(false)
400 }
401}
402
403struct Chars<'a> {
404 index: usize,
405 peek: Option<char>,
406 chars: std::iter::Chain<std::str::Chars<'a>, std::vec::IntoIter<char>>,
407}
408
409impl<'a> Chars<'a> {
410 fn new(s: &str) -> Chars {
411 Chars {
412 index: 0,
413 peek: None,
414 chars: s.chars().chain(vec!['\0']),
415 }
416 }
417 fn next(&mut self) -> Option<char> {
418 let ch = if let Some(ch) = std::mem::replace(&mut self.peek, None) {
419 Some(ch)
420 } else {
421 self.chars.next()
422 };
423 if let Some(ch) = ch {
424 self.index += ch.len_utf8();
425 }
426 ch
427 }
428 fn put_back(&mut self, c: char) {
429 assert!(self.peek.is_none());
430 self.peek = Some(c);
431 self.index -= c.len_utf8();
432 }
433}
434
435#[cfg(test)]
436mod tests {
437 use super::*;
438 use Token::*;
439
440 fn mksrc(data: &str) -> Rc<Source> {
441 Source {
442 name: "[for-test]".into(),
443 data: data.into(),
444 }
445 .into()
446 }
447
448 fn lex(src: &Rc<Source>) -> Vec<Token> {
449 super::lex(src).unwrap().into_iter().map(|p| p.0).collect()
450 }
451
452 #[test]
453 fn raw_string_literals() {
454 let src = mksrc(r####" r"hi" "####);
455 assert_eq!(lex(&src), vec![RawString("hi"), EOF]);
456
457 let src = mksrc(r####" r#"hello " "# "####);
458 assert_eq!(lex(&src), vec![RawString("hello \" "), EOF]);
459
460 let src = mksrc(r####" r##"world"## "####);
461 assert_eq!(lex(&src), vec![RawString("world"), EOF]);
462
463 let src = mksrc(r####" r##"hello "# "## "####);
464 assert_eq!(lex(&src), vec![RawString("hello \"# "), EOF]);
465 }
466
467 #[test]
468 fn misc() {
469 let src = mksrc(r##" x = r"hi" "##);
470 assert_eq!(lex(&src), vec![Name("x"), Eq, RawString("hi"), EOF]);
471 }
472}