1use thiserror::Error;
8
9use crate::{
10 gobbler::Gobbler,
11 names::{is_valid_ident_continuing_char, is_valid_ident_initial_char, Identifier, NameError},
12 src_pos::{PosTbl, SrcPos},
13};
14
15pub type Token = TaggedToken<()>;
17
18pub(crate) type SrcToken = TaggedToken<SrcPos>;
20
21#[derive(Clone, Debug, Eq, PartialEq)]
26pub enum TaggedToken<T> {
27 Import(T),
30 As(T),
32 Class(T),
34 Colon(T),
36 Eq(T),
38 Comma(T),
40 Dot(T),
42 Newline(T),
44 Null(T),
46
47 Identifier(T, Identifier),
50
51 IntegerLiteral(T, u64),
54 Shl(T),
56 Mul(T),
58
59 OpenBracket(T),
62 CloseBracket(T),
64 OpenParen(T),
66 CloseParen(T),
68 Indent(T),
70 Deindent(T),
72
73 DocString(T, String),
76}
77
78impl<T> TaggedToken<T> {
79 pub fn tag(&self) -> &T {
81 match self {
82 Self::Import(t) => t,
83 Self::As(t) => t,
84 Self::Class(t) => t,
85 Self::Colon(t) => t,
86 Self::Eq(t) => t,
87 Self::Comma(t) => t,
88 Self::Dot(t) => t,
89 Self::Newline(t) => t,
90 Self::Null(t) => t,
91 Self::Identifier(t, _) => t,
92 Self::IntegerLiteral(t, _) => t,
93 Self::Shl(t) => t,
94 Self::Mul(t) => t,
95 Self::OpenBracket(t) => t,
96 Self::CloseBracket(t) => t,
97 Self::OpenParen(t) => t,
98 Self::CloseParen(t) => t,
99 Self::Indent(t) => t,
100 Self::Deindent(t) => t,
101 Self::DocString(t, _) => t,
102 }
103 }
104
105 pub fn to_untagged(&self) -> Token {
107 match self {
108 Self::Import(_) => Token::Import(()),
109 Self::As(_) => Token::As(()),
110 Self::Class(_) => Token::Class(()),
111 Self::Colon(_) => Token::Colon(()),
112 Self::Eq(_) => Token::Eq(()),
113 Self::Comma(_) => Token::Comma(()),
114 Self::Dot(_) => Token::Dot(()),
115 Self::Newline(_) => Token::Newline(()),
116 Self::Null(_) => Token::Null(()),
117 Self::Identifier(_, ident) => Token::Identifier((), ident.clone()),
118 Self::IntegerLiteral(_, v) => Token::IntegerLiteral((), *v),
119 Self::Shl(_) => Token::Shl(()),
120 Self::Mul(_) => Token::Mul(()),
121 Self::OpenBracket(_) => Token::OpenBracket(()),
122 Self::CloseBracket(_) => Token::CloseBracket(()),
123 Self::OpenParen(_) => Token::OpenParen(()),
124 Self::CloseParen(_) => Token::CloseParen(()),
125 Self::Indent(_) => Token::Indent(()),
126 Self::Deindent(_) => Token::Deindent(()),
127 Self::DocString(_, s) => Token::DocString((), s.clone()),
128 }
129 }
130}
131
132#[derive(Debug, Error)]
133pub enum TokenError {
134 #[error("unexpected char '{0}' at pos {1}")]
135 UnexpectedChar(char, usize),
136
137 #[error("unexpected end of input")]
138 UnexpectedEnd,
139
140 #[error("unexpected end of input while reading {0}")]
141 UnexpectedEndOf(&'static str),
142
143 #[error("invalid indent at {0} (was {1:?})")]
144 InvalidIndent(usize, Indent),
145
146 #[error("unrecognizable indent at {0}")]
147 UnrecognizableIndent(usize),
148
149 #[error("invalid integer '{0}'")]
150 InvalidInt(String),
151
152 #[error("invalid name: {0}")]
153 InvalidName(#[from] NameError),
154}
155
156#[derive(Copy, Clone, Debug, Eq, PartialEq)]
158pub enum Indent {
159 Spaces(u8),
160 Tab,
161}
162
163pub(crate) struct TokenSeqBuilder {
164 indent_ty: Option<Indent>,
165 indent_level: usize,
166 output: Vec<SrcToken>,
167}
168
169impl TokenSeqBuilder {
170 pub(crate) fn new() -> Self {
171 Self {
172 indent_ty: None,
173 indent_level: 0,
174 output: Vec::new(),
175 }
176 }
177
178 fn _indent_level(&self) -> usize {
179 self.indent_level
180 }
181
182 fn infer_indent_level(&mut self, indent: &[char], at: usize) -> Result<usize, TokenError> {
184 match self.indent_ty {
185 Some(i @ Indent::Spaces(n)) => {
186 if is_all_spaces(indent) {
187 let found_spaces = indent.len();
188 if found_spaces.is_multiple_of(n as usize) {
189 let ind_cnt = found_spaces / n as usize;
190 Ok(ind_cnt)
191 } else {
192 Err(TokenError::InvalidIndent(at, i))
193 }
194 } else {
195 Err(TokenError::InvalidIndent(at, i))
196 }
197 }
198 Some(i @ Indent::Tab) => {
199 if is_all_tabs(indent) {
200 let found_tabs = indent.len();
201 Ok(found_tabs)
202 } else {
203 Err(TokenError::InvalidIndent(at, i))
204 }
205 }
206 None => {
207 let is_spaces = is_all_spaces(indent);
208 let is_tabs = is_all_tabs(indent);
209
210 if indent.is_empty() {
212 return Ok(0);
213 }
214
215 if is_spaces {
216 self.indent_ty = Some(Indent::Spaces(indent.len() as u8));
217 Ok(1)
218 } else if is_tabs {
219 self.indent_ty = Some(Indent::Tab);
220 Ok(indent.len())
221 } else {
222 Err(TokenError::UnrecognizableIndent(at))
223 }
224 }
225 }
226 }
227
228 fn push_token(&mut self, t: SrcToken) {
229 self.output.push(t);
230 }
231
232 fn update_indent_level(&mut self, level: usize, sp: SrcPos) {
234 let diff = level as isize - self.indent_level as isize;
235 match diff {
236 0 => {}
237
238 d if d < 0 => {
240 for _ in 0..(-d) {
241 self.push_token(TaggedToken::Deindent(sp));
242 }
243 self.indent_level = level;
244 }
245
246 d if d > 0 => {
248 for _ in 0..d {
249 self.push_token(TaggedToken::Indent(sp));
250 }
251 self.indent_level = level;
252 }
253
254 _ => unreachable!(),
255 }
256 }
257
258 fn strip_indent<'s>(&self, mut s: &'s str) -> Option<&'s str> {
260 match self.indent_ty {
261 Some(ind) => match ind {
262 Indent::Spaces(n) => {
263 for _i in 0..(n as usize * self.indent_level) {
264 s = s.strip_prefix(" ")?;
265 }
266 Some(s)
267 }
268 Indent::Tab => {
269 for _i in 0..self.indent_level {
270 s = s.strip_prefix("\t")?;
271 }
272 Some(s)
273 }
274 },
275
276 None => Some(s),
279 }
280 }
281
282 fn cleanup_docstring(&self, s: &str) -> String {
284 let mut buf = String::new();
285 for l in s.lines() {
286 let ls = match self.strip_indent(l) {
287 Some(s) => s,
288 None => l,
289 };
290
291 #[allow(deprecated)]
292 let ls = ls.trim_right();
293 buf.extend(ls.chars());
294 buf.push('\n');
295 }
296
297 buf.trim().to_owned()
298 }
299
300 fn finish(mut self, sp: SrcPos) -> Result<Vec<SrcToken>, TokenError> {
301 for _ in 0..self.indent_level {
303 self.push_token(TaggedToken::Deindent(sp));
304 }
305
306 Ok(self.output)
307 }
308}
309
310fn is_all_spaces<'c>(iter: impl IntoIterator<Item = &'c char>) -> bool {
311 iter.into_iter().all(|c| *c == ' ')
312}
313
314fn is_all_tabs<'c>(iter: impl IntoIterator<Item = &'c char>) -> bool {
315 iter.into_iter().all(|c| *c == '\t')
316}
317
318pub(crate) fn parse_char_array_to_tokens(s: &[char]) -> Result<Vec<SrcToken>, TokenError> {
319 let sp_tbl = PosTbl::generate(s.iter().copied());
320
321 let mut builder = TokenSeqBuilder::new();
322
323 let mut gob = Gobbler::new(s);
324
325 while gob.has_entry() {
326 let cur = *gob.get_expect();
327 let next = gob.get_rel(1).copied();
328 let sp = sp_tbl.expect_srcpos(gob.at());
329
330 #[cfg(test)]
331 eprintln!(
332 "considering {cur:?} (indent level {})",
333 builder._indent_level()
334 );
335
336 match cur {
338 ' ' => {
339 gob.gobble_one();
340 continue;
341 }
342
343 '#' => {
345 gob.gobble_until(|c| *c == '\n');
346 continue;
347 }
348
349 '\n' => {
351 builder.push_token(SrcToken::Newline(sp));
352 gob.gobble_one();
353
354 if gob.has_entry() {
358 let new_cur = gob.get_expect();
359 let new_sp = sp_tbl.expect_srcpos(gob.at());
360 if *new_cur == '\n' {
361 continue;
363 }
364
365 let at_before = gob.at();
366 let ws_span = gob
367 .gobble_slice_up_to(|c| !c.is_ascii_whitespace())
368 .expect("token: parse whitespace");
369 let cnt = builder.infer_indent_level(ws_span, at_before)?;
370
371 builder.update_indent_level(cnt, new_sp);
372 }
373
374 continue;
375 }
376
377 ':' => builder.push_token(SrcToken::Colon(sp)),
378 '=' => builder.push_token(SrcToken::Eq(sp)),
379 ',' => builder.push_token(SrcToken::Comma(sp)),
380 '.' => builder.push_token(SrcToken::Dot(sp)),
381 '*' => builder.push_token(SrcToken::Mul(sp)),
382 '[' => builder.push_token(SrcToken::OpenBracket(sp)),
383 ']' => builder.push_token(SrcToken::CloseBracket(sp)),
384 '(' => builder.push_token(SrcToken::OpenParen(sp)),
385 ')' => builder.push_token(SrcToken::CloseParen(sp)),
386
387 '<' => {
388 if let Some(next) = next {
389 if next == '<' {
390 builder.push_token(SrcToken::Shl(sp));
391 gob.gobble_exact(2);
392 continue;
393 } else {
394 return Err(TokenError::UnexpectedChar(next, gob.at() + 1));
395 }
396 } else {
397 return Err(TokenError::UnexpectedEnd);
398 }
399 }
400
401 '"' => {
403 let off_1 = gob.get_rel(1).copied();
404 let off_2 = gob.get_rel(2).copied();
405
406 match (off_1, off_2) {
408 (Some('"'), Some('"')) => {
410 gob.gobble_exact(3);
411 }
412 _ => return Err(TokenError::UnexpectedEndOf("docstring#begin")),
413 }
414
415 let Some(doc_span) = gob.gobble_slice_for_pattern(&['"', '"', '"']) else {
417 return Err(TokenError::UnexpectedEndOf("docstring#end"));
418 };
419
420 let doc_text = doc_span.into_iter().collect::<String>();
421 gob.gobble_exact(3); #[cfg(test)]
424 {
425 let entry = gob.get();
426 eprintln!("AFTER DOCSTRING {entry:?}");
427 }
428
429 let doc_cleaned_text = builder.cleanup_docstring(&doc_text);
430 builder.push_token(TaggedToken::DocString(sp, doc_cleaned_text));
431
432 continue;
433 }
434
435 c if is_valid_ident_initial_char(c) => {
436 let ident_chars =
437 gob.gobble_slice_up_to_end(|c| !is_valid_ident_continuing_char(*c));
438
439 let ident_str = ident_chars.into_iter().collect::<String>();
440
441 if let Some(kwtok) = try_parse_keyword(&ident_str, sp) {
443 builder.push_token(kwtok);
444 } else {
445 let ident = Identifier::try_from(ident_str)?;
446 builder.push_token(SrcToken::Identifier(sp, ident));
447 }
448
449 continue;
450 }
451
452 c if c.is_numeric() => {
453 let num_chars = gob.gobble_slice_up_to_end(|c| !char::is_numeric(*c));
454
455 let num_str = num_chars.iter().collect::<String>();
456 let v = num_str
457 .parse::<u64>()
458 .map_err(|_| TokenError::InvalidInt(num_str))?;
459 builder.push_token(SrcToken::IntegerLiteral(sp, v));
460 continue;
461 }
462
463 _ => return Err(TokenError::UnexpectedChar(cur, gob.at())),
464 }
465
466 gob.gobble_one();
467 }
468
469 builder.finish(sp_tbl.expect_end())
470}
471
472fn try_parse_keyword(s: &str, sp: SrcPos) -> Option<SrcToken> {
473 Some(match s {
474 "import" => SrcToken::Import(sp),
475 "as" => SrcToken::As(sp),
476 "class" => SrcToken::Class(sp),
477 "null" => SrcToken::Null(sp),
478 _ => return None,
479 })
480}
481
482#[cfg(test)]
483mod tests {
484 use super::{parse_char_array_to_tokens, TaggedToken, TokenSeqBuilder};
485
486 #[test]
487 fn test_whitespace_spaces() {
488 let mut builder = TokenSeqBuilder::new();
489
490 let cnt = builder
491 .infer_indent_level(&[' ', ' ', ' ', ' '], 5)
492 .unwrap();
493 assert_eq!(cnt, 1);
494
495 let cnt = builder
496 .infer_indent_level(&[' ', ' ', ' ', ' '], 5)
497 .unwrap();
498 assert_eq!(cnt, 1);
499
500 let cnt = builder
501 .infer_indent_level(&[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '], 5)
502 .unwrap();
503 assert_eq!(cnt, 2);
504
505 let _ = builder
506 .infer_indent_level(&[' ', ' ', ' ', ' ', ' ', ' ', ' '], 5)
507 .expect_err("test: should have errored");
508
509 let _ = builder
510 .infer_indent_level(&['\t'], 5)
511 .expect_err("test: should have errored");
512 }
513
514 #[test]
515 fn test_whitespace_tabs() {
516 let mut builder = TokenSeqBuilder::new();
517
518 let cnt = builder.infer_indent_level(&['\t'], 5).unwrap();
519 assert_eq!(cnt, 1);
520
521 let cnt = builder.infer_indent_level(&['\t', '\t'], 5).unwrap();
522 assert_eq!(cnt, 2);
523
524 let _ = builder
525 .infer_indent_level(&[' ', ' '], 5)
526 .expect_err("test: should have errored");
527 }
528
529 #[test]
530 fn test_parse_const() {
531 let s = "FOO_BAR = 1234";
532
533 let chars = s.chars().collect::<Vec<_>>();
534
535 let toks =
536 parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
537
538 eprintln!("{toks:#?}");
539 }
540
541 #[test]
542 fn test_parse_whitespace_consts() {
543 let s = "\nFOO = 123\n\n\nBAR = 555\nBAZ = 999";
544
545 let chars = s.chars().collect::<Vec<_>>();
546
547 let toks =
548 parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
549
550 eprintln!("{toks:#?}");
551 }
552
553 #[test]
554 fn test_parse_shl() {
555 let s = "\nFOO = 10 << 30";
556
557 let chars = s.chars().collect::<Vec<_>>();
558
559 let toks =
560 parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
561
562 eprintln!("{toks:#?}");
563 }
564
565 #[test]
566 fn test_parse_container_def() {
567 let s = "class Point(Container):\n x_pos: int32\n y_pos: int32\n";
568
569 let chars = s.chars().collect::<Vec<_>>();
570
571 let toks =
572 parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
573
574 eprintln!("{toks:#?}");
575 }
576
577 #[test]
578 fn test_parse_container_def_doc() {
579 let s = "class Point(Container):\n \"\"\"2-dimensional cartesian point\"\"\"\n x_pos: int32\n y_pos: int32\n";
580
581 let chars = s.chars().collect::<Vec<_>>();
582
583 let toks =
584 parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
585
586 let mut found_doc = false;
588 for tok in &toks {
589 match tok {
590 TaggedToken::DocString(_, text) => {
591 assert_eq!(text, "2-dimensional cartesian point");
592 found_doc = true;
593 }
594 _ => {}
595 }
596 }
597 assert!(found_doc);
598
599 eprintln!("{toks:#?}");
600 }
601
602 #[test]
603 fn test_parse_container_def_doc_multiline() {
604 let s = "class Point(Container):\n \"\"\"\n 2-dimensional cartesian point\n i love mathematics\n \"\"\"\n x_pos: int32\n y_pos: int32\n";
605
606 let chars = s.chars().collect::<Vec<_>>();
607
608 let toks =
609 parse_char_array_to_tokens(&chars).expect("test: invoke parse_char_array_to_tokens");
610
611 let mut found_doc = false;
612 for tok in &toks {
613 match tok {
614 TaggedToken::DocString(_, text) => {
615 assert_eq!(text, "2-dimensional cartesian point\ni love mathematics");
616 found_doc = true;
617 }
618 _ => {}
619 }
620 }
621 assert!(found_doc);
622
623 eprintln!("{toks:#?}");
624 }
625
626 #[test]
627 fn test_parse_container_def_comments() {
628 let s_without = "\nclass Point(Container):\n x_pos: int32\n y_pos: int32\n";
629 let s_with = "# comment \nclass Point(Container):# another\n x_pos: int32 # haha yes yes\n y_pos: int32 # hello\n";
630
631 let chars_without = s_without.chars().collect::<Vec<_>>();
632 let chars_with = s_with.chars().collect::<Vec<_>>();
633
634 let toks_without = parse_char_array_to_tokens(&chars_without)
635 .expect("test: invoke parse_char_array_to_tokens");
636 let toks_with = parse_char_array_to_tokens(&chars_with)
637 .expect("test: invoke parse_char_array_to_tokens");
638
639 let utoks_without = toks_without
640 .iter()
641 .map(|t| t.to_untagged())
642 .collect::<Vec<_>>();
643 let utoks_with = toks_with
644 .iter()
645 .map(|t| t.to_untagged())
646 .collect::<Vec<_>>();
647
648 assert_eq!(utoks_without, utoks_with);
649
650 eprintln!("{toks_without:#?}");
651 }
652}