1use core::marker::PhantomData;
2
3use alloc::{format, string::ToString, vec, vec::Vec};
4use unicode_segmentation::UnicodeSegmentation;
5
6use crate::{either::Either, lexeme::Lex, span::Span, string::StringExt, Error, Reader};
7
8pub trait Tokenizer {
9 type Token<'a>;
10
11 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error>;
12
13 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
14 let _ = self.to_token(reader)?;
15 Ok(())
16 }
17
18 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
19 Ok(self.to_token(reader).is_ok())
20 }
21}
22
23impl<'b, T> Tokenizer for &'b T
24where
25 T: Tokenizer,
26{
27 type Token<'a> = T::Token<'a>;
28 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
29 (*self).to_token(reader)
30 }
31
32 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
33 (*self).eat(reader)
34 }
35
36 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
37 (*self).peek(reader)
38 }
39}
40
41impl<'b, T> Tokenizer for &'b mut T
42where
43 T: Tokenizer,
44{
45 type Token<'a> = T::Token<'a>;
46 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
47 (**self).to_token(reader)
48 }
49
50 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
51 (**self).eat(reader)
52 }
53
54 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
55 (**self).peek(reader)
56 }
57}
58
59pub struct Func<T, U>(T, PhantomData<U>);
60
61impl<T, U> Func<T, U> {
62 pub fn new(func: T) -> Func<T, U> {
63 Func(func, PhantomData)
64 }
65}
66
67impl<T, U> Tokenizer for Func<T, U>
68where
69 for<'a, 'b> T: Fn(&mut Reader<'a, 'b>) -> Result<U, Error>,
70{
71 type Token<'a> = U;
72
73 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
74 (self.0)(reader)
75 }
76}
77
78impl Tokenizer for core::ops::Range<char> {
79 type Token<'a> = Lex<'a>;
80
81 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
82 let char = reader.parse(Char)?;
83
84 for n in char.as_str().chars() {
85 if !self.contains(&n) {
86 return Err(reader.error(format!("Expected char in range: {:?}", self)));
87 }
88 }
89
90 Ok(char)
91 }
92}
93
94impl Tokenizer for core::ops::RangeInclusive<char> {
95 type Token<'a> = Lex<'a>;
96
97 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
98 let char = reader.parse(Char)?;
99
100 for n in char.as_str().chars() {
101 if !self.contains(&n) {
102 return Err(reader.error(format!("Expected char in range: {:?}", self)));
103 }
104 }
105
106 Ok(char)
107 }
108}
109
110impl<L, R> Tokenizer for Either<L, R>
111where
112 L: Tokenizer,
113 R: Tokenizer,
114{
115 type Token<'a> = Either<L::Token<'a>, R::Token<'a>>;
116 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
117 match self {
118 Self::Left(left) => Ok(Either::Left(left.to_token(reader)?)),
119 Self::Right(right) => Ok(Either::Right(right.to_token(reader)?)),
120 }
121 }
122
123 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
124 match self {
125 Self::Left(left) => left.eat(reader),
126 Self::Right(right) => right.eat(reader),
127 }
128 }
129
130 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
131 match self {
132 Self::Left(left) => left.peek(reader),
133 Self::Right(right) => right.peek(reader),
134 }
135 }
136}
137#[derive(Debug, Clone, Copy, Default)]
139pub struct Ws;
140
141impl Tokenizer for Ws {
142 type Token<'a> = Span;
143 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
144 let start = reader.position();
145
146 let first = reader.eat_ch()?;
147
148 if !first.is_whitespace() {
149 return Err(reader.error("Expected whitespace"));
150 }
151
152 loop {
153 let Some(ch) = reader.peek_ch() else {
154 break;
155 };
156
157 if !ch.is_whitespace() {
158 break;
159 }
160
161 reader.eat_ch()?;
162 }
163
164 Ok(Span {
165 start,
166 end: reader.position(),
167 })
168 }
169}
170
171impl<'lit> Tokenizer for &'lit str {
173 type Token<'a> = Span;
174 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
175 let tokens = self.graphemes(true);
176
177 let start = reader.position();
178
179 for token in tokens {
180 let next = reader.eat_ch()?;
181 if token != next {
182 return Err(reader.error(self.to_string()));
183 }
184 }
185
186 if start == reader.position() {
187 return Err(reader.error(self.to_string()));
188 }
189
190 Ok(Span {
191 start,
192 end: reader.position(),
193 })
194 }
195
196 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
197 let tokens = self.graphemes(true);
198 for (idx, next) in tokens.enumerate() {
199 if Some(next) == reader.peek_chn(idx) {
200 continue;
201 }
202 return Ok(false);
203 }
204
205 Ok(true)
206 }
207}
208
209impl Tokenizer for char {
211 type Token<'a> = Span;
212 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
213 let start = reader.position();
214
215 let next = reader.eat_ch()?;
216
217 match next.chars().next() {
218 Some(next) if next == *self => Ok(Span {
219 start,
220 end: reader.position(),
221 }),
222 _ => return Err(reader.error(format!("expected '{}'", self))),
223 }
224 }
225
226 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
227 let Some(next) = reader.peek_ch() else {
228 return Ok(false);
229 };
230 match next.chars().next() {
231 Some(next) if next == *self => Ok(true),
232 _ => return Ok(false),
233 }
234 }
235}
236
237#[derive(Debug, Clone, Copy, Default)]
241pub struct EOF;
242
243impl Tokenizer for EOF {
244 type Token<'a> = ();
245
246 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
247 if reader.eof() {
248 Ok(())
249 } else {
250 Err(reader.error("expected eof"))
251 }
252 }
253
254 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
255 Ok(reader.eof())
256 }
257}
258
259#[derive(Debug, Clone, Copy)]
261pub struct Digit(pub u32);
262
263impl Default for Digit {
264 fn default() -> Self {
265 Digit(10)
266 }
267}
268
269impl Tokenizer for Digit {
270 type Token<'a> = u32;
271
272 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
273 let ch = reader.eat_ch()?;
274
275 if !ch.is_digit(self.0) {
276 return Err(reader.error("expected digit"));
277 }
278
279 Ok(ch.chars().next().unwrap().to_digit(self.0).unwrap())
280 }
281
282 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
283 let ch = reader.eat_ch()?;
284
285 if !ch.is_digit(self.0) {
286 return Err(reader.error("expected digit"));
287 }
288
289 Ok(())
290 }
291
292 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
293 let Some(ch) = reader.peek_ch() else {
294 return Ok(false);
295 };
296
297 Ok(ch.is_digit(self.0))
298 }
299}
300
301#[derive(Debug, Clone, Copy, Default)]
303pub struct Char;
304
305impl Tokenizer for Char {
306 type Token<'a> = Lex<'a>;
307 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
308 let start = reader.position();
309 let ch = reader.eat_ch()?;
310 let end = reader.position();
311 Ok(Lex {
312 value: ch,
313 span: Span { start, end },
314 })
315 }
316
317 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
318 let _ = reader.eat_ch()?;
319 Ok(())
320 }
321
322 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
323 Ok(if reader.eof() { false } else { false })
324 }
325}
326
327#[derive(Debug, Clone, Copy, Default)]
329pub struct Alphabetic;
330
331impl Tokenizer for Alphabetic {
332 type Token<'a> = Lex<'a>;
333
334 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
335 let ch = reader.parse(Char)?;
336 if ch.value.is_alphabetic() {
337 Ok(ch)
338 } else {
339 Err(reader.error("expected alphabetic"))
340 }
341 }
342}
343
344#[derive(Debug, Clone, Copy, Default)]
346pub struct AlphaNumeric;
347
348impl Tokenizer for AlphaNumeric {
349 type Token<'a> = Lex<'a>;
350
351 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
352 let ch = reader.parse(Char)?;
353 if ch.value.is_alphanumeric() {
354 Ok(ch)
355 } else {
356 Err(reader.error("expected alphanumeric"))
357 }
358 }
359}
360
361#[derive(Debug, Clone, Copy, Default)]
364pub struct Punctuation;
365
366impl Tokenizer for Punctuation {
367 type Token<'a> = Lex<'a>;
368
369 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
370 let ch = reader.parse(Char)?;
371 if ch.value.is_ascii_punctuation() {
372 Ok(ch)
373 } else {
374 Err(reader.error("expected punctuation"))
375 }
376 }
377}
378
379#[derive(Debug, Clone, Copy, Default)]
381pub struct Opt<T>(pub T);
382
383impl<T> Tokenizer for Opt<T>
384where
385 T: Tokenizer,
386{
387 type Token<'a> = Option<T::Token<'a>>;
388
389 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
390 Ok(reader.parse(&self.0).ok())
391 }
392
393 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
394 reader.eat(&self.0).ok();
395 Ok(())
396 }
397
398 fn peek(&self, _reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
399 Ok(true)
400 }
401}
402
403#[derive(Debug, Clone, Copy, Default)]
405pub struct Or<L, R>(pub L, pub R);
406
407impl<L, R> Tokenizer for Or<L, R>
408where
409 L: Tokenizer,
410 R: Tokenizer,
411{
412 type Token<'a> = Either<L::Token<'a>, R::Token<'a>>;
413 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
414 let line_no = reader.line_no();
415 let col_no = reader.col_no();
416
417 let left_err = match reader.parse(&self.0) {
418 Ok(ret) => return Ok(Either::Left(ret)),
419 Err(err) => err,
420 };
421
422 let right_err = match reader.parse(&self.1) {
423 Ok(ret) => return Ok(Either::Right(ret)),
424 Err(err) => err,
425 };
426
427 Err(Error::new_with(
428 "either",
429 reader.position(),
430 line_no,
431 col_no,
432 vec![left_err, right_err],
433 ))
434 }
435
436 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
437 Ok(reader.peek(&self.0)? || reader.peek(&self.1)?)
438 }
439}
440
441#[derive(Debug, Clone, Copy)]
443pub struct OneOrMany<T>(pub T);
444
445impl<T> Tokenizer for OneOrMany<T>
446where
447 T: Tokenizer,
448{
449 type Token<'a> = Vec<T::Token<'a>>;
450 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
451 let mut output = vec![reader.parse(&self.0)?];
452
453 loop {
454 let next = match reader.parse(&self.0) {
455 Ok(next) => next,
456 Err(_) => break,
457 };
458
459 output.push(next);
460 }
461
462 Ok(output)
463 }
464
465 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
466 reader.eat(&self.0)?;
467
468 loop {
469 match reader.eat(&self.0) {
470 Ok(_) => continue,
471 Err(_) => break,
472 };
473 }
474
475 Ok(())
476 }
477
478 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
479 reader.peek(&self.0)
480 }
481}
482
483#[derive(Debug, Clone, Copy)]
485pub struct Many<T>(pub T);
486
487impl<T> Tokenizer for Many<T>
488where
489 T: Tokenizer,
490{
491 type Token<'a> = Vec<T::Token<'a>>;
492 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
493 let mut output = Vec::default();
494
495 loop {
496 let next = match reader.parse(&self.0) {
497 Ok(next) => next,
498 Err(_) => break,
499 };
500
501 output.push(next);
502 }
503
504 Ok(output)
505 }
506
507 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
508 loop {
509 match reader.eat(&self.0) {
510 Ok(_) => continue,
511 Err(_) => break,
512 };
513 }
514 Ok(())
515 }
516
517 fn peek(&self, _reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
518 Ok(true)
519 }
520}
521
522impl<'b, T> Tokenizer for &'b [T]
523where
524 T: Tokenizer,
525{
526 type Token<'a> = T::Token<'a>;
527 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
528 let mut errors = Vec::default();
529 for tokenizer in self.iter() {
530 match tokenizer.to_token(reader) {
531 Ok(ret) => return Ok(ret),
532 Err(err) => {
533 errors.push(err);
534 }
535 }
536 }
537
538 Err(reader.error_with("one of", errors))
539 }
540
541 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
542 Ok(self.iter().any(|m| reader.peek(m).unwrap_or_default()))
543 }
544}
545
546pub struct Spanned<T>(pub T);
548
549impl<T> Tokenizer for Spanned<T>
550where
551 T: Tokenizer,
552{
553 type Token<'a> = Span;
554 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
555 let start = reader.position();
556 reader.eat(&self.0)?;
557 let end = reader.position();
558 Ok(Span { start, end })
559 }
560
561 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
562 reader.eat(&self.0)
563 }
564
565 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
566 reader.peek(&self.0)
567 }
568}
569
570#[derive(Debug, Clone, Copy)]
572pub struct Not<T>(pub T);
573
574impl<T> Tokenizer for Not<T>
575where
576 T: Tokenizer,
577{
578 type Token<'a> = ();
579
580 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
581 if reader.peek(&self.0)? {
582 let ch = reader.peek_ch().unwrap_or("EOF");
583 return Err(reader.error(format!("unexpected token: {ch}")));
584 }
585 Ok(())
586 }
587
588 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
589 Ok(!reader.peek(&self.0)?)
590 }
591}
592
593#[derive(Debug, Clone, Copy)]
594pub struct Test<T>(pub T);
595
596impl<T> Tokenizer for Test<T>
597where
598 T: Tokenizer,
599{
600 type Token<'a> = T::Token<'a>;
601
602 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
603 reader.parse(&self.0)
604 }
605
606 fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
607 reader.eat(&self.0)
608 }
609
610 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
611 Ok(self.to_token(reader).is_ok())
612 }
613}
614
615#[macro_export]
616macro_rules! any {
617 [$one: expr] => {
618 $one
619 };
620 [$first: expr, $($rest: expr),*] => {
621 $crate::token::Or($first, $crate::any!($($rest),*))
622 };
623
624}
625
626macro_rules! tokenizer {
627 ($first: ident) => {
628 impl<$first: Tokenizer> Tokenizer for ($first,) {
629 type Token<'a> = $first::Token<'a>;
630
631 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
632 reader.parse(&self.0)
633 }
634
635 fn eat<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<(), Error> {
636 reader.eat(&self.0)
637 }
638
639 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
640 Ok(reader.peek(&self.0)?)
641 }
642 }
643 };
644 ($first:ident $($rest:ident)*) => {
645 tokenizer!($($rest)*);
646
647 impl<$first: Tokenizer, $($rest: Tokenizer),*> Tokenizer for ($first,$($rest),*) {
648 type Token<'a> = ($first::Token<'a>, $($rest::Token<'a>),*);
649
650 #[allow(non_snake_case)]
651 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
652 let ($first, $($rest),*) = self;
653 Ok((
654 reader.parse(&$first)?,
655 $(
656 reader.parse(&$rest)?
657 ),*
658 ))
659 }
660
661 #[allow(non_snake_case)]
662 fn eat<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<(), Error> {
663 let ($first, $($rest),*) = self;
664
665 reader.eat(&$first)?;
666 $(
667 reader.eat(&$rest)?;
668 )*
669
670 Ok(())
671 }
672
673 fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
674 Ok(reader.peek(&self.0)?)
675 }
676 }
677 }
678}
679
680tokenizer!(T1 T2 T3 T4 T5 T6 T7 T8 T9 T10 T11 T12);
681
682#[cfg(test)]
683mod test {
684 use crate::Input;
685
686 use super::*;
687
688 struct Word;
689
690 impl Tokenizer for Word {
691 type Token<'a> = Lex<'a>;
692
693 fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
694 if !reader.peek(Alphabetic)? {
695 return Err(reader.error("expected alphabetic"));
696 }
697
698 let start = reader.position();
699
700 loop {
701 if reader.eof() {
702 break;
703 }
704
705 if !reader.peek(Alphabetic)? {
706 break;
707 }
708
709 reader.eat(Alphabetic)?;
710 }
711
712 let span = Span::new(start, reader.position());
713
714 if !span.is_valid() {
715 return Err(reader.error("no word"));
716 }
717
718 Ok(Lex::new(span.slice(reader.source()).unwrap(), span))
719 }
720 }
721
722 #[test]
723 fn opt() {
724 let mut input = Input::new("WS");
725 assert_eq!(input.parse(Opt("He")).unwrap(), None,);
726 assert_eq!(input.position(), 0);
727 assert_eq!(input.peek_ch(), Some("W"));
728 }
729
730 #[test]
731 fn char() {
732 let mut input = Input::new("char");
733 assert_eq!(
734 input.parse(Char).unwrap(),
735 Lex {
736 value: "c",
737 span: Span { start: 0, end: 1 }
738 }
739 );
740 }
741
742 #[test]
743 fn alphabetic() {
744 let mut input = Input::new("char");
745 assert_eq!(
746 input.parse(Alphabetic).unwrap(),
747 Lex {
748 value: "c",
749 span: Span { start: 0, end: 1 }
750 }
751 );
752
753 let mut input = Input::new("-har");
754 assert!(input.parse(Alphabetic).is_err());
755 }
756
757 #[test]
758 fn alphabetic_numeric() {
759 let mut input = Input::new("2char");
760 assert_eq!(
761 input.parse(AlphaNumeric).unwrap(),
762 Lex {
763 value: "2",
764 span: Span { start: 0, end: 1 }
765 }
766 );
767
768 let mut input = Input::new("-har");
769 assert!(input.parse(AlphaNumeric).is_err());
770 }
771
772 #[test]
773 fn spanned() {
774 let mut input = Input::new("Test this string");
775 assert_eq!(
776 input.parse(Spanned(Word)).unwrap(),
777 Span { start: 0, end: 4 }
778 );
779 }
780
781 #[test]
782 fn range() {
783 let mut input = Input::new("b");
784 assert_eq!(
785 input.parse('a'..'z').unwrap(),
786 Lex::new("b", Span::new(0, 1))
787 )
788 }
789
790 #[test]
791 fn not() {
792 assert_eq!(
793 Input::new("=-").parse(('=', Not('='))).unwrap(),
794 (Span::new(0, 1), ())
795 );
796
797 assert!(Input::new("==").parse(('=', Not('='))).is_err())
798 }
799
800 #[test]
801 fn func() {
802 let mut input = Input::new("Hello");
803
804 let ret = input
805 .parse(Func::new(|ctx: &mut Reader| ctx.parse("Hello")))
806 .unwrap();
807
808 assert_eq!(ret, Span::new(0, 5));
809 }
810}