udled/
token.rs

1use core::marker::PhantomData;
2
3use alloc::{format, string::ToString, vec, vec::Vec};
4use unicode_segmentation::UnicodeSegmentation;
5
6use crate::{either::Either, lexeme::Lex, span::Span, string::StringExt, Error, Reader};
7
8pub trait Tokenizer {
9    type Token<'a>;
10
11    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error>;
12
13    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
14        let _ = self.to_token(reader)?;
15        Ok(())
16    }
17
18    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
19        Ok(self.to_token(reader).is_ok())
20    }
21}
22
23impl<'b, T> Tokenizer for &'b T
24where
25    T: Tokenizer,
26{
27    type Token<'a> = T::Token<'a>;
28    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
29        (*self).to_token(reader)
30    }
31
32    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
33        (*self).eat(reader)
34    }
35
36    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
37        (*self).peek(reader)
38    }
39}
40
41impl<'b, T> Tokenizer for &'b mut T
42where
43    T: Tokenizer,
44{
45    type Token<'a> = T::Token<'a>;
46    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
47        (**self).to_token(reader)
48    }
49
50    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
51        (**self).eat(reader)
52    }
53
54    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
55        (**self).peek(reader)
56    }
57}
58
59pub struct Func<T, U>(T, PhantomData<U>);
60
61impl<T, U> Func<T, U> {
62    pub fn new(func: T) -> Func<T, U> {
63        Func(func, PhantomData)
64    }
65}
66
67impl<T, U> Tokenizer for Func<T, U>
68where
69    for<'a, 'b> T: Fn(&mut Reader<'a, 'b>) -> Result<U, Error>,
70{
71    type Token<'a> = U;
72
73    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
74        (self.0)(reader)
75    }
76}
77
78impl Tokenizer for core::ops::Range<char> {
79    type Token<'a> = Lex<'a>;
80
81    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
82        let char = reader.parse(Char)?;
83
84        for n in char.as_str().chars() {
85            if !self.contains(&n) {
86                return Err(reader.error(format!("Expected char in range: {:?}", self)));
87            }
88        }
89
90        Ok(char)
91    }
92}
93
94impl Tokenizer for core::ops::RangeInclusive<char> {
95    type Token<'a> = Lex<'a>;
96
97    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
98        let char = reader.parse(Char)?;
99
100        for n in char.as_str().chars() {
101            if !self.contains(&n) {
102                return Err(reader.error(format!("Expected char in range: {:?}", self)));
103            }
104        }
105
106        Ok(char)
107    }
108}
109
110impl<L, R> Tokenizer for Either<L, R>
111where
112    L: Tokenizer,
113    R: Tokenizer,
114{
115    type Token<'a> = Either<L::Token<'a>, R::Token<'a>>;
116    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
117        match self {
118            Self::Left(left) => Ok(Either::Left(left.to_token(reader)?)),
119            Self::Right(right) => Ok(Either::Right(right.to_token(reader)?)),
120        }
121    }
122
123    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
124        match self {
125            Self::Left(left) => left.eat(reader),
126            Self::Right(right) => right.eat(reader),
127        }
128    }
129
130    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
131        match self {
132            Self::Left(left) => left.peek(reader),
133            Self::Right(right) => right.peek(reader),
134        }
135    }
136}
137/// Match any whitespace
138#[derive(Debug, Clone, Copy, Default)]
139pub struct Ws;
140
141impl Tokenizer for Ws {
142    type Token<'a> = Span;
143    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
144        let start = reader.position();
145
146        let first = reader.eat_ch()?;
147
148        if !first.is_whitespace() {
149            return Err(reader.error("Expected whitespace"));
150        }
151
152        loop {
153            let Some(ch) = reader.peek_ch() else {
154                break;
155            };
156
157            if !ch.is_whitespace() {
158                break;
159            }
160
161            reader.eat_ch()?;
162        }
163
164        Ok(Span {
165            start,
166            end: reader.position(),
167        })
168    }
169}
170
171/// Match a literal string
172impl<'lit> Tokenizer for &'lit str {
173    type Token<'a> = Span;
174    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
175        let tokens = self.graphemes(true);
176
177        let start = reader.position();
178
179        for token in tokens {
180            let next = reader.eat_ch()?;
181            if token != next {
182                return Err(reader.error(self.to_string()));
183            }
184        }
185
186        if start == reader.position() {
187            return Err(reader.error(self.to_string()));
188        }
189
190        Ok(Span {
191            start,
192            end: reader.position(),
193        })
194    }
195
196    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
197        let tokens = self.graphemes(true);
198        for (idx, next) in tokens.enumerate() {
199            if Some(next) == reader.peek_chn(idx) {
200                continue;
201            }
202            return Ok(false);
203        }
204
205        Ok(true)
206    }
207}
208
209/// Match a literal char
210impl Tokenizer for char {
211    type Token<'a> = Span;
212    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
213        let start = reader.position();
214
215        let next = reader.eat_ch()?;
216
217        match next.chars().next() {
218            Some(next) if next == *self => Ok(Span {
219                start,
220                end: reader.position(),
221            }),
222            _ => return Err(reader.error(format!("expected '{}'", self))),
223        }
224    }
225
226    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
227        let Some(next) = reader.peek_ch() else {
228            return Ok(false);
229        };
230        match next.chars().next() {
231            Some(next) if next == *self => Ok(true),
232            _ => return Ok(false),
233        }
234    }
235}
236
237// Helpers
238
239/// Match EOF
240#[derive(Debug, Clone, Copy, Default)]
241pub struct EOF;
242
243impl Tokenizer for EOF {
244    type Token<'a> = ();
245
246    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
247        if reader.eof() {
248            Ok(())
249        } else {
250            Err(reader.error("expected eof"))
251        }
252    }
253
254    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
255        Ok(reader.eof())
256    }
257}
258
259/// Match a digit with a given radix
260#[derive(Debug, Clone, Copy)]
261pub struct Digit(pub u32);
262
263impl Default for Digit {
264    fn default() -> Self {
265        Digit(10)
266    }
267}
268
269impl Tokenizer for Digit {
270    type Token<'a> = u32;
271
272    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
273        let ch = reader.eat_ch()?;
274
275        if !ch.is_digit(self.0) {
276            return Err(reader.error("expected digit"));
277        }
278
279        Ok(ch.chars().next().unwrap().to_digit(self.0).unwrap())
280    }
281
282    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
283        let ch = reader.eat_ch()?;
284
285        if !ch.is_digit(self.0) {
286            return Err(reader.error("expected digit"));
287        }
288
289        Ok(())
290    }
291
292    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
293        let Some(ch) = reader.peek_ch() else {
294            return Ok(false);
295        };
296
297        Ok(ch.is_digit(self.0))
298    }
299}
300
301/// Match any unnicode graphme
302#[derive(Debug, Clone, Copy, Default)]
303pub struct Char;
304
305impl Tokenizer for Char {
306    type Token<'a> = Lex<'a>;
307    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
308        let start = reader.position();
309        let ch = reader.eat_ch()?;
310        let end = reader.position();
311        Ok(Lex {
312            value: ch,
313            span: Span { start, end },
314        })
315    }
316
317    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
318        let _ = reader.eat_ch()?;
319        Ok(())
320    }
321
322    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
323        Ok(if reader.eof() { false } else { false })
324    }
325}
326
327/// Match a alphabetic character
328#[derive(Debug, Clone, Copy, Default)]
329pub struct Alphabetic;
330
331impl Tokenizer for Alphabetic {
332    type Token<'a> = Lex<'a>;
333
334    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
335        let ch = reader.parse(Char)?;
336        if ch.value.is_alphabetic() {
337            Ok(ch)
338        } else {
339            Err(reader.error("expected alphabetic"))
340        }
341    }
342}
343
344/// Match any unicode alphanumeric character
345#[derive(Debug, Clone, Copy, Default)]
346pub struct AlphaNumeric;
347
348impl Tokenizer for AlphaNumeric {
349    type Token<'a> = Lex<'a>;
350
351    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
352        let ch = reader.parse(Char)?;
353        if ch.value.is_alphanumeric() {
354            Ok(ch)
355        } else {
356            Err(reader.error("expected alphanumeric"))
357        }
358    }
359}
360
361/// Match any ascii punctuation
362/// Match a punctuation
363#[derive(Debug, Clone, Copy, Default)]
364pub struct Punctuation;
365
366impl Tokenizer for Punctuation {
367    type Token<'a> = Lex<'a>;
368
369    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
370        let ch = reader.parse(Char)?;
371        if ch.value.is_ascii_punctuation() {
372            Ok(ch)
373        } else {
374            Err(reader.error("expected punctuation"))
375        }
376    }
377}
378
379/// Optional match T
380#[derive(Debug, Clone, Copy, Default)]
381pub struct Opt<T>(pub T);
382
383impl<T> Tokenizer for Opt<T>
384where
385    T: Tokenizer,
386{
387    type Token<'a> = Option<T::Token<'a>>;
388
389    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
390        Ok(reader.parse(&self.0).ok())
391    }
392
393    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
394        reader.eat(&self.0).ok();
395        Ok(())
396    }
397
398    fn peek(&self, _reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
399        Ok(true)
400    }
401}
402
403/// Match either L or R
404#[derive(Debug, Clone, Copy, Default)]
405pub struct Or<L, R>(pub L, pub R);
406
407impl<L, R> Tokenizer for Or<L, R>
408where
409    L: Tokenizer,
410    R: Tokenizer,
411{
412    type Token<'a> = Either<L::Token<'a>, R::Token<'a>>;
413    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
414        let line_no = reader.line_no();
415        let col_no = reader.col_no();
416
417        let left_err = match reader.parse(&self.0) {
418            Ok(ret) => return Ok(Either::Left(ret)),
419            Err(err) => err,
420        };
421
422        let right_err = match reader.parse(&self.1) {
423            Ok(ret) => return Ok(Either::Right(ret)),
424            Err(err) => err,
425        };
426
427        Err(Error::new_with(
428            "either",
429            reader.position(),
430            line_no,
431            col_no,
432            vec![left_err, right_err],
433        ))
434    }
435
436    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
437        Ok(reader.peek(&self.0)? || reader.peek(&self.1)?)
438    }
439}
440
441/// One or many T's
442#[derive(Debug, Clone, Copy)]
443pub struct OneOrMany<T>(pub T);
444
445impl<T> Tokenizer for OneOrMany<T>
446where
447    T: Tokenizer,
448{
449    type Token<'a> = Vec<T::Token<'a>>;
450    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
451        let mut output = vec![reader.parse(&self.0)?];
452
453        loop {
454            let next = match reader.parse(&self.0) {
455                Ok(next) => next,
456                Err(_) => break,
457            };
458
459            output.push(next);
460        }
461
462        Ok(output)
463    }
464
465    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
466        reader.eat(&self.0)?;
467
468        loop {
469            match reader.eat(&self.0) {
470                Ok(_) => continue,
471                Err(_) => break,
472            };
473        }
474
475        Ok(())
476    }
477
478    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
479        reader.peek(&self.0)
480    }
481}
482
483/// Many of T's
484#[derive(Debug, Clone, Copy)]
485pub struct Many<T>(pub T);
486
487impl<T> Tokenizer for Many<T>
488where
489    T: Tokenizer,
490{
491    type Token<'a> = Vec<T::Token<'a>>;
492    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
493        let mut output = Vec::default();
494
495        loop {
496            let next = match reader.parse(&self.0) {
497                Ok(next) => next,
498                Err(_) => break,
499            };
500
501            output.push(next);
502        }
503
504        Ok(output)
505    }
506
507    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
508        loop {
509            match reader.eat(&self.0) {
510                Ok(_) => continue,
511                Err(_) => break,
512            };
513        }
514        Ok(())
515    }
516
517    fn peek(&self, _reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
518        Ok(true)
519    }
520}
521
522impl<'b, T> Tokenizer for &'b [T]
523where
524    T: Tokenizer,
525{
526    type Token<'a> = T::Token<'a>;
527    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
528        let mut errors = Vec::default();
529        for tokenizer in self.iter() {
530            match tokenizer.to_token(reader) {
531                Ok(ret) => return Ok(ret),
532                Err(err) => {
533                    errors.push(err);
534                }
535            }
536        }
537
538        Err(reader.error_with("one of", errors))
539    }
540
541    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
542        Ok(self.iter().any(|m| reader.peek(m).unwrap_or_default()))
543    }
544}
545
546/// Returns span of T
547pub struct Spanned<T>(pub T);
548
549impl<T> Tokenizer for Spanned<T>
550where
551    T: Tokenizer,
552{
553    type Token<'a> = Span;
554    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
555        let start = reader.position();
556        reader.eat(&self.0)?;
557        let end = reader.position();
558        Ok(Span { start, end })
559    }
560
561    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
562        reader.eat(&self.0)
563    }
564
565    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
566        reader.peek(&self.0)
567    }
568}
569
570/// Match anything but T
571#[derive(Debug, Clone, Copy)]
572pub struct Not<T>(pub T);
573
574impl<T> Tokenizer for Not<T>
575where
576    T: Tokenizer,
577{
578    type Token<'a> = ();
579
580    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
581        if reader.peek(&self.0)? {
582            let ch = reader.peek_ch().unwrap_or("EOF");
583            return Err(reader.error(format!("unexpected token: {ch}")));
584        }
585        Ok(())
586    }
587
588    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
589        Ok(!reader.peek(&self.0)?)
590    }
591}
592
593#[derive(Debug, Clone, Copy)]
594pub struct Test<T>(pub T);
595
596impl<T> Tokenizer for Test<T>
597where
598    T: Tokenizer,
599{
600    type Token<'a> = T::Token<'a>;
601
602    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
603        reader.parse(&self.0)
604    }
605
606    fn eat(&self, reader: &mut Reader<'_, '_>) -> Result<(), Error> {
607        reader.eat(&self.0)
608    }
609
610    fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
611        Ok(self.to_token(reader).is_ok())
612    }
613}
614
615#[macro_export]
616macro_rules! any {
617    [$one: expr] => {
618        $one
619    };
620    [$first: expr, $($rest: expr),*] => {
621        $crate::token::Or($first, $crate::any!($($rest),*))
622    };
623
624}
625
626macro_rules! tokenizer {
627    ($first: ident) => {
628        impl<$first: Tokenizer> Tokenizer for ($first,) {
629            type Token<'a> = $first::Token<'a>;
630
631            fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
632                reader.parse(&self.0)
633            }
634
635            fn eat<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<(), Error> {
636                reader.eat(&self.0)
637            }
638
639            fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
640                Ok(reader.peek(&self.0)?)
641            }
642        }
643    };
644    ($first:ident $($rest:ident)*) => {
645        tokenizer!($($rest)*);
646
647        impl<$first: Tokenizer, $($rest: Tokenizer),*> Tokenizer for ($first,$($rest),*) {
648            type Token<'a> = ($first::Token<'a>, $($rest::Token<'a>),*);
649
650            #[allow(non_snake_case)]
651            fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
652                let ($first, $($rest),*) = self;
653                Ok((
654                    reader.parse(&$first)?,
655                    $(
656                        reader.parse(&$rest)?
657                    ),*
658                ))
659            }
660
661            #[allow(non_snake_case)]
662            fn eat<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<(), Error> {
663                let ($first, $($rest),*) = self;
664
665                reader.eat(&$first)?;
666                $(
667                    reader.eat(&$rest)?;
668                )*
669
670                Ok(())
671            }
672
673            fn peek(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
674                Ok(reader.peek(&self.0)?)
675            }
676        }
677    }
678}
679
680tokenizer!(T1 T2 T3 T4 T5 T6 T7 T8 T9 T10 T11 T12);
681
682#[cfg(test)]
683mod test {
684    use crate::Input;
685
686    use super::*;
687
688    struct Word;
689
690    impl Tokenizer for Word {
691        type Token<'a> = Lex<'a>;
692
693        fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
694            if !reader.peek(Alphabetic)? {
695                return Err(reader.error("expected alphabetic"));
696            }
697
698            let start = reader.position();
699
700            loop {
701                if reader.eof() {
702                    break;
703                }
704
705                if !reader.peek(Alphabetic)? {
706                    break;
707                }
708
709                reader.eat(Alphabetic)?;
710            }
711
712            let span = Span::new(start, reader.position());
713
714            if !span.is_valid() {
715                return Err(reader.error("no word"));
716            }
717
718            Ok(Lex::new(span.slice(reader.source()).unwrap(), span))
719        }
720    }
721
722    #[test]
723    fn opt() {
724        let mut input = Input::new("WS");
725        assert_eq!(input.parse(Opt("He")).unwrap(), None,);
726        assert_eq!(input.position(), 0);
727        assert_eq!(input.peek_ch(), Some("W"));
728    }
729
730    #[test]
731    fn char() {
732        let mut input = Input::new("char");
733        assert_eq!(
734            input.parse(Char).unwrap(),
735            Lex {
736                value: "c",
737                span: Span { start: 0, end: 1 }
738            }
739        );
740    }
741
742    #[test]
743    fn alphabetic() {
744        let mut input = Input::new("char");
745        assert_eq!(
746            input.parse(Alphabetic).unwrap(),
747            Lex {
748                value: "c",
749                span: Span { start: 0, end: 1 }
750            }
751        );
752
753        let mut input = Input::new("-har");
754        assert!(input.parse(Alphabetic).is_err());
755    }
756
757    #[test]
758    fn alphabetic_numeric() {
759        let mut input = Input::new("2char");
760        assert_eq!(
761            input.parse(AlphaNumeric).unwrap(),
762            Lex {
763                value: "2",
764                span: Span { start: 0, end: 1 }
765            }
766        );
767
768        let mut input = Input::new("-har");
769        assert!(input.parse(AlphaNumeric).is_err());
770    }
771
772    #[test]
773    fn spanned() {
774        let mut input = Input::new("Test this string");
775        assert_eq!(
776            input.parse(Spanned(Word)).unwrap(),
777            Span { start: 0, end: 4 }
778        );
779    }
780
781    #[test]
782    fn range() {
783        let mut input = Input::new("b");
784        assert_eq!(
785            input.parse('a'..'z').unwrap(),
786            Lex::new("b", Span::new(0, 1))
787        )
788    }
789
790    #[test]
791    fn not() {
792        assert_eq!(
793            Input::new("=-").parse(('=', Not('='))).unwrap(),
794            (Span::new(0, 1), ())
795        );
796
797        assert!(Input::new("==").parse(('=', Not('='))).is_err())
798    }
799
800    #[test]
801    fn func() {
802        let mut input = Input::new("Hello");
803
804        let ret = input
805            .parse(Func::new(|ctx: &mut Reader| ctx.parse("Hello")))
806            .unwrap();
807
808        assert_eq!(ret, Span::new(0, 5));
809    }
810}