udled_tokenizers/
comments.rs

1use udled::{token::Spanned, Either, Error, Lex, Reader, Span, Tokenizer};
2
3pub const fn cstyle_line_comment() -> RawLineComment<&'static str> {
4    RawLineComment("//")
5}
6
7pub const fn cstyle_multiline_comment(
8    nested: bool,
9) -> Either<RawMultiLine<&'static str, &'static str>, RawMultiLineNested<&'static str, &'static str>>
10{
11    if nested {
12        Either::Right(RawMultiLineNested("/*", "*/"))
13    } else {
14        Either::Left(RawMultiLine("/*", "*/"))
15    }
16}
17
18pub const fn rust_doc_comment() -> RawLineComment<&'static str> {
19    RawLineComment("///")
20}
21
22pub const fn python_line_comment() -> RawLineComment<&'static str> {
23    RawLineComment("#")
24}
25
26pub const fn python_multiline_comment() -> RawMultiLine<&'static str, &'static str> {
27    RawMultiLine("'''", "'''")
28}
29
30pub const fn javascript_doc_comment() -> RawMultiLine<&'static str, &'static str> {
31    RawMultiLine("/**", "*/")
32}
33
34pub const fn html_comment() -> RawMultiLine<&'static str, &'static str> {
35    RawMultiLine("<!--", "-->")
36}
37
38/// Match a c style line comment
39#[derive(Debug, Clone, Copy, Default)]
40pub struct LineComment;
41
42impl Tokenizer for LineComment {
43    type Token<'a> = Lex<'a>;
44    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
45        reader.parse(cstyle_line_comment())
46    }
47
48    fn peek<'a>(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
49        reader.peek("//")
50    }
51}
52
53/// Match a c style multiline comments with support for nested comments
54#[derive(Debug, Clone, Copy, Default)]
55pub struct MultiLineComment;
56
57impl Tokenizer for MultiLineComment {
58    type Token<'a> = Lex<'a>;
59    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
60        reader
61            .parse(cstyle_multiline_comment(true))
62            .map(|m| m.unify())
63    }
64
65    fn peek<'a>(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
66        reader.peek("/*")
67    }
68}
69
70#[derive(Debug, Clone, Copy)]
71pub struct RawLineComment<T>(T);
72
73impl<T> Tokenizer for RawLineComment<T>
74where
75    T: Tokenizer,
76{
77    type Token<'a> = Lex<'a>;
78    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
79        let start = reader.position();
80
81        let _ = reader.parse(&self.0)?;
82
83        loop {
84            let Some(ch) = reader.peek_ch() else {
85                break;
86            };
87
88            if ch == "\n" {
89                break;
90            }
91
92            let _ = reader.eat_ch()?;
93        }
94
95        let span = Span::new(start, reader.position());
96
97        Ok(Lex {
98            value: span.slice(reader.source()).expect("slice"),
99            span,
100        })
101    }
102
103    fn peek<'a>(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
104        reader.peek(&self.0)
105    }
106}
107
108#[derive(Debug, Clone, Copy)]
109pub struct RawMultiLine<O, C>(O, C);
110
111impl<O, C> Tokenizer for RawMultiLine<O, C>
112where
113    O: Tokenizer,
114    C: Tokenizer,
115{
116    type Token<'a> = Lex<'a>;
117    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
118        let start = reader.parse(Spanned(&self.0))?;
119
120        let end = loop {
121            if reader.eof() {
122                return Err(reader.error("unexpected end of input inside multi-line comment"));
123            } else if let Ok(end) = reader.parse(Spanned(&self.1)) {
124                break end;
125            } else {
126                reader.eat_ch()?;
127            }
128        };
129
130        let span = start + end;
131
132        Ok(Lex {
133            value: span.slice(reader.source()).expect("slice"),
134            span,
135        })
136    }
137
138    fn peek<'a>(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
139        reader.peek(&self.0)
140    }
141}
142
143#[derive(Debug, Clone, Copy)]
144pub struct RawMultiLineNested<O, C>(O, C);
145
146impl<O, C> Tokenizer for RawMultiLineNested<O, C>
147where
148    O: Tokenizer,
149    C: Tokenizer,
150{
151    type Token<'a> = Lex<'a>;
152    fn to_token<'a>(&self, reader: &mut Reader<'_, 'a>) -> Result<Self::Token<'a>, Error> {
153        let start = reader.position();
154
155        let _ = reader.parse(&self.0)?;
156
157        let mut depth = 1;
158
159        loop {
160            if reader.eof() {
161                return Err(reader.error("unexpected end of input inside multi-line comment"));
162            } else if reader.parse(&self.0).is_ok() {
163                depth += 1;
164            } else if reader.parse(&self.1).is_ok() {
165                depth -= 1;
166
167                if depth == 0 {
168                    break;
169                }
170            } else {
171                reader.eat_ch()?;
172            }
173        }
174
175        let span = Span::new(start, reader.position());
176
177        Ok(Lex {
178            value: span.slice(reader.source()).expect("slice"),
179            span,
180        })
181    }
182
183    fn peek<'a>(&self, reader: &mut Reader<'_, '_>) -> Result<bool, Error> {
184        reader.peek(&self.0)
185    }
186}
187
188#[cfg(test)]
189mod test {
190    use udled::Input;
191
192    use super::*;
193
194    #[test]
195    fn line_comment() {
196        let mut input = Input::new("//");
197        assert_eq!(
198            input.parse(LineComment).unwrap(),
199            Lex::new("//", Span::new(0, 2))
200        );
201
202        let mut input = Input::new("// Some tekst");
203        assert_eq!(
204            input.parse(LineComment).unwrap(),
205            Lex::new("// Some tekst", Span::new(0, 13))
206        );
207        let mut input = Input::new("// Some tekst\n test");
208        assert_eq!(
209            input.parse(LineComment).unwrap(),
210            Lex::new("// Some tekst", Span::new(0, 13))
211        );
212    }
213}