scheme_rs/
lex.rs

1use nom::{
2    branch::alt,
3    bytes::complete::{is_not, tag, tag_no_case, take_until, take_while1},
4    character::{
5        complete::{char as match_char, hex_digit1, one_of, satisfy},
6        streaming::anychar,
7    },
8    combinator::{map, opt, value, verify},
9    multi::{fold_many0, many0},
10    sequence::{delimited, preceded, tuple},
11    IResult,
12};
13use nom_locate::{position, LocatedSpan};
14use std::{borrow::Cow, sync::Arc};
15use unicode_categories::UnicodeCategories;
16
17pub type InputSpan<'a> = LocatedSpan<&'a str, Arc<String>>;
18
19#[derive(Clone, Debug, PartialEq, Eq)]
20pub enum Lexeme<'a> {
21    Identifier(Cow<'a, str>),
22    Boolean(bool),
23    Number(Cow<'a, str>),
24    Character(Cow<'a, str>),
25    String(Vec<Fragment<'a>>),
26    LParen,
27    RParen,
28    LBracket,
29    RBracket,
30    HashParen,
31    Vu8Paren,
32    Quote,
33    Tick,
34    Comma,
35    CommaAt,
36    Period,
37    HashQuote,
38    HashTick,
39    HashComma,
40    HashCommaAt,
41    DocComment(String),
42}
43
44impl Lexeme<'static> {
45    fn identifier_owned(s: String) -> Self {
46        Self::Identifier(Cow::Owned(s))
47    }
48
49    fn number_owned(s: String) -> Self {
50        Self::Number(Cow::Owned(s))
51    }
52
53    fn string_owned(v: Vec<Fragment<'static>>) -> Self {
54        Self::String(v)
55    }
56}
57
58impl<'a> Lexeme<'a> {
59    pub fn to_number(&self) -> &str {
60        let Lexeme::Number(num) = self else {
61            panic!("not a number");
62        };
63        num.as_ref()
64    }
65
66    pub fn to_boolean(&self) -> bool {
67        let Lexeme::Boolean(b) = self else {
68            panic!("not a boolean");
69        };
70        *b
71    }
72
73    pub fn to_ident(&self) -> &str {
74        let Lexeme::Identifier(i) = self else {
75            panic!("not an ident");
76        };
77        i.as_ref()
78    }
79
80    pub fn to_char(&self) -> &str {
81        let Lexeme::Character(c) = self else {
82            panic!("not a character");
83        };
84        c.as_ref()
85    }
86
87    pub fn to_string(&self) -> &[Fragment<'a>] {
88        let Lexeme::String(s) = self else {
89            panic!("not a string");
90        };
91        s.as_slice()
92    }
93
94    pub fn string(v: Vec<Fragment<'a>>) -> Self {
95        Self::String(v)
96    }
97}
98
99fn lexeme(i: InputSpan) -> IResult<InputSpan, Lexeme<'static>> {
100    alt((
101        map(identifier, Lexeme::identifier_owned),
102        map(boolean, Lexeme::Boolean),
103        map(string, Lexeme::string_owned),
104        map(number, Lexeme::number_owned),
105        // I _want_ to do something with doc comments, but probably best not
106        // to include them for now.
107        // map(doc_comment, Lexeme::DocComment),
108        map(match_char('.'), |_| Lexeme::Period),
109        map(match_char('\''), |_| Lexeme::Quote),
110        map(match_char('('), |_| Lexeme::LParen),
111        map(match_char(')'), |_| Lexeme::RParen),
112        map(match_char('['), |_| Lexeme::LBracket),
113        map(match_char(']'), |_| Lexeme::RBracket),
114        map(tag("#("), |_| Lexeme::HashParen),
115        map(tag("#'"), |_| Lexeme::HashTick),
116    ))(i)
117}
118
119fn comment(i: InputSpan) -> IResult<InputSpan, ()> {
120    map(
121        delimited(tag(";"), take_until("\n"), many0(whitespace)),
122        |_| (),
123    )(i)
124}
125
126fn whitespace(i: InputSpan) -> IResult<InputSpan, ()> {
127    map(
128        alt((satisfy(UnicodeCategories::is_separator), match_char('\n'))),
129        |_| (),
130    )(i)
131}
132
133fn atmosphere(i: InputSpan) -> IResult<InputSpan, ()> {
134    map(tuple((whitespace,)), |_| ())(i)
135}
136
137fn interlexeme_space(i: InputSpan) -> IResult<InputSpan, ()> {
138    fold_many0(alt((atmosphere, comment)), || (), |_, _| ())(i)
139}
140
141fn identifier(i: InputSpan) -> IResult<InputSpan, String> {
142    alt((
143        map(tuple((initial, many0(subsequent))), |(i, s)| {
144            format!("{i}{}", s.join(""))
145        }),
146        peculiar_identifier,
147    ))(i)
148}
149
150fn boolean(i: InputSpan) -> IResult<InputSpan, bool> {
151    alt((
152        map(tag_no_case("#t"), |_| true),
153        map(tag_no_case("#f"), |_| false),
154    ))(i)
155}
156
157fn initial(i: InputSpan) -> IResult<InputSpan, String> {
158    alt((
159        map(satisfy(is_constituent), String::from),
160        map(satisfy(is_special_initial), String::from),
161        inline_hex_escape,
162    ))(i)
163}
164
165fn subsequent(i: InputSpan) -> IResult<InputSpan, String> {
166    alt((
167        initial,
168        map(satisfy(|c| c.is_ascii_digit()), String::from),
169        map(
170            satisfy(|c| {
171                c.is_number_decimal_digit()
172                    || c.is_mark_spacing_combining()
173                    || c.is_mark_enclosing()
174            }),
175            String::from,
176        ),
177        map(special_subsequent, String::from),
178    ))(i)
179}
180
181fn special_subsequent(i: InputSpan) -> IResult<InputSpan, char> {
182    one_of("+-.@")(i)
183}
184
185fn peculiar_identifier(i: InputSpan) -> IResult<InputSpan, String> {
186    alt((
187        map(match_char('+'), |_| String::from("+")),
188        map(match_char('-'), |_| String::from("-")),
189        map(tag("..."), |_| String::from("...")),
190        map(tuple((tag("->"), many0(subsequent))), |(_, subseq)| {
191            format!("->{}", subseq.join(""))
192        }),
193    ))(i)
194}
195
196fn inline_hex_escape(i: InputSpan) -> IResult<InputSpan, String> {
197    map(
198        tuple((tag("\\x"), hex_scalar_value, match_char(';'))),
199        |(_, value, _)| format!("\\x{value};"),
200    )(i)
201}
202
203fn hex_scalar_value(i: InputSpan) -> IResult<InputSpan, InputSpan> {
204    hex_digit1(i)
205}
206
207fn is_constituent(c: char) -> bool {
208    c.is_ascii_alphabetic()
209        || (c as u32 > 127
210            && (c.is_letter()
211                || c.is_mark_nonspacing()
212                || c.is_number_letter()
213                || c.is_number_other()
214                || c.is_punctuation_dash()
215                || c.is_punctuation_connector()
216                || c.is_punctuation_other()
217                || c.is_symbol()
218                || c.is_other_private_use()))
219}
220
221fn is_special_initial(c: char) -> bool {
222    matches!(
223        c,
224        '!' | '$' | '%' | '&' | '*' | '/' | ':' | '<' | '=' | '>' | '?' | '^' | '_' | '~'
225    )
226}
227
228#[derive(Clone, Debug, PartialEq, Eq)]
229pub enum Fragment<'a> {
230    HexValue(Cow<'a, str>),
231    Escaped(char),
232    Unescaped(Cow<'a, str>),
233}
234
235fn string(i: InputSpan) -> IResult<InputSpan, Vec<Fragment<'static>>> {
236    delimited(
237        match_char('"'),
238        many0(alt((
239            preceded(
240                match_char('\\'),
241                alt((
242                    map(
243                        tuple((tag_no_case("x"), hex_scalar_value, match_char(';'))),
244                        |(_, hex, _)| Fragment::HexValue(Cow::Owned(hex.to_string())),
245                    ),
246                    value(Fragment::Escaped('\u{07}'), match_char('a')),
247                    value(Fragment::Escaped('\u{08}'), match_char('b')),
248                    value(Fragment::Escaped('\t'), match_char('t')),
249                    value(Fragment::Escaped('\n'), match_char('n')),
250                    value(Fragment::Escaped('\u{0B}'), match_char('v')),
251                    value(Fragment::Escaped('\u{0C}'), match_char('f')),
252                    value(Fragment::Escaped('\r'), match_char('r')),
253                    value(Fragment::Escaped('"'), match_char('"')),
254                    value(Fragment::Escaped('\\'), match_char('\\')),
255                    map(anychar, Fragment::Escaped),
256                )),
257            ),
258            map(
259                verify(is_not("\"\\"), |s: &InputSpan| !s.fragment().is_empty()),
260                |s: InputSpan| Fragment::Unescaped(Cow::Owned(s.fragment().to_string())),
261            ),
262        ))),
263        match_char('"'),
264    )(i)
265}
266
267fn number(i: InputSpan) -> IResult<InputSpan, String> {
268    map(
269        tuple((
270            opt(alt((
271                tag_no_case("#b"),
272                tag_no_case("#o"),
273                tag_no_case("#d"),
274                tag_no_case("#x"),
275            ))),
276            take_while1(|c: char| c.is_ascii_hexdigit()),
277        )),
278        |(radix, real): (Option<InputSpan>, InputSpan)| {
279            format!("{}{real}", radix.map(|s| *s.fragment()).unwrap_or(""))
280        },
281    )(i)
282}
283
284/*
285fn doc_comment(i: InputSpan) -> IResult<InputSpan, String> {
286    fold_many1(
287        delimited(tag(";;"), take_until("\n"), many0(whitespace)),
288        String::new,
289        |mut comment, line| {
290            comment.push_str(&line);
291            comment.push('\n');
292            comment
293        },
294    )(i)
295}
296*/
297
298#[derive(Debug)]
299pub struct Token<'a> {
300    pub lexeme: Lexeme<'static>,
301    pub span: InputSpan<'a>,
302}
303
304pub type LexError<'a> = nom::Err<nom::error::Error<InputSpan<'a>>>;
305
306impl<'a> Token<'a> {
307    pub fn tokenize_file(s: &'a str, filename: &str) -> Result<Vec<Self>, LexError<'a>> {
308        let mut span = InputSpan::new_extra(s, Arc::new(filename.to_string()));
309        let mut output = Vec::new();
310        while !span.is_empty() {
311            let (remaining, ()) = interlexeme_space(span)?;
312            if remaining.is_empty() {
313                break;
314            }
315            let (remaining, curr_span) = position(remaining)?;
316            let (remaining, lexeme) = lexeme(remaining)?;
317            output.push(Token {
318                lexeme,
319                span: curr_span,
320            });
321            span = remaining;
322        }
323        Ok(output)
324    }
325
326    pub fn tokenize_str(s: &'a str) -> Result<Vec<Self>, LexError<'a>> {
327        Self::tokenize_file(s, "<stdin>")
328    }
329}