1use nom::{
2 branch::alt,
3 bytes::complete::{is_not, tag, tag_no_case, take_until, take_while1},
4 character::{
5 complete::{char as match_char, hex_digit1, one_of, satisfy},
6 streaming::anychar,
7 },
8 combinator::{map, opt, value, verify},
9 multi::{fold_many0, many0},
10 sequence::{delimited, preceded, tuple},
11 IResult,
12};
13use nom_locate::{position, LocatedSpan};
14use std::{borrow::Cow, sync::Arc};
15use unicode_categories::UnicodeCategories;
16
17pub type InputSpan<'a> = LocatedSpan<&'a str, Arc<String>>;
18
19#[derive(Clone, Debug, PartialEq, Eq)]
20pub enum Lexeme<'a> {
21 Identifier(Cow<'a, str>),
22 Boolean(bool),
23 Number(Cow<'a, str>),
24 Character(Cow<'a, str>),
25 String(Vec<Fragment<'a>>),
26 LParen,
27 RParen,
28 LBracket,
29 RBracket,
30 HashParen,
31 Vu8Paren,
32 Quote,
33 Tick,
34 Comma,
35 CommaAt,
36 Period,
37 HashQuote,
38 HashTick,
39 HashComma,
40 HashCommaAt,
41 DocComment(String),
42}
43
44impl Lexeme<'static> {
45 fn identifier_owned(s: String) -> Self {
46 Self::Identifier(Cow::Owned(s))
47 }
48
49 fn number_owned(s: String) -> Self {
50 Self::Number(Cow::Owned(s))
51 }
52
53 fn string_owned(v: Vec<Fragment<'static>>) -> Self {
54 Self::String(v)
55 }
56}
57
58impl<'a> Lexeme<'a> {
59 pub fn to_number(&self) -> &str {
60 let Lexeme::Number(num) = self else {
61 panic!("not a number");
62 };
63 num.as_ref()
64 }
65
66 pub fn to_boolean(&self) -> bool {
67 let Lexeme::Boolean(b) = self else {
68 panic!("not a boolean");
69 };
70 *b
71 }
72
73 pub fn to_ident(&self) -> &str {
74 let Lexeme::Identifier(i) = self else {
75 panic!("not an ident");
76 };
77 i.as_ref()
78 }
79
80 pub fn to_char(&self) -> &str {
81 let Lexeme::Character(c) = self else {
82 panic!("not a character");
83 };
84 c.as_ref()
85 }
86
87 pub fn to_string(&self) -> &[Fragment<'a>] {
88 let Lexeme::String(s) = self else {
89 panic!("not a string");
90 };
91 s.as_slice()
92 }
93
94 pub fn string(v: Vec<Fragment<'a>>) -> Self {
95 Self::String(v)
96 }
97}
98
99fn lexeme(i: InputSpan) -> IResult<InputSpan, Lexeme<'static>> {
100 alt((
101 map(identifier, Lexeme::identifier_owned),
102 map(boolean, Lexeme::Boolean),
103 map(string, Lexeme::string_owned),
104 map(number, Lexeme::number_owned),
105 map(match_char('.'), |_| Lexeme::Period),
109 map(match_char('\''), |_| Lexeme::Quote),
110 map(match_char('('), |_| Lexeme::LParen),
111 map(match_char(')'), |_| Lexeme::RParen),
112 map(match_char('['), |_| Lexeme::LBracket),
113 map(match_char(']'), |_| Lexeme::RBracket),
114 map(tag("#("), |_| Lexeme::HashParen),
115 map(tag("#'"), |_| Lexeme::HashTick),
116 ))(i)
117}
118
119fn comment(i: InputSpan) -> IResult<InputSpan, ()> {
120 map(
121 delimited(tag(";"), take_until("\n"), many0(whitespace)),
122 |_| (),
123 )(i)
124}
125
126fn whitespace(i: InputSpan) -> IResult<InputSpan, ()> {
127 map(
128 alt((satisfy(UnicodeCategories::is_separator), match_char('\n'))),
129 |_| (),
130 )(i)
131}
132
133fn atmosphere(i: InputSpan) -> IResult<InputSpan, ()> {
134 map(tuple((whitespace,)), |_| ())(i)
135}
136
137fn interlexeme_space(i: InputSpan) -> IResult<InputSpan, ()> {
138 fold_many0(alt((atmosphere, comment)), || (), |_, _| ())(i)
139}
140
141fn identifier(i: InputSpan) -> IResult<InputSpan, String> {
142 alt((
143 map(tuple((initial, many0(subsequent))), |(i, s)| {
144 format!("{i}{}", s.join(""))
145 }),
146 peculiar_identifier,
147 ))(i)
148}
149
150fn boolean(i: InputSpan) -> IResult<InputSpan, bool> {
151 alt((
152 map(tag_no_case("#t"), |_| true),
153 map(tag_no_case("#f"), |_| false),
154 ))(i)
155}
156
157fn initial(i: InputSpan) -> IResult<InputSpan, String> {
158 alt((
159 map(satisfy(is_constituent), String::from),
160 map(satisfy(is_special_initial), String::from),
161 inline_hex_escape,
162 ))(i)
163}
164
165fn subsequent(i: InputSpan) -> IResult<InputSpan, String> {
166 alt((
167 initial,
168 map(satisfy(|c| c.is_ascii_digit()), String::from),
169 map(
170 satisfy(|c| {
171 c.is_number_decimal_digit()
172 || c.is_mark_spacing_combining()
173 || c.is_mark_enclosing()
174 }),
175 String::from,
176 ),
177 map(special_subsequent, String::from),
178 ))(i)
179}
180
181fn special_subsequent(i: InputSpan) -> IResult<InputSpan, char> {
182 one_of("+-.@")(i)
183}
184
185fn peculiar_identifier(i: InputSpan) -> IResult<InputSpan, String> {
186 alt((
187 map(match_char('+'), |_| String::from("+")),
188 map(match_char('-'), |_| String::from("-")),
189 map(tag("..."), |_| String::from("...")),
190 map(tuple((tag("->"), many0(subsequent))), |(_, subseq)| {
191 format!("->{}", subseq.join(""))
192 }),
193 ))(i)
194}
195
196fn inline_hex_escape(i: InputSpan) -> IResult<InputSpan, String> {
197 map(
198 tuple((tag("\\x"), hex_scalar_value, match_char(';'))),
199 |(_, value, _)| format!("\\x{value};"),
200 )(i)
201}
202
203fn hex_scalar_value(i: InputSpan) -> IResult<InputSpan, InputSpan> {
204 hex_digit1(i)
205}
206
207fn is_constituent(c: char) -> bool {
208 c.is_ascii_alphabetic()
209 || (c as u32 > 127
210 && (c.is_letter()
211 || c.is_mark_nonspacing()
212 || c.is_number_letter()
213 || c.is_number_other()
214 || c.is_punctuation_dash()
215 || c.is_punctuation_connector()
216 || c.is_punctuation_other()
217 || c.is_symbol()
218 || c.is_other_private_use()))
219}
220
221fn is_special_initial(c: char) -> bool {
222 matches!(
223 c,
224 '!' | '$' | '%' | '&' | '*' | '/' | ':' | '<' | '=' | '>' | '?' | '^' | '_' | '~'
225 )
226}
227
228#[derive(Clone, Debug, PartialEq, Eq)]
229pub enum Fragment<'a> {
230 HexValue(Cow<'a, str>),
231 Escaped(char),
232 Unescaped(Cow<'a, str>),
233}
234
235fn string(i: InputSpan) -> IResult<InputSpan, Vec<Fragment<'static>>> {
236 delimited(
237 match_char('"'),
238 many0(alt((
239 preceded(
240 match_char('\\'),
241 alt((
242 map(
243 tuple((tag_no_case("x"), hex_scalar_value, match_char(';'))),
244 |(_, hex, _)| Fragment::HexValue(Cow::Owned(hex.to_string())),
245 ),
246 value(Fragment::Escaped('\u{07}'), match_char('a')),
247 value(Fragment::Escaped('\u{08}'), match_char('b')),
248 value(Fragment::Escaped('\t'), match_char('t')),
249 value(Fragment::Escaped('\n'), match_char('n')),
250 value(Fragment::Escaped('\u{0B}'), match_char('v')),
251 value(Fragment::Escaped('\u{0C}'), match_char('f')),
252 value(Fragment::Escaped('\r'), match_char('r')),
253 value(Fragment::Escaped('"'), match_char('"')),
254 value(Fragment::Escaped('\\'), match_char('\\')),
255 map(anychar, Fragment::Escaped),
256 )),
257 ),
258 map(
259 verify(is_not("\"\\"), |s: &InputSpan| !s.fragment().is_empty()),
260 |s: InputSpan| Fragment::Unescaped(Cow::Owned(s.fragment().to_string())),
261 ),
262 ))),
263 match_char('"'),
264 )(i)
265}
266
267fn number(i: InputSpan) -> IResult<InputSpan, String> {
268 map(
269 tuple((
270 opt(alt((
271 tag_no_case("#b"),
272 tag_no_case("#o"),
273 tag_no_case("#d"),
274 tag_no_case("#x"),
275 ))),
276 take_while1(|c: char| c.is_ascii_hexdigit()),
277 )),
278 |(radix, real): (Option<InputSpan>, InputSpan)| {
279 format!("{}{real}", radix.map(|s| *s.fragment()).unwrap_or(""))
280 },
281 )(i)
282}
283
284#[derive(Debug)]
299pub struct Token<'a> {
300 pub lexeme: Lexeme<'static>,
301 pub span: InputSpan<'a>,
302}
303
304pub type LexError<'a> = nom::Err<nom::error::Error<InputSpan<'a>>>;
305
306impl<'a> Token<'a> {
307 pub fn tokenize_file(s: &'a str, filename: &str) -> Result<Vec<Self>, LexError<'a>> {
308 let mut span = InputSpan::new_extra(s, Arc::new(filename.to_string()));
309 let mut output = Vec::new();
310 while !span.is_empty() {
311 let (remaining, ()) = interlexeme_space(span)?;
312 if remaining.is_empty() {
313 break;
314 }
315 let (remaining, curr_span) = position(remaining)?;
316 let (remaining, lexeme) = lexeme(remaining)?;
317 output.push(Token {
318 lexeme,
319 span: curr_span,
320 });
321 span = remaining;
322 }
323 Ok(output)
324 }
325
326 pub fn tokenize_str(s: &'a str) -> Result<Vec<Self>, LexError<'a>> {
327 Self::tokenize_file(s, "<stdin>")
328 }
329}