deskc_lexer/
lib.rs

1mod error;
2
3use std::ops::Range;
4
5use chumsky::prelude::*;
6use error::LexerError;
7use tokens::Token;
8use uuid::Uuid;
9
10pub fn scan(input: &str) -> Result<Vec<(Token, Range<usize>)>, LexerError> {
11    lexer().then_ignore(end()).parse(input).map_err(LexerError)
12}
13
14pub fn lexer() -> impl Parser<char, Vec<(Token, Range<usize>)>, Error = Simple<char>> {
15    let comment = recursive(|comment| {
16        none_of("()")
17            .repeated()
18            .at_least(1)
19            .collect::<String>()
20            .or(comment)
21            .repeated()
22            .delimited_by(just('('), just(')'))
23            .map(|s| format!("({})", s.join("")))
24    })
25    .map(Token::Comment);
26    let identifier = ident().map(Token::Ident);
27    let int = just('-')
28        .or_not()
29        .chain::<char, _, _>(text::int(10))
30        .collect::<String>()
31        .map(|s: String| Token::Int(s.parse().unwrap()));
32    let escape = just('\\').ignore_then(
33        just('\\')
34            .or(just('"'))
35            .or(just('n').to('\n'))
36            .or(just('r').to('\r'))
37            .or(just('t').to('\t')),
38    );
39    let string = just('"')
40        .ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
41        .then_ignore(just('"'))
42        .collect::<String>()
43        .map(|s| {
44            s.replace(r#"\\"#, r#"\"#)
45                .replace(r#"\""#, "\"")
46                .replace(r#"\n"#, "\n")
47                .replace(r#"\r"#, "\r")
48                .replace(r#"\t"#, "\t")
49        })
50        .map(Token::Str);
51    let symbol = just('$')
52        .to(Token::Let)
53        .or(just('~').to(Token::In))
54        .or(just('/').to(Token::Divide))
55        .or(just('!').to(Token::Perform))
56        .or(just(':').to(Token::TypeAnnotation))
57        .or(just('%').to(Token::Trait))
58        .or(just('#').to(Token::Attribute))
59        .or(just('^').to(Token::FromHere))
60        .or(just('+').to(Token::Sum))
61        .or(just('*').to(Token::Product))
62        .or(just(',').to(Token::Comma))
63        .or(just('.').to(Token::Dot))
64        .or(just('>').to(Token::Apply))
65        .or(just('[').to(Token::ArrayBegin))
66        .or(just(']').to(Token::ArrayEnd))
67        .or(just('{').to(Token::SetBegin))
68        .or(just('}').to(Token::SetEnd))
69        .or(just('?').to(Token::Hole))
70        .or(just('_').to(Token::Infer))
71        .or(just('\\').to(Token::Lambda))
72        .or(just('&').to(Token::Reference))
73        .or(just('<').chain(just('!')).to(Token::Continue))
74        .or(just('-').chain(just('>')).to(Token::Arrow))
75        .or(just('=').chain(just('>')).to(Token::EArrow))
76        .or(just('-').to(Token::Minus));
77    let special = just('\'')
78        .ignore_then(text::ident())
79        .try_map(|ident: String, span| match ident.as_str() {
80            "module" => Ok(Token::Include),
81            "import" => Ok(Token::Import),
82            "export" => Ok(Token::Export),
83            "number" => Ok(Token::NumberType),
84            "string" => Ok(Token::StringType),
85            "alias" => Ok(Token::Alias),
86            "brand" => Ok(Token::Brands),
87            "type" => Ok(Token::Type),
88            "this" => Ok(Token::This),
89            "handle" => Ok(Token::Handle),
90            "card" => Ok(Token::Card),
91            "a" => Ok(Token::A),
92            _ => Err(Simple::custom(
93                span,
94                format!(r#"undefined special keyword: {}"#, ident),
95            )),
96        });
97    let brand = just('@').ignore_then(ident()).map(Token::Brand);
98    let uuid = just("'uuid")
99        .then_ignore(text::whitespace())
100        .ignore_then(
101            one_of("0123456789abcdefABCDEF")
102                .repeated()
103                .at_least(4)
104                .separated_by(just('-'))
105                .at_least(1),
106        )
107        .flatten()
108        .collect::<String>()
109        .map(|uuid| uuid.parse::<Uuid>())
110        .try_map(|uuid, span| match uuid {
111            Ok(uuid) => Ok(Token::Uuid(uuid)),
112            Err(_) => Err(Simple::custom(span, "invalid uuid")),
113        });
114    let token = comment
115        .or(uuid)
116        .or(int)
117        .or(string)
118        .or(symbol)
119        .or(special)
120        .or(brand)
121        .or(identifier);
122    let semicolon = just(';').to(()).map_with_span(|_, span: Range<usize>| {
123        vec![(Token::Dot, span.clone()), (Token::Comma, span)]
124    });
125    token
126        .map_with_span(|token, span| (token, span))
127        .padded()
128        .repeated()
129        .at_least(1)
130        .or(semicolon)
131        .repeated()
132        .flatten()
133}
134
135pub fn ident() -> impl Parser<char, String, Error = Simple<char>> + Clone {
136    let assert_not_whitespace = |c: char, span| {
137        if c.is_whitespace() {
138            Err(Simple::custom(span, "invalid character"))
139        } else {
140            Ok(c)
141        }
142    };
143    let non_symbol =
144        none_of(r#"%@/&$<>!#*^?\[]{}_-+=;:~,.()"'1234567890"#).try_map(assert_not_whitespace);
145    // Does not have @, underscore, hyphen, and single quote.
146    let non_symbol_2 = none_of(r#"%/&$<>!#*^?\[]{}+=;:~,.()"#).try_map(assert_not_whitespace);
147
148    non_symbol
149        .chain::<char, _, _>(non_symbol_2.repeated())
150        .collect()
151        .separated_by(text::whitespace())
152        .at_least(1)
153        .map(|ident: Vec<String>| ident.join(" "))
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159
160    #[test]
161    fn all_syntax() {
162        let tokens = lexer()
163            .parse(
164                r#"
165            (defines < 'number -> @incremented 'number >)
166            $ \x -> ^ \ 'number, 'number -> @added 'number >
167                1, x (1 + x): @incremented 'number > ~
168            (increments a value which is padded later)
169            \ 'number -> @incremented 'number > ?.
170            "#,
171            )
172            .unwrap()
173            .into_iter()
174            .map(|(token, _)| token)
175            .collect::<Vec<_>>();
176
177        use Token::*;
178        assert_eq!(
179            tokens,
180            vec![
181                Comment("(defines < 'number -> @incremented 'number >)".into()),
182                Let,
183                Lambda,
184                Ident("x".into()),
185                Arrow,
186                FromHere,
187                Lambda,
188                NumberType,
189                Comma,
190                NumberType,
191                Arrow,
192                Brand("added".into()),
193                NumberType,
194                Apply,
195                Int(1),
196                Comma,
197                Ident("x".into()),
198                Comment("(1 + x)".into()),
199                TypeAnnotation,
200                Brand("incremented".into()),
201                NumberType,
202                Apply,
203                In,
204                Comment("(increments a value which is padded later)".into()),
205                Lambda,
206                NumberType,
207                Arrow,
208                Brand("incremented".into()),
209                NumberType,
210                Apply,
211                Hole,
212                Dot,
213            ]
214        )
215    }
216
217    #[test]
218    fn ident_with_spaces() {
219        assert_eq!(
220            lexer().parse(" the\t\nnumber  of apples ").unwrap(),
221            vec![(Token::Ident("the number of apples".into()), 1..23)]
222        );
223    }
224
225    #[test]
226    fn ident_utf8() {
227        assert_eq!(
228            lexer().parse("あ- a0").unwrap(),
229            vec![(Token::Ident("あ- a0".into()), 0..5)]
230        );
231    }
232
233    #[test]
234    fn brand_with_spaces() {
235        assert_eq!(
236            lexer().parse("@あ- a0").unwrap(),
237            vec![(Token::Brand("あ- a0".into()), 0..6)]
238        );
239    }
240
241    #[test]
242    fn string_with_escape() {
243        assert_eq!(
244            lexer()
245                .parse(
246                    r#"
247            "\\\n\""
248            "#
249                )
250                .unwrap(),
251            vec![(Token::Str("\\\n\"".into()), 13..21)]
252        );
253    }
254
255    #[test]
256    fn semicolon_to_comma_dot() {
257        assert_eq!(
258            lexer().then_ignore(end()).parse("?;?").unwrap(),
259            vec![
260                (Token::Hole, 0..1),
261                (Token::Dot, 1..2),
262                (Token::Comma, 1..2),
263                (Token::Hole, 2..3)
264            ]
265        );
266    }
267}