kanata_parser/cfg/
sexpr.rs

1use std::ops::Index;
2use std::rc::Rc;
3use std::str::Bytes;
4use std::{fmt::Debug, iter};
5
6type HashMap<K, V> = rustc_hash::FxHashMap<K, V>;
7
8use super::{ParseError, Result};
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
11pub struct Position {
12    /// The position (since the beginning of the file), in bytes.
13    pub absolute: usize,
14    /// The number of newline characters since the beginning of the file.
15    pub line: usize,
16    /// The position of beginning of line, in bytes.
17    pub line_beginning: usize,
18}
19
20impl Position {
21    pub fn new(absolute: usize, line: usize, line_beginning: usize) -> Self {
22        assert!(line <= absolute);
23        assert!(line_beginning <= absolute);
24        Self {
25            absolute,
26            line,
27            line_beginning,
28        }
29    }
30}
31
32#[derive(Clone, PartialEq, Eq, Hash)]
33pub struct Span {
34    pub start: Position,
35    pub end: Position,
36    pub file_name: Rc<str>,
37    pub file_content: Rc<str>,
38}
39
40impl Debug for Span {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        f.debug_struct("Span")
43            .field("start", &self.start)
44            .field("end", &self.end)
45            .field("file_name", &self.file_name)
46            .field("file_content [len]", &self.file_content.len())
47            .finish()
48    }
49}
50
51impl Default for Span {
52    fn default() -> Self {
53        Self {
54            start: Position::default(),
55            end: Position::default(),
56            file_name: Rc::from(""),
57            file_content: Rc::from(""),
58        }
59    }
60}
61
62impl Span {
63    pub fn new(start: Position, end: Position, file_name: Rc<str>, file_content: Rc<str>) -> Span {
64        assert!(start.absolute <= end.absolute);
65        assert!(start.line <= end.line);
66        Span {
67            start,
68            end,
69            file_name,
70            file_content,
71        }
72    }
73
74    pub fn cover(&self, other: &Span) -> Span {
75        assert!(self.file_name == other.file_name);
76
77        let start: Position = if self.start() <= other.start() {
78            self.start
79        } else {
80            other.start
81        };
82
83        let end: Position = if self.end() >= other.end() {
84            self.end
85        } else {
86            other.end
87        };
88
89        Span::new(
90            start,
91            end,
92            self.file_name.clone(),
93            self.file_content.clone(),
94        )
95    }
96
97    pub fn start(&self) -> usize {
98        self.start.absolute
99    }
100
101    pub fn end(&self) -> usize {
102        self.end.absolute
103    }
104
105    pub fn file_name(&self) -> String {
106        self.file_name.clone().to_string()
107    }
108
109    pub fn file_content(&self) -> String {
110        self.file_content.clone().to_string()
111    }
112}
113
114impl Index<Span> for str {
115    type Output = str;
116    fn index(&self, span: Span) -> &Self::Output {
117        &self[span.start()..span.end()]
118    }
119}
120
121impl Index<Span> for String {
122    type Output = str;
123    fn index(&self, span: Span) -> &Self::Output {
124        &self[span.start()..span.end()]
125    }
126}
127
128#[derive(Debug, Clone, PartialEq, Eq, Hash)]
129pub struct Spanned<T> {
130    pub t: T,
131    pub span: Span,
132}
133
134impl<T> Spanned<T> {
135    pub fn new(t: T, span: Span) -> Spanned<T> {
136        Spanned { t, span }
137    }
138}
139
140#[derive(Clone, PartialEq, Eq, Hash)]
141/// I know this isn't the classic definition of an S-Expression which uses cons cell and atom, but
142/// this is more convenient to work with (I find).
143pub enum SExpr {
144    Atom(Spanned<String>),
145    List(Spanned<Vec<SExpr>>),
146}
147
148impl SExpr {
149    pub fn atom<'a>(&'a self, vars: Option<&'a HashMap<String, SExpr>>) -> Option<&'a str> {
150        match self {
151            SExpr::Atom(a) => {
152                let s = a.t.as_str();
153                match (s.strip_prefix('$'), vars) {
154                    (Some(varname), Some(vars)) => match vars.get(varname) {
155                        Some(var) => {
156                            #[cfg(feature = "lsp")]
157                            super::LSP_VARIABLE_REFERENCES.with_borrow_mut(|refs| {
158                                refs.push(varname, a.span.clone());
159                            });
160                            var.atom(Some(vars))
161                        }
162                        None => Some(s),
163                    },
164                    _ => Some(s),
165                }
166            }
167            _ => None,
168        }
169    }
170
171    pub fn list<'a>(&'a self, vars: Option<&'a HashMap<String, SExpr>>) -> Option<&'a [SExpr]> {
172        match self {
173            SExpr::List(l) => Some(&l.t),
174            SExpr::Atom(a) => match (a.t.strip_prefix('$'), vars) {
175                (Some(varname), Some(vars)) => match vars.get(varname) {
176                    Some(var) => {
177                        #[cfg(feature = "lsp")]
178                        super::LSP_VARIABLE_REFERENCES.with_borrow_mut(|refs| {
179                            refs.push(varname, a.span.clone());
180                        });
181                        var.list(Some(vars))
182                    }
183                    None => None,
184                },
185                _ => None,
186            },
187        }
188    }
189
190    pub fn span_list<'a>(
191        &'a self,
192        vars: Option<&'a HashMap<String, SExpr>>,
193    ) -> Option<&'a Spanned<Vec<SExpr>>> {
194        match self {
195            SExpr::List(l) => Some(l),
196            SExpr::Atom(a) => match (a.t.strip_prefix('$'), vars) {
197                (Some(varname), Some(vars)) => match vars.get(varname) {
198                    Some(var) => {
199                        #[cfg(feature = "lsp")]
200                        super::LSP_VARIABLE_REFERENCES.with_borrow_mut(|refs| {
201                            refs.push(varname, a.span.clone());
202                        });
203                        var.span_list(Some(vars))
204                    }
205                    None => None,
206                },
207                _ => None,
208            },
209        }
210    }
211
212    pub fn span(&self) -> Span {
213        match self {
214            SExpr::Atom(a) => a.span.clone(),
215            SExpr::List(l) => l.span.clone(),
216        }
217    }
218}
219
220impl std::fmt::Debug for SExpr {
221    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
222        match self {
223            SExpr::Atom(a) => write!(f, "{}", &a.t),
224            SExpr::List(l) => {
225                write!(f, "(")?;
226                for i in 0..l.t.len() - 1 {
227                    write!(f, "{:?} ", &l.t[i])?;
228                }
229                if let Some(last) = &l.t.last() {
230                    write!(f, "{last:?}")?;
231                }
232                write!(f, ")")?;
233                Ok(())
234            }
235        }
236    }
237}
238
239#[derive(Clone, PartialEq, Eq, Debug)]
240/// Complementary to SExpr metadata items.
241pub enum SExprMetaData {
242    LineComment(Spanned<String>),
243    BlockComment(Spanned<String>),
244    Whitespace(Spanned<String>),
245}
246
247impl SExprMetaData {
248    pub fn span(&self) -> Span {
249        match self {
250            Self::LineComment(x) => x.span.clone(),
251            Self::BlockComment(x) => x.span.clone(),
252            Self::Whitespace(x) => x.span.clone(),
253        }
254    }
255}
256
257#[derive(Debug)]
258enum Token {
259    Open,
260    Close,
261    StringTok,
262    BlockComment,
263    LineComment,
264    Whitespace,
265}
266
267#[derive(Clone)]
268/// A wrapper around [`Bytes`] that keeps track of current [`Position`].
269struct PositionCountingBytesIterator<'a> {
270    bytes: Bytes<'a>,
271    source_length: usize,
272    line: usize,
273    line_beginning: usize,
274}
275
276impl<'a> PositionCountingBytesIterator<'a> {
277    fn new(s: &'a str) -> Self {
278        Self {
279            bytes: s.bytes(),
280            source_length: s.len(),
281            line: 0,
282            line_beginning: 0,
283        }
284    }
285
286    fn pos(&self) -> Position {
287        let absolute = self.source_length - self.bytes.len();
288        Position::new(absolute, self.line, self.line_beginning)
289    }
290}
291
292impl Iterator for PositionCountingBytesIterator<'_> {
293    type Item = u8;
294
295    fn next(&mut self) -> Option<Self::Item> {
296        self.bytes.next().inspect(|&b| {
297            if b == b'\n' {
298                self.line += 1;
299                self.line_beginning = self.source_length - self.bytes.len()
300            }
301        })
302    }
303}
304
305pub struct Lexer<'a> {
306    bytes: PositionCountingBytesIterator<'a>,
307    ignore_whitespace_and_comments: bool,
308}
309
310fn is_start(b: u8) -> bool {
311    matches!(b, b'(' | b')' | b'"') || b.is_ascii_whitespace()
312}
313
314type TokenRes = std::result::Result<Token, String>;
315
316impl<'a> Lexer<'a> {
317    #[allow(clippy::new_ret_no_self)]
318    /// `file_name` is used only for indicating a file, where
319    /// a fragment of `source` that caused parsing error came from.
320    fn new(
321        source: &'a str,
322        file_name: &'a str,
323        ignore_whitespace_and_comments: bool,
324    ) -> impl Iterator<Item = Spanned<TokenRes>> + 'a {
325        let _bytes = source.bytes().next();
326
327        let mut lexer = Lexer {
328            bytes: PositionCountingBytesIterator::new(source),
329            ignore_whitespace_and_comments,
330        };
331        let file_name: Rc<str> = Rc::from(file_name);
332        let file_content: Rc<str> = Rc::from(source);
333        iter::from_fn(move || {
334            lexer.next_token().map(|(start, t)| {
335                let end = lexer.bytes.pos();
336                Spanned::new(
337                    t,
338                    Span::new(start, end, file_name.clone(), file_content.clone()),
339                )
340            })
341        })
342    }
343
344    fn next_while(&mut self, f: impl Fn(u8) -> bool) {
345        for b in self.bytes.clone() {
346            if f(b) {
347                // Iterating over a clone of this iterator - this is guaranteed to be Some
348                self.bytes.next().expect("iter lag");
349            } else {
350                break;
351            }
352        }
353    }
354
355    /// Looks for "#, consuming bytes until found. If not found, returns Err(...);
356    fn read_until_multiline_string_end(&mut self) -> TokenRes {
357        for b2 in self.bytes.clone().skip(1) {
358            // Iterating over a clone of this iterator that's 1 item ahead - this is guaranteed to
359            // be Some.
360            let b1 = self.bytes.next().expect("iter lag");
361            if b1 == b'"' && b2 == b'#' {
362                self.bytes.next();
363                return Ok(Token::StringTok);
364            }
365        }
366        Err("Unterminated multiline string. Add \"# after the end of your string.".to_string())
367    }
368
369    /// Looks for "|#", consuming bytes until found. If not found, returns Err(...);
370    fn read_until_multiline_comment_end(&mut self) -> TokenRes {
371        for b2 in self.bytes.clone().skip(1) {
372            // Iterating over a clone of this iterator that's 1 item ahead - this is guaranteed to
373            // be Some.
374            let b1 = self.bytes.next().expect("iter lag");
375            if b1 == b'|' && b2 == b'#' {
376                self.bytes.next();
377                return Ok(Token::BlockComment);
378            }
379        }
380        Err("Unterminated multiline comment. Add |# after the end of your comment.".to_string())
381    }
382
383    fn next_token(&mut self) -> Option<(Position, TokenRes)> {
384        use Token::*;
385        loop {
386            let start = self.bytes.pos();
387            break match self.bytes.next() {
388                Some(b) => Some((
389                    start,
390                    Ok(match b {
391                        b'(' => Open,
392                        b')' => Close,
393                        b'"' => {
394                            self.next_while(|b| b != b'"' && b != b'\n');
395                            match self.bytes.next() {
396                                Some(b'"') => StringTok,
397                                _ => return Some((start, Err("Unterminated string".to_string()))),
398                            }
399                        }
400                        b';' => match self.bytes.clone().next() {
401                            Some(b';') => {
402                                self.next_while(|b| b != b'\n');
403                                // possibly consume the newline (or EOF handled in next iteration)
404                                self.bytes.next();
405                                if self.ignore_whitespace_and_comments {
406                                    continue;
407                                }
408                                Token::LineComment
409                            }
410                            _ => self.next_string(),
411                        },
412                        b'r' => {
413                            match (self.bytes.clone().next(), self.bytes.clone().nth(1)) {
414                                (Some(b'#'), Some(b'"')) => {
415                                    // consume the # and "
416                                    self.bytes.next();
417                                    self.bytes.next();
418                                    let tok: Token = match self.read_until_multiline_string_end() {
419                                        Ok(t) => t,
420                                        e @ Err(_) => return Some((start, e)),
421                                    };
422                                    tok
423                                }
424                                _ => self.next_string(),
425                            }
426                        }
427                        b'#' => match self.bytes.clone().next() {
428                            Some(b'|') => {
429                                // consume the '|'
430                                self.bytes.next();
431                                let tok: Token = match self.read_until_multiline_comment_end() {
432                                    Ok(t) => t,
433                                    e @ Err(_) => return Some((start, e)),
434                                };
435                                if self.ignore_whitespace_and_comments {
436                                    continue;
437                                }
438                                tok
439                            }
440                            _ => self.next_string(),
441                        },
442                        b if b.is_ascii_whitespace() => {
443                            let tok = self.next_whitespace();
444                            if self.ignore_whitespace_and_comments {
445                                continue;
446                            }
447                            tok
448                        }
449                        _ => self.next_string(),
450                    }),
451                )),
452                None => None,
453            };
454        }
455    }
456
457    fn next_string(&mut self) -> Token {
458        // might want to limit this to ascii or XID_START/XID_CONTINUE
459        self.next_while(|b| !is_start(b));
460        Token::StringTok
461    }
462
463    fn next_whitespace(&mut self) -> Token {
464        self.next_while(|b| b.is_ascii_whitespace());
465        Token::Whitespace
466    }
467}
468
469pub type TopLevel = Spanned<Vec<SExpr>>;
470
471pub fn parse(cfg: &str, file_name: &str) -> std::result::Result<Vec<TopLevel>, ParseError> {
472    let ignore_whitespace_and_comments = true;
473    parse_(cfg, file_name, ignore_whitespace_and_comments).map(|(x, _)| x)
474}
475
476pub fn parse_(
477    cfg: &str,
478    file_name: &str,
479    ignore_whitespace_and_comments: bool,
480) -> Result<(Vec<TopLevel>, Vec<SExprMetaData>)> {
481    let cfg = strip_utf8_bom(cfg);
482    parse_with(
483        cfg,
484        Lexer::new(cfg, file_name, ignore_whitespace_and_comments),
485    )
486    .map_err(|e| {
487        if e.msg.contains("Unterminated multiline comment") {
488            if let Some(mut span) = e.span {
489                span.end = span.start;
490                span.end.absolute += 2;
491                ParseError::new(span, e.msg)
492            } else {
493                e
494            }
495        } else {
496            e
497        }
498    })
499}
500
501fn strip_utf8_bom(s: &str) -> &str {
502    match s.as_bytes().strip_prefix(&[0xef, 0xbb, 0xbf]) {
503        Some(stripped) => std::str::from_utf8(stripped).expect("valid input"),
504        None => s,
505    }
506}
507
508fn parse_with(
509    s: &str,
510    mut tokens: impl Iterator<Item = Spanned<TokenRes>>,
511) -> Result<(Vec<TopLevel>, Vec<SExprMetaData>)> {
512    use Token::*;
513    let mut stack = vec![Spanned::new(vec![], Span::default())];
514    let mut metadata: Vec<SExprMetaData> = vec![];
515    loop {
516        match tokens.next() {
517            None => break,
518            Some(Spanned { t, span }) => match t.map_err(|s| ParseError::new(span.clone(), s))? {
519                Open => stack.push(Spanned::new(vec![], span)),
520                Close => {
521                    let Spanned {
522                        t: exprs,
523                        span: stack_span,
524                        // There is a placeholder at the bottom of the stack to allow this unwrap;
525                        // if the stack is ever empty, return an error.
526                    } = stack.pop().expect("placeholder unpopped");
527                    if stack.is_empty() {
528                        return Err(ParseError::new(span, "Unexpected closing parenthesis"));
529                    }
530                    let expr = SExpr::List(Spanned::new(exprs, stack_span.cover(&span)));
531                    stack.last_mut().expect("not empty").t.push(expr);
532                }
533                StringTok => stack
534                    .last_mut()
535                    .expect("not empty")
536                    .t
537                    .push(SExpr::Atom(Spanned::new(s[span.clone()].to_string(), span))),
538                BlockComment => metadata.push(SExprMetaData::BlockComment(Spanned::new(
539                    s[span.clone()].to_string(),
540                    span,
541                ))),
542                LineComment => metadata.push(SExprMetaData::LineComment(Spanned::new(
543                    s[span.clone()].to_string(),
544                    span,
545                ))),
546                Whitespace => metadata.push(SExprMetaData::Whitespace(Spanned::new(
547                    s[span.clone()].to_string(),
548                    span,
549                ))),
550            },
551        }
552    }
553    // There is a placeholder at the bottom of the stack to allow this unwrap; if the stack is ever
554    // empty, return an error.
555    let Spanned { t: exprs, span: sp } = stack.pop().expect("placeholder unpopped");
556    if !stack.is_empty() {
557        return Err(ParseError::new(sp, "Unclosed opening parenthesis"));
558    }
559    let exprs = exprs
560        .into_iter()
561        .map(|expr| match expr {
562            SExpr::List(es) => Ok(es),
563            SExpr::Atom(s) => Err(ParseError::new(s.span, "Everything must be in a list")),
564        })
565        .collect::<Result<_>>()?;
566    Ok((exprs, metadata))
567}
568
569use miette::{Diagnostic, SourceSpan};
570use thiserror::Error;
571
572#[derive(Error, Debug, Diagnostic)]
573#[error("Error in configuration syntax")]
574#[diagnostic()]
575pub struct LexError {
576    // Snippets and highlights can be included in the diagnostic!
577    #[label("Here")]
578    pub err_span: SourceSpan,
579    #[help]
580    pub help_msg: String,
581}