cirru_parser/
parser.rs

1/*! # Cirru Parser
2This tiny parser parses indentation based syntax into nested a vector,
3then it could used as S-Expressions for evaluation or codegen.
4
5```cirru
6defn fib (x)
7  if (<= x 2) 1
8    +
9      fib $ dec x
10      fib $ - x 2
11```
12
13parses to:
14
15```edn
16[ ["defn" "fib" [ "x" ]
17    [ "if" [ "<=" "x" "2" ] "1"
18      [ "+" [ "fib" ["dec" "x"] ] [ "fib" ["-" "x" "2"] ] ]
19    ]
20] ]
21```
22
23find more on <http://text.cirru.org/> .
24*/
25
26mod primes;
27mod s_expr;
28mod tree;
29mod writer;
30
31#[cfg(feature = "serde-json")]
32mod json;
33
34#[cfg(feature = "serde-json")]
35pub use json::*;
36
37const DEFAULT_EXPR_CAPACITY: usize = 8; // Added for default capacity
38
39use std::cmp::Ordering::*;
40
41use primes::CirruLexState;
42use tree::{resolve_comma, resolve_dollar};
43
44pub use primes::{Cirru, CirruLexItem, CirruLexItemList, escape_cirru_leaf};
45pub use s_expr::format_to_lisp;
46pub use writer::{CirruWriterOptions, format};
47
48/// builds a tree from a flat list of tokens
49fn build_exprs(tokens: &[CirruLexItem]) -> Result<Vec<Cirru>, String> {
50  let mut acc: Vec<Cirru> = Vec::with_capacity(tokens.len() / 6 + 1);
51  let mut idx = 0;
52  let mut pull_token = || {
53    if idx >= tokens.len() {
54      return None;
55    }
56    let pos = idx;
57    idx += 1;
58    Some(&tokens[pos])
59  };
60  loop {
61    let chunk = pull_token();
62
63    match &chunk {
64      None => return Ok(acc),
65      Some(ck) => {
66        match ck {
67          CirruLexItem::Open => {
68            let mut pointer: Vec<Cirru> = Vec::with_capacity(DEFAULT_EXPR_CAPACITY);
69            // guess a nested level of 16
70            let mut pointer_stack: Vec<Vec<Cirru>> = Vec::with_capacity(16);
71            loop {
72              let cursor = pull_token();
73
74              match &cursor {
75                None => return Err(String::from("unexpected end of file")),
76                Some(c) => match c {
77                  CirruLexItem::Close => match pointer_stack.pop() {
78                    None => {
79                      acc.push(Cirru::List(pointer));
80                      break;
81                    }
82                    Some(v) => {
83                      let prev_p = pointer;
84                      pointer = v;
85                      pointer.push(Cirru::List(prev_p));
86                    }
87                  },
88                  CirruLexItem::Open => {
89                    pointer_stack.push(pointer);
90                    pointer = Vec::with_capacity(DEFAULT_EXPR_CAPACITY);
91                  }
92                  CirruLexItem::Str(s) => pointer.push(Cirru::Leaf((**s).into())),
93                  CirruLexItem::Indent(n) => return Err(format!("unknown indent: {n}")),
94                },
95              }
96            }
97          }
98          CirruLexItem::Close => return Err(String::from("unexpected \")\"")),
99          a => return Err(format!("unknown item: {a:?}")),
100        }
101      }
102    }
103  }
104}
105
106fn parse_indentation(size: u8) -> Result<CirruLexItem, String> {
107  if size & 0x1 == 0x0 {
108    // even number
109    Ok(CirruLexItem::Indent(size >> 1))
110  } else {
111    Err(format!("odd indentation size, {size}"))
112  }
113}
114
115const DEFAULT_BUFFER_CAPACITY: usize = 8;
116
117/// The lexer for Cirru syntax. It scans the code and returns a flat list of tokens.
118/// It uses a state machine to handle different parts of the syntax, such as strings,
119/// tokens, and indentation.
120pub fn lex(initial_code: &str) -> Result<CirruLexItemList, String> {
121  // guessed an initial length
122  let mut acc: CirruLexItemList = Vec::with_capacity(initial_code.len() >> 4);
123  let mut state = CirruLexState::Indent;
124  let mut buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
125  let code = initial_code;
126
127  for (idx, c) in code.char_indices() {
128    match state {
129      CirruLexState::Space => match c {
130        ' ' => {
131          state = CirruLexState::Space;
132          buffer.clear();
133        }
134        '\n' => {
135          state = CirruLexState::Indent;
136          buffer.clear();
137        }
138        '(' => {
139          acc.push(CirruLexItem::Open);
140          state = CirruLexState::Space;
141          buffer = String::new()
142        }
143        ')' => {
144          acc.push(CirruLexItem::Close);
145          state = CirruLexState::Space;
146          buffer.clear()
147        }
148        '"' => {
149          state = CirruLexState::Str;
150          buffer.clear();
151        }
152        _ => {
153          state = CirruLexState::Token;
154          buffer.clear();
155          buffer.push(c);
156        }
157      },
158      CirruLexState::Token => match c {
159        ' ' => {
160          acc.push(CirruLexItem::Str(buffer));
161          state = CirruLexState::Space;
162          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
163        }
164        '"' => {
165          acc.push(CirruLexItem::Str(buffer));
166          state = CirruLexState::Str;
167          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
168        }
169        '\n' => {
170          acc.push(CirruLexItem::Str(buffer));
171          state = CirruLexState::Indent;
172          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
173        }
174        '(' => {
175          acc.push(CirruLexItem::Str(buffer));
176          acc.push(CirruLexItem::Open);
177          state = CirruLexState::Space;
178          buffer = String::new()
179        }
180        ')' => {
181          acc.push(CirruLexItem::Str(buffer));
182          acc.push(CirruLexItem::Close);
183          state = CirruLexState::Space;
184          buffer = String::new()
185        }
186        _ => {
187          state = CirruLexState::Token;
188          buffer.push(c);
189        }
190      },
191      CirruLexState::Str => match c {
192        '"' => {
193          acc.push(CirruLexItem::Str(buffer));
194          state = CirruLexState::Space;
195          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
196        }
197        '\\' => {
198          state = CirruLexState::Escape;
199        }
200        '\n' => {
201          return Err(String::from("unexpected newline in string"));
202        }
203        _ => {
204          state = CirruLexState::Str;
205          buffer.push(c);
206        }
207      },
208      CirruLexState::Escape => match c {
209        '"' => {
210          state = CirruLexState::Str;
211          buffer.push('"');
212        }
213        '\'' => {
214          state = CirruLexState::Str;
215          buffer.push('\'');
216        }
217        't' => {
218          state = CirruLexState::Str;
219          buffer.push('\t');
220        }
221        'n' => {
222          state = CirruLexState::Str;
223          buffer.push('\n');
224        }
225        'r' => {
226          state = CirruLexState::Str;
227          buffer.push('\r');
228        }
229        'u' => {
230          // not supporting, but don't panic
231          let end = idx + 10;
232          let peek = if end >= code.len() { &code[idx..] } else { &code[idx..end] };
233          println!("Unicode escaping is not supported yet: {peek:?} ...");
234          buffer.push_str("\\u");
235          state = CirruLexState::Str;
236        }
237        '\\' => {
238          state = CirruLexState::Str;
239          buffer.push('\\');
240        }
241        _ => return Err(format!("unexpected character during string escaping: {c:?}")),
242      },
243      CirruLexState::Indent => match c {
244        ' ' => {
245          state = CirruLexState::Indent;
246          buffer.push(c);
247        }
248        '\n' => {
249          state = CirruLexState::Indent;
250          buffer.clear();
251        }
252        '"' => {
253          let level = parse_indentation(buffer.len() as u8)?;
254          acc.push(level);
255          state = CirruLexState::Str;
256          buffer = String::new();
257        }
258        '(' => {
259          let level = parse_indentation(buffer.len() as u8)?;
260          acc.push(level);
261          acc.push(CirruLexItem::Open);
262          state = CirruLexState::Space;
263          buffer.clear();
264        }
265        ')' => return Err(String::from("unexpected ) at line start")),
266        _ => {
267          let level = parse_indentation(buffer.len() as u8)?;
268          acc.push(level);
269          state = CirruLexState::Token;
270          buffer.clear();
271          buffer.push(c);
272        }
273      },
274    }
275  }
276
277  match state {
278    CirruLexState::Space => Ok(acc),
279    CirruLexState::Token => {
280      acc.push(CirruLexItem::Str(buffer));
281      Ok(acc)
282    }
283    CirruLexState::Escape => Err(String::from("unknown escape")),
284    CirruLexState::Indent => Ok(acc),
285    CirruLexState::Str => Err(String::from("finished at string")),
286  }
287}
288
289/// This function transforms a flat list of tokens into a tree structure
290/// by handling indentation. It inserts `Open` and `Close` tokens based on
291/// changes in indentation levels.
292///
293/// # Examples
294///
295/// ```
296/// # use cirru_parser::{CirruLexItem, resolve_indentations};
297/// # use cirru_parser::CirruLexItem::*;
298/// let tokens = vec![Indent(0), "a".into(), Indent(1), "b".into()];
299/// let resolved = resolve_indentations(&tokens);
300/// assert_eq!(resolved, vec![Open, "a".into(), Open, "b".into(), Close, Close]);
301/// ```
302pub fn resolve_indentations(tokens: &[CirruLexItem]) -> CirruLexItemList {
303  let mut acc: CirruLexItemList = Vec::with_capacity(tokens.len() * 2);
304  let mut level: u8 = 0;
305
306  if tokens.is_empty() {
307    return vec![];
308  }
309
310  for token in tokens {
311    match token {
312      CirruLexItem::Str(s) => {
313        acc.push(CirruLexItem::Str(s.to_owned()));
314      }
315      CirruLexItem::Open => {
316        acc.push(CirruLexItem::Open);
317      }
318      CirruLexItem::Close => {
319        acc.push(CirruLexItem::Close);
320      }
321      CirruLexItem::Indent(n) => {
322        match n.cmp(&level) {
323          Greater => {
324            // Indent level increased, push open parenthesis.
325            for _ in 0..(n - level) {
326              acc.push(CirruLexItem::Open);
327            }
328          }
329          Less => {
330            // Indent level decreased, push close parenthesis.
331            for _ in 0..(level - n) {
332              acc.push(CirruLexItem::Close);
333            }
334            acc.push(CirruLexItem::Close);
335            acc.push(CirruLexItem::Open);
336          }
337          Equal => {
338            // Same indent level, close previous expression and start a new one.
339            if !acc.is_empty() {
340              acc.push(CirruLexItem::Close);
341              acc.push(CirruLexItem::Open);
342            }
343          }
344        }
345        level = *n;
346      }
347    }
348  }
349
350  // Close all remaining parenthesis.
351  if !acc.is_empty() {
352    let mut new_acc = Vec::with_capacity(1 + acc.len() + level as usize + 1);
353    new_acc.push(CirruLexItem::Open);
354    new_acc.append(&mut acc); // acc is drained
355
356    for _ in 0..level {
357      new_acc.push(CirruLexItem::Close);
358    }
359    new_acc.push(CirruLexItem::Close);
360    new_acc
361  } else {
362    vec![]
363  }
364}
365
366/// Parses a string of Cirru code into a tree of `Cirru` expressions.
367///
368/// This is the main entry point for the parser. It performs the following steps:
369/// 1. Lexing: The code is tokenized into a flat list of `CirruLexItem`s.
370/// 2. Indentation Resolution: The flat list of tokens is transformed into a tree
371///    structure by handling indentation.
372/// 3. Tree Building: The token tree is converted into a tree of `Cirru` expressions.
373/// 4. Syntax Resolution: Special syntax like `$` and `,` is resolved.
374///
375/// # Examples
376///
377/// ```
378/// # use cirru_parser::{parse, Cirru};
379/// let code = "defn main\n  println \"Hello, world!\"";
380/// let expected = Ok(vec![
381///   Cirru::List(vec![
382///     Cirru::Leaf("defn".into()),
383///     Cirru::Leaf("main".into()),
384///     Cirru::List(vec![
385///       Cirru::Leaf("println".into()),
386///       Cirru::Leaf("Hello, world!".into()),
387///     ]),
388///   ]),
389/// ]);
390/// assert_eq!(parse(code), expected);
391/// ```
392pub fn parse(code: &str) -> Result<Vec<Cirru>, String> {
393  let tokens = resolve_indentations(&lex(code)?);
394  // println!("{:?}", tokens);
395  let mut tree = build_exprs(&tokens)?;
396  // println!("tree {:?}", tree);
397  resolve_dollar(&mut tree);
398  resolve_comma(&mut tree);
399  Ok(tree)
400}
401
402/// Converts a string of Cirru code directly to a Lisp-like string.
403///
404/// This function is a convenience wrapper around `parse` and `format_to_lisp`.
405/// It will panic if parsing or formatting fails.
406pub fn cirru_to_lisp(code: &str) -> String {
407  match parse(code) {
408    Ok(tree) => match format_to_lisp(&tree) {
409      Ok(s) => s,
410      Err(_) => panic!("failed to convert to lisp"),
411    },
412    Err(_) => panic!("expected a leaf"),
413  }
414}