cirru_parser/
parser.rs

1/*! # Cirru Parser
2This tiny parser parses indentation based syntax into nested a vector,
3then it could used as S-Expressions for evaluation or codegen.
4
5```cirru
6defn fib (x)
7  if (<= x 2) 1
8    +
9      fib $ dec x
10      fib $ - x 2
11```
12
13parses to:
14
15```edn
16[ ["defn" "fib" [ "x" ]
17    [ "if" [ "<=" "x" "2" ] "1"
18      [ "+" [ "fib" ["dec" "x"] ] [ "fib" ["-" "x" "2"] ] ]
19    ]
20] ]
21```
22
23find more on <http://text.cirru.org/> .
24*/
25
26mod primes;
27mod s_expr;
28mod tree;
29mod writer;
30
31#[cfg(feature = "use-serde")]
32mod json;
33
34#[cfg(feature = "use-serde")]
35pub use json::*;
36
37const DEFAULT_EXPR_CAPACITY: usize = 8; // Added for default capacity
38
39use std::cmp::Ordering::*;
40
41use primes::CirruLexState;
42use tree::{resolve_comma, resolve_dollar};
43
44pub use primes::{Cirru, CirruLexItem, CirruLexItemList, escape_cirru_leaf};
45pub use s_expr::format_to_lisp;
46pub use writer::{CirruWriterOptions, format};
47
48fn build_exprs(tokens: &[CirruLexItem]) -> Result<Vec<Cirru>, String> {
49  let mut acc: Vec<Cirru> = Vec::with_capacity(tokens.len() / 6 + 1);
50  let mut idx = 0;
51  let mut pull_token = || {
52    if idx >= tokens.len() {
53      return None;
54    }
55    let pos = idx;
56    idx += 1;
57    Some(&tokens[pos])
58  };
59  loop {
60    let chunk = pull_token();
61
62    match &chunk {
63      None => return Ok(acc),
64      Some(ck) => {
65        match ck {
66          CirruLexItem::Open => {
67            let mut pointer: Vec<Cirru> = Vec::with_capacity(DEFAULT_EXPR_CAPACITY);
68            // guess a nested level of 16
69            let mut pointer_stack: Vec<Vec<Cirru>> = Vec::with_capacity(16);
70            loop {
71              let cursor = pull_token();
72
73              match &cursor {
74                None => return Err(String::from("unexpected end of file")),
75                Some(c) => match c {
76                  CirruLexItem::Close => match pointer_stack.pop() {
77                    None => {
78                      acc.push(Cirru::List(pointer));
79                      break;
80                    }
81                    Some(v) => {
82                      let prev_p = pointer;
83                      pointer = v;
84                      pointer.push(Cirru::List(prev_p));
85                    }
86                  },
87                  CirruLexItem::Open => {
88                    pointer_stack.push(pointer);
89                    pointer = Vec::with_capacity(DEFAULT_EXPR_CAPACITY);
90                  }
91                  CirruLexItem::Str(s) => pointer.push(Cirru::Leaf((**s).into())),
92                  CirruLexItem::Indent(n) => return Err(format!("unknown indent: {}", n)),
93                },
94              }
95            }
96          }
97          CirruLexItem::Close => return Err(String::from("unexpected \")\"")),
98          a => return Err(format!("unknown item: {:?}", a)),
99        }
100      }
101    }
102  }
103}
104
105fn parse_indentation(size: u8) -> Result<CirruLexItem, String> {
106  if size & 0x1 == 0x0 {
107    // even number
108    Ok(CirruLexItem::Indent(size >> 1))
109  } else {
110    Err(format!("odd indentation size, {}", size))
111  }
112}
113
114const DEFAULT_BUFFER_CAPACITY: usize = 8;
115
116/// internal function for lexing
117pub fn lex(initial_code: &str) -> Result<CirruLexItemList, String> {
118  // guessed an initial length
119  let mut acc: CirruLexItemList = Vec::with_capacity(initial_code.len() >> 4);
120  let mut state = CirruLexState::Indent;
121  let mut buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
122  let code = initial_code;
123
124  for (idx, c) in code.chars().enumerate() {
125    match state {
126      CirruLexState::Space => match c {
127        ' ' => {
128          state = CirruLexState::Space;
129          buffer.clear();
130        }
131        '\n' => {
132          state = CirruLexState::Indent;
133          buffer.clear();
134        }
135        '(' => {
136          acc.push(CirruLexItem::Open);
137          state = CirruLexState::Space;
138          buffer = String::new()
139        }
140        ')' => {
141          acc.push(CirruLexItem::Close);
142          state = CirruLexState::Space;
143          buffer.clear()
144        }
145        '"' => {
146          state = CirruLexState::Str;
147          buffer.clear();
148        }
149        _ => {
150          state = CirruLexState::Token;
151          buffer.clear();
152          buffer.push(c);
153        }
154      },
155      CirruLexState::Token => match c {
156        ' ' => {
157          acc.push(CirruLexItem::Str(buffer));
158          state = CirruLexState::Space;
159          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
160        }
161        '"' => {
162          acc.push(CirruLexItem::Str(buffer));
163          state = CirruLexState::Str;
164          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
165        }
166        '\n' => {
167          acc.push(CirruLexItem::Str(buffer));
168          state = CirruLexState::Indent;
169          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
170        }
171        '(' => {
172          acc.push(CirruLexItem::Str(buffer));
173          acc.push(CirruLexItem::Open);
174          state = CirruLexState::Space;
175          buffer = String::new()
176        }
177        ')' => {
178          acc.push(CirruLexItem::Str(buffer));
179          acc.push(CirruLexItem::Close);
180          state = CirruLexState::Space;
181          buffer = String::new()
182        }
183        _ => {
184          state = CirruLexState::Token;
185          buffer.push(c);
186        }
187      },
188      CirruLexState::Str => match c {
189        '"' => {
190          acc.push(CirruLexItem::Str(buffer));
191          state = CirruLexState::Space;
192          buffer = String::with_capacity(DEFAULT_BUFFER_CAPACITY);
193        }
194        '\\' => {
195          state = CirruLexState::Escape;
196        }
197        '\n' => {
198          return Err(String::from("unexpected newline in string"));
199        }
200        _ => {
201          state = CirruLexState::Str;
202          buffer.push(c);
203        }
204      },
205      CirruLexState::Escape => match c {
206        '"' => {
207          state = CirruLexState::Str;
208          buffer.push('"');
209        }
210        '\'' => {
211          state = CirruLexState::Str;
212          buffer.push('\'');
213        }
214        't' => {
215          state = CirruLexState::Str;
216          buffer.push('\t');
217        }
218        'n' => {
219          state = CirruLexState::Str;
220          buffer.push('\n');
221        }
222        'r' => {
223          state = CirruLexState::Str;
224          buffer.push('\r');
225        }
226        'u' => {
227          // not supporting, but don't panic
228          let end = idx + 10;
229          let peek = if end >= code.len() { &code[idx..] } else { &code[idx..end] };
230          println!("Unicode escaping is not supported yet: {:?} ...", peek);
231          buffer.push_str("\\u");
232          state = CirruLexState::Str;
233        }
234        '\\' => {
235          state = CirruLexState::Str;
236          buffer.push('\\');
237        }
238        _ => return Err(format!("unexpected character during string escaping: {:?}", c)),
239      },
240      CirruLexState::Indent => match c {
241        ' ' => {
242          state = CirruLexState::Indent;
243          buffer.push(c);
244        }
245        '\n' => {
246          state = CirruLexState::Indent;
247          buffer.clear();
248        }
249        '"' => {
250          let level = parse_indentation(buffer.len() as u8)?;
251          acc.push(level);
252          state = CirruLexState::Str;
253          buffer = String::new();
254        }
255        '(' => {
256          let level = parse_indentation(buffer.len() as u8)?;
257          acc.push(level);
258          acc.push(CirruLexItem::Open);
259          state = CirruLexState::Space;
260          buffer.clear();
261        }
262        ')' => return Err(String::from("unexpected ) at line start")),
263        _ => {
264          let level = parse_indentation(buffer.len() as u8)?;
265          acc.push(level);
266          state = CirruLexState::Token;
267          buffer.clear();
268          buffer.push(c);
269        }
270      },
271    }
272  }
273
274  match state {
275    CirruLexState::Space => Ok(acc),
276    CirruLexState::Token => {
277      acc.push(CirruLexItem::Str(buffer));
278      Ok(acc)
279    }
280    CirruLexState::Escape => Err(String::from("unknown escape")),
281    CirruLexState::Indent => Ok(acc),
282    CirruLexState::Str => Err(String::from("finished at string")),
283  }
284}
285
286/// internal function for figuring out indentations after lexing
287pub fn resolve_indentations(tokens: &[CirruLexItem]) -> CirruLexItemList {
288  let size = tokens.len();
289  let mut acc: CirruLexItemList = Vec::new();
290  let mut level = 0;
291  let mut pointer = 0;
292  loop {
293    if pointer >= size {
294      if acc.is_empty() {
295        return vec![];
296      }
297
298      // More efficient way to wrap acc
299      let mut new_acc = Vec::with_capacity(1 + acc.len() + level as usize + 1);
300      new_acc.push(CirruLexItem::Open);
301      new_acc.append(&mut acc); // acc is drained
302
303      for _ in 0..level {
304        new_acc.push(CirruLexItem::Close);
305      }
306      new_acc.push(CirruLexItem::Close);
307      return new_acc;
308    }
309    match &tokens[pointer] {
310      CirruLexItem::Str(s) => {
311        acc.push(CirruLexItem::Str(s.to_owned()));
312        pointer += 1;
313      }
314      CirruLexItem::Open => {
315        acc.push(CirruLexItem::Open);
316        pointer += 1;
317      }
318      CirruLexItem::Close => {
319        acc.push(CirruLexItem::Close);
320        pointer += 1;
321      }
322      CirruLexItem::Indent(n) => match n.cmp(&level) {
323        Greater => {
324          let delta = n - level;
325          for _ in 0..delta {
326            acc.push(CirruLexItem::Open);
327          }
328          pointer += 1;
329          level = *n;
330        }
331        Less => {
332          let delta = level - n;
333          for _ in 0..delta {
334            acc.push(CirruLexItem::Close);
335          }
336          acc.push(CirruLexItem::Close);
337          acc.push(CirruLexItem::Open);
338          pointer += 1;
339          level = *n;
340        }
341        Equal => {
342          if acc.is_empty() {
343            acc = vec![];
344          } else {
345            acc.push(CirruLexItem::Close);
346            acc.push(CirruLexItem::Open);
347          }
348          pointer += 1;
349        }
350      },
351    }
352  }
353}
354
355/// parse function, parse String to Cirru.
356///
357/// ```rust
358/// cirru_parser::parse("def a 1");
359/// ```
360pub fn parse(code: &str) -> Result<Vec<Cirru>, String> {
361  let tokens = resolve_indentations(&lex(code)?);
362  // println!("{:?}", tokens);
363  let tree = build_exprs(&tokens)?;
364  // println!("tree {:?}", tree);
365  Ok(resolve_comma(&resolve_dollar(&tree)))
366}
367
368pub fn cirru_to_lisp(code: &str) -> String {
369  match parse(code) {
370    Ok(tree) => match format_to_lisp(&tree) {
371      Ok(s) => s,
372      Err(_) => panic!("failed to convert to lisp"),
373    },
374    Err(_) => panic!("expected a leaf"),
375  }
376}