sexpression/reader.rs
1//! S-Expression Parser Module
2//!
3//! This module provides a zero-copy S-expression parser optimized for compiler use cases.
4//! The parser uses borrowed string slices to minimize memory allocations while providing
5//! fast parsing performance through various optimizations.
6//!
7//! # Features
8//!
9//! - **Zero-copy parsing**: Uses borrowed string slices to avoid unnecessary allocations
10//! - **Fast-path optimizations**: Optimized number parsing and single-character symbols
11//! - **Production error handling**: Proper error types instead of panics
12//! - **Memory efficient**: Pre-allocated vectors and optimized tokenization
13//!
14//! # Example
15//!
16//! ```rust
17//! use sexpression::{Expression, read};
18//!
19//! let result = read("(define (factorial n) (if (= n 0) 1 (* n (factorial (- n 1)))))");
20//! match result {
21//! Ok(expr) => println!("Parsed: {:?}", expr),
22//! Err(e) => eprintln!("Parse error: {}", e),
23//! }
24//! ```
25
26// Zero-copy Expression that borrows from source
27/// Represents an S-expression as a borrowed data structure.
28///
29/// This enum provides zero-copy access to parsed S-expressions by borrowing
30/// string slices from the original source. This is ideal for compiler use cases
31/// where you need to parse expressions without taking ownership of the data.
32///
33/// # Examples
34///
35/// ```rust
36/// use sexpression::Expression;
37///
38/// // Numbers
39/// let num = Expression::Number(42.0);
40///
41/// // Symbols
42/// let sym = Expression::Symbol("define");
43///
44/// // Lists
45/// let list = Expression::List(vec![
46/// Expression::Symbol("+"),
47/// Expression::Number(1.0),
48/// Expression::Number(2.0)
49/// ]);
50/// ```
51#[derive(PartialEq, Debug)]
52pub enum Expression<'a> {
53 /// A numeric literal (f64)
54 Number(f64),
55 /// A boolean literal
56 Bool(bool),
57 /// A string literal (borrowed from source)
58 Str(&'a str),
59 /// A symbol/identifier (borrowed from source)
60 Symbol(&'a str),
61 /// A list of expressions
62 List(Vec<Expression<'a>>),
63 /// A null value
64 Null,
65}
66
67/// Owned version of Expression for when you need ownership.
68///
69/// This is useful when you need to store expressions independently of the
70/// original source string, or when you need to modify the expressions.
71///
72/// # Examples
73///
74/// ```rust
75/// use sexpression::{Expression, OwnedExpression};
76///
77/// let borrowed = Expression::Symbol("hello");
78/// let owned: OwnedExpression = borrowed.to_owned();
79/// assert_eq!(owned, OwnedExpression::Symbol("hello".to_string()));
80/// ```
81#[derive(PartialEq, Debug, Clone)]
82pub enum OwnedExpression {
83 /// A numeric literal (f64)
84 Number(f64),
85 /// A boolean literal
86 Bool(bool),
87 /// A string literal (owned)
88 Str(String),
89 /// A symbol/identifier (owned)
90 Symbol(String),
91 /// A list of expressions
92 List(Vec<OwnedExpression>),
93 /// A null value
94 Null,
95}
96
97/// Parse errors that can occur during S-expression parsing.
98///
99/// This enum provides detailed error information for debugging and
100/// error handling in production environments.
101#[derive(Debug, thiserror::Error)]
102pub enum ParseError {
103 /// Unexpected end of input while parsing
104 #[error("Unexpected EOF")]
105 UnexpectedEOF,
106 /// Missing closing parenthesis in a list
107 #[error("Missing closing parenthesis")]
108 MissingClosingParen,
109 /// Unexpected closing parenthesis (no matching opening parenthesis)
110 #[error("Unexpected closing parenthesis")]
111 UnexpectedClosingParen,
112}
113
114impl<'a> Expression<'a> {
115 /// Convert a borrowed expression to an owned expression.
116 ///
117 /// This method allocates new strings for all string and symbol data,
118 /// allowing the resulting expression to outlive the original source.
119 ///
120 /// # Examples
121 ///
122 /// ```rust
123 /// use sexpression::Expression;
124 ///
125 /// let borrowed = Expression::Symbol("hello");
126 /// let owned = borrowed.to_owned();
127 /// assert_eq!(owned, sexpression::OwnedExpression::Symbol("hello".to_string()));
128 /// ```
129 pub fn to_owned(&self) -> OwnedExpression {
130 match self {
131 Expression::Number(n) => OwnedExpression::Number(*n),
132 Expression::Bool(b) => OwnedExpression::Bool(*b),
133 Expression::Str(s) => OwnedExpression::Str(s.to_string()),
134 Expression::Symbol(s) => OwnedExpression::Symbol(s.to_string()),
135 Expression::List(list) => OwnedExpression::List(
136 list.iter().map(|expr| expr.to_owned()).collect()
137 ),
138 Expression::Null => OwnedExpression::Null,
139 }
140 }
141}
142
143/// Optimized zero-copy tokenizer using string slices.
144///
145/// This function efficiently tokenizes S-expression source code by:
146/// - Pre-allocating vectors with realistic capacity estimates
147/// - Using efficient string operations instead of character-by-character iteration
148/// - Minimizing memory allocations through zero-copy string slices
149///
150/// # Arguments
151///
152/// * `src` - The source string to tokenize
153///
154/// # Returns
155///
156/// A vector of string slices representing the tokens
157fn tokenize(src: &str) -> Vec<&str> {
158 // More realistic capacity estimate
159 let mut tokens = Vec::with_capacity(src.len() / 2);
160 let mut current = src;
161
162 while !current.is_empty() {
163 // Skip leading whitespace efficiently
164 current = current.trim_start();
165 if current.is_empty() { break; }
166
167 // Find next delimiter or whitespace
168 let (token, rest) = match current.find(|c: char| c.is_whitespace() || "()'".contains(c)) {
169 Some(pos) => {
170 let token = ¤t[..pos];
171 let rest = ¤t[pos..];
172 (token, rest)
173 }
174 None => (current, ""),
175 };
176
177 if !token.is_empty() {
178 tokens.push(token);
179 }
180
181 // Handle delimiter efficiently
182 if !rest.is_empty() {
183 let delimiter = &rest[..1];
184 if "()'".contains(delimiter) {
185 tokens.push(delimiter);
186 }
187 current = &rest[1..];
188 } else {
189 break;
190 }
191 }
192
193 tokens
194}
195
196/// Optimized zero-copy parser with proper error handling.
197///
198/// This function parses a slice of tokens into an S-expression, using:
199/// - Pre-allocated vectors for common list sizes
200/// - Proper error handling instead of panics
201/// - Recursive descent parsing with zero-copy semantics
202///
203/// # Arguments
204///
205/// * `tokens` - A mutable reference to a slice of tokens to parse
206///
207/// # Returns
208///
209/// A `Result` containing either the parsed expression or a parse error
210///
211/// # Errors
212///
213/// Returns `ParseError` variants for various parsing failures
214fn parse<'a>(tokens: &mut &[&'a str]) -> Result<Expression<'a>, ParseError> {
215 if tokens.is_empty() {
216 return Err(ParseError::UnexpectedEOF);
217 }
218
219 let token = tokens[0];
220 *tokens = &tokens[1..]; // Advance slice
221
222 match token {
223 "(" => {
224 // Pre-allocate list vector for common list sizes
225 let mut stack = Vec::with_capacity(8);
226 while !tokens.is_empty() && tokens[0] != ")" {
227 stack.push(parse(tokens)?);
228 }
229 if tokens.is_empty() {
230 return Err(ParseError::MissingClosingParen);
231 }
232 *tokens = &tokens[1..]; // Skip closing paren
233 Ok(Expression::List(stack))
234 }
235 ")" => Err(ParseError::UnexpectedClosingParen),
236 _ => Ok(parse_atom(token)),
237 }
238}
239
240/// Optimized atom parsing with fast paths.
241///
242/// This function parses individual tokens into atomic expressions using:
243/// - Fast-path checks for single-character symbols
244/// - Optimized number parsing with first-character checks
245/// - Bounds-safe string literal handling
246///
247/// # Arguments
248///
249/// * `token` - The token string to parse as an atom
250///
251/// # Returns
252///
253/// The parsed atomic expression
254fn parse_atom(token: &str) -> Expression {
255 // Fast path: single character symbols
256 if token.len() == 1 {
257 return Expression::Symbol(token);
258 }
259
260 // Fast path: check first character for number parsing
261 if let Some(first) = token.chars().next() {
262 if first.is_ascii_digit() || first == '-' || first == '+' {
263 if let Ok(n) = token.parse::<f64>() {
264 return Expression::Number(n);
265 }
266 }
267 }
268
269 // Check for booleans and null
270 match token {
271 "true" => return Expression::Bool(true),
272 "false" => return Expression::Bool(false),
273 "null" => return Expression::Null,
274 _ => {}
275 }
276
277 // Optimized string literal handling with bounds safety
278 if token.len() >= 2 && token.starts_with('"') && token.ends_with('"') {
279 if let Some(content) = token.get(1..token.len()-1) {
280 return Expression::Str(content);
281 }
282 }
283
284 // Default to symbol
285 Expression::Symbol(token)
286}
287
288/// Main parsing function with error handling.
289///
290/// This is the primary entry point for parsing S-expressions. It provides:
291/// - Zero-copy parsing with borrowed string slices
292/// - Comprehensive error handling
293/// - Optimized performance through various fast paths
294///
295/// # Arguments
296///
297/// * `src` - The source string to parse as an S-expression
298///
299/// # Returns
300///
301/// A `Result` containing either the parsed expression or a parse error
302///
303/// # Examples
304///
305/// ```rust
306/// use sexpression::{read, Expression};
307///
308/// // Successful parsing
309/// let result = read("(define x 42)");
310/// assert!(result.is_ok());
311///
312/// // Error handling
313/// let result = read("(unclosed");
314/// assert!(result.is_err());
315/// ```
316pub fn read(src: &str) -> Result<Expression, ParseError> {
317 let tokens = tokenize(src);
318 let mut token_slice = tokens.as_slice();
319 parse(&mut token_slice)
320}
321
322/// Convenience function for backward compatibility (panics on error).
323///
324/// This function provides the same interface as the original parser
325/// but will panic if parsing fails. Use `read()` for production code
326/// that needs proper error handling.
327///
328/// # Arguments
329///
330/// * `src` - The source string to parse as an S-expression
331///
332/// # Returns
333///
334/// The parsed expression
335///
336/// # Panics
337///
338/// Panics if the source cannot be parsed as a valid S-expression
339///
340/// # Examples
341///
342/// ```rust
343/// use sexpression::read_unchecked;
344///
345/// let expr = read_unchecked("(hello world)");
346/// // Use expr safely knowing it was parsed successfully
347/// ```
348pub fn read_unchecked(src: &str) -> Expression {
349 read(src).expect("Failed to parse S-expression")
350}
351
352#[cfg(test)]
353mod tests {
354 use super::*;
355
356 #[test]
357 fn tokenize_test() {
358 assert_eq!(tokenize("this is a test"), vec!["this", "is", "a", "test"]);
359 assert_eq!(tokenize("(hello world)"), vec!["(", "hello", "world", ")"]);
360 }
361
362 #[test]
363 fn read_test() {
364 let result = read("(1 symbol \"string\" true null (1.5))").unwrap();
365 println!("{:?}", result);
366
367 // Test error handling
368 assert!(read("(unclosed").is_err());
369 assert!(read(")unexpected").is_err());
370 }
371
372 #[test]
373 fn fast_path_tests() {
374 // Test single character symbols
375 let result = read("a").unwrap();
376 assert!(matches!(result, Expression::Symbol("a")));
377
378 // Test number parsing
379 let result = read("42").unwrap();
380 assert!(matches!(result, Expression::Number(42.0)));
381
382 // Test negative numbers
383 let result = read("-3.14").unwrap();
384 assert!(matches!(result, Expression::Number(-3.14)));
385 }
386
387 #[test]
388 fn performance_test() {
389 // Simple performance test without unstable features
390 let input = "(define (factorial n) (if (= n 0) 1 (* n (factorial (- n 1)))))";
391 let start = std::time::Instant::now();
392 for _ in 0..1000 {
393 let _ = read(input);
394 }
395 let duration = start.elapsed();
396 println!("Parsed 1000 times in {:?}", duration);
397 }
398}