sexpression/
reader.rs

1//! S-Expression Parser Module
2//! 
3//! This module provides a zero-copy S-expression parser optimized for compiler use cases.
4//! The parser uses borrowed string slices to minimize memory allocations while providing
5//! fast parsing performance through various optimizations.
6//! 
7//! # Features
8//! 
9//! - **Zero-copy parsing**: Uses borrowed string slices to avoid unnecessary allocations
10//! - **Fast-path optimizations**: Optimized number parsing and single-character symbols
11//! - **Production error handling**: Proper error types instead of panics
12//! - **Memory efficient**: Pre-allocated vectors and optimized tokenization
13//! 
14//! # Example
15//! 
16//! ```rust
17//! use sexpression::{Expression, read};
18//! 
19//! let result = read("(define (factorial n) (if (= n 0) 1 (* n (factorial (- n 1)))))");
20//! match result {
21//!     Ok(expr) => println!("Parsed: {:?}", expr),
22//!     Err(e) => eprintln!("Parse error: {}", e),
23//! }
24//! ```
25
26// Zero-copy Expression that borrows from source
27/// Represents an S-expression as a borrowed data structure.
28/// 
29/// This enum provides zero-copy access to parsed S-expressions by borrowing
30/// string slices from the original source. This is ideal for compiler use cases
31/// where you need to parse expressions without taking ownership of the data.
32/// 
33/// # Examples
34/// 
35/// ```rust
36/// use sexpression::Expression;
37/// 
38/// // Numbers
39/// let num = Expression::Number(42.0);
40/// 
41/// // Symbols
42/// let sym = Expression::Symbol("define");
43/// 
44/// // Lists
45/// let list = Expression::List(vec![
46///     Expression::Symbol("+"),
47///     Expression::Number(1.0),
48///     Expression::Number(2.0)
49/// ]);
50/// ```
51#[derive(PartialEq, Debug)]
52pub enum Expression<'a> {
53    /// A numeric literal (f64)
54    Number(f64),
55    /// A boolean literal
56    Bool(bool),
57    /// A string literal (borrowed from source)
58    Str(&'a str),
59    /// A symbol/identifier (borrowed from source)
60    Symbol(&'a str),
61    /// A list of expressions
62    List(Vec<Expression<'a>>),
63    /// A null value
64    Null,
65}
66
67/// Owned version of Expression for when you need ownership.
68/// 
69/// This is useful when you need to store expressions independently of the
70/// original source string, or when you need to modify the expressions.
71/// 
72/// # Examples
73/// 
74/// ```rust
75/// use sexpression::{Expression, OwnedExpression};
76/// 
77/// let borrowed = Expression::Symbol("hello");
78/// let owned: OwnedExpression = borrowed.to_owned();
79/// assert_eq!(owned, OwnedExpression::Symbol("hello".to_string()));
80/// ```
81#[derive(PartialEq, Debug, Clone)]
82pub enum OwnedExpression {
83    /// A numeric literal (f64)
84    Number(f64),
85    /// A boolean literal
86    Bool(bool),
87    /// A string literal (owned)
88    Str(String),
89    /// A symbol/identifier (owned)
90    Symbol(String),
91    /// A list of expressions
92    List(Vec<OwnedExpression>),
93    /// A null value
94    Null,
95}
96
97/// Parse errors that can occur during S-expression parsing.
98/// 
99/// This enum provides detailed error information for debugging and
100/// error handling in production environments.
101#[derive(Debug, thiserror::Error)]
102pub enum ParseError {
103    /// Unexpected end of input while parsing
104    #[error("Unexpected EOF")]
105    UnexpectedEOF,
106    /// Missing closing parenthesis in a list
107    #[error("Missing closing parenthesis")]
108    MissingClosingParen,
109    /// Unexpected closing parenthesis (no matching opening parenthesis)
110    #[error("Unexpected closing parenthesis")]
111    UnexpectedClosingParen,
112}
113
114impl<'a> Expression<'a> {
115    /// Convert a borrowed expression to an owned expression.
116    /// 
117    /// This method allocates new strings for all string and symbol data,
118    /// allowing the resulting expression to outlive the original source.
119    /// 
120    /// # Examples
121    /// 
122    /// ```rust
123    /// use sexpression::Expression;
124    /// 
125    /// let borrowed = Expression::Symbol("hello");
126    /// let owned = borrowed.to_owned();
127    /// assert_eq!(owned, sexpression::OwnedExpression::Symbol("hello".to_string()));
128    /// ```
129    pub fn to_owned(&self) -> OwnedExpression {
130        match self {
131            Expression::Number(n) => OwnedExpression::Number(*n),
132            Expression::Bool(b) => OwnedExpression::Bool(*b),
133            Expression::Str(s) => OwnedExpression::Str(s.to_string()),
134            Expression::Symbol(s) => OwnedExpression::Symbol(s.to_string()),
135            Expression::List(list) => OwnedExpression::List(
136                list.iter().map(|expr| expr.to_owned()).collect()
137            ),
138            Expression::Null => OwnedExpression::Null,
139        }
140    }
141}
142
143/// Optimized zero-copy tokenizer using string slices.
144/// 
145/// This function efficiently tokenizes S-expression source code by:
146/// - Pre-allocating vectors with realistic capacity estimates
147/// - Using efficient string operations instead of character-by-character iteration
148/// - Minimizing memory allocations through zero-copy string slices
149/// 
150/// # Arguments
151/// 
152/// * `src` - The source string to tokenize
153/// 
154/// # Returns
155/// 
156/// A vector of string slices representing the tokens
157fn tokenize(src: &str) -> Vec<&str> {
158    // More realistic capacity estimate
159    let mut tokens = Vec::with_capacity(src.len() / 2);
160    let mut current = src;
161    
162    while !current.is_empty() {
163        // Skip leading whitespace efficiently
164        current = current.trim_start();
165        if current.is_empty() { break; }
166        
167        // Find next delimiter or whitespace
168        let (token, rest) = match current.find(|c: char| c.is_whitespace() || "()'".contains(c)) {
169            Some(pos) => {
170                let token = &current[..pos];
171                let rest = &current[pos..];
172                (token, rest)
173            }
174            None => (current, ""),
175        };
176        
177        if !token.is_empty() {
178            tokens.push(token);
179        }
180        
181        // Handle delimiter efficiently
182        if !rest.is_empty() {
183            let delimiter = &rest[..1];
184            if "()'".contains(delimiter) {
185                tokens.push(delimiter);
186            }
187            current = &rest[1..];
188        } else {
189            break;
190        }
191    }
192    
193    tokens
194}
195
196/// Optimized zero-copy parser with proper error handling.
197/// 
198/// This function parses a slice of tokens into an S-expression, using:
199/// - Pre-allocated vectors for common list sizes
200/// - Proper error handling instead of panics
201/// - Recursive descent parsing with zero-copy semantics
202/// 
203/// # Arguments
204/// 
205/// * `tokens` - A mutable reference to a slice of tokens to parse
206/// 
207/// # Returns
208/// 
209/// A `Result` containing either the parsed expression or a parse error
210/// 
211/// # Errors
212/// 
213/// Returns `ParseError` variants for various parsing failures
214fn parse<'a>(tokens: &mut &[&'a str]) -> Result<Expression<'a>, ParseError> {
215    if tokens.is_empty() {
216        return Err(ParseError::UnexpectedEOF);
217    }
218    
219    let token = tokens[0];
220    *tokens = &tokens[1..]; // Advance slice
221    
222    match token {
223        "(" => {
224            // Pre-allocate list vector for common list sizes
225            let mut stack = Vec::with_capacity(8);
226            while !tokens.is_empty() && tokens[0] != ")" {
227                stack.push(parse(tokens)?);
228            }
229            if tokens.is_empty() {
230                return Err(ParseError::MissingClosingParen);
231            }
232            *tokens = &tokens[1..]; // Skip closing paren
233            Ok(Expression::List(stack))
234        }
235        ")" => Err(ParseError::UnexpectedClosingParen),
236        _ => Ok(parse_atom(token)),
237    }
238}
239
240/// Optimized atom parsing with fast paths.
241/// 
242/// This function parses individual tokens into atomic expressions using:
243/// - Fast-path checks for single-character symbols
244/// - Optimized number parsing with first-character checks
245/// - Bounds-safe string literal handling
246/// 
247/// # Arguments
248/// 
249/// * `token` - The token string to parse as an atom
250/// 
251/// # Returns
252/// 
253/// The parsed atomic expression
254fn parse_atom(token: &str) -> Expression {
255    // Fast path: single character symbols
256    if token.len() == 1 {
257        return Expression::Symbol(token);
258    }
259    
260    // Fast path: check first character for number parsing
261    if let Some(first) = token.chars().next() {
262        if first.is_ascii_digit() || first == '-' || first == '+' {
263            if let Ok(n) = token.parse::<f64>() {
264                return Expression::Number(n);
265            }
266        }
267    }
268    
269    // Check for booleans and null
270    match token {
271        "true" => return Expression::Bool(true),
272        "false" => return Expression::Bool(false),
273        "null" => return Expression::Null,
274        _ => {}
275    }
276    
277    // Optimized string literal handling with bounds safety
278    if token.len() >= 2 && token.starts_with('"') && token.ends_with('"') {
279        if let Some(content) = token.get(1..token.len()-1) {
280            return Expression::Str(content);
281        }
282    }
283    
284    // Default to symbol
285    Expression::Symbol(token)
286}
287
288/// Main parsing function with error handling.
289/// 
290/// This is the primary entry point for parsing S-expressions. It provides:
291/// - Zero-copy parsing with borrowed string slices
292/// - Comprehensive error handling
293/// - Optimized performance through various fast paths
294/// 
295/// # Arguments
296/// 
297/// * `src` - The source string to parse as an S-expression
298/// 
299/// # Returns
300/// 
301/// A `Result` containing either the parsed expression or a parse error
302/// 
303/// # Examples
304/// 
305/// ```rust
306/// use sexpression::{read, Expression};
307/// 
308/// // Successful parsing
309/// let result = read("(define x 42)");
310/// assert!(result.is_ok());
311/// 
312/// // Error handling
313/// let result = read("(unclosed");
314/// assert!(result.is_err());
315/// ```
316pub fn read(src: &str) -> Result<Expression, ParseError> {
317    let tokens = tokenize(src);
318    let mut token_slice = tokens.as_slice();
319    parse(&mut token_slice)
320}
321
322/// Convenience function for backward compatibility (panics on error).
323/// 
324/// This function provides the same interface as the original parser
325/// but will panic if parsing fails. Use `read()` for production code
326/// that needs proper error handling.
327/// 
328/// # Arguments
329/// 
330/// * `src` - The source string to parse as an S-expression
331/// 
332/// # Returns
333/// 
334/// The parsed expression
335/// 
336/// # Panics
337/// 
338/// Panics if the source cannot be parsed as a valid S-expression
339/// 
340/// # Examples
341/// 
342/// ```rust
343/// use sexpression::read_unchecked;
344/// 
345/// let expr = read_unchecked("(hello world)");
346/// // Use expr safely knowing it was parsed successfully
347/// ```
348pub fn read_unchecked(src: &str) -> Expression {
349    read(src).expect("Failed to parse S-expression")
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    #[test]
357    fn tokenize_test() {
358        assert_eq!(tokenize("this is a test"), vec!["this", "is", "a", "test"]);
359        assert_eq!(tokenize("(hello world)"), vec!["(", "hello", "world", ")"]);
360    }
361
362    #[test]
363    fn read_test() {
364        let result = read("(1 symbol \"string\" true null (1.5))").unwrap();
365        println!("{:?}", result);
366        
367        // Test error handling
368        assert!(read("(unclosed").is_err());
369        assert!(read(")unexpected").is_err());
370    }
371
372    #[test]
373    fn fast_path_tests() {
374        // Test single character symbols
375        let result = read("a").unwrap();
376        assert!(matches!(result, Expression::Symbol("a")));
377        
378        // Test number parsing
379        let result = read("42").unwrap();
380        assert!(matches!(result, Expression::Number(42.0)));
381        
382        // Test negative numbers
383        let result = read("-3.14").unwrap();
384        assert!(matches!(result, Expression::Number(-3.14)));
385    }
386    
387    #[test]
388    fn performance_test() {
389        // Simple performance test without unstable features
390        let input = "(define (factorial n) (if (= n 0) 1 (* n (factorial (- n 1)))))";
391        let start = std::time::Instant::now();
392        for _ in 0..1000 {
393            let _ = read(input);
394        }
395        let duration = start.elapsed();
396        println!("Parsed 1000 times in {:?}", duration);
397    }
398}