Skip to main content

shannon_brush_parser/parser/
mod.rs

1use std::path::PathBuf;
2
3use bon::bon;
4
5use crate::ast;
6use crate::tokenizer::{Token, TokenEndReason, Tokenizer, TokenizerOptions, Tokens};
7
8pub mod peg;
9#[cfg(feature = "winnow-parser")]
10pub mod winnow_str;
11
12/// Parser implementation to use
13#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Default)]
14pub enum ParserImpl {
15    /// PEG-based parser (token-based)
16    #[default]
17    Peg,
18    /// Winnow-based parser (string-based, direct)
19    #[cfg(feature = "winnow-parser")]
20    Winnow,
21}
22
23/// Options used to control the behavior of the parser.
24#[derive(Clone, Eq, Hash, PartialEq)]
25pub struct ParserOptions {
26    /// Whether or not to enable extended globbing (a.k.a. `extglob`).
27    pub enable_extended_globbing: bool,
28    /// Whether or not to enable POSIX compliance mode.
29    pub posix_mode: bool,
30    /// Whether or not to enable maximal compatibility with the `sh` shell.
31    pub sh_mode: bool,
32    /// Whether or not to perform tilde expansion for tildes at the start of words.
33    pub tilde_expansion_at_word_start: bool,
34    /// Whether or not to perform tilde expansion for tildes after colons.
35    pub tilde_expansion_after_colon: bool,
36    /// Select the parser internal implementation
37    pub parser_impl: ParserImpl,
38}
39
40impl Default for ParserOptions {
41    fn default() -> Self {
42        Self {
43            enable_extended_globbing: true,
44            posix_mode: false,
45            sh_mode: false,
46            tilde_expansion_at_word_start: true,
47            tilde_expansion_after_colon: false,
48            parser_impl: ParserImpl::default(),
49        }
50    }
51}
52
53impl ParserOptions {
54    /// Returns the tokenizer options implied by these parser options.
55    pub const fn tokenizer_options(&self) -> TokenizerOptions {
56        TokenizerOptions {
57            enable_extended_globbing: self.enable_extended_globbing,
58            posix_mode: self.posix_mode,
59            sh_mode: self.sh_mode,
60        }
61    }
62}
63
64/// Information about the source of tokens.
65#[derive(Clone, Debug, Default)]
66#[allow(dead_code)]
67pub struct SourceInfo {
68    /// The source of the tokens.
69    pub source: String,
70}
71
72impl From<PathBuf> for SourceInfo {
73    fn from(path: PathBuf) -> Self {
74        Self {
75            source: path.to_string_lossy().to_string(),
76        }
77    }
78}
79
80/// Implements parsing for shell programs.
81pub struct Parser<R: std::io::BufRead> {
82    /// The reader to use for input
83    reader: R,
84    /// Parsing options
85    options: ParserOptions,
86}
87
88#[bon]
89impl<R: std::io::BufRead> Parser<R> {
90    ///
91    /// # Arguments
92    ///
93    /// * `reader` - The reader to use for input.
94    /// * `options` - The options to use when parsing.
95    pub fn new(reader: R, options: &ParserOptions) -> Self {
96        Self {
97            reader,
98            options: options.clone(),
99        }
100    }
101
102    /// Create a new parser instance through a builder
103    #[builder(
104        finish_fn(doc {
105            /// Instantiate a parser with the provided reader as input
106        })
107    )]
108    pub const fn builder(
109        /// The reader to use for input
110        #[builder(finish_fn)]
111        reader: R,
112
113        #[builder(default = true)]
114        /// Whether or not to enable extended globbing (a.k.a. `extglob`).
115        enable_extended_globbing: bool,
116        #[builder(default = false)]
117        /// Whether or not to enable POSIX compliance mode.
118        posix_mode: bool,
119        #[builder(default = false)]
120        /// Whether or not to enable maximal compatibility with the `sh` shell.
121        sh_mode: bool,
122        #[builder(default = true)]
123        /// Whether or not to perform tilde expansion for tildes at the start of words.
124        tilde_expansion_at_word_start: bool,
125        #[builder(default = false)]
126        /// Whether or not to perform tilde expansion for tildes after colons.
127        tilde_expansion_after_colon: bool,
128        #[builder(default)]
129        /// Select the parser internal implementation
130        parser_impl: ParserImpl,
131    ) -> Self {
132        let options = ParserOptions {
133            enable_extended_globbing,
134            posix_mode,
135            sh_mode,
136            tilde_expansion_at_word_start,
137            tilde_expansion_after_colon,
138            parser_impl,
139        };
140        Self { reader, options }
141    }
142
143    /// Parses the input into an abstract syntax tree (AST) of a shell program.
144    pub fn parse_program(&mut self) -> Result<ast::Program, crate::error::ParseError> {
145        //
146        // References:
147        //   * https://www.gnu.org/software/bash/manual/bash.html#Shell-Syntax
148        //   * https://mywiki.wooledge.org/BashParser
149        //   * https://aosabook.org/en/v1/bash.html
150        //   * https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
151        //
152        match self.options.parser_impl {
153            ParserImpl::Peg => {
154                let tokens = self.tokenize()?;
155                parse_tokens(&tokens, &self.options)
156            }
157            #[cfg(feature = "winnow-parser")]
158            ParserImpl::Winnow => {
159                // Read entire input to string for winnow_str parser
160                let mut input_str = String::new();
161                std::io::Read::read_to_string(&mut self.reader, &mut input_str).map_err(|e| {
162                    crate::error::ParseError::Tokenizing {
163                        inner: crate::tokenizer::TokenizerError::from(e),
164                        position: None,
165                    }
166                })?;
167
168                winnow_str::parse_program(&input_str, &self.options, &SourceInfo::default())
169                    .map_err(|_e| {
170                        // Convert winnow error to ParseError
171                        // TODO: Extract position information from winnow error
172                        crate::error::ParseError::ParsingAtEndOfInput
173                    })
174            }
175        }
176    }
177
178    /// Parses a function definition body from the input. The body is expected to be
179    /// preceded by "()", but no function name.
180    pub fn parse_function_parens_and_body(
181        &mut self,
182    ) -> Result<ast::FunctionBody, crate::error::ParseError> {
183        let tokens = self.tokenize()?;
184        let parse_result =
185            peg::token_parser::function_parens_and_body(&Tokens { tokens: &tokens }, &self.options);
186        parse_result_to_error(parse_result, &tokens)
187    }
188
189    fn tokenize(&mut self) -> Result<Vec<Token>, crate::error::ParseError> {
190        // First we tokenize the input, according to the policy implied by provided options.
191        let mut tokenizer = Tokenizer::new(&mut self.reader, &self.options.tokenizer_options());
192
193        tracing::debug!(target: "tokenize", "Tokenizing...");
194
195        let mut tokens = vec![];
196        loop {
197            let result = match tokenizer.next_token() {
198                Ok(result) => result,
199                Err(e) => {
200                    return Err(crate::error::ParseError::Tokenizing {
201                        inner: e,
202                        position: tokenizer.current_location(),
203                    });
204                }
205            };
206
207            let reason = result.reason;
208            if let Some(token) = result.token {
209                tracing::debug!(target: "tokenize", "TOKEN {}: {:?} {reason:?}", tokens.len(), token);
210                tokens.push(token);
211            }
212
213            if matches!(reason, TokenEndReason::EndOfInput) {
214                break;
215            }
216        }
217
218        tracing::debug!(target: "tokenize", "  => {} token(s)", tokens.len());
219
220        Ok(tokens)
221    }
222}
223
224/// Parses a sequence of tokens into the abstract syntax tree (AST) of a shell program.
225///
226/// # Arguments
227///
228/// * `tokens` - The tokens to parse.
229/// * `options` - The options to use when parsing.
230pub fn parse_tokens(
231    tokens: &[Token],
232    options: &ParserOptions,
233) -> Result<ast::Program, crate::error::ParseError> {
234    let parse_result = peg::token_parser::program(&Tokens { tokens }, options);
235    parse_result_to_error(parse_result, tokens)
236}
237
238fn parse_result_to_error<R>(
239    parse_result: Result<R, ::peg::error::ParseError<usize>>,
240    tokens: &[Token],
241) -> Result<R, crate::error::ParseError>
242where
243    R: std::fmt::Debug,
244{
245    match parse_result {
246        Ok(program) => {
247            tracing::debug!(target: "parse", "PROG: {:?}", program);
248            Ok(program)
249        }
250        Err(parse_error) => {
251            tracing::debug!(target: "parse", "Parse error: {:?}", parse_error);
252            Err(crate::error::convert_peg_parse_error(&parse_error, tokens))
253        }
254    }
255}
256
257#[cfg(test)]
258mod tests;