miden_assembly/parser/
mod.rs

1/// Simple macro used in the grammar definition for constructing spans
2macro_rules! span {
3    ($id:expr, $l:expr, $r:expr) => {
4        crate::SourceSpan::new($id, $l..$r)
5    };
6    ($id:expr, $i:expr) => {
7        crate::SourceSpan::at($id, $i)
8    };
9}
10
11lalrpop_util::lalrpop_mod!(
12    #[allow(clippy::all)]
13    grammar,
14    "/parser/grammar.rs"
15);
16
17mod error;
18mod lexer;
19mod scanner;
20mod token;
21
22use alloc::{boxed::Box, collections::BTreeSet, string::ToString, sync::Arc, vec::Vec};
23
24pub use self::{
25    error::{BinErrorKind, HexErrorKind, LiteralErrorKind, ParsingError},
26    lexer::Lexer,
27    scanner::Scanner,
28    token::{BinEncodedValue, DocumentationType, HexEncodedValue, Token},
29};
30use crate::{
31    ast,
32    diagnostics::{Report, SourceFile, SourceSpan, Span, Spanned},
33    sema, LibraryPath, SourceManager,
34};
35
36// TYPE ALIASES
37// ================================================================================================
38
39type ParseError<'a> = lalrpop_util::ParseError<u32, Token<'a>, ParsingError>;
40
41// MODULE PARSER
42// ================================================================================================
43
44/// This is a wrapper around the lower-level parser infrastructure which handles orchestrating all
45/// of the pieces needed to parse a [ast::Module] from source, and run semantic analysis on it.
46#[derive(Default)]
47pub struct ModuleParser {
48    /// The kind of module we're parsing.
49    ///
50    /// This is used when performing semantic analysis to detect when various invalid constructions
51    /// are encountered, such as use of the `syscall` instruction in a kernel module.
52    kind: ast::ModuleKind,
53    /// A set of interned strings allocated during parsing/semantic analysis.
54    ///
55    /// This is a very primitive and imprecise way of interning strings, but was the least invasive
56    /// at the time the new parser was implemented. In essence, we avoid duplicating allocations
57    /// for frequently occurring strings, by tracking which strings we've seen before, and
58    /// sharing a reference counted pointer instead.
59    ///
60    /// We may want to replace this eventually with a proper interner, so that we can also gain the
61    /// benefits commonly provided by interned string handles (e.g. cheap equality comparisons, no
62    /// ref- counting overhead, copyable and of smaller size).
63    ///
64    /// Note that [Ident], [ProcedureName], [LibraryPath] and others are all implemented in terms
65    /// of either the actual reference-counted string, e.g. `Arc<str>`, or in terms of [Ident],
66    /// which is essentially the former wrapped in a [SourceSpan]. If we ever replace this with
67    /// a better interner, we will also want to update those types to be in terms of whatever
68    /// the handle type of the interner is.
69    interned: BTreeSet<Arc<str>>,
70    /// When true, all warning diagnostics are promoted to error severity
71    warnings_as_errors: bool,
72}
73
74impl ModuleParser {
75    /// Construct a new parser for the given `kind` of [ast::Module].
76    pub fn new(kind: ast::ModuleKind) -> Self {
77        Self {
78            kind,
79            interned: Default::default(),
80            warnings_as_errors: false,
81        }
82    }
83
84    /// Configure this parser so that any warning diagnostics are promoted to errors.
85    pub fn set_warnings_as_errors(&mut self, yes: bool) {
86        self.warnings_as_errors = yes;
87    }
88
89    /// Parse a [ast::Module] from `source`, and give it the provided `path`.
90    pub fn parse(
91        &mut self,
92        path: LibraryPath,
93        source: Arc<SourceFile>,
94    ) -> Result<Box<ast::Module>, Report> {
95        let forms = parse_forms_internal(source.clone(), &mut self.interned)
96            .map_err(|err| Report::new(err).with_source_code(source.clone()))?;
97        sema::analyze(source, self.kind, path, forms, self.warnings_as_errors).map_err(Report::new)
98    }
99
100    /// Parse a [ast::Module], `name`, from `path`.
101    #[cfg(feature = "std")]
102    pub fn parse_file<P>(
103        &mut self,
104        name: LibraryPath,
105        path: P,
106        source_manager: &dyn SourceManager,
107    ) -> Result<Box<ast::Module>, Report>
108    where
109        P: AsRef<std::path::Path>,
110    {
111        use vm_core::debuginfo::SourceManagerExt;
112
113        use crate::diagnostics::{IntoDiagnostic, WrapErr};
114
115        let path = path.as_ref();
116        let source_file = source_manager
117            .load_file(path)
118            .into_diagnostic()
119            .wrap_err_with(|| format!("failed to load source file from '{}'", path.display()))?;
120        self.parse(name, source_file)
121    }
122
123    /// Parse a [ast::Module], `name`, from `source`.
124    pub fn parse_str(
125        &mut self,
126        name: LibraryPath,
127        source: impl ToString,
128        source_manager: &dyn SourceManager,
129    ) -> Result<Box<ast::Module>, Report> {
130        use vm_core::debuginfo::SourceContent;
131
132        let path = Arc::from(name.path().into_owned().into_boxed_str());
133        let content = SourceContent::new(Arc::clone(&path), source.to_string().into_boxed_str());
134        let source_file = source_manager.load_from_raw_parts(path, content);
135        self.parse(name, source_file)
136    }
137}
138
139/// This is used in tests to parse `source` as a set of raw [ast::Form]s rather than as a
140/// [ast::Module].
141///
142/// NOTE: This does _not_ run semantic analysis.
143#[cfg(any(test, feature = "testing"))]
144pub fn parse_forms(source: Arc<SourceFile>) -> Result<Vec<ast::Form>, ParsingError> {
145    let mut interned = BTreeSet::default();
146    parse_forms_internal(source, &mut interned)
147}
148
149/// Parse `source` as a set of [ast::Form]s
150///
151/// Aside from catching syntax errors, this does little validation of the resulting forms, that is
152/// handled by semantic analysis, which the caller is expected to perform next.
153fn parse_forms_internal(
154    source: Arc<SourceFile>,
155    interned: &mut BTreeSet<Arc<str>>,
156) -> Result<Vec<ast::Form>, ParsingError> {
157    let source_id = source.id();
158    let scanner = Scanner::new(source.as_str());
159    let lexer = Lexer::new(source_id, scanner);
160    grammar::FormsParser::new()
161        .parse(&source, interned, core::marker::PhantomData, lexer)
162        .map_err(|err| ParsingError::from_parse_error(source_id, err))
163}
164
165// DIRECTORY PARSER
166// ================================================================================================
167
168/// Read the contents (modules) of this library from `dir`, returning any errors that occur
169/// while traversing the file system.
170///
171/// Errors may also be returned if traversal discovers issues with the modules, such as
172/// invalid names, etc.
173///
174/// Returns an iterator over all parsed modules.
175#[cfg(feature = "std")]
176pub fn read_modules_from_dir(
177    namespace: crate::LibraryNamespace,
178    dir: &std::path::Path,
179    source_manager: &dyn SourceManager,
180) -> Result<impl Iterator<Item = Box<ast::Module>>, Report> {
181    use std::collections::{btree_map::Entry, BTreeMap};
182
183    use miette::miette;
184    use module_walker::{ModuleEntry, WalkModules};
185
186    use crate::diagnostics::{IntoDiagnostic, WrapErr};
187
188    if !dir.is_dir() {
189        return Err(miette!("the provided path '{}' is not a valid directory", dir.display()));
190    }
191
192    // mod.masm is not allowed in the root directory
193    if dir.join(ast::Module::ROOT_FILENAME).exists() {
194        return Err(miette!("{} is not allowed in the root directory", ast::Module::ROOT_FILENAME));
195    }
196
197    let mut modules = BTreeMap::default();
198
199    let walker = WalkModules::new(namespace.clone(), dir)
200        .into_diagnostic()
201        .wrap_err_with(|| format!("failed to load modules from '{}'", dir.display()))?;
202    for entry in walker {
203        let ModuleEntry { mut name, source_path } = entry?;
204        if name.last() == ast::Module::ROOT {
205            name.pop();
206        }
207
208        // Parse module at the given path
209        let mut parser = ModuleParser::new(ast::ModuleKind::Library);
210        let ast = parser.parse_file(name.clone(), &source_path, source_manager)?;
211        match modules.entry(name) {
212            Entry::Occupied(ref entry) => {
213                return Err(miette!("duplicate module '{0}'", entry.key().clone()));
214            },
215            Entry::Vacant(entry) => {
216                entry.insert(ast);
217            },
218        }
219    }
220
221    Ok(modules.into_values())
222}
223
224#[cfg(feature = "std")]
225mod module_walker {
226
227    use std::{
228        ffi::OsStr,
229        fs::{self, DirEntry, FileType},
230        io,
231        path::{Path, PathBuf},
232    };
233
234    use miette::miette;
235
236    use crate::{
237        ast::Module,
238        diagnostics::{IntoDiagnostic, Report},
239        LibraryNamespace, LibraryPath,
240    };
241
242    pub struct ModuleEntry {
243        pub name: LibraryPath,
244        pub source_path: PathBuf,
245    }
246
247    pub struct WalkModules<'a> {
248        namespace: LibraryNamespace,
249        root: &'a Path,
250        stack: alloc::collections::VecDeque<io::Result<DirEntry>>,
251    }
252
253    impl<'a> WalkModules<'a> {
254        pub fn new(namespace: LibraryNamespace, path: &'a Path) -> io::Result<Self> {
255            use alloc::collections::VecDeque;
256
257            let stack = VecDeque::from_iter(fs::read_dir(path)?);
258
259            Ok(Self { namespace, root: path, stack })
260        }
261
262        fn next_entry(
263            &mut self,
264            entry: &DirEntry,
265            ty: &FileType,
266        ) -> Result<Option<ModuleEntry>, Report> {
267            if ty.is_dir() {
268                let dir = entry.path();
269                self.stack.extend(fs::read_dir(dir).into_diagnostic()?);
270                return Ok(None);
271            }
272
273            let mut file_path = entry.path();
274            let is_module = file_path
275                .extension()
276                .map(|ext| ext == AsRef::<OsStr>::as_ref(Module::FILE_EXTENSION))
277                .unwrap_or(false);
278            if !is_module {
279                return Ok(None);
280            }
281
282            // Remove the file extension and the root prefix, leaving a namespace-relative path
283            file_path.set_extension("");
284            if file_path.is_dir() {
285                return Err(miette!(
286                    "file and directory with same name are not allowed: {}",
287                    file_path.display()
288                ));
289            }
290            let relative_path = file_path
291                .strip_prefix(self.root)
292                .expect("expected path to be a child of the root directory");
293
294            // Construct a [LibraryPath] from the path components, after validating them
295            let mut libpath = LibraryPath::from(self.namespace.clone());
296            for component in relative_path.iter() {
297                let component = component.to_str().ok_or_else(|| {
298                    let p = entry.path();
299                    miette!("{} is an invalid directory entry", p.display())
300                })?;
301                libpath.push(component).into_diagnostic()?;
302            }
303            Ok(Some(ModuleEntry { name: libpath, source_path: entry.path() }))
304        }
305    }
306
307    impl Iterator for WalkModules<'_> {
308        type Item = Result<ModuleEntry, Report>;
309
310        fn next(&mut self) -> Option<Self::Item> {
311            loop {
312                let entry = self
313                    .stack
314                    .pop_front()?
315                    .and_then(|entry| entry.file_type().map(|ft| (entry, ft)))
316                    .into_diagnostic();
317
318                match entry {
319                    Ok((ref entry, ref file_type)) => {
320                        match self.next_entry(entry, file_type).transpose() {
321                            None => continue,
322                            result => break result,
323                        }
324                    },
325                    Err(err) => break Some(Err(err)),
326                }
327            }
328        }
329    }
330}
331
332// TESTS
333// ================================================================================================
334
335#[cfg(test)]
336mod tests {
337    use vm_core::assert_matches;
338
339    use super::*;
340    use crate::SourceId;
341
342    // This test checks the lexer behavior with regard to tokenizing `exp(.u?[\d]+)?`
343    #[test]
344    fn lex_exp() {
345        let source_id = SourceId::default();
346        let scanner = Scanner::new("begin exp.u9 end");
347        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
348        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
349        assert_matches!(lexer.next(), Some(Ok(Token::ExpU)));
350        assert_matches!(lexer.next(), Some(Ok(Token::Int(n))) if n == 9);
351        assert_matches!(lexer.next(), Some(Ok(Token::End)));
352    }
353
354    #[test]
355    fn lex_block() {
356        let source_id = SourceId::default();
357        let scanner = Scanner::new(
358            "\
359const.ERR1=1
360
361begin
362    u32assertw
363    u32assertw.err=ERR1
364    u32assertw.err=2
365end
366",
367        );
368        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
369        assert_matches!(lexer.next(), Some(Ok(Token::Const)));
370        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
371        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
372        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
373        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
374        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
375        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
376        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
377        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
378        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
379        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
380        assert_matches!(lexer.next(), Some(Ok(Token::ConstantIdent("ERR1"))));
381        assert_matches!(lexer.next(), Some(Ok(Token::U32Assertw)));
382        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
383        assert_matches!(lexer.next(), Some(Ok(Token::Err)));
384        assert_matches!(lexer.next(), Some(Ok(Token::Equal)));
385        assert_matches!(lexer.next(), Some(Ok(Token::Int(2))));
386        assert_matches!(lexer.next(), Some(Ok(Token::End)));
387        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
388    }
389
390    #[test]
391    fn lex_emit() {
392        let source_id = SourceId::default();
393        let scanner = Scanner::new(
394            "\
395begin
396    push.1
397    emit.1
398end
399",
400        );
401        let mut lexer = Lexer::new(source_id, scanner).map(|result| result.map(|(_, t, _)| t));
402        assert_matches!(lexer.next(), Some(Ok(Token::Begin)));
403        assert_matches!(lexer.next(), Some(Ok(Token::Push)));
404        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
405        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
406        assert_matches!(lexer.next(), Some(Ok(Token::Emit)));
407        assert_matches!(lexer.next(), Some(Ok(Token::Dot)));
408        assert_matches!(lexer.next(), Some(Ok(Token::Int(1))));
409        assert_matches!(lexer.next(), Some(Ok(Token::End)));
410        assert_matches!(lexer.next(), Some(Ok(Token::Eof)));
411    }
412}