1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
// Copyright (c) 2018 King's College London // created by the Software Development Team <http://soft-dev.org/> // // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or // http://www.apache.org/licenses/LICENSE-2.0>, or the MIT license <LICENSE-MIT or // http://opensource.org/licenses/MIT>, or the UPL-1.0 license <http://opensource.org/licenses/UPL> // at your option. This file may not be copied, modified, or distributed except according to those // terms. use std::{error::Error, fmt, hash::Hash, mem::size_of}; use num_traits::{PrimInt, Unsigned}; use static_assertions::const_assert; /// A Lexing error. #[derive(Copy, Clone, Debug)] pub struct LexError { pub idx: usize } impl Error for LexError {} impl fmt::Display for LexError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Couldn't lex input at position {}", self.idx) } } /// Roughly speaking, `Lexer` is an iterator which collectively produces `Lexeme`s, as well as /// collecting the newlines encountered so that it can later optionally answer queries of the form /// "what's the line and column number of lexeme L". pub trait Lexer<StorageT: Hash + PrimInt + Unsigned> { /// Return the next `Lexeme` in the input or a `LexError`. Returns `None` if the input has been /// fully lexed (or if an error occurred which prevents further lexing). fn next(&mut self) -> Option<Result<Lexeme<StorageT>, LexError>>; /// Return the user input associated with a lexeme. Panics if the lexeme is invalid (i.e. was /// not produced by next()). fn lexeme_str(&self, lexeme: &Lexeme<StorageT>) -> &str; /// Return the line and column number of the byte at position `off`. Note that the column /// number is a character -- not a byte! -- offset, relative to the beginning of the line. /// Panics if the offset exceeds the known input. fn line_col(&self, off: usize) -> (usize, usize); /// Return the line containing the byte at position `off`. Panics if the offset exceeds the /// known input. fn surrounding_line_str(&self, off: usize) -> &str; /// Return all this lexer's remaining lexemes or a `LexError` if there was a problem when lexing. fn all_lexemes(&mut self) -> Result<Vec<Lexeme<StorageT>>, LexError> { let mut lxs = Vec::new(); let mut n = self.next(); while let Some(r) = n { lxs.push(r?); n = self.next(); } Ok(lxs) } } /// A `Lexeme` represents a segment of the user's input that conforms to a known type. All lexemes /// have a starting position in the user's input: lexemes that result from error recovery, however, /// do not have a length (or, therefore, an end). This allows us to differentiate between lexemes /// that are always of zero length (which are required in some grammars) from lexemes that result /// from error recovery (where an error recovery algorithm can know the type that a lexeme should /// have been, but can't know what its contents should have been). #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub struct Lexeme<StorageT> { // The long-term aim is to pack this struct so that len can be longer than u32 while everything // still fitting into 2 64-bit words. start: usize, len: u32, tok_id: StorageT } impl<StorageT: Copy> Lexeme<StorageT> { /// Create a new token with ID `tok_id` and a starting position in the input `start`. If the /// token is the result of user input, then `Some(n)` should be passed to `len`; if the token /// is the result of error recovery, then `None` should be passed to `len`. pub fn new(tok_id: StorageT, start: usize, len: Option<usize>) -> Self { const_assert!(size_of::<usize>() >= size_of::<u32>()); let len = if let Some(l) = len { if l >= u32::max_value() as usize { panic!("Can't currently represent lexeme of length {}.", l); } l as u32 } else { u32::max_value() }; Lexeme { start, len, tok_id } } /// The token ID. pub fn tok_id(&self) -> StorageT { self.tok_id } /// Byte offset of the start of the lexeme pub fn start(&self) -> usize { self.start } /// Byte offset of the end of the lexeme. /// /// Note that if this lexeme was inserted by error recovery, it will end at the same place it /// started (i.e. `self.start() == self.end()). pub fn end(&self) -> usize { if self.len == u32::max_value() { self.start } else { self.start + (self.len as usize) } } /// Length in bytes of the lexeme. Note that if this lexeme was inserted by error recovery, it /// will have a length of 0. pub fn len(&self) -> usize { if self.len == u32::max_value() { 0 } else { const_assert!(size_of::<usize>() >= size_of::<u32>()); self.len as usize } } /// Returns `true` if this lexeme was inserted as the result of error recovery, or `false` /// otherwise. pub fn inserted(&self) -> bool { self.len == u32::max_value() } }