lexer_rs/lexer/
lexer_of_str.rs

1//a Imports
2use std::marker::PhantomData;
3
4use crate::BoxDynLexerParseFn;
5use crate::{CharStream, Lexer, LexerError, LexerParseResult};
6use crate::{ParserIterator, PosnInCharStream, StreamCharSpan};
7
8//a LexerOfStr
9//tp LexerOfStr
10/// A [Lexer] of a [str], using an arbitrary stream position type,
11/// lexer token, and lexer error.
12///
13/// This provides implementations of [Lexer] and [CharStream].
14///
15/// The [Lexer] implementation means that a [LexerOfStr] has a 'parse'
16/// method that can be invoked to parse a single token at a position
17/// within the [str], and another 'iter' method that can be invoked to
18/// generate an iterator that returns all the tokens in the [str]
19///
20/// If the iterator or parser return an Err, the that error is of the
21/// generic type 'E' supplied to the [LexerOfStr] which must implement
22/// [LexerError] of the generic position 'P' - so a failure to parse a
23/// character in the string can be indicated at a particular location
24/// (byte offset, with line number and column potentially).
25///
26/// The actual parsing of tokens is supported through the [Lexer]
27/// trait for both the 'parser' and 'iter' trait methods using a
28/// &[BoxDynLexerParseFn]. These must be boxed functions with the signature
29/// like:
30///
31/// ```ignore
32///    fn parse(stream: &LexerOfStr<P, T, E>, pos:P, ch:char) ->
33///               LexerParseResult<P, T, E>
34/// ```
35///
36/// where
37///
38/// ```ignore
39///    LexerParseResult<P, T, E> = Result<Option<P, T>, E>
40/// ```
41///
42/// See the [Lexer] trait for more details on these parse functions
43///
44/// The [LexerOfStr] also provides a [CharStream] implementation,
45/// which provides methods that are can be used by the parse functions.
46///
47/// This provides methods to match strings, get
48///
49// Cannot derive either Copy or Clone without that putting the same bound on T and E
50#[derive(Debug)]
51pub struct LexerOfStr<'a, P, T, E>
52where
53    P: PosnInCharStream,
54{
55    text: &'a str,
56    end: usize,
57    _phantom_posn: PhantomData<&'a P>,
58    _phantom_token: PhantomData<&'a T>,
59    _phantom_error: PhantomData<&'a E>,
60}
61
62//ip Copy for LexerOfStr<'a, P, T, E>
63impl<'a, P, T, E> Copy for LexerOfStr<'a, P, T, E> where P: PosnInCharStream {}
64
65//ip Clone for LexerOfStr<'a, P, T, E>
66impl<'a, P, T, E> Clone for LexerOfStr<'a, P, T, E>
67where
68    P: PosnInCharStream,
69{
70    fn clone(&self) -> Self {
71        *self
72    }
73}
74
75//ip LexerOfStr
76impl<'a, P, T, E> LexerOfStr<'a, P, T, E>
77where
78    P: PosnInCharStream,
79{
80    //fp new
81    /// Create a new [LexerOfStr] by borrowing a [str]
82    pub fn new(text: &'a str) -> Self {
83        let end = text.as_bytes().len();
84        Self {
85            text,
86            end,
87            _phantom_posn: PhantomData,
88            _phantom_token: PhantomData,
89            _phantom_error: PhantomData,
90        }
91    }
92
93    //mp peek_at_offset
94    /// Get the utf8 chararacter at the byte offset, or None at the end of a string
95    #[inline(always)]
96    unsafe fn peek_at_offset(&self, byte_ofs: usize) -> Option<char> {
97        if byte_ofs >= self.end {
98            None
99        } else {
100            let text = self.text.get_unchecked(byte_ofs..self.end);
101            text.chars().next()
102        }
103    }
104
105    //mp remaining_text
106    /// Get the remaining text from a position
107    #[inline(always)]
108    fn remaining_text(&self, p: &P) -> &str {
109        // # Safety
110        //
111        // Safe if p is a valid Posn as then it must be a utf8
112        // character boundary
113        unsafe { self.text.get_unchecked(p.byte_ofs()..self.end) }
114    }
115}
116
117//a Impl Lexer, CharStream
118//ip Lexer for LexerOfStr
119impl<'a, P, T, E> Lexer for LexerOfStr<'a, P, T, E>
120where
121    P: PosnInCharStream,
122    T: std::fmt::Debug + Clone,
123    E: LexerError<P>,
124{
125    type Token = T;
126    type Error = E;
127    type State = P;
128
129    //mp parse
130    fn parse<'iter>(
131        &'iter self,
132        state: Self::State,
133        parsers: &[BoxDynLexerParseFn<'iter, Self>],
134    ) -> LexerParseResult<Self::State, Self::Token, Self::Error> {
135        if let Some(ch) = self.peek_at(&state) {
136            for p in parsers {
137                let result = p(self, state, ch)?;
138                if result.is_some() {
139                    return Ok(result);
140                }
141            }
142            return Err(E::failed_to_parse(state, ch));
143        }
144        Ok(None)
145    }
146
147    //mp iter
148    fn iter<'iter>(
149        &'iter self,
150        parsers: &'iter [BoxDynLexerParseFn<'iter, Self>],
151    ) -> Box<dyn Iterator<Item = Result<T, E>> + 'iter> {
152        let state = Default::default();
153        Box::new(ParserIterator::new(self, state, parsers))
154    }
155}
156
157//ip CharStream for LexerOfStr
158impl<'a, P, T, E> CharStream<P> for LexerOfStr<'a, P, T, E>
159where
160    P: PosnInCharStream,
161{
162    //mp range_as_bytes
163    /// Borrow some bytes of the stream from an offset
164    ///
165    /// Return None if the bytes are out of range
166    fn range_as_bytes(&self, ofs: usize, n: usize) -> &[u8] {
167        assert!(ofs + n <= self.end);
168        &self.text.as_bytes()[ofs..ofs + n]
169    }
170
171    //mp get_text_span
172    /// Get the text of a [StreamCharSpan] provided by a parser
173    ///
174    /// # Safety
175    ///
176    /// The [StreamCharSpan] must have been provided by a parser and
177    /// so the byte offsets are indeed utf8 character boundaries
178    fn get_text_span(&self, span: &StreamCharSpan<P>) -> &str {
179        unsafe { self.text.get_unchecked(span.byte_range()) }
180    }
181
182    //mp get_text
183    /// Get the text between two [crate::StreamCharPos] provided by a parser
184    ///
185    /// # Safety
186    ///
187    /// The [crate::StreamCharPos] must have been provided by a parser and
188    /// so the byte offsets are indeed utf8 character boundaries
189    #[inline(always)]
190    fn get_text(&self, start: P, end: P) -> &str {
191        unsafe { self.text.get_unchecked(start.byte_ofs()..end.byte_ofs()) }
192    }
193
194    //mp peek_at
195    /// Get the utf8 chararacter at the byte offset, or None at the end of a string
196    ///
197    /// # Safety
198    ///
199    /// 'state' is maintained as a utf8 character point boundary
200    /// within or at the end of the 'str' borrowed by [Self]
201    #[inline(always)]
202    fn peek_at(&self, state: &P) -> Option<char> {
203        unsafe { self.peek_at_offset(state.byte_ofs()) }
204    }
205
206    //mp matches_bytes
207    /// Match the text at the offset with a str
208    fn matches_bytes(&self, state: &P, s: &[u8]) -> bool {
209        let n = s.len();
210        let byte_ofs = state.byte_ofs();
211        if byte_ofs + n > self.end {
212            false
213        } else {
214            s == self.range_as_bytes(byte_ofs, n)
215        }
216    }
217
218    //mp matches_str
219    /// Match the text at the offset with a str
220    #[inline(always)]
221    fn matches_str(&self, pos: &P, pat: &str) -> bool {
222        self.remaining_text(pos).starts_with(pat)
223    }
224
225    //mp matches - awaiting Pattern stabilization
226    // Match the text at the offset with a str
227    // fn matches<'call, Pat:std::str::pattern::Pattern<'call>>(&self, pos: &P, pat: Pat) -> bool {
228    // self.remaining_text(pos).starts_with(pat)
229    // }
230
231    //cp consumed
232    #[inline(always)]
233    fn consumed(&self, mut state: P, mut n: usize) -> P {
234        for ch in self.remaining_text(&state).chars() {
235            if n == 0 {
236                break;
237            }
238            if ch == '\n' {
239                state = state.advance_line(1)
240            } else {
241                state = state.advance_cols(ch.len_utf8(), 1)
242            }
243            n -= 1;
244        }
245        state
246    }
247
248    //mp do_while
249    fn do_while<F: Fn(usize, char) -> bool>(
250        &self,
251        mut state: P,
252        ch: char,
253        f: &F,
254    ) -> (P, Option<(P, usize)>) {
255        if !f(0, ch) {
256            return (state, None);
257        }
258        let start = state;
259        let mut n = 1;
260        let mut ofs = state.byte_ofs() + ch.len_utf8();
261        // # Safety
262        //
263        // 'ofs' is maintained as a utf8 character point boundary
264        // within or at the end of the 'str' borrowed by [Self]
265        while let Some(ch) = unsafe { self.peek_at_offset(ofs) } {
266            if !f(n, ch) {
267                break;
268            }
269            n += 1;
270            ofs += ch.len_utf8();
271        }
272        // Does not work if newlines are involved
273        state = unsafe { self.consumed_chars(state, ofs - start.byte_ofs(), n) };
274        (state, Some((start, n)))
275    }
276}