lexer_rs/lexer/lexer_of_str.rs
1//a Imports
2use std::marker::PhantomData;
3
4use crate::BoxDynLexerParseFn;
5use crate::{CharStream, Lexer, LexerError, LexerParseResult};
6use crate::{ParserIterator, PosnInCharStream, StreamCharSpan};
7
8//a LexerOfStr
9//tp LexerOfStr
10/// A [Lexer] of a [str], using an arbitrary stream position type,
11/// lexer token, and lexer error.
12///
13/// This provides implementations of [Lexer] and [CharStream].
14///
15/// The [Lexer] implementation means that a [LexerOfStr] has a 'parse'
16/// method that can be invoked to parse a single token at a position
17/// within the [str], and another 'iter' method that can be invoked to
18/// generate an iterator that returns all the tokens in the [str]
19///
20/// If the iterator or parser return an Err, the that error is of the
21/// generic type 'E' supplied to the [LexerOfStr] which must implement
22/// [LexerError] of the generic position 'P' - so a failure to parse a
23/// character in the string can be indicated at a particular location
24/// (byte offset, with line number and column potentially).
25///
26/// The actual parsing of tokens is supported through the [Lexer]
27/// trait for both the 'parser' and 'iter' trait methods using a
28/// &[BoxDynLexerParseFn]. These must be boxed functions with the signature
29/// like:
30///
31/// ```ignore
32/// fn parse(stream: &LexerOfStr<P, T, E>, pos:P, ch:char) ->
33/// LexerParseResult<P, T, E>
34/// ```
35///
36/// where
37///
38/// ```ignore
39/// LexerParseResult<P, T, E> = Result<Option<P, T>, E>
40/// ```
41///
42/// See the [Lexer] trait for more details on these parse functions
43///
44/// The [LexerOfStr] also provides a [CharStream] implementation,
45/// which provides methods that are can be used by the parse functions.
46///
47/// This provides methods to match strings, get
48///
49// Cannot derive either Copy or Clone without that putting the same bound on T and E
50#[derive(Debug)]
51pub struct LexerOfStr<'a, P, T, E>
52where
53 P: PosnInCharStream,
54{
55 text: &'a str,
56 end: usize,
57 _phantom_posn: PhantomData<&'a P>,
58 _phantom_token: PhantomData<&'a T>,
59 _phantom_error: PhantomData<&'a E>,
60}
61
62//ip Copy for LexerOfStr<'a, P, T, E>
63impl<'a, P, T, E> Copy for LexerOfStr<'a, P, T, E> where P: PosnInCharStream {}
64
65//ip Clone for LexerOfStr<'a, P, T, E>
66impl<'a, P, T, E> Clone for LexerOfStr<'a, P, T, E>
67where
68 P: PosnInCharStream,
69{
70 fn clone(&self) -> Self {
71 *self
72 }
73}
74
75//ip LexerOfStr
76impl<'a, P, T, E> LexerOfStr<'a, P, T, E>
77where
78 P: PosnInCharStream,
79{
80 //fp new
81 /// Create a new [LexerOfStr] by borrowing a [str]
82 pub fn new(text: &'a str) -> Self {
83 let end = text.as_bytes().len();
84 Self {
85 text,
86 end,
87 _phantom_posn: PhantomData,
88 _phantom_token: PhantomData,
89 _phantom_error: PhantomData,
90 }
91 }
92
93 //mp peek_at_offset
94 /// Get the utf8 chararacter at the byte offset, or None at the end of a string
95 #[inline(always)]
96 unsafe fn peek_at_offset(&self, byte_ofs: usize) -> Option<char> {
97 if byte_ofs >= self.end {
98 None
99 } else {
100 let text = self.text.get_unchecked(byte_ofs..self.end);
101 text.chars().next()
102 }
103 }
104
105 //mp remaining_text
106 /// Get the remaining text from a position
107 #[inline(always)]
108 fn remaining_text(&self, p: &P) -> &str {
109 // # Safety
110 //
111 // Safe if p is a valid Posn as then it must be a utf8
112 // character boundary
113 unsafe { self.text.get_unchecked(p.byte_ofs()..self.end) }
114 }
115}
116
117//a Impl Lexer, CharStream
118//ip Lexer for LexerOfStr
119impl<'a, P, T, E> Lexer for LexerOfStr<'a, P, T, E>
120where
121 P: PosnInCharStream,
122 T: std::fmt::Debug + Clone,
123 E: LexerError<P>,
124{
125 type Token = T;
126 type Error = E;
127 type State = P;
128
129 //mp parse
130 fn parse<'iter>(
131 &'iter self,
132 state: Self::State,
133 parsers: &[BoxDynLexerParseFn<'iter, Self>],
134 ) -> LexerParseResult<Self::State, Self::Token, Self::Error> {
135 if let Some(ch) = self.peek_at(&state) {
136 for p in parsers {
137 let result = p(self, state, ch)?;
138 if result.is_some() {
139 return Ok(result);
140 }
141 }
142 return Err(E::failed_to_parse(state, ch));
143 }
144 Ok(None)
145 }
146
147 //mp iter
148 fn iter<'iter>(
149 &'iter self,
150 parsers: &'iter [BoxDynLexerParseFn<'iter, Self>],
151 ) -> Box<dyn Iterator<Item = Result<T, E>> + 'iter> {
152 let state = Default::default();
153 Box::new(ParserIterator::new(self, state, parsers))
154 }
155}
156
157//ip CharStream for LexerOfStr
158impl<'a, P, T, E> CharStream<P> for LexerOfStr<'a, P, T, E>
159where
160 P: PosnInCharStream,
161{
162 //mp range_as_bytes
163 /// Borrow some bytes of the stream from an offset
164 ///
165 /// Return None if the bytes are out of range
166 fn range_as_bytes(&self, ofs: usize, n: usize) -> &[u8] {
167 assert!(ofs + n <= self.end);
168 &self.text.as_bytes()[ofs..ofs + n]
169 }
170
171 //mp get_text_span
172 /// Get the text of a [StreamCharSpan] provided by a parser
173 ///
174 /// # Safety
175 ///
176 /// The [StreamCharSpan] must have been provided by a parser and
177 /// so the byte offsets are indeed utf8 character boundaries
178 fn get_text_span(&self, span: &StreamCharSpan<P>) -> &str {
179 unsafe { self.text.get_unchecked(span.byte_range()) }
180 }
181
182 //mp get_text
183 /// Get the text between two [crate::StreamCharPos] provided by a parser
184 ///
185 /// # Safety
186 ///
187 /// The [crate::StreamCharPos] must have been provided by a parser and
188 /// so the byte offsets are indeed utf8 character boundaries
189 #[inline(always)]
190 fn get_text(&self, start: P, end: P) -> &str {
191 unsafe { self.text.get_unchecked(start.byte_ofs()..end.byte_ofs()) }
192 }
193
194 //mp peek_at
195 /// Get the utf8 chararacter at the byte offset, or None at the end of a string
196 ///
197 /// # Safety
198 ///
199 /// 'state' is maintained as a utf8 character point boundary
200 /// within or at the end of the 'str' borrowed by [Self]
201 #[inline(always)]
202 fn peek_at(&self, state: &P) -> Option<char> {
203 unsafe { self.peek_at_offset(state.byte_ofs()) }
204 }
205
206 //mp matches_bytes
207 /// Match the text at the offset with a str
208 fn matches_bytes(&self, state: &P, s: &[u8]) -> bool {
209 let n = s.len();
210 let byte_ofs = state.byte_ofs();
211 if byte_ofs + n > self.end {
212 false
213 } else {
214 s == self.range_as_bytes(byte_ofs, n)
215 }
216 }
217
218 //mp matches_str
219 /// Match the text at the offset with a str
220 #[inline(always)]
221 fn matches_str(&self, pos: &P, pat: &str) -> bool {
222 self.remaining_text(pos).starts_with(pat)
223 }
224
225 //mp matches - awaiting Pattern stabilization
226 // Match the text at the offset with a str
227 // fn matches<'call, Pat:std::str::pattern::Pattern<'call>>(&self, pos: &P, pat: Pat) -> bool {
228 // self.remaining_text(pos).starts_with(pat)
229 // }
230
231 //cp consumed
232 #[inline(always)]
233 fn consumed(&self, mut state: P, mut n: usize) -> P {
234 for ch in self.remaining_text(&state).chars() {
235 if n == 0 {
236 break;
237 }
238 if ch == '\n' {
239 state = state.advance_line(1)
240 } else {
241 state = state.advance_cols(ch.len_utf8(), 1)
242 }
243 n -= 1;
244 }
245 state
246 }
247
248 //mp do_while
249 fn do_while<F: Fn(usize, char) -> bool>(
250 &self,
251 mut state: P,
252 ch: char,
253 f: &F,
254 ) -> (P, Option<(P, usize)>) {
255 if !f(0, ch) {
256 return (state, None);
257 }
258 let start = state;
259 let mut n = 1;
260 let mut ofs = state.byte_ofs() + ch.len_utf8();
261 // # Safety
262 //
263 // 'ofs' is maintained as a utf8 character point boundary
264 // within or at the end of the 'str' borrowed by [Self]
265 while let Some(ch) = unsafe { self.peek_at_offset(ofs) } {
266 if !f(n, ch) {
267 break;
268 }
269 n += 1;
270 ofs += ch.len_utf8();
271 }
272 // Does not work if newlines are involved
273 state = unsafe { self.consumed_chars(state, ofs - start.byte_ofs(), n) };
274 (state, Some((start, n)))
275 }
276}