lexer_rs/char_stream.rs
1//a Imports
2use crate::{PosnInCharStream, StreamCharSpan};
3
4//tt CharStream
5/// The [CharStream] trait allows a stream of [char] to provide extraa methods
6///
7/// Requires P : PosnInCharStream
8pub trait CharStream<P>
9where
10 P: PosnInCharStream,
11{
12 /// Steps along the stream starting at the provided state (and
13 /// character) while the provided function returns true; the
14 /// function is provided with the index and character (starting at
15 /// 0 / ch), and it returns true if the token continues, otherwise
16 /// false
17 ///
18 /// If the first invocation of 'f' returns false then the token is
19 /// said to not match, and 'do_while' returns the stream state and Ok(None).
20 ///
21 /// If the first N (more than zero) invocations match then the
22 /// result is the stream state after the matched characters, and
23 /// Some(initial state, N)
24 ///
25 /// This can be used to match whitespace (where N is probably
26 /// discarded), or user 'id' values in a language. The text can be
27 /// retrieved with the 'get_text' method
28 fn do_while<F: Fn(usize, char) -> bool>(
29 &self,
30 state: P,
31 ch: char,
32 f: &F,
33 ) -> (P, Option<(P, usize)>) {
34 if !f(0, ch) {
35 return (state, None);
36 }
37 let start = state;
38 let mut n = 1;
39 let mut state = self.consumed_char(state, ch);
40 // # Safety
41 //
42 // 'ofs' is maintained as a utf8 character point boundary
43 // within or at the end of the 'str' borrowed by [Self]
44 while let Some(ch) = self.peek_at(&state) {
45 if !f(n, ch) {
46 break;
47 }
48 n += 1;
49 state = self.consumed_char(state, ch);
50 }
51 (state, Some((start, n)))
52 }
53
54 /// Steps along the stream starting at the provided state,
55 /// character and accumulator value while the provided function
56 /// returns (true, new accumulator); the function is provided with
57 /// the latest accumulator, index, character (starting at 0 / ch),
58 /// and it returns true and a new accumulator if the token
59 /// continues, otherwise false and the final accumulator value
60 ///
61 /// If the first invocation of 'f' returns false then the token is
62 /// said to not match, and 'fold' returns the stream state and Ok(None).
63 ///
64 /// If the first N (more than zero) invocations match then the
65 /// result is the stream state after the matched characters, and
66 /// Some(initial state, N, final accumulator)
67 ///
68 /// This can be used to accumulate significant state about a token
69 /// as it is parsed, in excess of the simple number of characters.
70 fn fold<T, F: Fn(&Self, T, &P, usize, char) -> (T, Option<P>)>(
71 &self,
72 state: P,
73 ch: char,
74 acc: T,
75 f: &F,
76 ) -> (P, Option<(P, usize, T)>) {
77 let (mut acc, some_posn) = f(self, acc, &state, 0, ch);
78 if some_posn.is_none() {
79 return (state, None);
80 }
81 let start = state;
82 let mut n = 1;
83 let mut state = some_posn.unwrap();
84 while let Some(ch) = self.peek_at(&state) {
85 let (new_acc, more_posn) = f(self, acc, &state, n, ch);
86 acc = new_acc;
87 if more_posn.is_none() {
88 break;
89 }
90 n += 1;
91 state = more_posn.unwrap();
92 }
93 (state, Some((start, n, acc)))
94 }
95
96 /// Retrieve a range of bytes from the stream
97 fn range_as_bytes(&self, ofs: usize, n: usize) -> &[u8];
98
99 /// Return true if the content of the stream at 'state' matches
100 /// the byte slice
101 fn matches_bytes(&self, state: &P, s: &[u8]) -> bool;
102
103 /// Get the text between the start of a span (inclusive) and the
104 /// end of the span (exclusive).
105 fn get_text_span(&self, span: &StreamCharSpan<P>) -> &str
106 where
107 P: PosnInCharStream;
108
109 /// Get the text between the start (inclusive) and the
110 /// end (exclusive).
111 fn get_text(&self, start: P, end: P) -> &str;
112
113 // Return true if the text at 'pos' matches the string
114 //
115 // Waiting for pattern stabiliztion
116 // fn matches<'call, P:std::str::pattern::Pattern<'call>>(&self, pos: &P, pat: P) -> bool;
117
118 /// Match the text at the offset with a str; return true if it matches, else false
119 fn matches_str(&self, pos: &P, pat: &str) -> bool;
120
121 /// Peek at the next character in the stream, returning None if
122 /// the state is the end of the stream
123 fn peek_at(&self, state: &P) -> Option<char>;
124
125 //cp consumed
126 /// Move the stream state forward by the specified number of characters
127 ///
128 /// The characters MUST NOT inclulde newlines
129 fn consumed(&self, state: P, num_chars: usize) -> P;
130
131 //cp consumed_char
132 /// Get a stream state after consuming the specified character at
133 /// its current state
134 fn consumed_char(&self, state: P, ch: char) -> P
135 where
136 P: PosnInCharStream,
137 {
138 if ch == '\n' {
139 state.advance_line(1)
140 } else {
141 state.advance_cols(ch.len_utf8(), 1)
142 }
143 }
144
145 //cp consumed_newline
146 /// Get a stream state after consuming a newline at its current state
147 ///
148 /// # Safety
149 ///
150 /// num_bytes *must* correspond to the number of bytes that the
151 /// newline character consists of, and state *must* point to the
152 /// bytes offset of that character
153 unsafe fn consumed_newline(&self, state: P, num_bytes: usize) -> P
154 where
155 P: PosnInCharStream,
156 {
157 state.advance_line(num_bytes)
158 }
159
160 //cp consumed_ascii_str
161 /// Get the state after consuming a particular ascii string
162 /// without newlines
163 ///
164 /// This is safe as there is no unsafe handling of byte offsets
165 /// within *state*; however, there is no check that the provided
166 /// string is ASCII and that it does not contain newlines. If
167 /// these API rules are broke then the lie and column held by
168 /// *state* may be incorrect (which is not *unsafe*, but
169 /// potentially a bug)
170 fn consumed_ascii_str(&self, state: P, s: &str) -> P
171 where
172 P: PosnInCharStream,
173 {
174 let n = s.len();
175 state.advance_cols(n, n)
176 }
177
178 //cp consumed_chars
179 /// Become the span after consuming a particular string of known character length
180 ///
181 /// # Safety
182 ///
183 /// num_bytes *must* correspond to the number of bytes that
184 /// 'num_chars' indicates start at *state*. If this constraint is
185 /// not met then the byte offset indicated by the returned value
186 /// may not correspond to a UTF8 character boundary within the
187 /// stream.
188 unsafe fn consumed_chars(&self, state: P, num_bytes: usize, num_chars: usize) -> P
189 where
190 P: PosnInCharStream,
191 {
192 state.advance_cols(num_bytes, num_chars)
193 }
194
195 //mp commit_consumed
196 /// Invoked by the Lexer to indicate that the stream has been
197 /// consumed up to a certain point, and that (for parsing) no
198 /// state earlier in the stream will be requested in the future
199 ///
200 /// A truly streaming source can drop earlier data in the stream
201 /// if this fits the application
202 fn commit_consumed(&self, _up_to: &P) {}
203}