1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
use crate::errors::{GelError, Span};
use crate::parser::lexer::{next_token, TokenKind};
use crate::stream::chunk_reader::ReadChunks;
pub struct StreamTokenBatch<'a> {
pub tokens: Vec<BorrowedToken>,
pub finished: bool,
pub buffer: &'a str,
}
#[derive(Debug, Clone)]
pub struct BorrowedToken {
pub kind: TokenKind,
pub start: usize,
pub len: usize,
pub position: usize,
}
pub struct StreamingLexer<R: ReadChunks> {
reader: R,
pending: String, // not yet fully tokenized bytes
eof: bool, // reader reported EOF
global_offset: usize, // total consumed offset
finished: bool, // EOF token already emitted
full_buffer: String, // full emitted text (temporary until true zero-copy path)
}
impl<R: ReadChunks> StreamingLexer<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
pending: String::new(),
eof: false,
global_offset: 0,
finished: false,
full_buffer: String::new(),
}
}
/// Fill pending buffer with next chunk if available.
fn fill(&mut self) -> Result<(), GelError> {
if self.eof {
return Ok(());
}
if let Some(bytes) = self.reader.next_chunk().map_err(GelError::Io)? {
let s = std::str::from_utf8(&bytes)
.map_err(|e| GelError::lex(e.to_string(), Span::new(self.global_offset, 0, 0, 0)))?;
self.pending.push_str(s);
} else {
self.eof = true;
}
Ok(())
}
/// Detect if rest likely represents a fragmented literal (string or regex without closing delimiter yet).
fn looks_fragmented(rest: &str) -> bool {
if rest.is_empty() {
return false;
}
// Scan for unescaped closing delimiter.
let starts = rest.chars().next().unwrap();
if starts == '/' || starts == '"' || starts == '\'' {
let mut escaped = false;
for (_, c) in rest.char_indices().skip(1) {
// skip opening
if escaped {
escaped = false;
continue;
}
if c == '\\' {
escaped = true;
continue;
}
if c == starts {
return false;
} // found closing
}
return true; // no closing found
}
false
}
pub fn next_batch(&mut self) -> Result<Option<StreamTokenBatch<'_>>, GelError> {
if self.finished {
return Ok(None);
}
// If no pending data: attempt to fill
if self.pending.is_empty() {
self.fill()?;
}
if self.pending.is_empty() && self.eof {
// Empty input → directly emit EOF
self.finished = true;
return Ok(Some(StreamTokenBatch {
tokens: vec![BorrowedToken {
kind: TokenKind::EOF,
start: self.global_offset,
len: 0,
position: self.global_offset,
}],
finished: true,
buffer: &self.full_buffer,
}));
}
let mut batch = Vec::<BorrowedToken>::new();
let mut local_consumed = 0usize; // bytes consumed this round
let mut slice = self.pending.as_str();
while !slice.is_empty() {
match next_token(slice) {
Ok((rest, tok)) => {
let consumed = slice.len() - rest.len();
// Filter comments / intra-line whitespace (same rules as full lexer)
let push_it = if tok.kind == TokenKind::Comment {
false
} else {
tok.kind != TokenKind::Newline || tok.slice.contains('\n')
};
if push_it {
let abs_start = self.global_offset + local_consumed;
batch.push(BorrowedToken {
kind: tok.kind,
start: abs_start,
len: tok.slice.len(),
position: abs_start,
});
}
// (debug logging removed for benchmarks)
slice = rest;
local_consumed += consumed;
// Limit batch size heuristically
if batch.len() >= 2048 {
break;
}
}
Err(_) => {
// No complete token recognized.
if self.eof {
if Self::looks_fragmented(slice) {
return Err(GelError::lex(
"Unterminated literal at EOF",
Span::new(self.global_offset + local_consumed, 0, 0, 0),
));
} else {
// Consume one byte as garbage to progress instead of hard error.
let abs_start = self.global_offset + local_consumed;
let ch_len = slice.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
batch.push(BorrowedToken {
kind: TokenKind::Comment,
start: abs_start,
len: 0,
position: abs_start,
}); // represent as skipped token (comment kind ignored downstream)
slice = &slice[ch_len..];
local_consumed += ch_len;
}
} else if Self::looks_fragmented(slice) {
// Wait for more bytes – treat as fragment, stop producing further tokens now.
break;
} else {
// Consume one byte as unknown to advance without failing.
let ch_len = slice.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
local_consumed += ch_len;
slice = &slice[ch_len..];
// Do not push a token; continue scanning.
}
}
}
}
// Remove consumed bytes from pending
if local_consumed > 0 {
let consumed_segment = self.pending[..local_consumed].to_string();
self.pending.replace_range(0..local_consumed, "");
self.full_buffer.push_str(&consumed_segment);
self.global_offset += local_consumed;
}
// If nothing produced and not EOF – fill and retry
if batch.is_empty() && !self.eof {
self.fill()?;
if self.pending.is_empty() {
return Ok(Some(StreamTokenBatch {
tokens: Vec::new(),
finished: false,
buffer: &self.full_buffer,
}));
}
return self.next_batch(); // tail recursion (klein, bounded)
}
// EOF reached & pending empty → append EOF token (once)
let finished;
if self.eof && self.pending.is_empty() {
self.finished = true;
batch.push(BorrowedToken {
kind: TokenKind::EOF,
start: self.global_offset,
len: 0,
position: self.global_offset,
});
finished = true;
} else {
finished = false;
}
Ok(Some(StreamTokenBatch {
tokens: batch,
finished,
buffer: &self.full_buffer,
}))
}
}