1use std::borrow::Cow;
2use std::sync::Arc;
3use std::{fs, io, ops::Range, path::Path};
4
5use encoding_rs::{Encoding, GBK, SHIFT_JIS};
6use mel_syntax::{SourceMap, SourceView, TextRange, text_range};
7
8use crate::{
9 DecodeDiagnostic, ParseBudgets, ParseError, SourceEncoding, budget_error,
10 decode::{OffsetMap, decode_source_auto, decode_source_with_encoding},
11 text_len_range,
12};
13
14const DEFAULT_MAX_PREFIX_WORDS: usize = 64;
15const DEFAULT_MAX_PREFIX_BYTES: usize = 4096;
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct LightParseOptions {
19 pub max_prefix_words: usize,
20 pub max_prefix_bytes: usize,
21 pub budgets: ParseBudgets,
22}
23
24impl Default for LightParseOptions {
25 fn default() -> Self {
26 Self {
27 max_prefix_words: DEFAULT_MAX_PREFIX_WORDS,
28 max_prefix_bytes: DEFAULT_MAX_PREFIX_BYTES,
29 budgets: ParseBudgets::default(),
30 }
31 }
32}
33
34#[derive(Debug, Clone, PartialEq, Eq, Default)]
35pub struct LightSourceFile {
36 pub items: Vec<LightItem>,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub enum LightItem {
41 Command(LightCommandSurface),
42 Proc(LightProcSurface),
43 Other { span: TextRange },
44}
45
46#[derive(Debug, Clone, PartialEq, Eq)]
47pub struct LightProcSurface {
48 pub name_range: Option<TextRange>,
49 pub is_global: bool,
50 pub span: TextRange,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct LightCommandSurface {
55 pub head_range: TextRange,
56 pub captured: bool,
57 pub words: Vec<LightWord>,
58 pub opaque_tail: Option<TextRange>,
59 pub span: TextRange,
60}
61
62#[derive(Debug, Clone, PartialEq, Eq)]
63pub enum LightWord {
64 Flag { text: TextRange, range: TextRange },
65 NumericLiteral { text: TextRange, range: TextRange },
66 BareWord { text: TextRange, range: TextRange },
67 QuotedString { text: TextRange, range: TextRange },
68 Variable { range: TextRange },
69 GroupedExpr { range: TextRange },
70 BraceList { range: TextRange },
71 VectorLiteral { range: TextRange },
72 Capture { range: TextRange },
73}
74
75impl LightWord {
76 #[must_use]
77 pub const fn range(&self) -> TextRange {
78 match self {
79 Self::Flag { range, .. }
80 | Self::NumericLiteral { range, .. }
81 | Self::BareWord { range, .. }
82 | Self::QuotedString { range, .. }
83 | Self::Variable { range }
84 | Self::GroupedExpr { range }
85 | Self::BraceList { range }
86 | Self::VectorLiteral { range }
87 | Self::Capture { range } => *range,
88 }
89 }
90}
91
92pub trait LightItemSink {
93 fn on_item(&mut self, source: LightSourceView<'_>, item: LightItem);
94}
95
96impl<F> LightItemSink for F
97where
98 F: for<'a> FnMut(LightSourceView<'a>, LightItem),
99{
100 fn on_item(&mut self, source: LightSourceView<'_>, item: LightItem) {
101 self(source, item);
102 }
103}
104
105#[derive(Clone, Copy)]
106pub enum LightSourceView<'a> {
107 Text(SourceView<'a>),
108 Bytes {
109 bytes: &'a [u8],
110 encoding: SourceEncoding,
111 },
112}
113
114impl<'a> LightSourceView<'a> {
115 #[must_use]
116 pub fn raw_slice(self, range: TextRange) -> &'a [u8] {
117 let start = usize::from(range.start()).min(self.len());
118 let end = usize::from(range.end()).min(self.len()).max(start);
119 match self {
120 Self::Text(source) => &source.text().as_bytes()[start..end],
121 Self::Bytes { bytes, .. } => &bytes[start..end],
122 }
123 }
124
125 #[must_use]
126 pub fn try_ascii_slice(self, range: TextRange) -> Option<&'a str> {
127 std::str::from_utf8(self.raw_slice(range))
128 .ok()
129 .filter(|text| text.is_ascii())
130 }
131
132 #[must_use]
133 pub fn decode_slice_lossy_for_preview(self, range: TextRange) -> Cow<'a, str> {
134 match self {
135 Self::Text(source) => Cow::Borrowed(source.slice(range)),
136 Self::Bytes { bytes, encoding } => {
137 decode_bytes_lossy(slice_range(bytes, range), encoding)
138 }
139 }
140 }
141
142 #[must_use]
143 pub fn decode_slice(self, range: TextRange) -> DecodedLightSlice<'a> {
144 match self {
145 Self::Text(source) => DecodedLightSlice {
146 text: Cow::Borrowed(source.slice(range)),
147 diagnostics: Vec::new(),
148 },
149 Self::Bytes { bytes, encoding } => {
150 decode_bytes_with_diagnostics(slice_range(bytes, range), encoding, range)
151 }
152 }
153 }
154
155 #[must_use]
156 pub fn len(self) -> usize {
157 match self {
158 Self::Text(source) => source.text().len(),
159 Self::Bytes { bytes, .. } => bytes.len(),
160 }
161 }
162
163 #[must_use]
164 pub fn is_empty(self) -> bool {
165 self.len() == 0
166 }
167}
168
169#[derive(Debug, Clone, PartialEq, Eq)]
170pub struct DecodedLightSlice<'a> {
171 pub text: Cow<'a, str>,
172 pub diagnostics: Vec<DecodeDiagnostic>,
173}
174
175fn slice_range(bytes: &[u8], range: TextRange) -> &[u8] {
176 let start = usize::from(range.start()).min(bytes.len());
177 let end = usize::from(range.end()).min(bytes.len()).max(start);
178 &bytes[start..end]
179}
180
181fn decode_bytes_lossy(bytes: &[u8], encoding: SourceEncoding) -> Cow<'_, str> {
182 if matches!(encoding, SourceEncoding::Utf8) {
183 return String::from_utf8_lossy(bytes);
184 }
185 encoding_rs_encoding(encoding).decode(bytes).0
186}
187
188fn decode_bytes_with_diagnostics(
189 bytes: &[u8],
190 encoding: SourceEncoding,
191 range: TextRange,
192) -> DecodedLightSlice<'_> {
193 if matches!(encoding, SourceEncoding::Utf8) {
194 return match std::str::from_utf8(bytes) {
195 Ok(text) => DecodedLightSlice {
196 text: Cow::Borrowed(text),
197 diagnostics: Vec::new(),
198 },
199 Err(_) => DecodedLightSlice {
200 text: String::from_utf8_lossy(bytes),
201 diagnostics: vec![DecodeDiagnostic {
202 message: "source slice is not valid UTF-8; decoded lossily".into(),
203 range,
204 }],
205 },
206 };
207 }
208
209 let (text, _, had_errors) = encoding_rs_encoding(encoding).decode(bytes);
210 DecodedLightSlice {
211 text,
212 diagnostics: had_errors
213 .then(|| DecodeDiagnostic {
214 message: format!(
215 "source slice is not valid {}; decoded with replacement",
216 encoding.label()
217 )
218 .into(),
219 range,
220 })
221 .into_iter()
222 .collect(),
223 }
224}
225
226fn encoding_rs_encoding(encoding: SourceEncoding) -> &'static Encoding {
227 match encoding {
228 SourceEncoding::Utf8 => encoding_rs::UTF_8,
229 SourceEncoding::Cp932 => SHIFT_JIS,
230 SourceEncoding::Gbk => GBK,
231 }
232}
233
234#[derive(Debug, Clone, PartialEq, Eq)]
235pub struct LightScanReport {
236 pub source_text: String,
237 pub source_map: SourceMap,
238 pub source_encoding: SourceEncoding,
239 pub decode_errors: Vec<DecodeDiagnostic>,
240 pub errors: Vec<ParseError>,
241}
242
243#[derive(Debug, Clone, PartialEq, Eq)]
244pub struct SharedLightScanReport {
245 pub source_text: Arc<str>,
246 pub source_map: SourceMap,
247 pub source_encoding: SourceEncoding,
248 pub decode_errors: Vec<DecodeDiagnostic>,
249 pub errors: Vec<ParseError>,
250}
251
252#[derive(Debug, Clone, PartialEq, Eq)]
253pub struct LightScanSummary {
254 pub source_encoding: SourceEncoding,
255 pub decode_errors: Vec<DecodeDiagnostic>,
256 pub errors: Vec<ParseError>,
257}
258
259impl LightScanReport {
260 #[must_use]
261 pub fn source_view(&self) -> SourceView<'_> {
262 SourceView::new(&self.source_text, &self.source_map)
263 }
264
265 #[must_use]
266 pub fn source_range(&self, range: TextRange) -> Range<usize> {
267 self.source_view().display_range(range)
268 }
269
270 #[must_use]
271 pub fn source_slice(&self, range: TextRange) -> &str {
272 self.source_view().slice(range)
273 }
274
275 #[must_use]
276 pub fn display_slice(&self, range: TextRange) -> &str {
277 self.source_view().display_slice(range)
278 }
279
280 #[must_use]
281 pub fn string_literal_contents(&self, range: TextRange) -> Option<&str> {
282 self.source_slice(range)
283 .strip_prefix('"')?
284 .strip_suffix('"')
285 }
286}
287
288impl SharedLightScanReport {
289 #[must_use]
290 pub fn source_view(&self) -> SourceView<'_> {
291 SourceView::new(&self.source_text, &self.source_map)
292 }
293
294 #[must_use]
295 pub fn source_range(&self, range: TextRange) -> Range<usize> {
296 self.source_view().display_range(range)
297 }
298
299 #[must_use]
300 pub fn source_slice(&self, range: TextRange) -> &str {
301 self.source_view().slice(range)
302 }
303
304 #[must_use]
305 pub fn display_slice(&self, range: TextRange) -> &str {
306 self.source_view().display_slice(range)
307 }
308
309 #[must_use]
310 pub fn string_literal_contents(&self, range: TextRange) -> Option<&str> {
311 self.source_slice(range)
312 .strip_prefix('"')?
313 .strip_suffix('"')
314 }
315}
316
317#[derive(Debug, Clone, PartialEq, Eq)]
318pub struct LightParse {
319 pub source: LightSourceFile,
320 pub source_text: String,
321 pub source_map: SourceMap,
322 pub source_encoding: SourceEncoding,
323 pub decode_errors: Vec<DecodeDiagnostic>,
324 pub errors: Vec<ParseError>,
325}
326
327#[derive(Debug, Clone, PartialEq, Eq)]
328pub struct SharedLightParse {
329 pub source: LightSourceFile,
330 pub source_text: Arc<str>,
331 pub source_map: SourceMap,
332 pub source_encoding: SourceEncoding,
333 pub decode_errors: Vec<DecodeDiagnostic>,
334 pub errors: Vec<ParseError>,
335}
336
337impl LightParse {
338 #[must_use]
339 pub fn source_view(&self) -> SourceView<'_> {
340 SourceView::new(&self.source_text, &self.source_map)
341 }
342
343 #[must_use]
344 pub fn source_range(&self, range: TextRange) -> Range<usize> {
345 self.source_view().display_range(range)
346 }
347
348 #[must_use]
349 pub fn source_slice(&self, range: TextRange) -> &str {
350 self.source_view().slice(range)
351 }
352
353 #[must_use]
354 pub fn display_slice(&self, range: TextRange) -> &str {
355 self.source_view().display_slice(range)
356 }
357
358 #[must_use]
359 pub fn string_literal_contents(&self, range: TextRange) -> Option<&str> {
360 self.source_slice(range)
361 .strip_prefix('"')?
362 .strip_suffix('"')
363 }
364}
365
366impl SharedLightParse {
367 #[must_use]
368 pub fn source_view(&self) -> SourceView<'_> {
369 SourceView::new(&self.source_text, &self.source_map)
370 }
371
372 #[must_use]
373 pub fn source_range(&self, range: TextRange) -> Range<usize> {
374 self.source_view().display_range(range)
375 }
376
377 #[must_use]
378 pub fn source_slice(&self, range: TextRange) -> &str {
379 self.source_view().slice(range)
380 }
381
382 #[must_use]
383 pub fn display_slice(&self, range: TextRange) -> &str {
384 self.source_view().display_slice(range)
385 }
386
387 #[must_use]
388 pub fn string_literal_contents(&self, range: TextRange) -> Option<&str> {
389 self.source_slice(range)
390 .strip_prefix('"')?
391 .strip_suffix('"')
392 }
393}
394
395impl From<(LightSourceFile, LightScanReport)> for LightParse {
396 fn from((source, report): (LightSourceFile, LightScanReport)) -> Self {
397 Self {
398 source,
399 source_text: report.source_text,
400 source_map: report.source_map,
401 source_encoding: report.source_encoding,
402 decode_errors: report.decode_errors,
403 errors: report.errors,
404 }
405 }
406}
407
408impl From<(LightSourceFile, SharedLightScanReport)> for SharedLightParse {
409 fn from((source, report): (LightSourceFile, SharedLightScanReport)) -> Self {
410 Self {
411 source,
412 source_text: report.source_text,
413 source_map: report.source_map,
414 source_encoding: report.source_encoding,
415 decode_errors: report.decode_errors,
416 errors: report.errors,
417 }
418 }
419}
420
421impl From<SharedLightScanReport> for LightScanReport {
422 fn from(value: SharedLightScanReport) -> Self {
423 Self {
424 source_text: value.source_text.as_ref().to_owned(),
425 source_map: value.source_map,
426 source_encoding: value.source_encoding,
427 decode_errors: value.decode_errors,
428 errors: value.errors,
429 }
430 }
431}
432
433impl From<SharedLightParse> for LightParse {
434 fn from(value: SharedLightParse) -> Self {
435 Self {
436 source: value.source,
437 source_text: value.source_text.as_ref().to_owned(),
438 source_map: value.source_map,
439 source_encoding: value.source_encoding,
440 decode_errors: value.decode_errors,
441 errors: value.errors,
442 }
443 }
444}
445
446#[must_use]
447pub fn parse_light_source(input: &str) -> LightParse {
448 parse_light_source_with_options(input, LightParseOptions::default())
449}
450
451#[must_use]
452pub fn parse_light_source_with_options(input: &str, options: LightParseOptions) -> LightParse {
453 let mut sink = CollectLightItems::default();
454 let report = scan_light_source_with_options_and_sink(input, options, &mut sink);
455 LightParse::from((sink.finish(), report))
456}
457
458#[must_use]
459pub fn parse_light_shared_source(input: Arc<str>) -> SharedLightParse {
460 parse_light_shared_source_with_options(input, LightParseOptions::default())
461}
462
463#[must_use]
464pub fn parse_light_shared_source_with_options(
465 input: Arc<str>,
466 options: LightParseOptions,
467) -> SharedLightParse {
468 let mut sink = CollectLightItems::default();
469 let report =
470 scan_light_shared_source_with_options_and_sink(Arc::clone(&input), options, &mut sink);
471 SharedLightParse::from((sink.finish(), report))
472}
473
474pub fn scan_light_source_with_sink(input: &str, sink: &mut impl LightItemSink) -> LightScanReport {
475 scan_light_source_with_options_and_sink(input, LightParseOptions::default(), sink)
476}
477
478pub fn scan_light_shared_source_with_sink(
479 input: Arc<str>,
480 sink: &mut impl LightItemSink,
481) -> SharedLightScanReport {
482 scan_light_shared_source_with_options_and_sink(input, LightParseOptions::default(), sink)
483}
484
485pub fn scan_light_shared_source_with_options_and_sink(
486 input: Arc<str>,
487 options: LightParseOptions,
488 sink: &mut impl LightItemSink,
489) -> SharedLightScanReport {
490 let input_len = input.len();
491 if let Some(error) = max_bytes_error_for_text(input_len, options.budgets) {
492 return SharedLightScanReport {
493 source_text: input,
494 source_map: SourceMap::identity(input_len),
495 source_encoding: SourceEncoding::Utf8,
496 decode_errors: Vec::new(),
497 errors: vec![error],
498 };
499 }
500 let source_map = SourceMap::identity(input_len);
501 let source_view = LightSourceView::Text(SourceView::new(&input, &source_map));
502 let mut scanner = LightScanner::new(&input, options);
503 scanner.scan_with_sink(source_view, sink, None);
504 let errors = scanner.errors;
505 SharedLightScanReport {
506 source_text: input,
507 source_map,
508 source_encoding: SourceEncoding::Utf8,
509 decode_errors: Vec::new(),
510 errors,
511 }
512}
513
514pub fn scan_light_source_with_options_and_sink(
515 input: &str,
516 options: LightParseOptions,
517 sink: &mut impl LightItemSink,
518) -> LightScanReport {
519 if let Some(error) = max_bytes_error_for_text(input.len(), options.budgets) {
520 return LightScanReport {
521 source_text: input.to_owned(),
522 source_map: SourceMap::identity(input.len()),
523 source_encoding: SourceEncoding::Utf8,
524 decode_errors: Vec::new(),
525 errors: vec![error],
526 };
527 }
528 let source_map = SourceMap::identity(input.len());
529 let source_view = LightSourceView::Text(SourceView::new(input, &source_map));
530 let mut scanner = LightScanner::new(input, options);
531 scanner.scan_with_sink(source_view, sink, None);
532 LightScanReport {
533 source_text: input.to_owned(),
534 source_map,
535 source_encoding: SourceEncoding::Utf8,
536 decode_errors: Vec::new(),
537 errors: scanner.errors,
538 }
539}
540
541#[must_use]
542pub fn parse_light_bytes(input: &[u8]) -> LightParse {
543 let decoded = decode_source_auto(input);
544 let mut sink = CollectLightItems::default();
545 let report = build_light_scan(decoded, LightParseOptions::default(), &mut sink);
546 LightParse::from((sink.finish(), report))
547}
548
549#[must_use]
550pub fn parse_light_shared_bytes(input: &[u8]) -> SharedLightParse {
551 let decoded = decode_source_auto(input);
552 let mut sink = CollectLightItems::default();
553 let report = build_shared_light_scan(decoded, LightParseOptions::default(), &mut sink);
554 SharedLightParse::from((sink.finish(), report))
555}
556
557#[must_use]
558pub fn parse_light_bytes_with_encoding(input: &[u8], encoding: SourceEncoding) -> LightParse {
559 let decoded = decode_source_with_encoding(input, encoding);
560 let mut sink = CollectLightItems::default();
561 let report = build_light_scan(decoded, LightParseOptions::default(), &mut sink);
562 LightParse::from((sink.finish(), report))
563}
564
565#[must_use]
566pub fn parse_light_shared_bytes_with_encoding(
567 input: &[u8],
568 encoding: SourceEncoding,
569) -> SharedLightParse {
570 let decoded = decode_source_with_encoding(input, encoding);
571 let mut sink = CollectLightItems::default();
572 let report = build_shared_light_scan(decoded, LightParseOptions::default(), &mut sink);
573 SharedLightParse::from((sink.finish(), report))
574}
575
576pub fn scan_light_bytes_with_sink(input: &[u8], sink: &mut impl LightItemSink) -> LightScanReport {
577 scan_light_bytes_with_options_and_sink(input, LightParseOptions::default(), sink)
578}
579
580pub fn scan_light_shared_bytes_with_options_and_sink(
581 input: &[u8],
582 options: LightParseOptions,
583 sink: &mut impl LightItemSink,
584) -> SharedLightScanReport {
585 if let Some(error) = max_bytes_error_for_bytes(input.len(), options.budgets) {
586 return empty_shared_light_scan_report(error);
587 }
588 let summary =
589 scan_light_bytes_native(input, detect_light_source_encoding(input), options, sink);
590 shared_report_from_summary(summary, input.len())
591}
592
593pub fn scan_light_bytes_with_options_and_sink(
594 input: &[u8],
595 options: LightParseOptions,
596 sink: &mut impl LightItemSink,
597) -> LightScanReport {
598 if let Some(error) = max_bytes_error_for_bytes(input.len(), options.budgets) {
599 return empty_light_scan_report(error);
600 }
601 let summary =
602 scan_light_bytes_native(input, detect_light_source_encoding(input), options, sink);
603 report_from_summary(summary, input.len())
604}
605
606pub fn scan_light_bytes_with_options_and_sink_and_then<S, T>(
607 input: &[u8],
608 options: LightParseOptions,
609 sink: &mut S,
610 then: impl for<'a> FnOnce(&mut S, LightSourceView<'a>, LightScanSummary) -> T,
611) -> T
612where
613 S: LightItemSink,
614{
615 if let Some(error) = max_bytes_error_for_bytes(input.len(), options.budgets) {
616 let source_map = SourceMap::identity(0);
617 return then(
618 sink,
619 LightSourceView::Text(SourceView::new("", &source_map)),
620 LightScanSummary {
621 source_encoding: SourceEncoding::Utf8,
622 decode_errors: Vec::new(),
623 errors: vec![error],
624 },
625 );
626 }
627 let encoding = detect_light_source_encoding(input);
628 let summary = scan_light_bytes_native(input, encoding, options, sink);
629 then(
630 sink,
631 LightSourceView::Bytes {
632 bytes: input,
633 encoding,
634 },
635 summary,
636 )
637}
638
639pub fn scan_light_bytes_with_encoding_and_sink(
640 input: &[u8],
641 encoding: SourceEncoding,
642 sink: &mut impl LightItemSink,
643) -> LightScanReport {
644 scan_light_bytes_with_encoding_and_options_and_sink(
645 input,
646 encoding,
647 LightParseOptions::default(),
648 sink,
649 )
650}
651
652pub fn scan_light_shared_bytes_with_encoding_and_options_and_sink(
653 input: &[u8],
654 encoding: SourceEncoding,
655 options: LightParseOptions,
656 sink: &mut impl LightItemSink,
657) -> SharedLightScanReport {
658 if let Some(error) = max_bytes_error_for_bytes(input.len(), options.budgets) {
659 return empty_shared_light_scan_report(error);
660 }
661 let summary = scan_light_bytes_native(input, encoding, options, sink);
662 shared_report_from_summary(summary, input.len())
663}
664
665pub fn scan_light_bytes_with_encoding_and_options_and_sink(
666 input: &[u8],
667 encoding: SourceEncoding,
668 options: LightParseOptions,
669 sink: &mut impl LightItemSink,
670) -> LightScanReport {
671 if let Some(error) = max_bytes_error_for_bytes(input.len(), options.budgets) {
672 return empty_light_scan_report(error);
673 }
674 let summary = scan_light_bytes_native(input, encoding, options, sink);
675 report_from_summary(summary, input.len())
676}
677
678pub fn parse_light_file(path: impl AsRef<Path>) -> io::Result<LightParse> {
679 parse_light_file_with_options(path, LightParseOptions::default())
680}
681
682pub fn parse_light_file_with_options(
683 path: impl AsRef<Path>,
684 options: LightParseOptions,
685) -> io::Result<LightParse> {
686 if let Some(error) = max_bytes_error_for_file(path.as_ref(), options.budgets)? {
687 return Ok(LightParse::from((
688 LightSourceFile::default(),
689 empty_light_scan_report(error),
690 )));
691 }
692 let bytes = fs::read(path)?;
693 let decoded = decode_source_auto(&bytes);
694 let mut sink = CollectLightItems::default();
695 let report = build_light_scan(decoded, options, &mut sink);
696 Ok(LightParse::from((sink.finish(), report)))
697}
698
699pub fn parse_light_shared_file(path: impl AsRef<Path>) -> io::Result<SharedLightParse> {
700 if let Some(error) =
701 max_bytes_error_for_file(path.as_ref(), LightParseOptions::default().budgets)?
702 {
703 return Ok(SharedLightParse::from((
704 LightSourceFile::default(),
705 empty_shared_light_scan_report(error),
706 )));
707 }
708 let bytes = fs::read(path)?;
709 Ok(parse_light_shared_bytes(&bytes))
710}
711
712pub fn parse_light_file_with_encoding(
713 path: impl AsRef<Path>,
714 encoding: SourceEncoding,
715) -> io::Result<LightParse> {
716 parse_light_file_with_encoding_and_options(path, encoding, LightParseOptions::default())
717}
718
719pub fn parse_light_file_with_encoding_and_options(
720 path: impl AsRef<Path>,
721 encoding: SourceEncoding,
722 options: LightParseOptions,
723) -> io::Result<LightParse> {
724 if let Some(error) = max_bytes_error_for_file(path.as_ref(), options.budgets)? {
725 return Ok(LightParse::from((
726 LightSourceFile::default(),
727 empty_light_scan_report(error),
728 )));
729 }
730 let bytes = fs::read(path)?;
731 let decoded = decode_source_with_encoding(&bytes, encoding);
732 let mut sink = CollectLightItems::default();
733 let report = build_light_scan(decoded, options, &mut sink);
734 Ok(LightParse::from((sink.finish(), report)))
735}
736
737pub fn parse_light_shared_file_with_encoding(
738 path: impl AsRef<Path>,
739 encoding: SourceEncoding,
740) -> io::Result<SharedLightParse> {
741 if let Some(error) =
742 max_bytes_error_for_file(path.as_ref(), LightParseOptions::default().budgets)?
743 {
744 return Ok(SharedLightParse::from((
745 LightSourceFile::default(),
746 empty_shared_light_scan_report(error),
747 )));
748 }
749 let bytes = fs::read(path)?;
750 Ok(parse_light_shared_bytes_with_encoding(&bytes, encoding))
751}
752
753pub fn scan_light_file_with_sink(
754 path: impl AsRef<Path>,
755 sink: &mut impl LightItemSink,
756) -> io::Result<LightScanReport> {
757 scan_light_file_with_options_and_sink(path, LightParseOptions::default(), sink)
758}
759
760pub fn scan_light_shared_file_with_options_and_sink(
761 path: impl AsRef<Path>,
762 options: LightParseOptions,
763 sink: &mut impl LightItemSink,
764) -> io::Result<SharedLightScanReport> {
765 if let Some(error) = max_bytes_error_for_file(path.as_ref(), options.budgets)? {
766 return Ok(empty_shared_light_scan_report(error));
767 }
768 let bytes = fs::read(path)?;
769 Ok(scan_light_shared_bytes_with_options_and_sink(
770 &bytes, options, sink,
771 ))
772}
773
774pub fn scan_light_file_with_options_and_sink(
775 path: impl AsRef<Path>,
776 options: LightParseOptions,
777 sink: &mut impl LightItemSink,
778) -> io::Result<LightScanReport> {
779 if let Some(error) = max_bytes_error_for_file(path.as_ref(), options.budgets)? {
780 return Ok(empty_light_scan_report(error));
781 }
782 let bytes = fs::read(path)?;
783 Ok(scan_light_bytes_with_options_and_sink(
784 &bytes, options, sink,
785 ))
786}
787
788pub fn scan_light_file_with_encoding_and_sink(
789 path: impl AsRef<Path>,
790 encoding: SourceEncoding,
791 sink: &mut impl LightItemSink,
792) -> io::Result<LightScanReport> {
793 scan_light_file_with_encoding_and_options_and_sink(
794 path,
795 encoding,
796 LightParseOptions::default(),
797 sink,
798 )
799}
800
801pub fn scan_light_shared_file_with_encoding_and_options_and_sink(
802 path: impl AsRef<Path>,
803 encoding: SourceEncoding,
804 options: LightParseOptions,
805 sink: &mut impl LightItemSink,
806) -> io::Result<SharedLightScanReport> {
807 if let Some(error) = max_bytes_error_for_file(path.as_ref(), options.budgets)? {
808 return Ok(empty_shared_light_scan_report(error));
809 }
810 let bytes = fs::read(path)?;
811 Ok(scan_light_shared_bytes_with_encoding_and_options_and_sink(
812 &bytes, encoding, options, sink,
813 ))
814}
815
816pub fn scan_light_file_with_encoding_and_options_and_sink(
817 path: impl AsRef<Path>,
818 encoding: SourceEncoding,
819 options: LightParseOptions,
820 sink: &mut impl LightItemSink,
821) -> io::Result<LightScanReport> {
822 if let Some(error) = max_bytes_error_for_file(path.as_ref(), options.budgets)? {
823 return Ok(empty_light_scan_report(error));
824 }
825 let bytes = fs::read(path)?;
826 Ok(scan_light_bytes_with_encoding_and_options_and_sink(
827 &bytes, encoding, options, sink,
828 ))
829}
830
831fn build_light_scan(
832 decoded: crate::decode::DecodedSource<'_>,
833 options: LightParseOptions,
834 sink: &mut impl LightItemSink,
835) -> LightScanReport {
836 let source_text = decoded.text.into_owned();
837 let source_map = decoded.offset_map.source_map();
838 let source_view = LightSourceView::Text(SourceView::new(&source_text, &source_map));
839 let mut scanner = LightScanner::new(&source_text, options);
840 scanner.scan_with_sink(source_view, sink, Some(&decoded.offset_map));
841 let errors = scanner
842 .errors
843 .into_iter()
844 .map(|mut error| {
845 error.range = decoded.offset_map.map_range(error.range);
846 error
847 })
848 .collect();
849 LightScanReport {
850 source_text,
851 source_map,
852 source_encoding: decoded.encoding,
853 decode_errors: decoded.diagnostics,
854 errors,
855 }
856}
857
858fn build_shared_light_scan(
859 decoded: crate::decode::DecodedSource<'_>,
860 options: LightParseOptions,
861 sink: &mut impl LightItemSink,
862) -> SharedLightScanReport {
863 let source_text: Arc<str> = Arc::from(decoded.text.into_owned());
864 let source_map = decoded.offset_map.source_map();
865 let source_view = LightSourceView::Text(SourceView::new(&source_text, &source_map));
866 let mut scanner = LightScanner::new(&source_text, options);
867 scanner.scan_with_sink(source_view, sink, Some(&decoded.offset_map));
868 let errors = scanner
869 .errors
870 .into_iter()
871 .map(|mut error| {
872 error.range = decoded.offset_map.map_range(error.range);
873 error
874 })
875 .collect();
876 SharedLightScanReport {
877 source_text,
878 source_map,
879 source_encoding: decoded.encoding,
880 decode_errors: decoded.diagnostics,
881 errors,
882 }
883}
884
885fn scan_light_bytes_native(
886 input: &[u8],
887 encoding: SourceEncoding,
888 options: LightParseOptions,
889 sink: &mut impl LightItemSink,
890) -> LightScanSummary {
891 let mut scanner = ByteLightScanner::new(input, encoding, options);
892 scanner.scan_with_sink(sink);
893 LightScanSummary {
894 source_encoding: encoding,
895 decode_errors: Vec::new(),
896 errors: scanner.errors,
897 }
898}
899
900fn report_from_summary(summary: LightScanSummary, source_len: usize) -> LightScanReport {
901 LightScanReport {
902 source_text: String::new(),
903 source_map: SourceMap::identity(source_len),
904 source_encoding: summary.source_encoding,
905 decode_errors: summary.decode_errors,
906 errors: summary.errors,
907 }
908}
909
910fn shared_report_from_summary(
911 summary: LightScanSummary,
912 source_len: usize,
913) -> SharedLightScanReport {
914 SharedLightScanReport {
915 source_text: Arc::from(""),
916 source_map: SourceMap::identity(source_len),
917 source_encoding: summary.source_encoding,
918 decode_errors: summary.decode_errors,
919 errors: summary.errors,
920 }
921}
922
923fn detect_light_source_encoding(input: &[u8]) -> SourceEncoding {
924 if ascii_header_declares_codeset_932(input) {
925 return SourceEncoding::Cp932;
926 }
927 if std::str::from_utf8(input).is_ok() {
928 SourceEncoding::Utf8
929 } else {
930 SourceEncoding::Cp932
931 }
932}
933
934fn ascii_header_declares_codeset_932(input: &[u8]) -> bool {
935 let limit = input.len().min(4096);
936 input[..limit]
937 .windows(b"Codeset: 932".len())
938 .any(|window| window.eq_ignore_ascii_case(b"Codeset: 932"))
939}
940
941#[derive(Default)]
942struct CollectLightItems {
943 items: Vec<LightItem>,
944}
945
946impl CollectLightItems {
947 fn finish(self) -> LightSourceFile {
948 LightSourceFile { items: self.items }
949 }
950}
951
952impl LightItemSink for CollectLightItems {
953 fn on_item(&mut self, _: LightSourceView<'_>, item: LightItem) {
954 self.items.push(item);
955 }
956}
957
958fn remap_light_item(item: &mut LightItem, map: &OffsetMap) {
959 match item {
960 LightItem::Command(command) => {
961 command.head_range = map.map_range(command.head_range);
962 if let Some(opaque_tail) = &mut command.opaque_tail {
963 *opaque_tail = map.map_range(*opaque_tail);
964 }
965 for word in &mut command.words {
966 match word {
967 LightWord::Flag { text, range }
968 | LightWord::NumericLiteral { text, range }
969 | LightWord::BareWord { text, range }
970 | LightWord::QuotedString { text, range } => {
971 *text = map.map_range(*text);
972 *range = map.map_range(*range);
973 }
974 LightWord::Variable { range }
975 | LightWord::GroupedExpr { range }
976 | LightWord::BraceList { range }
977 | LightWord::VectorLiteral { range }
978 | LightWord::Capture { range } => {
979 *range = map.map_range(*range);
980 }
981 }
982 }
983 command.span = map.map_range(command.span);
984 }
985 LightItem::Proc(proc_def) => {
986 if let Some(name_range) = &mut proc_def.name_range {
987 *name_range = map.map_range(*name_range);
988 }
989 proc_def.span = map.map_range(proc_def.span);
990 }
991 LightItem::Other { span } => *span = map.map_range(*span),
992 }
993}
994
995struct LightScanner<'a> {
996 text: &'a str,
997 options: LightParseOptions,
998 errors: Vec<ParseError>,
999 reported_unterminated_block_comment: bool,
1000 reported_budget_error: bool,
1001 budget: LightBudgetTracker,
1002}
1003
1004impl<'a> LightScanner<'a> {
1005 fn new(text: &'a str, options: LightParseOptions) -> Self {
1006 Self {
1007 text,
1008 options,
1009 errors: Vec::new(),
1010 reported_unterminated_block_comment: false,
1011 reported_budget_error: false,
1012 budget: LightBudgetTracker::new(options.budgets),
1013 }
1014 }
1015
1016 fn scan_with_sink(
1017 &mut self,
1018 source: LightSourceView<'_>,
1019 sink: &mut impl LightItemSink,
1020 remap: Option<&OffsetMap>,
1021 ) {
1022 let mut cursor = self.skip_trivia(0);
1023
1024 while cursor < self.text.len() && !self.is_halted() {
1025 let (mut item, next_cursor) = if self.is_proc_start(cursor) {
1026 self.scan_proc_item(cursor)
1027 } else {
1028 self.scan_statement_item(cursor)
1029 };
1030 if self.is_halted() {
1031 break;
1032 }
1033 if !self.record_statement(start_range(&item)) {
1034 break;
1035 }
1036 if let Some(map) = remap {
1037 remap_light_item(&mut item, map);
1038 }
1039 sink.on_item(source, item);
1040 cursor = self.skip_trivia(next_cursor);
1041 }
1042 }
1043
1044 fn scan_proc_item(&mut self, start: usize) -> (LightItem, usize) {
1045 let mut cursor = start;
1046 let mut is_global = false;
1047 if let Some(after_global) = self.consume_keyword(cursor, "global") {
1048 is_global = true;
1049 cursor = self.skip_trivia(after_global);
1050 }
1051 let after_proc = self.consume_keyword(cursor, "proc").unwrap_or(cursor);
1052 cursor = self.skip_trivia(after_proc);
1053
1054 let first_word = self.scan_simple_word(cursor);
1055 let mut name_range = None;
1056 let mut body_scan_start = cursor;
1057 if let Some((first_start, first_end)) = first_word {
1058 let after_first = self.skip_trivia(first_end);
1059 body_scan_start = after_first;
1060 if self.peek_byte(after_first) == Some(b'(') {
1061 name_range = Some(text_range(first_start as u32, first_end as u32));
1062 } else if let Some((name_start, name_end)) = self.scan_simple_word(after_first) {
1063 name_range = Some(text_range(name_start as u32, name_end as u32));
1064 body_scan_start = self.skip_trivia(name_end);
1065 }
1066 }
1067
1068 let end = self.scan_until_matching_body_end(start, body_scan_start);
1069 (
1070 LightItem::Proc(LightProcSurface {
1071 name_range,
1072 is_global,
1073 span: text_range(start as u32, end as u32),
1074 }),
1075 end,
1076 )
1077 }
1078
1079 fn scan_statement_item(&mut self, start: usize) -> (LightItem, usize) {
1080 let Some((head_start, head_end)) = self.scan_simple_word(start) else {
1081 let end = self.scan_statement_tail(start);
1082 return (
1083 LightItem::Other {
1084 span: text_range(start as u32, end as u32),
1085 },
1086 end,
1087 );
1088 };
1089 let head_range = text_range(head_start as u32, head_end as u32);
1090 let head_is_non_command = is_non_command_head(&self.text[head_start..head_end]);
1091 let after_head = self.skip_trivia(head_end);
1092 if self.peek_byte(after_head) == Some(b'(') || head_is_non_command {
1093 let end = self.scan_statement_tail(after_head);
1094 return (
1095 LightItem::Other {
1096 span: text_range(start as u32, end as u32),
1097 },
1098 end,
1099 );
1100 }
1101
1102 let (end, words, opaque_tail) =
1103 self.scan_command_statement_tail(start, head_end, after_head);
1104
1105 (
1106 LightItem::Command(LightCommandSurface {
1107 head_range,
1108 captured: false,
1109 words,
1110 opaque_tail,
1111 span: text_range(start as u32, end as u32),
1112 }),
1113 end,
1114 )
1115 }
1116
1117 fn scan_command_statement_tail(
1118 &mut self,
1119 start: usize,
1120 head_end: usize,
1121 after_head: usize,
1122 ) -> (usize, Vec<LightWord>, Option<TextRange>) {
1123 let mut words = Vec::with_capacity(self.options.max_prefix_words.min(8));
1124 let mut cursor = after_head;
1125 loop {
1126 cursor = self.skip_trivia(cursor);
1127 if cursor >= self.text.len() {
1128 return (self.text.len(), words, None);
1129 }
1130 if self.byte_at(cursor) == Some(b';') {
1131 let _ = self.record_token(cursor, cursor + 1);
1132 return (cursor + 1, words, None);
1133 }
1134
1135 let consumed_bytes = cursor.saturating_sub(head_end);
1136 if words.len() >= self.options.max_prefix_words
1137 || consumed_bytes >= self.options.max_prefix_bytes
1138 {
1139 let end = self.scan_statement_tail(cursor);
1140 let body_end = self.statement_body_end(start, end);
1141 let opaque_tail =
1142 (cursor < body_end).then(|| text_range(cursor as u32, body_end as u32));
1143 return (end, words, opaque_tail);
1144 }
1145
1146 let Some((word, next_cursor)) = self.scan_word(cursor, self.text.len()) else {
1147 if self.is_halted() {
1148 return (self.text.len(), words, None);
1149 }
1150 let end = self.scan_statement_tail(cursor);
1151 let body_end = self.statement_body_end(start, end);
1152 let opaque_tail =
1153 (cursor < body_end).then(|| text_range(cursor as u32, body_end as u32));
1154 return (end, words, opaque_tail);
1155 };
1156 words.push(word);
1157 cursor = next_cursor;
1158 }
1159 }
1160
1161 fn scan_statement_tail(&mut self, start: usize) -> usize {
1162 let mut cursor = start;
1163 let mut paren_depth = 0usize;
1164 let mut bracket_depth = 0usize;
1165 let mut brace_depth = 0usize;
1166 let mut in_string = false;
1167 let mut in_backquote = false;
1168
1169 while cursor < self.text.len() && !self.is_halted() {
1170 if in_string {
1171 cursor = self.advance_string_body(cursor);
1172 in_string = false;
1173 continue;
1174 }
1175 if in_backquote {
1176 cursor = self.advance_backquote_body(cursor);
1177 in_backquote = false;
1178 continue;
1179 }
1180 if self.starts_with(cursor, "//") {
1181 cursor = self.skip_line_comment(cursor);
1182 continue;
1183 }
1184 if self.starts_with(cursor, "/*") {
1185 cursor = self.skip_block_comment(cursor);
1186 continue;
1187 }
1188
1189 match self.byte_at(cursor) {
1190 Some(b'"') => {
1191 if !self.record_token(cursor, cursor + 1) {
1192 return self.text.len();
1193 }
1194 in_string = true;
1195 cursor += 1;
1196 }
1197 Some(b'`') => {
1198 if !self.record_token(cursor, cursor + 1) {
1199 return self.text.len();
1200 }
1201 in_backquote = true;
1202 cursor += 1;
1203 }
1204 Some(b'(') => {
1205 if !self.record_token(cursor, cursor + 1)
1206 || !self.enter_nesting(cursor, cursor + 1)
1207 {
1208 return self.text.len();
1209 }
1210 paren_depth += 1;
1211 cursor += 1;
1212 }
1213 Some(b')') => {
1214 if !self.record_token(cursor, cursor + 1) {
1215 return self.text.len();
1216 }
1217 if paren_depth > 0 {
1218 self.exit_nesting();
1219 }
1220 paren_depth = paren_depth.saturating_sub(1);
1221 cursor += 1;
1222 }
1223 Some(b'[') => {
1224 if !self.record_token(cursor, cursor + 1)
1225 || !self.enter_nesting(cursor, cursor + 1)
1226 {
1227 return self.text.len();
1228 }
1229 bracket_depth += 1;
1230 cursor += 1;
1231 }
1232 Some(b']') => {
1233 if !self.record_token(cursor, cursor + 1) {
1234 return self.text.len();
1235 }
1236 if bracket_depth > 0 {
1237 self.exit_nesting();
1238 }
1239 bracket_depth = bracket_depth.saturating_sub(1);
1240 cursor += 1;
1241 }
1242 Some(b'{') => {
1243 if !self.record_token(cursor, cursor + 1)
1244 || !self.enter_nesting(cursor, cursor + 1)
1245 {
1246 return self.text.len();
1247 }
1248 brace_depth += 1;
1249 cursor += 1;
1250 }
1251 Some(b'}') => {
1252 if !self.record_token(cursor, cursor + 1) {
1253 return self.text.len();
1254 }
1255 if brace_depth > 0 {
1256 self.exit_nesting();
1257 }
1258 brace_depth = brace_depth.saturating_sub(1);
1259 cursor += 1;
1260 }
1261 Some(b';')
1262 if paren_depth == 0
1263 && bracket_depth == 0
1264 && brace_depth == 0
1265 && !in_string
1266 && !in_backquote =>
1267 {
1268 let _ = self.record_token(cursor, cursor + 1);
1269 return cursor + 1;
1270 }
1271 Some(ch) if (ch as char).is_whitespace() => cursor = self.next_offset(cursor),
1272 Some(_) => {
1273 let end = self.scan_simple_word_until(cursor, self.text.len());
1274 if end <= cursor {
1275 if !self.record_token(cursor, self.next_offset(cursor)) {
1276 return self.text.len();
1277 }
1278 cursor = self.next_offset(cursor);
1279 } else {
1280 if !self.record_token(cursor, end) {
1281 return self.text.len();
1282 }
1283 cursor = end;
1284 }
1285 }
1286 None => break,
1287 }
1288 }
1289
1290 self.text.len()
1291 }
1292
1293 fn statement_body_end(&self, start: usize, end: usize) -> usize {
1294 let mut body_end = end;
1295 if body_end > start && self.byte_at(body_end - 1) == Some(b';') {
1296 body_end -= 1;
1297 }
1298 while body_end > start {
1299 let prev = self.prev_offset(body_end);
1300 let segment = &self.text[prev..body_end];
1301 if segment.chars().all(char::is_whitespace) {
1302 body_end = prev;
1303 continue;
1304 }
1305 break;
1306 }
1307 body_end
1308 }
1309
1310 fn scan_word(&mut self, start: usize, body_end: usize) -> Option<(LightWord, usize)> {
1311 if start >= body_end {
1312 return None;
1313 }
1314 if self.byte_at(start) == Some(b'"') {
1315 let end = self.scan_quoted_string(start);
1316 let range = text_range(start as u32, end as u32);
1317 if !self.check_literal(range) {
1318 return None;
1319 }
1320 return Some((LightWord::QuotedString { text: range, range }, end));
1321 }
1322 if self.byte_at(start) == Some(b'`') {
1323 let end = self.scan_backquote(start);
1324 let range = text_range(start as u32, end as u32);
1325 return Some((LightWord::Capture { range }, end));
1326 }
1327 if self.byte_at(start) == Some(b'{') {
1328 let end = self.scan_balanced(start, b'{', b'}');
1329 let range = text_range(start as u32, end as u32);
1330 if !self.check_literal(range) {
1331 return None;
1332 }
1333 return Some((LightWord::BraceList { range }, end));
1334 }
1335 if self.starts_with(start, "<<") {
1336 let end = self.scan_vector_literal(start);
1337 let range = text_range(start as u32, end as u32);
1338 if !self.check_literal(range) {
1339 return None;
1340 }
1341 return Some((LightWord::VectorLiteral { range }, end));
1342 }
1343 if self.byte_at(start) == Some(b'(') {
1344 let end = self.scan_balanced(start, b'(', b')');
1345 let range = text_range(start as u32, end as u32);
1346 if !self.check_literal(range) {
1347 return None;
1348 }
1349 return Some((LightWord::GroupedExpr { range }, end));
1350 }
1351
1352 let end = self.scan_simple_word_until(start, body_end);
1353 if end <= start {
1354 return None;
1355 }
1356 if !self.record_token(start, end) {
1357 return None;
1358 }
1359 let range = text_range(start as u32, end as u32);
1360 let text = &self.text[start..end];
1361 let word = if text.starts_with('$') {
1362 LightWord::Variable { range }
1363 } else if text.starts_with('-') && text.len() > 1 {
1364 LightWord::Flag { text: range, range }
1365 } else if looks_numeric_like(text) {
1366 LightWord::NumericLiteral { text: range, range }
1367 } else {
1368 LightWord::BareWord { text: range, range }
1369 };
1370 Some((word, end))
1371 }
1372
1373 fn scan_quoted_string(&mut self, start: usize) -> usize {
1374 let mut cursor = start + 1;
1375 while cursor < self.text.len() {
1376 match self.byte_at(cursor) {
1377 Some(b'\\') => {
1378 cursor = self.next_offset(cursor + 1);
1379 }
1380 Some(b'"') => {
1381 let end = cursor + 1;
1382 let _ = self.record_token(start, end);
1383 return end;
1384 }
1385 Some(_) => cursor = self.next_offset(cursor),
1386 None => break,
1387 }
1388 }
1389 if self.is_halted() {
1390 return self.text.len();
1391 }
1392 let _ = self.record_token(start, self.text.len());
1393 self.errors.push(ParseError {
1394 message: "unterminated string literal in lightweight surface parse",
1395 range: text_range(start as u32, self.text.len() as u32),
1396 });
1397 self.text.len()
1398 }
1399
1400 fn scan_backquote(&mut self, start: usize) -> usize {
1401 let mut cursor = start + 1;
1402 while cursor < self.text.len() {
1403 match self.byte_at(cursor) {
1404 Some(b'\\') => {
1405 cursor = self.next_offset(cursor + 1);
1406 }
1407 Some(b'`') => {
1408 let end = cursor + 1;
1409 let _ = self.record_token(start, end);
1410 return end;
1411 }
1412 Some(b'"') => cursor = self.scan_quoted_string(cursor),
1413 Some(_) => cursor = self.next_offset(cursor),
1414 None => break,
1415 }
1416 }
1417 if self.is_halted() {
1418 return self.text.len();
1419 }
1420 let _ = self.record_token(start, self.text.len());
1421 self.errors.push(ParseError {
1422 message: "unterminated backquote capture in lightweight surface parse",
1423 range: text_range(start as u32, self.text.len() as u32),
1424 });
1425 self.text.len()
1426 }
1427
1428 fn scan_balanced(&mut self, start: usize, open: u8, close: u8) -> usize {
1429 let mut cursor = start;
1430 let mut depth = 0usize;
1431 while cursor < self.text.len() && !self.is_halted() {
1432 if self.starts_with(cursor, "//") {
1433 cursor = self.skip_line_comment(cursor);
1434 continue;
1435 }
1436 if self.starts_with(cursor, "/*") {
1437 cursor = self.skip_block_comment(cursor);
1438 continue;
1439 }
1440 match self.byte_at(cursor) {
1441 Some(b'"') => cursor = self.scan_quoted_string(cursor),
1442 Some(b'`') => cursor = self.scan_backquote(cursor),
1443 Some(ch) if ch == open => {
1444 if !self.record_token(cursor, cursor + 1)
1445 || !self.enter_nesting(cursor, cursor + 1)
1446 {
1447 return self.text.len();
1448 }
1449 depth += 1;
1450 cursor += 1;
1451 }
1452 Some(ch) if ch == close => {
1453 if !self.record_token(cursor, cursor + 1) {
1454 return self.text.len();
1455 }
1456 if depth > 0 {
1457 self.exit_nesting();
1458 }
1459 depth = depth.saturating_sub(1);
1460 cursor += 1;
1461 if depth == 0 {
1462 return cursor;
1463 }
1464 }
1465 Some(b'(' | b')' | b'[' | b']' | b'{' | b'}' | b',') => {
1466 if !self.record_token(cursor, cursor + 1) {
1467 return self.text.len();
1468 }
1469 cursor += 1;
1470 }
1471 Some(ch) if (ch as char).is_whitespace() => cursor = self.next_offset(cursor),
1472 Some(_) => {
1473 let end = self.scan_simple_word_until(cursor, self.text.len());
1474 if end <= cursor {
1475 if !self.record_token(cursor, self.next_offset(cursor)) {
1476 return self.text.len();
1477 }
1478 cursor = self.next_offset(cursor);
1479 } else {
1480 if !self.record_token(cursor, end) {
1481 return self.text.len();
1482 }
1483 cursor = end;
1484 }
1485 }
1486 None => break,
1487 }
1488 }
1489 if self.is_halted() {
1490 return self.text.len();
1491 }
1492 self.errors.push(ParseError {
1493 message: "unterminated grouped surface in lightweight parse",
1494 range: text_range(start as u32, self.text.len() as u32),
1495 });
1496 self.text.len()
1497 }
1498
1499 fn scan_vector_literal(&mut self, start: usize) -> usize {
1500 let mut cursor = start + 2;
1501 if !self.record_token(start, start + 2) || !self.enter_nesting(start, start + 2) {
1502 return self.text.len();
1503 }
1504 while cursor < self.text.len() && !self.is_halted() {
1505 if self.starts_with(cursor, ">>") {
1506 let _ = self.record_token(cursor, cursor + 2);
1507 self.exit_nesting();
1508 return cursor + 2;
1509 }
1510 if self.byte_at(cursor) == Some(b'"') {
1511 cursor = self.scan_quoted_string(cursor);
1512 continue;
1513 }
1514 if self
1515 .byte_at(cursor)
1516 .is_some_and(|ch| (ch as char).is_whitespace())
1517 {
1518 cursor = self.next_offset(cursor);
1519 continue;
1520 }
1521 let end = self.scan_simple_word_until(cursor, self.text.len());
1522 if end <= cursor {
1523 let next = self.next_offset(cursor);
1524 if !self.record_token(cursor, next) {
1525 return self.text.len();
1526 }
1527 cursor = next;
1528 } else {
1529 if !self.record_token(cursor, end) {
1530 return self.text.len();
1531 }
1532 cursor = end;
1533 }
1534 }
1535 if self.is_halted() {
1536 return self.text.len();
1537 }
1538 self.errors.push(ParseError {
1539 message: "unterminated vector literal in lightweight parse",
1540 range: text_range(start as u32, self.text.len() as u32),
1541 });
1542 self.text.len()
1543 }
1544
1545 fn scan_until_matching_body_end(&mut self, start: usize, cursor: usize) -> usize {
1546 let mut cursor = cursor;
1547 let mut depth = 0usize;
1548 let mut saw_body = false;
1549 while cursor < self.text.len() && !self.is_halted() {
1550 if self.starts_with(cursor, "//") {
1551 cursor = self.skip_line_comment(cursor);
1552 continue;
1553 }
1554 if self.starts_with(cursor, "/*") {
1555 cursor = self.skip_block_comment(cursor);
1556 continue;
1557 }
1558 match self.byte_at(cursor) {
1559 Some(b'"') => cursor = self.scan_quoted_string(cursor),
1560 Some(b'`') => cursor = self.scan_backquote(cursor),
1561 Some(b'{') => {
1562 if !self.record_token(cursor, cursor + 1)
1563 || !self.enter_nesting(cursor, cursor + 1)
1564 {
1565 return self.text.len();
1566 }
1567 saw_body = true;
1568 depth += 1;
1569 cursor += 1;
1570 }
1571 Some(b'}') if saw_body => {
1572 if !self.record_token(cursor, cursor + 1) {
1573 return self.text.len();
1574 }
1575 if depth > 0 {
1576 self.exit_nesting();
1577 }
1578 depth = depth.saturating_sub(1);
1579 cursor += 1;
1580 if depth == 0 {
1581 return cursor;
1582 }
1583 }
1584 Some(b'(' | b')' | b'[' | b']' | b',' | b';') => {
1585 if !self.record_token(cursor, cursor + 1) {
1586 return self.text.len();
1587 }
1588 cursor += 1;
1589 }
1590 Some(ch) if (ch as char).is_whitespace() => cursor = self.next_offset(cursor),
1591 Some(_) => {
1592 let end = self.scan_simple_word_until(cursor, self.text.len());
1593 if end <= cursor {
1594 if !self.record_token(cursor, self.next_offset(cursor)) {
1595 return self.text.len();
1596 }
1597 cursor = self.next_offset(cursor);
1598 } else {
1599 if !self.record_token(cursor, end) {
1600 return self.text.len();
1601 }
1602 cursor = end;
1603 }
1604 }
1605 None => break,
1606 }
1607 }
1608 if self.is_halted() {
1609 return self.text.len();
1610 }
1611 self.errors.push(ParseError {
1612 message: "unterminated proc body in lightweight surface parse",
1613 range: text_range(start as u32, self.text.len() as u32),
1614 });
1615 self.text.len()
1616 }
1617
1618 fn scan_simple_word(&mut self, start: usize) -> Option<(usize, usize)> {
1619 let start = self.skip_trivia(start);
1620 let end = self.scan_simple_word_until(start, self.text.len());
1621 if end > start && !self.record_token(start, end) {
1622 return None;
1623 }
1624 (end > start).then_some((start, end))
1625 }
1626
1627 fn scan_simple_word_until(&self, start: usize, body_end: usize) -> usize {
1628 let mut cursor = start;
1629 while cursor < body_end {
1630 if self.starts_with(cursor, "//") || self.starts_with(cursor, "/*") {
1631 break;
1632 }
1633 match self.byte_at(cursor) {
1634 Some(b';' | b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'`' | b'"') | None => break,
1635 Some(ch) if (ch as char).is_whitespace() => break,
1636 Some(_) => cursor = self.next_offset(cursor),
1637 }
1638 }
1639 cursor
1640 }
1641
1642 fn skip_trivia(&mut self, start: usize) -> usize {
1643 let mut cursor = start;
1644 while cursor < self.text.len() {
1645 if self.starts_with(cursor, "//") {
1646 cursor = self.skip_line_comment(cursor);
1647 continue;
1648 }
1649 if self.starts_with(cursor, "/*") {
1650 cursor = self.skip_block_comment(cursor);
1651 continue;
1652 }
1653 let Some(ch) = self.text[cursor..].chars().next() else {
1654 break;
1655 };
1656 if ch.is_whitespace() {
1657 cursor += ch.len_utf8();
1658 continue;
1659 }
1660 break;
1661 }
1662 cursor
1663 }
1664
1665 fn skip_trivia_peek(&self, start: usize) -> usize {
1666 let mut cursor = start;
1667 while cursor < self.text.len() {
1668 if self.starts_with(cursor, "//") {
1669 cursor = self.skip_line_comment(cursor);
1670 continue;
1671 }
1672 if self.starts_with(cursor, "/*") {
1673 let Some(after_comment) = self.skip_block_comment_peek(cursor) else {
1674 return self.text.len();
1675 };
1676 cursor = after_comment;
1677 continue;
1678 }
1679 let Some(ch) = self.text[cursor..].chars().next() else {
1680 break;
1681 };
1682 if ch.is_whitespace() {
1683 cursor += ch.len_utf8();
1684 continue;
1685 }
1686 break;
1687 }
1688 cursor
1689 }
1690
1691 fn skip_line_comment(&self, start: usize) -> usize {
1692 let mut cursor = start + 2;
1693 while cursor < self.text.len() {
1694 match self.byte_at(cursor) {
1695 Some(b'\n') => return cursor + 1,
1696 Some(_) => cursor = self.next_offset(cursor),
1697 None => break,
1698 }
1699 }
1700 self.text.len()
1701 }
1702
1703 fn skip_block_comment(&mut self, start: usize) -> usize {
1704 let mut cursor = start + 2;
1705 while cursor < self.text.len() {
1706 if self.starts_with(cursor, "*/") {
1707 return cursor + 2;
1708 }
1709 cursor = self.next_offset(cursor);
1710 }
1711 if !self.reported_unterminated_block_comment {
1712 self.errors.push(ParseError {
1713 message: "unterminated block comment",
1714 range: text_range(start as u32, self.text.len() as u32),
1715 });
1716 self.reported_unterminated_block_comment = true;
1717 }
1718 self.text.len()
1719 }
1720
1721 fn skip_block_comment_peek(&self, start: usize) -> Option<usize> {
1722 let mut cursor = start + 2;
1723 while cursor < self.text.len() {
1724 if self.starts_with(cursor, "*/") {
1725 return Some(cursor + 2);
1726 }
1727 cursor = self.next_offset(cursor);
1728 }
1729 None
1730 }
1731
1732 fn advance_string_body(&self, start: usize) -> usize {
1733 let mut cursor = start;
1734 while cursor < self.text.len() {
1735 match self.byte_at(cursor) {
1736 Some(b'\\') => cursor = self.next_offset(cursor + 1),
1737 Some(b'"') => return cursor + 1,
1738 Some(_) => cursor = self.next_offset(cursor),
1739 None => break,
1740 }
1741 }
1742 self.text.len()
1743 }
1744
1745 fn advance_backquote_body(&self, start: usize) -> usize {
1746 let mut cursor = start;
1747 while cursor < self.text.len() {
1748 match self.byte_at(cursor) {
1749 Some(b'\\') => cursor = self.next_offset(cursor + 1),
1750 Some(b'`') => return cursor + 1,
1751 Some(b'"') => cursor = self.advance_string_body(cursor + 1),
1752 Some(_) => cursor = self.next_offset(cursor),
1753 None => break,
1754 }
1755 }
1756 self.text.len()
1757 }
1758
1759 fn is_proc_start(&mut self, start: usize) -> bool {
1760 if self.peek_keyword_end(start, "proc").is_some() {
1761 return true;
1762 }
1763 let Some(after_global) = self.peek_keyword_end(start, "global") else {
1764 return false;
1765 };
1766 let after_global = self.skip_trivia_peek(after_global);
1767 self.peek_keyword_end(after_global, "proc").is_some()
1768 }
1769
1770 fn peek_keyword_end(&self, start: usize, keyword: &str) -> Option<usize> {
1771 let cursor = self.skip_trivia_peek(start);
1772 if !self.text[cursor..].starts_with(keyword) {
1773 return None;
1774 }
1775 let end = cursor + keyword.len();
1776 let next = self.text[end..].chars().next();
1777 if next.is_some_and(is_word_continue) {
1778 return None;
1779 }
1780 Some(end)
1781 }
1782
1783 fn consume_keyword(&mut self, start: usize, keyword: &str) -> Option<usize> {
1784 let cursor = self.skip_trivia(start);
1785 if !self.text[cursor..].starts_with(keyword) {
1786 return None;
1787 }
1788 let end = cursor + keyword.len();
1789 let next = self.text[end..].chars().next();
1790 if next.is_some_and(is_word_continue) {
1791 return None;
1792 }
1793 if !self.record_token(cursor, end) {
1794 return None;
1795 }
1796 Some(end)
1797 }
1798
1799 fn starts_with(&self, start: usize, needle: &str) -> bool {
1800 self.text[start..].starts_with(needle)
1801 }
1802
1803 fn byte_at(&self, offset: usize) -> Option<u8> {
1804 self.text.as_bytes().get(offset).copied()
1805 }
1806
1807 fn peek_byte(&self, offset: usize) -> Option<u8> {
1808 self.byte_at(offset)
1809 }
1810
1811 fn next_offset(&self, offset: usize) -> usize {
1812 self.text[offset..]
1813 .chars()
1814 .next()
1815 .map_or(self.text.len(), |ch| offset + ch.len_utf8())
1816 }
1817
1818 fn prev_offset(&self, offset: usize) -> usize {
1819 let mut index = offset.saturating_sub(1);
1820 while !self.text.is_char_boundary(index) {
1821 index = index.saturating_sub(1);
1822 }
1823 index
1824 }
1825
1826 fn is_halted(&self) -> bool {
1827 self.budget.halted
1828 }
1829
1830 fn halt(&mut self, error: ParseError) {
1831 if self.reported_budget_error {
1832 return;
1833 }
1834 self.reported_budget_error = true;
1835 self.budget.halted = true;
1836 self.errors.push(error);
1837 }
1838
1839 fn record_token(&mut self, start: usize, end: usize) -> bool {
1840 let range = text_range(start as u32, end as u32);
1841 if !self.budget.record_token() {
1842 self.halt(budget_error("max_tokens", range));
1843 return false;
1844 }
1845 true
1846 }
1847
1848 fn record_statement(&mut self, range: TextRange) -> bool {
1849 if !self.budget.record_statement() {
1850 self.halt(budget_error("max_statements", range));
1851 return false;
1852 }
1853 true
1854 }
1855
1856 fn enter_nesting(&mut self, start: usize, end: usize) -> bool {
1857 let range = text_range(start as u32, end as u32);
1858 if !self.budget.enter_nesting() {
1859 self.halt(budget_error("max_nesting_depth", range));
1860 return false;
1861 }
1862 true
1863 }
1864
1865 fn exit_nesting(&mut self) {
1866 self.budget.exit_nesting();
1867 }
1868
1869 fn check_literal(&mut self, range: TextRange) -> bool {
1870 if !self.budget.check_literal(usize::from(range.len())) {
1871 self.halt(budget_error("max_literal_bytes", range));
1872 return false;
1873 }
1874 true
1875 }
1876}
1877
1878struct ByteLightScanner<'a> {
1879 bytes: &'a [u8],
1880 encoding: SourceEncoding,
1881 options: LightParseOptions,
1882 errors: Vec<ParseError>,
1883 reported_unterminated_block_comment: bool,
1884 reported_budget_error: bool,
1885 budget: LightBudgetTracker,
1886}
1887
1888impl<'a> ByteLightScanner<'a> {
1889 fn new(bytes: &'a [u8], encoding: SourceEncoding, options: LightParseOptions) -> Self {
1890 Self {
1891 bytes,
1892 encoding,
1893 options,
1894 errors: Vec::new(),
1895 reported_unterminated_block_comment: false,
1896 reported_budget_error: false,
1897 budget: LightBudgetTracker::new(options.budgets),
1898 }
1899 }
1900
1901 fn scan_with_sink(&mut self, sink: &mut impl LightItemSink) {
1902 let source = LightSourceView::Bytes {
1903 bytes: self.bytes,
1904 encoding: self.encoding,
1905 };
1906 let mut cursor = self.skip_trivia(0);
1907 while cursor < self.bytes.len() && !self.is_halted() {
1908 let (item, next_cursor) = if self.is_proc_start(cursor) {
1909 self.scan_proc_item(cursor)
1910 } else {
1911 self.scan_statement_item(cursor)
1912 };
1913 if self.is_halted() {
1914 break;
1915 }
1916 if !self.record_statement(start_range(&item)) {
1917 break;
1918 }
1919 sink.on_item(source, item);
1920 cursor = self.skip_trivia(next_cursor);
1921 }
1922 }
1923
1924 fn scan_proc_item(&mut self, start: usize) -> (LightItem, usize) {
1925 let mut cursor = start;
1926 let mut is_global = false;
1927 if let Some(after_global) = self.consume_keyword(cursor, b"global") {
1928 is_global = true;
1929 cursor = self.skip_trivia(after_global);
1930 }
1931 let after_proc = self.consume_keyword(cursor, b"proc").unwrap_or(cursor);
1932 cursor = self.skip_trivia(after_proc);
1933 let first_word = self.scan_simple_word(cursor);
1934 let mut name_range = None;
1935 let mut body_scan_start = cursor;
1936 if let Some((first_start, first_end)) = first_word {
1937 let after_first = self.skip_trivia(first_end);
1938 body_scan_start = after_first;
1939 if self.peek_byte(after_first) == Some(b'(') {
1940 name_range = Some(text_range(first_start as u32, first_end as u32));
1941 } else if let Some((name_start, name_end)) = self.scan_simple_word(after_first) {
1942 name_range = Some(text_range(name_start as u32, name_end as u32));
1943 body_scan_start = self.skip_trivia(name_end);
1944 }
1945 }
1946 let end = self.scan_until_matching_body_end(start, body_scan_start);
1947 (
1948 LightItem::Proc(LightProcSurface {
1949 name_range,
1950 is_global,
1951 span: text_range(start as u32, end as u32),
1952 }),
1953 end,
1954 )
1955 }
1956
1957 fn scan_statement_item(&mut self, start: usize) -> (LightItem, usize) {
1958 let Some((head_start, head_end)) = self.scan_simple_word(start) else {
1959 let end = self.scan_statement_tail(start);
1960 return (
1961 LightItem::Other {
1962 span: text_range(start as u32, end as u32),
1963 },
1964 end,
1965 );
1966 };
1967 let head_range = text_range(head_start as u32, head_end as u32);
1968 let head_is_non_command = is_non_command_head_bytes(&self.bytes[head_start..head_end]);
1969 let after_head = self.skip_trivia(head_end);
1970 if self.peek_byte(after_head) == Some(b'(') || head_is_non_command {
1971 let end = self.scan_statement_tail(after_head);
1972 return (
1973 LightItem::Other {
1974 span: text_range(start as u32, end as u32),
1975 },
1976 end,
1977 );
1978 }
1979 let (end, words, opaque_tail) =
1980 self.scan_command_statement_tail(start, head_end, after_head);
1981 (
1982 LightItem::Command(LightCommandSurface {
1983 head_range,
1984 captured: false,
1985 words,
1986 opaque_tail,
1987 span: text_range(start as u32, end as u32),
1988 }),
1989 end,
1990 )
1991 }
1992
1993 fn scan_command_statement_tail(
1994 &mut self,
1995 start: usize,
1996 head_end: usize,
1997 after_head: usize,
1998 ) -> (usize, Vec<LightWord>, Option<TextRange>) {
1999 let mut words = Vec::with_capacity(self.options.max_prefix_words.min(8));
2000 let mut cursor = after_head;
2001 loop {
2002 cursor = self.skip_trivia(cursor);
2003 if cursor >= self.bytes.len() {
2004 return (self.bytes.len(), words, None);
2005 }
2006 if self.byte_at(cursor) == Some(b';') {
2007 let _ = self.record_token(cursor, cursor + 1);
2008 return (cursor + 1, words, None);
2009 }
2010 if words.len() >= self.options.max_prefix_words
2011 || cursor.saturating_sub(head_end) >= self.options.max_prefix_bytes
2012 {
2013 let end = self.scan_statement_tail(cursor);
2014 let body_end = self.statement_body_end(start, end);
2015 let opaque_tail =
2016 (cursor < body_end).then(|| text_range(cursor as u32, body_end as u32));
2017 return (end, words, opaque_tail);
2018 }
2019 let Some((word, next_cursor)) = self.scan_word(cursor, self.bytes.len()) else {
2020 if self.is_halted() {
2021 return (self.bytes.len(), words, None);
2022 }
2023 let end = self.scan_statement_tail(cursor);
2024 let body_end = self.statement_body_end(start, end);
2025 let opaque_tail =
2026 (cursor < body_end).then(|| text_range(cursor as u32, body_end as u32));
2027 return (end, words, opaque_tail);
2028 };
2029 words.push(word);
2030 cursor = next_cursor;
2031 }
2032 }
2033
2034 fn scan_statement_tail(&mut self, start: usize) -> usize {
2035 let mut cursor = start;
2036 let mut paren_depth = 0usize;
2037 let mut bracket_depth = 0usize;
2038 let mut brace_depth = 0usize;
2039 while cursor < self.bytes.len() && !self.is_halted() {
2040 if self.starts_with(cursor, b"//") {
2041 cursor = self.skip_line_comment(cursor);
2042 continue;
2043 }
2044 if self.starts_with(cursor, b"/*") {
2045 cursor = self.skip_block_comment(cursor);
2046 continue;
2047 }
2048 match self.byte_at(cursor) {
2049 Some(b'"') => cursor = self.scan_quoted_string(cursor),
2050 Some(b'`') => cursor = self.scan_backquote(cursor),
2051 Some(b'(') => {
2052 if !self.record_token(cursor, cursor + 1)
2053 || !self.enter_nesting(cursor, cursor + 1)
2054 {
2055 return self.bytes.len();
2056 }
2057 paren_depth += 1;
2058 cursor += 1;
2059 }
2060 Some(b')') => {
2061 if !self.record_token(cursor, cursor + 1) {
2062 return self.bytes.len();
2063 }
2064 if paren_depth > 0 {
2065 self.exit_nesting();
2066 }
2067 paren_depth = paren_depth.saturating_sub(1);
2068 cursor += 1;
2069 }
2070 Some(b'[') => {
2071 if !self.record_token(cursor, cursor + 1)
2072 || !self.enter_nesting(cursor, cursor + 1)
2073 {
2074 return self.bytes.len();
2075 }
2076 bracket_depth += 1;
2077 cursor += 1;
2078 }
2079 Some(b']') => {
2080 if !self.record_token(cursor, cursor + 1) {
2081 return self.bytes.len();
2082 }
2083 if bracket_depth > 0 {
2084 self.exit_nesting();
2085 }
2086 bracket_depth = bracket_depth.saturating_sub(1);
2087 cursor += 1;
2088 }
2089 Some(b'{') => {
2090 if !self.record_token(cursor, cursor + 1)
2091 || !self.enter_nesting(cursor, cursor + 1)
2092 {
2093 return self.bytes.len();
2094 }
2095 brace_depth += 1;
2096 cursor += 1;
2097 }
2098 Some(b'}') => {
2099 if !self.record_token(cursor, cursor + 1) {
2100 return self.bytes.len();
2101 }
2102 if brace_depth > 0 {
2103 self.exit_nesting();
2104 }
2105 brace_depth = brace_depth.saturating_sub(1);
2106 cursor += 1;
2107 }
2108 Some(b';') if paren_depth == 0 && bracket_depth == 0 && brace_depth == 0 => {
2109 let _ = self.record_token(cursor, cursor + 1);
2110 return cursor + 1;
2111 }
2112 Some(ch) if is_ascii_ws(ch) => cursor += 1,
2113 Some(_) => {
2114 let end = self.scan_simple_word_until(cursor, self.bytes.len());
2115 if end <= cursor {
2116 let next = self.next_offset(cursor);
2117 if !self.record_token(cursor, next) {
2118 return self.bytes.len();
2119 }
2120 cursor = next;
2121 } else {
2122 if !self.record_token(cursor, end) {
2123 return self.bytes.len();
2124 }
2125 cursor = end;
2126 }
2127 }
2128 None => break,
2129 }
2130 }
2131 self.bytes.len()
2132 }
2133
2134 fn statement_body_end(&self, start: usize, end: usize) -> usize {
2135 let mut body_end = end;
2136 if body_end > start && self.byte_at(body_end - 1) == Some(b';') {
2137 body_end -= 1;
2138 }
2139 while body_end > start && self.byte_at(body_end - 1).is_some_and(is_ascii_ws) {
2140 body_end -= 1;
2141 }
2142 body_end
2143 }
2144
2145 fn scan_word(&mut self, start: usize, body_end: usize) -> Option<(LightWord, usize)> {
2146 if start >= body_end {
2147 return None;
2148 }
2149 if self.byte_at(start) == Some(b'"') {
2150 let end = self.scan_quoted_string(start);
2151 let range = text_range(start as u32, end as u32);
2152 if !self.check_literal(range) {
2153 return None;
2154 }
2155 return Some((LightWord::QuotedString { text: range, range }, end));
2156 }
2157 if self.byte_at(start) == Some(b'`') {
2158 let end = self.scan_backquote(start);
2159 let range = text_range(start as u32, end as u32);
2160 return Some((LightWord::Capture { range }, end));
2161 }
2162 if self.byte_at(start) == Some(b'{') {
2163 let end = self.scan_balanced(start, b'{', b'}');
2164 let range = text_range(start as u32, end as u32);
2165 if !self.check_literal(range) {
2166 return None;
2167 }
2168 return Some((LightWord::BraceList { range }, end));
2169 }
2170 if self.starts_with(start, b"<<") {
2171 let end = self.scan_vector_literal(start);
2172 let range = text_range(start as u32, end as u32);
2173 if !self.check_literal(range) {
2174 return None;
2175 }
2176 return Some((LightWord::VectorLiteral { range }, end));
2177 }
2178 if self.byte_at(start) == Some(b'(') {
2179 let end = self.scan_balanced(start, b'(', b')');
2180 let range = text_range(start as u32, end as u32);
2181 if !self.check_literal(range) {
2182 return None;
2183 }
2184 return Some((LightWord::GroupedExpr { range }, end));
2185 }
2186 let end = self.scan_simple_word_until(start, body_end);
2187 if end <= start {
2188 return None;
2189 }
2190 if !self.record_token(start, end) {
2191 return None;
2192 }
2193 let range = text_range(start as u32, end as u32);
2194 let text = &self.bytes[start..end];
2195 let word = if text.starts_with(b"$") {
2196 LightWord::Variable { range }
2197 } else if text.starts_with(b"-") && text.len() > 1 {
2198 LightWord::Flag { text: range, range }
2199 } else if looks_numeric_like_bytes(text) {
2200 LightWord::NumericLiteral { text: range, range }
2201 } else {
2202 LightWord::BareWord { text: range, range }
2203 };
2204 Some((word, end))
2205 }
2206
2207 fn scan_quoted_string(&mut self, start: usize) -> usize {
2208 let mut cursor = start + 1;
2209 while cursor < self.bytes.len() {
2210 match self.byte_at(cursor) {
2211 Some(b'\\') => cursor = self.next_offset(cursor + 1),
2212 Some(b'"') => {
2213 let end = cursor + 1;
2214 let _ = self.record_token(start, end);
2215 return end;
2216 }
2217 Some(_) => cursor = self.next_offset(cursor),
2218 None => break,
2219 }
2220 }
2221 if self.is_halted() {
2222 return self.bytes.len();
2223 }
2224 let _ = self.record_token(start, self.bytes.len());
2225 self.errors.push(ParseError {
2226 message: "unterminated string literal in lightweight surface parse",
2227 range: text_range(start as u32, self.bytes.len() as u32),
2228 });
2229 self.bytes.len()
2230 }
2231
2232 fn scan_backquote(&mut self, start: usize) -> usize {
2233 let mut cursor = start + 1;
2234 while cursor < self.bytes.len() {
2235 match self.byte_at(cursor) {
2236 Some(b'\\') => cursor = self.next_offset(cursor + 1),
2237 Some(b'`') => {
2238 let end = cursor + 1;
2239 let _ = self.record_token(start, end);
2240 return end;
2241 }
2242 Some(b'"') => cursor = self.scan_quoted_string(cursor),
2243 Some(_) => cursor = self.next_offset(cursor),
2244 None => break,
2245 }
2246 }
2247 if self.is_halted() {
2248 return self.bytes.len();
2249 }
2250 let _ = self.record_token(start, self.bytes.len());
2251 self.errors.push(ParseError {
2252 message: "unterminated backquote capture in lightweight surface parse",
2253 range: text_range(start as u32, self.bytes.len() as u32),
2254 });
2255 self.bytes.len()
2256 }
2257
2258 fn scan_balanced(&mut self, start: usize, open: u8, close: u8) -> usize {
2259 let mut cursor = start;
2260 let mut depth = 0usize;
2261 while cursor < self.bytes.len() && !self.is_halted() {
2262 if self.starts_with(cursor, b"//") {
2263 cursor = self.skip_line_comment(cursor);
2264 continue;
2265 }
2266 if self.starts_with(cursor, b"/*") {
2267 cursor = self.skip_block_comment(cursor);
2268 continue;
2269 }
2270 match self.byte_at(cursor) {
2271 Some(b'"') => cursor = self.scan_quoted_string(cursor),
2272 Some(b'`') => cursor = self.scan_backquote(cursor),
2273 Some(ch) if ch == open => {
2274 if !self.record_token(cursor, cursor + 1)
2275 || !self.enter_nesting(cursor, cursor + 1)
2276 {
2277 return self.bytes.len();
2278 }
2279 depth += 1;
2280 cursor += 1;
2281 }
2282 Some(ch) if ch == close => {
2283 if !self.record_token(cursor, cursor + 1) {
2284 return self.bytes.len();
2285 }
2286 if depth > 0 {
2287 self.exit_nesting();
2288 }
2289 depth = depth.saturating_sub(1);
2290 cursor += 1;
2291 if depth == 0 {
2292 return cursor;
2293 }
2294 }
2295 Some(b'(' | b')' | b'[' | b']' | b'{' | b'}' | b',') => {
2296 if !self.record_token(cursor, cursor + 1) {
2297 return self.bytes.len();
2298 }
2299 cursor += 1;
2300 }
2301 Some(ch) if is_ascii_ws(ch) => cursor += 1,
2302 Some(_) => {
2303 let end = self.scan_simple_word_until(cursor, self.bytes.len());
2304 if end <= cursor {
2305 let next = self.next_offset(cursor);
2306 if !self.record_token(cursor, next) {
2307 return self.bytes.len();
2308 }
2309 cursor = next;
2310 } else {
2311 if !self.record_token(cursor, end) {
2312 return self.bytes.len();
2313 }
2314 cursor = end;
2315 }
2316 }
2317 None => break,
2318 }
2319 }
2320 if self.is_halted() {
2321 return self.bytes.len();
2322 }
2323 self.errors.push(ParseError {
2324 message: "unterminated grouped surface in lightweight parse",
2325 range: text_range(start as u32, self.bytes.len() as u32),
2326 });
2327 self.bytes.len()
2328 }
2329
2330 fn scan_vector_literal(&mut self, start: usize) -> usize {
2331 let mut cursor = start + 2;
2332 if !self.record_token(start, start + 2) || !self.enter_nesting(start, start + 2) {
2333 return self.bytes.len();
2334 }
2335 while cursor < self.bytes.len() && !self.is_halted() {
2336 if self.starts_with(cursor, b">>") {
2337 let _ = self.record_token(cursor, cursor + 2);
2338 self.exit_nesting();
2339 return cursor + 2;
2340 }
2341 if self.byte_at(cursor) == Some(b'"') {
2342 cursor = self.scan_quoted_string(cursor);
2343 continue;
2344 }
2345 if self.byte_at(cursor).is_some_and(is_ascii_ws) {
2346 cursor += 1;
2347 continue;
2348 }
2349 let end = self.scan_simple_word_until(cursor, self.bytes.len());
2350 if end <= cursor {
2351 let next = self.next_offset(cursor);
2352 if !self.record_token(cursor, next) {
2353 return self.bytes.len();
2354 }
2355 cursor = next;
2356 } else {
2357 if !self.record_token(cursor, end) {
2358 return self.bytes.len();
2359 }
2360 cursor = end;
2361 }
2362 }
2363 if self.is_halted() {
2364 return self.bytes.len();
2365 }
2366 self.errors.push(ParseError {
2367 message: "unterminated vector literal in lightweight parse",
2368 range: text_range(start as u32, self.bytes.len() as u32),
2369 });
2370 self.bytes.len()
2371 }
2372
2373 fn scan_until_matching_body_end(&mut self, start: usize, cursor: usize) -> usize {
2374 let mut cursor = cursor;
2375 let mut depth = 0usize;
2376 let mut saw_body = false;
2377 while cursor < self.bytes.len() && !self.is_halted() {
2378 if self.starts_with(cursor, b"//") {
2379 cursor = self.skip_line_comment(cursor);
2380 continue;
2381 }
2382 if self.starts_with(cursor, b"/*") {
2383 cursor = self.skip_block_comment(cursor);
2384 continue;
2385 }
2386 match self.byte_at(cursor) {
2387 Some(b'"') => cursor = self.scan_quoted_string(cursor),
2388 Some(b'`') => cursor = self.scan_backquote(cursor),
2389 Some(b'{') => {
2390 if !self.record_token(cursor, cursor + 1)
2391 || !self.enter_nesting(cursor, cursor + 1)
2392 {
2393 return self.bytes.len();
2394 }
2395 saw_body = true;
2396 depth += 1;
2397 cursor += 1;
2398 }
2399 Some(b'}') if saw_body => {
2400 if !self.record_token(cursor, cursor + 1) {
2401 return self.bytes.len();
2402 }
2403 if depth > 0 {
2404 self.exit_nesting();
2405 }
2406 depth = depth.saturating_sub(1);
2407 cursor += 1;
2408 if depth == 0 {
2409 return cursor;
2410 }
2411 }
2412 Some(b'(' | b')' | b'[' | b']' | b',' | b';') => {
2413 if !self.record_token(cursor, cursor + 1) {
2414 return self.bytes.len();
2415 }
2416 cursor += 1;
2417 }
2418 Some(ch) if is_ascii_ws(ch) => cursor += 1,
2419 Some(_) => {
2420 let end = self.scan_simple_word_until(cursor, self.bytes.len());
2421 if end <= cursor {
2422 let next = self.next_offset(cursor);
2423 if !self.record_token(cursor, next) {
2424 return self.bytes.len();
2425 }
2426 cursor = next;
2427 } else {
2428 if !self.record_token(cursor, end) {
2429 return self.bytes.len();
2430 }
2431 cursor = end;
2432 }
2433 }
2434 None => break,
2435 }
2436 }
2437 if self.is_halted() {
2438 return self.bytes.len();
2439 }
2440 self.errors.push(ParseError {
2441 message: "unterminated proc body in lightweight surface parse",
2442 range: text_range(start as u32, self.bytes.len() as u32),
2443 });
2444 self.bytes.len()
2445 }
2446
2447 fn scan_simple_word(&mut self, start: usize) -> Option<(usize, usize)> {
2448 let start = self.skip_trivia(start);
2449 let end = self.scan_simple_word_until(start, self.bytes.len());
2450 if end > start && !self.record_token(start, end) {
2451 return None;
2452 }
2453 (end > start).then_some((start, end))
2454 }
2455
2456 fn scan_simple_word_until(&self, start: usize, body_end: usize) -> usize {
2457 let mut cursor = start;
2458 while cursor < body_end {
2459 if self.starts_with(cursor, b"//") || self.starts_with(cursor, b"/*") {
2460 break;
2461 }
2462 match self.byte_at(cursor) {
2463 Some(b';' | b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'`' | b'"') | None => break,
2464 Some(ch) if is_ascii_ws(ch) => break,
2465 Some(_) => cursor = self.next_offset(cursor),
2466 }
2467 }
2468 cursor
2469 }
2470
2471 fn skip_trivia(&mut self, start: usize) -> usize {
2472 let mut cursor = start;
2473 while cursor < self.bytes.len() {
2474 if self.starts_with(cursor, b"//") {
2475 cursor = self.skip_line_comment(cursor);
2476 continue;
2477 }
2478 if self.starts_with(cursor, b"/*") {
2479 cursor = self.skip_block_comment(cursor);
2480 continue;
2481 }
2482 if self.byte_at(cursor).is_some_and(is_ascii_ws) {
2483 cursor += 1;
2484 continue;
2485 }
2486 break;
2487 }
2488 cursor
2489 }
2490
2491 fn skip_trivia_peek(&self, start: usize) -> usize {
2492 let mut cursor = start;
2493 while cursor < self.bytes.len() {
2494 if self.starts_with(cursor, b"//") {
2495 cursor = self.skip_line_comment(cursor);
2496 continue;
2497 }
2498 if self.starts_with(cursor, b"/*") {
2499 let Some(after_comment) = self.skip_block_comment_peek(cursor) else {
2500 return self.bytes.len();
2501 };
2502 cursor = after_comment;
2503 continue;
2504 }
2505 if self.byte_at(cursor).is_some_and(is_ascii_ws) {
2506 cursor += 1;
2507 continue;
2508 }
2509 break;
2510 }
2511 cursor
2512 }
2513
2514 fn skip_line_comment(&self, start: usize) -> usize {
2515 let mut cursor = start + 2;
2516 while cursor < self.bytes.len() {
2517 match self.byte_at(cursor) {
2518 Some(b'\n') => return cursor + 1,
2519 Some(_) => cursor = self.next_offset(cursor),
2520 None => break,
2521 }
2522 }
2523 self.bytes.len()
2524 }
2525
2526 fn skip_block_comment(&mut self, start: usize) -> usize {
2527 let mut cursor = start + 2;
2528 while cursor < self.bytes.len() {
2529 if self.starts_with(cursor, b"*/") {
2530 return cursor + 2;
2531 }
2532 cursor = self.next_offset(cursor);
2533 }
2534 if !self.reported_unterminated_block_comment {
2535 self.errors.push(ParseError {
2536 message: "unterminated block comment",
2537 range: text_range(start as u32, self.bytes.len() as u32),
2538 });
2539 self.reported_unterminated_block_comment = true;
2540 }
2541 self.bytes.len()
2542 }
2543
2544 fn skip_block_comment_peek(&self, start: usize) -> Option<usize> {
2545 let mut cursor = start + 2;
2546 while cursor < self.bytes.len() {
2547 if self.starts_with(cursor, b"*/") {
2548 return Some(cursor + 2);
2549 }
2550 cursor = self.next_offset(cursor);
2551 }
2552 None
2553 }
2554
2555 fn is_proc_start(&self, start: usize) -> bool {
2556 if self.peek_keyword_end(start, b"proc").is_some() {
2557 return true;
2558 }
2559 let Some(after_global) = self.peek_keyword_end(start, b"global") else {
2560 return false;
2561 };
2562 let after_global = self.skip_trivia_peek(after_global);
2563 self.peek_keyword_end(after_global, b"proc").is_some()
2564 }
2565
2566 fn peek_keyword_end(&self, start: usize, keyword: &[u8]) -> Option<usize> {
2567 let cursor = self.skip_trivia_peek(start);
2568 if !self.bytes.get(cursor..)?.starts_with(keyword) {
2569 return None;
2570 }
2571 let end = cursor + keyword.len();
2572 if self.byte_at(end).is_some_and(is_word_continue_byte) {
2573 return None;
2574 }
2575 Some(end)
2576 }
2577
2578 fn consume_keyword(&mut self, start: usize, keyword: &[u8]) -> Option<usize> {
2579 let cursor = self.skip_trivia(start);
2580 if !self.bytes.get(cursor..)?.starts_with(keyword) {
2581 return None;
2582 }
2583 let end = cursor + keyword.len();
2584 if self.byte_at(end).is_some_and(is_word_continue_byte) {
2585 return None;
2586 }
2587 if !self.record_token(cursor, end) {
2588 return None;
2589 }
2590 Some(end)
2591 }
2592
2593 fn starts_with(&self, start: usize, needle: &[u8]) -> bool {
2594 self.bytes
2595 .get(start..)
2596 .is_some_and(|bytes| bytes.starts_with(needle))
2597 }
2598
2599 fn byte_at(&self, offset: usize) -> Option<u8> {
2600 self.bytes.get(offset).copied()
2601 }
2602
2603 fn peek_byte(&self, offset: usize) -> Option<u8> {
2604 self.byte_at(offset)
2605 }
2606
2607 fn next_offset(&self, offset: usize) -> usize {
2608 if offset >= self.bytes.len() {
2609 return self.bytes.len();
2610 }
2611 let first = self.bytes[offset];
2612 if first < 0x80 {
2613 return offset + 1;
2614 }
2615 let len = match self.encoding {
2616 SourceEncoding::Utf8 => utf8_sequence_len(first, &self.bytes[offset..]),
2617 SourceEncoding::Cp932 => multibyte_len_cp932(first, self.byte_at(offset + 1)),
2618 SourceEncoding::Gbk => multibyte_len_gbk(first, self.byte_at(offset + 1)),
2619 };
2620 offset.saturating_add(len).min(self.bytes.len())
2621 }
2622
2623 fn is_halted(&self) -> bool {
2624 self.budget.halted
2625 }
2626
2627 fn halt(&mut self, error: ParseError) {
2628 if self.reported_budget_error {
2629 return;
2630 }
2631 self.reported_budget_error = true;
2632 self.budget.halted = true;
2633 self.errors.push(error);
2634 }
2635
2636 fn record_token(&mut self, start: usize, end: usize) -> bool {
2637 let range = text_range(start as u32, end as u32);
2638 if !self.budget.record_token() {
2639 self.halt(budget_error("max_tokens", range));
2640 return false;
2641 }
2642 true
2643 }
2644
2645 fn record_statement(&mut self, range: TextRange) -> bool {
2646 if !self.budget.record_statement() {
2647 self.halt(budget_error("max_statements", range));
2648 return false;
2649 }
2650 true
2651 }
2652
2653 fn enter_nesting(&mut self, start: usize, end: usize) -> bool {
2654 let range = text_range(start as u32, end as u32);
2655 if !self.budget.enter_nesting() {
2656 self.halt(budget_error("max_nesting_depth", range));
2657 return false;
2658 }
2659 true
2660 }
2661
2662 fn exit_nesting(&mut self) {
2663 self.budget.exit_nesting();
2664 }
2665
2666 fn check_literal(&mut self, range: TextRange) -> bool {
2667 if !self.budget.check_literal(usize::from(range.len())) {
2668 self.halt(budget_error("max_literal_bytes", range));
2669 return false;
2670 }
2671 true
2672 }
2673}
2674
2675#[derive(Debug, Clone, Copy)]
2676struct LightBudgetTracker {
2677 max_nesting_depth: usize,
2678 max_literal_bytes: usize,
2679 remaining_tokens: usize,
2680 remaining_statements: usize,
2681 remaining_nesting: usize,
2682 halted: bool,
2683}
2684
2685impl LightBudgetTracker {
2686 fn new(budgets: ParseBudgets) -> Self {
2687 Self {
2688 max_nesting_depth: budgets.max_nesting_depth,
2689 max_literal_bytes: budgets.max_literal_bytes,
2690 remaining_tokens: budgets.max_tokens,
2691 remaining_statements: budgets.max_statements,
2692 remaining_nesting: budgets.max_nesting_depth,
2693 halted: false,
2694 }
2695 }
2696
2697 fn record_token(&mut self) -> bool {
2698 if self.remaining_tokens == 0 {
2699 self.halted = true;
2700 return false;
2701 }
2702 self.remaining_tokens -= 1;
2703 true
2704 }
2705
2706 fn record_statement(&mut self) -> bool {
2707 if self.remaining_statements == 0 {
2708 self.halted = true;
2709 return false;
2710 }
2711 self.remaining_statements -= 1;
2712 true
2713 }
2714
2715 fn enter_nesting(&mut self) -> bool {
2716 if self.remaining_nesting == 0 {
2717 self.halted = true;
2718 return false;
2719 }
2720 self.remaining_nesting -= 1;
2721 true
2722 }
2723
2724 fn exit_nesting(&mut self) {
2725 if self.remaining_nesting < self.max_nesting_depth {
2726 self.remaining_nesting += 1;
2727 }
2728 }
2729
2730 fn check_literal(&mut self, len: usize) -> bool {
2731 if len > self.max_literal_bytes {
2732 self.halted = true;
2733 return false;
2734 }
2735 true
2736 }
2737}
2738
2739fn start_range(item: &LightItem) -> TextRange {
2740 match item {
2741 LightItem::Command(command) => command.span,
2742 LightItem::Proc(proc_def) => proc_def.span,
2743 LightItem::Other { span } => *span,
2744 }
2745}
2746
2747fn max_bytes_error_for_text(len: usize, budgets: ParseBudgets) -> Option<ParseError> {
2748 (len > budgets.max_bytes).then(|| budget_error("max_bytes", text_len_range(len)))
2749}
2750
2751fn max_bytes_error_for_bytes(len: usize, budgets: ParseBudgets) -> Option<ParseError> {
2752 (len > budgets.max_bytes).then(|| budget_error("max_bytes", text_range(0, 0)))
2753}
2754
2755fn max_bytes_error_for_file(path: &Path, budgets: ParseBudgets) -> io::Result<Option<ParseError>> {
2756 match fs::metadata(path) {
2757 Ok(metadata) if metadata.len() > budgets.max_bytes as u64 => {
2758 Ok(Some(budget_error("max_bytes", text_range(0, 0))))
2759 }
2760 Ok(_) => Ok(None),
2761 Err(error) if error.kind() == io::ErrorKind::NotFound => Err(error),
2762 Err(_) => Ok(None),
2763 }
2764}
2765
2766fn empty_light_scan_report(error: ParseError) -> LightScanReport {
2767 LightScanReport {
2768 source_text: String::new(),
2769 source_map: SourceMap::identity(0),
2770 source_encoding: SourceEncoding::Utf8,
2771 decode_errors: Vec::new(),
2772 errors: vec![error],
2773 }
2774}
2775
2776fn empty_shared_light_scan_report(error: ParseError) -> SharedLightScanReport {
2777 SharedLightScanReport {
2778 source_text: Arc::from(""),
2779 source_map: SourceMap::identity(0),
2780 source_encoding: SourceEncoding::Utf8,
2781 decode_errors: Vec::new(),
2782 errors: vec![error],
2783 }
2784}
2785
2786fn is_word_continue(ch: char) -> bool {
2787 ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$')
2788}
2789
2790fn is_word_continue_byte(ch: u8) -> bool {
2791 ch.is_ascii_alphanumeric() || matches!(ch, b'_' | b'$')
2792}
2793
2794fn is_ascii_ws(ch: u8) -> bool {
2795 matches!(ch, b' ' | b'\t' | b'\n' | b'\r' | 0x0C)
2796}
2797
2798fn is_non_command_head(head: &str) -> bool {
2799 matches!(
2800 head,
2801 "global"
2802 | "proc"
2803 | "if"
2804 | "while"
2805 | "do"
2806 | "for"
2807 | "switch"
2808 | "return"
2809 | "break"
2810 | "continue"
2811 | "int"
2812 | "float"
2813 | "string"
2814 | "vector"
2815 | "matrix"
2816 )
2817}
2818
2819fn is_non_command_head_bytes(head: &[u8]) -> bool {
2820 matches!(
2821 head,
2822 b"global"
2823 | b"proc"
2824 | b"if"
2825 | b"while"
2826 | b"do"
2827 | b"for"
2828 | b"switch"
2829 | b"return"
2830 | b"break"
2831 | b"continue"
2832 | b"int"
2833 | b"float"
2834 | b"string"
2835 | b"vector"
2836 | b"matrix"
2837 )
2838}
2839
2840fn looks_numeric_like(text: &str) -> bool {
2841 let trimmed = text.strip_prefix(['+', '-']).unwrap_or(text);
2842 if trimmed.is_empty() {
2843 return false;
2844 }
2845 trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
2846 || (trimmed.starts_with('.')
2847 && trimmed[1..]
2848 .chars()
2849 .next()
2850 .is_some_and(|ch| ch.is_ascii_digit()))
2851}
2852
2853fn looks_numeric_like_bytes(text: &[u8]) -> bool {
2854 let trimmed = text
2855 .strip_prefix(b"+")
2856 .or_else(|| text.strip_prefix(b"-"))
2857 .unwrap_or(text);
2858 if trimmed.is_empty() {
2859 return false;
2860 }
2861 trimmed[0].is_ascii_digit()
2862 || (trimmed.starts_with(b".") && trimmed.get(1).is_some_and(|ch| ch.is_ascii_digit()))
2863}
2864
2865fn utf8_sequence_len(first: u8, bytes: &[u8]) -> usize {
2866 let len = if first & 0b1110_0000 == 0b1100_0000 {
2867 2
2868 } else if first & 0b1111_0000 == 0b1110_0000 {
2869 3
2870 } else if first & 0b1111_1000 == 0b1111_0000 {
2871 4
2872 } else {
2873 1
2874 };
2875 if bytes.len() >= len
2876 && bytes[1..len]
2877 .iter()
2878 .all(|byte| byte & 0b1100_0000 == 0b1000_0000)
2879 {
2880 len
2881 } else {
2882 1
2883 }
2884}
2885
2886fn multibyte_len_cp932(first: u8, second: Option<u8>) -> usize {
2887 let is_lead = matches!(first, 0x81..=0x9F | 0xE0..=0xFC);
2888 let is_trail = second.is_some_and(|byte| matches!(byte, 0x40..=0x7E | 0x80..=0xFC));
2889 if is_lead && is_trail { 2 } else { 1 }
2890}
2891
2892fn multibyte_len_gbk(first: u8, second: Option<u8>) -> usize {
2893 let is_lead = matches!(first, 0x81..=0xFE);
2894 let is_trail = second.is_some_and(|byte| matches!(byte, 0x40..=0x7E | 0x80..=0xFE));
2895 if is_lead && is_trail { 2 } else { 1 }
2896}
2897
2898#[cfg(test)]
2899mod tests {
2900 use super::parse_light_source;
2901 use mel_syntax::text_range;
2902
2903 #[test]
2904 fn unterminated_block_comment_reports_light_parse_error() {
2905 let parse = parse_light_source("createNode file -n \"f\";\n/* hidden tail");
2906
2907 assert_eq!(parse.source.items.len(), 1);
2908 assert_eq!(parse.errors.len(), 1);
2909 assert_eq!(parse.errors[0].message, "unterminated block comment");
2910 assert_eq!(parse.errors[0].range, text_range(24, 38));
2911 }
2912}