Skip to main content

parse_that/span_parser/
mod.rs

1use regex::Regex;
2use std::sync::Arc;
3
4use crate::leaf::{trim_leading_whitespace, trim_leading_whitespace_mut};
5use crate::parse::ParserFn;
6use crate::state::{ParserState, Span};
7
8use aho_corasick::{AhoCorasick, Anchored, Input};
9
10// ── Flags (same values as Parser flags) ───────────────────────
11
12const FLAG_TRIM_WS: u8 = 0b0001;
13const FLAG_SAVE_STATE: u8 = 0b0010;
14
15// ── SpanParser: enum-dispatched, zero-boxing for Span hot path ─
16
17/// Helper macro for constructing SpanParser with conditional label field.
18#[cfg(feature = "diagnostics")]
19macro_rules! sp_new {
20    ($kind:expr, $label:expr) => {
21        SpanParser {
22            kind: $kind,
23            flags: 0,
24            label: Some($label),
25        }
26    };
27    ($kind:expr) => {
28        SpanParser {
29            kind: $kind,
30            flags: 0,
31            label: None,
32        }
33    };
34}
35
36#[cfg(not(feature = "diagnostics"))]
37macro_rules! sp_new {
38    ($kind:expr, $label:expr) => {
39        SpanParser {
40            kind: $kind,
41            flags: 0,
42        }
43    };
44    ($kind:expr) => {
45        SpanParser {
46            kind: $kind,
47            flags: 0,
48        }
49    };
50}
51
52pub struct SpanParser<'a> {
53    pub(super) kind: SpanKind<'a>,
54    pub(super) flags: u8,
55    #[cfg(feature = "diagnostics")]
56    pub(super) label: Option<&'static str>,
57}
58
59pub(super) enum SpanKind<'a> {
60    // === Leaves (no inner parser, no vtable) ===
61    StringLiteral(&'static [u8]),
62    RegexMatch(Arc<Regex>),
63    AhoCorasickMatch(AhoCorasick),
64    TakeWhileByte(fn(u8) -> bool),
65    TakeWhileChar(Box<dyn Fn(char) -> bool + 'a>),
66    NextN(usize),
67    Epsilon,
68    /// Fast path for negated byte classes with one excluded byte.
69    TakeUntilAny1(u8),
70    /// Fast path for negated byte classes with two excluded bytes.
71    TakeUntilAny2(u8, u8),
72    /// Fast path for negated byte classes with three excluded bytes.
73    TakeUntilAny3(u8, u8, u8),
74    /// LUT-based byte scanner for negated character classes (`[^...]+`) when
75    /// the excluded set is larger than three bytes.
76    TakeUntilAnyLut(Box<[bool; 256]>),
77    /// SIMD nibble-LUT byte scanner for negated character classes when the
78    /// excluded set has 4–8 unique bytes. Two 16-byte LUTs classify 16 bytes/cycle
79    /// via `swizzle_dyn` (vpshufb / tbl).
80    TakeUntilAnySIMD {
81        lo_lut: [u8; 16],
82        hi_lut: [u8; 16],
83    },
84
85    // === Domain-specific monolithic scanners (JSON, CSS, etc.) ===
86    Scanner(SpanScanner),
87
88    // === Flat combinators (no nesting depth) ===
89    Seq(Vec<SpanParser<'a>>),
90    OneOf(Vec<SpanParser<'a>>),
91    Many {
92        inner: Box<SpanParser<'a>>,
93        lo: usize,
94        hi: usize,
95    },
96    SepBy {
97        inner: Box<SpanParser<'a>>,
98        sep: Box<SpanParser<'a>>,
99        lo: usize,
100        hi: usize,
101    },
102    /// Fused sep_by + whitespace trimming: single trim between each step,
103    /// no redundant double-trims from nested trim_whitespace wrappers.
104    SepByWs {
105        inner: Box<SpanParser<'a>>,
106        sep: Box<SpanParser<'a>>,
107        lo: usize,
108        hi: usize,
109    },
110    Opt(Box<SpanParser<'a>>),
111    Wrap {
112        left: Box<SpanParser<'a>>,
113        inner: Box<SpanParser<'a>>,
114        right: Box<SpanParser<'a>>,
115    },
116    Skip(Box<SpanParser<'a>>, Box<SpanParser<'a>>),
117    Next(Box<SpanParser<'a>>, Box<SpanParser<'a>>),
118    Not(Box<SpanParser<'a>>, Box<SpanParser<'a>>),
119    /// Set difference: match main only if excluded would NOT match at the same
120    /// starting position. Used for EBNF/BNF exception (`-`) semantics.
121    Minus(Box<SpanParser<'a>>, Box<SpanParser<'a>>),
122    LookAhead(Box<SpanParser<'a>>, Box<SpanParser<'a>>),
123    /// Zero-width negative assertion: succeeds (empty Span) when inner fails.
124    Negate(Box<SpanParser<'a>>),
125    /// Zero-width positive assertion: succeeds with inner's Span but does NOT
126    /// consume input. The dual of `Negate`.
127    Peek(Box<SpanParser<'a>>),
128    /// End-of-input check: succeeds (empty Span) if at end of source.
129    Eof,
130
131    // === Escape hatch ===
132    Boxed(Box<dyn ParserFn<'a, Span<'a>> + 'a>),
133}
134
135impl<'a> SpanParser<'a> {
136    // ── Core dispatch ─────────────────────────────────────────
137
138    #[inline(always)]
139    pub fn call(&self, state: &mut ParserState<'a>) -> Option<Span<'a>> {
140        if self.flags == 0 {
141            return self.call_inner(state);
142        }
143        // Fast path: trim_ws only (most common flag combination)
144        if self.flags == FLAG_TRIM_WS {
145            state.offset += trim_leading_whitespace(state);
146            let result = self.call_inner(state);
147            if result.is_some() {
148                state.offset += trim_leading_whitespace(state);
149            }
150            return result;
151        }
152        self.call_with_flags_cold(state)
153    }
154
155    #[inline(never)]
156    fn call_with_flags_cold(&self, state: &mut ParserState<'a>) -> Option<Span<'a>> {
157        if self.flags & FLAG_TRIM_WS != 0 {
158            state.offset += trim_leading_whitespace(state);
159        }
160        let checkpoint = if self.flags & FLAG_SAVE_STATE != 0 {
161            Some(state.offset)
162        } else {
163            None
164        };
165
166        let result = self.call_inner(state);
167
168        if let Some(cp) = checkpoint {
169            if result.is_none() {
170                state.furthest_offset = state.furthest_offset.max(state.offset);
171                state.offset = cp;
172                return None;
173            }
174        }
175        // Skip post-trim on failure
176        if result.is_some() && self.flags & FLAG_TRIM_WS != 0 {
177            state.offset += trim_leading_whitespace(state);
178        }
179        result
180    }
181
182    #[inline(always)]
183    fn call_inner(&self, state: &mut ParserState<'a>) -> Option<Span<'a>> {
184        match &self.kind {
185            SpanKind::StringLiteral(s_bytes) => {
186                let end = s_bytes.len();
187                if end == 0 {
188                    return Some(Span::new(state.offset, state.offset, state.src));
189                }
190                let slc = state.src_bytes.get(state.offset..)?;
191                if slc.len() >= end
192                    && slc[0] == s_bytes[0]
193                    && (end == 1 || slc[1..end].starts_with(&s_bytes[1..]))
194                {
195                    let start = state.offset;
196                    state.offset += end;
197                    Some(Span::new(start, state.offset, state.src))
198                } else {
199                    #[cfg(feature = "diagnostics")]
200                    if let Some(lbl) = self.label {
201                        state.add_expected(lbl);
202                    }
203                    None
204                }
205            }
206
207            SpanKind::RegexMatch(re) => {
208                let slc = state.src.get(state.offset..)?;
209                match re.find_at(slc, 0) {
210                    Some(m) if m.start() == 0 => {
211                        let start = state.offset;
212                        state.offset += m.end();
213                        Some(Span::new(start, state.offset, state.src))
214                    }
215                    _ => {
216                        #[cfg(feature = "diagnostics")]
217                        if let Some(lbl) = self.label {
218                            state.add_expected(lbl);
219                        }
220                        None
221                    }
222                }
223            }
224
225            SpanKind::AhoCorasickMatch(ac) => {
226                let slc = state.src.get(state.offset..)?;
227                let input = Input::new(slc).anchored(Anchored::Yes);
228                match ac.find(input) {
229                    Some(m) => {
230                        let start = state.offset;
231                        state.offset += m.end();
232                        Some(Span::new(start, state.offset, state.src))
233                    }
234                    None => {
235                        #[cfg(feature = "diagnostics")]
236                        if let Some(lbl) = self.label {
237                            state.add_expected(lbl);
238                        }
239                        None
240                    }
241                }
242            }
243
244            SpanKind::TakeWhileByte(f) => {
245                let bytes = state.src_bytes;
246                let start = state.offset;
247                let end = bytes.len();
248                let mut i = start;
249                while i < end && f(unsafe { *bytes.get_unchecked(i) }) {
250                    i += 1;
251                }
252                if i == start {
253                    #[cfg(feature = "diagnostics")]
254                    if let Some(lbl) = self.label {
255                        state.add_expected(lbl);
256                    }
257                    return None;
258                }
259                state.offset = i;
260                Some(Span::new(start, i, state.src))
261            }
262
263            SpanKind::TakeWhileChar(f) => {
264                let slc = state.src.get(state.offset..)?;
265                match slc
266                    .char_indices()
267                    .take_while(|(_, c)| f(*c))
268                    .map(|(i, _)| i)
269                    .last()
270                {
271                    Some(mut len) => {
272                        len += 1;
273                        while len < slc.len() && !slc.is_char_boundary(len) {
274                            len += 1;
275                        }
276                        let start = state.offset;
277                        state.offset += len;
278                        Some(Span::new(start, state.offset, state.src))
279                    }
280                    None => {
281                        #[cfg(feature = "diagnostics")]
282                        if let Some(lbl) = self.label {
283                            state.add_expected(lbl);
284                        }
285                        None
286                    }
287                }
288            }
289
290            SpanKind::NextN(amount) => {
291                let start = state.offset;
292                let new_offset = start + amount;
293                if new_offset > state.src.len() {
294                    #[cfg(feature = "diagnostics")]
295                    if let Some(lbl) = self.label {
296                        state.add_expected(lbl);
297                    }
298                    return None;
299                }
300                state.offset = new_offset;
301                Some(Span::new(start, new_offset, state.src))
302            }
303
304            SpanKind::Epsilon => Some(Span::new(state.offset, state.offset, state.src)),
305
306            // Domain-specific scanners delegate to SpanScanner dispatch
307            SpanKind::Scanner(scanner) => {
308                let result = scanner.call(state);
309                #[cfg(feature = "diagnostics")]
310                if result.is_none() {
311                    if let Some(lbl) = self.label {
312                        state.add_expected(lbl);
313                    }
314                }
315                result
316            }
317
318            SpanKind::TakeUntilAny1(b1) => {
319                let bytes = state.src_bytes;
320                let start = state.offset;
321                if start >= bytes.len() {
322                    #[cfg(feature = "diagnostics")]
323                    if let Some(lbl) = self.label {
324                        state.add_expected(lbl);
325                    }
326                    return None;
327                }
328                let scan_len = memchr::memchr(*b1, &bytes[start..]).unwrap_or(bytes.len() - start);
329                if scan_len == 0 {
330                    #[cfg(feature = "diagnostics")]
331                    if let Some(lbl) = self.label {
332                        state.add_expected(lbl);
333                    }
334                    return None;
335                }
336                let end = start + scan_len;
337                state.offset = end;
338                Some(Span::new(start, end, state.src))
339            }
340            SpanKind::TakeUntilAny2(b1, b2) => {
341                let bytes = state.src_bytes;
342                let start = state.offset;
343                if start >= bytes.len() {
344                    #[cfg(feature = "diagnostics")]
345                    if let Some(lbl) = self.label {
346                        state.add_expected(lbl);
347                    }
348                    return None;
349                }
350                let scan_len =
351                    memchr::memchr2(*b1, *b2, &bytes[start..]).unwrap_or(bytes.len() - start);
352                if scan_len == 0 {
353                    #[cfg(feature = "diagnostics")]
354                    if let Some(lbl) = self.label {
355                        state.add_expected(lbl);
356                    }
357                    return None;
358                }
359                let end = start + scan_len;
360                state.offset = end;
361                Some(Span::new(start, end, state.src))
362            }
363            SpanKind::TakeUntilAny3(b1, b2, b3) => {
364                let bytes = state.src_bytes;
365                let start = state.offset;
366                if start >= bytes.len() {
367                    #[cfg(feature = "diagnostics")]
368                    if let Some(lbl) = self.label {
369                        state.add_expected(lbl);
370                    }
371                    return None;
372                }
373                let scan_len =
374                    memchr::memchr3(*b1, *b2, *b3, &bytes[start..]).unwrap_or(bytes.len() - start);
375                if scan_len == 0 {
376                    #[cfg(feature = "diagnostics")]
377                    if let Some(lbl) = self.label {
378                        state.add_expected(lbl);
379                    }
380                    return None;
381                }
382                let end = start + scan_len;
383                state.offset = end;
384                Some(Span::new(start, end, state.src))
385            }
386            SpanKind::TakeUntilAnyLut(lut) => {
387                let bytes = state.src_bytes;
388                let start = state.offset;
389                let end = bytes.len();
390                let mut i = start;
391                while i < end && !lut[unsafe { *bytes.get_unchecked(i) } as usize] {
392                    i += 1;
393                }
394                if i == start {
395                    #[cfg(feature = "diagnostics")]
396                    if let Some(lbl) = self.label {
397                        state.add_expected(lbl);
398                    }
399                    return None;
400                }
401                state.offset = i;
402                Some(Span::new(start, i, state.src))
403            }
404            SpanKind::TakeUntilAnySIMD { lo_lut, hi_lut } => {
405                use std::simd::prelude::*;
406
407                let lo = u8x16::from_array(*lo_lut);
408                let hi = u8x16::from_array(*hi_lut);
409                let lo_mask_const = u8x16::splat(0x0F);
410
411                let bytes = state.src_bytes;
412                let start = state.offset;
413                let end = bytes.len();
414                let mut i = start;
415
416                // SIMD: classify 16 bytes at a time
417                while i + 16 <= end {
418                    let chunk = u8x16::from_slice(&bytes[i..i + 16]);
419                    let lo_nibbles = chunk & lo_mask_const;
420                    let hi_nibbles = chunk >> 4;
421
422                    let lo_result = lo.swizzle_dyn(lo_nibbles);
423                    let hi_result = hi.swizzle_dyn(hi_nibbles);
424                    let matched = lo_result & hi_result;
425
426                    let is_excluded = matched.simd_ne(u8x16::splat(0));
427                    if !is_excluded.any() {
428                        i += 16;
429                        continue;
430                    }
431                    i += is_excluded.to_bitmask().trailing_zeros() as usize;
432                    // Found an excluded byte — break to return result
433                    if i == start {
434                        #[cfg(feature = "diagnostics")]
435                        if let Some(lbl) = self.label {
436                            state.add_expected(lbl);
437                        }
438                        return None;
439                    }
440                    state.offset = i;
441                    return Some(Span::new(start, i, state.src));
442                }
443
444                // Scalar tail: use nibble LUTs for remaining bytes
445                while i < end {
446                    let b = unsafe { *bytes.get_unchecked(i) };
447                    if lo_lut[(b & 0x0F) as usize] & hi_lut[(b >> 4) as usize] != 0 {
448                        break;
449                    }
450                    i += 1;
451                }
452
453                if i == start {
454                    #[cfg(feature = "diagnostics")]
455                    if let Some(lbl) = self.label {
456                        state.add_expected(lbl);
457                    }
458                    return None;
459                }
460                state.offset = i;
461                Some(Span::new(start, i, state.src))
462            }
463
464            SpanKind::Seq(parsers) => {
465                let start = state.offset;
466                for p in parsers {
467                    p.call(state)?;
468                }
469                Some(Span::new(start, state.offset, state.src))
470            }
471
472            SpanKind::OneOf(parsers) => {
473                for p in parsers {
474                    let cp = state.offset;
475                    if let Some(span) = p.call(state) {
476                        return Some(span);
477                    }
478                    state.furthest_offset = state.furthest_offset.max(state.offset);
479                    state.offset = cp;
480                }
481                None
482            }
483
484            SpanKind::Many { inner, lo, hi } => {
485                let start = state.offset;
486                let mut end = state.offset;
487                let mut count = 0;
488                while count < *hi {
489                    let prev_offset = state.offset;
490                    match inner.call(state) {
491                        Some(span) => {
492                            end = span.end;
493                            count += 1;
494                            // Guard: break on zero-length match to prevent infinite loops.
495                            if state.offset == prev_offset {
496                                break;
497                            }
498                        }
499                        None => {
500                            state.offset = prev_offset;
501                            break;
502                        }
503                    }
504                }
505                if count >= *lo {
506                    Some(Span::new(start, end, state.src))
507                } else {
508                    None
509                }
510            }
511
512            SpanKind::SepBy { inner, sep, lo, hi } => {
513                let start = state.offset;
514                let mut count = 0;
515                // Parse first element
516                let Some(first_span) = inner.call(state) else {
517                    if *lo == 0 {
518                        return Some(Span::new(start, start, state.src));
519                    }
520                    return None;
521                };
522                let mut end = first_span.end;
523                count += 1;
524                // Parse (sep elem)* — checkpoint before separator to reject
525                // trailing separators.
526                while count < *hi {
527                    let cp = state.offset;
528                    if sep.call(state).is_none() {
529                        state.offset = cp;
530                        break;
531                    }
532                    if let Some(span) = inner.call(state) {
533                        end = span.end;
534                        count += 1;
535                    } else {
536                        // Element after separator failed — backtrack past
537                        // the separator (reject trailing sep).
538                        state.offset = cp;
539                        break;
540                    }
541                }
542                if count >= *lo {
543                    Some(Span::new(start, end, state.src))
544                } else {
545                    None
546                }
547            }
548
549            SpanKind::SepByWs { inner, sep, lo, hi } => {
550                let start = state.offset;
551                let mut count = 0;
552                // Pre-trim before first element
553                trim_leading_whitespace_mut(state);
554                // Parse first element
555                if inner.call(state).is_none() {
556                    if *lo == 0 {
557                        return Some(Span::new(start, state.offset, state.src));
558                    }
559                    return None;
560                }
561                count += 1;
562                while count < *hi {
563                    let cp = state.offset;
564                    // Trim before separator
565                    trim_leading_whitespace_mut(state);
566                    if sep.call(state).is_none() {
567                        state.offset = cp;
568                        break;
569                    }
570                    // Trim before next element
571                    trim_leading_whitespace_mut(state);
572                    if inner.call(state).is_some() {
573                        count += 1;
574                    } else {
575                        state.offset = cp;
576                        break;
577                    }
578                }
579                if count >= *lo {
580                    // Post-trim after the last element
581                    trim_leading_whitespace_mut(state);
582                    Some(Span::new(start, state.offset, state.src))
583                } else {
584                    None
585                }
586            }
587
588            SpanKind::Opt(inner) => {
589                let start = state.offset;
590                if inner.call(state).is_none() {
591                    return Some(Span::new(start, start, state.src));
592                }
593                Some(Span::new(start, state.offset, state.src))
594            }
595
596            SpanKind::Wrap { left, inner, right } => {
597                #[cfg(feature = "diagnostics")]
598                let open_offset = state.offset;
599                left.call(state)?;
600                #[cfg(feature = "diagnostics")]
601                let open_end = state.offset;
602                let middle = inner.call(state)?;
603                if right.call(state).is_some() {
604                    Some(Span::new(middle.start, middle.end, state.src))
605                } else {
606                    #[cfg(feature = "diagnostics")]
607                    {
608                        let delimiter = state.src[open_offset..open_end].to_string();
609                        state.add_suggestion(|| crate::state::Suggestion {
610                            kind: crate::state::SuggestionKind::UnclosedDelimiter {
611                                delimiter: delimiter.clone(),
612                                open_offset,
613                            },
614                            message: format!(
615                                "close the delimiter with matching `{}`",
616                                match delimiter.as_str() {
617                                    "{" => "}",
618                                    "[" => "]",
619                                    "(" => ")",
620                                    d => d,
621                                }
622                            ),
623                        });
624                        state.add_secondary_span(
625                            open_offset,
626                            format!("unclosed `{}` opened here", delimiter),
627                        );
628                    }
629                    None
630                }
631            }
632
633            SpanKind::Skip(first, second) => {
634                let span = first.call(state)?;
635                second.call(state)?;
636                Some(span)
637            }
638
639            SpanKind::Next(first, second) => {
640                first.call(state)?;
641                second.call(state)
642            }
643
644            SpanKind::Not(main, negated) => {
645                let span = main.call(state)?;
646                let checkpoint = state.offset;
647                let saved_furthest = state.furthest_offset;
648                if negated.call(state).is_none() {
649                    state.offset = checkpoint;
650                    state.furthest_offset = saved_furthest;
651                    return Some(span);
652                }
653                state.offset = checkpoint;
654                state.furthest_offset = saved_furthest;
655                None
656            }
657
658            SpanKind::Minus(main, excluded) => {
659                let checkpoint = state.offset;
660                let saved_furthest = state.furthest_offset;
661                if excluded.call(state).is_some() {
662                    state.offset = checkpoint;
663                    state.furthest_offset = saved_furthest;
664                    return None;
665                }
666                state.offset = checkpoint;
667                state.furthest_offset = saved_furthest;
668                main.call(state)
669            }
670
671            SpanKind::LookAhead(main, lookahead) => {
672                let span = main.call(state)?;
673                let offset_after = state.offset;
674                let result = lookahead.call(state);
675                state.offset = offset_after;
676                result?;
677                Some(span)
678            }
679
680            SpanKind::Negate(inner) => {
681                let checkpoint = state.offset;
682                let saved_furthest = state.furthest_offset;
683                if inner.call(state).is_none() {
684                    state.offset = checkpoint;
685                    state.furthest_offset = saved_furthest;
686                    return Some(Span::new(checkpoint, checkpoint, state.src));
687                }
688                state.offset = checkpoint;
689                state.furthest_offset = saved_furthest;
690                None
691            }
692
693            SpanKind::Peek(inner) => {
694                let checkpoint = state.offset;
695                let saved_furthest = state.furthest_offset;
696                let span = inner.call(state)?;
697                state.offset = checkpoint;
698                state.furthest_offset = saved_furthest;
699                Some(span)
700            }
701
702            SpanKind::Eof => {
703                if state.is_at_end() {
704                    Some(Span::new(state.offset, state.offset, state.src))
705                } else {
706                    #[cfg(feature = "diagnostics")]
707                    if let Some(lbl) = self.label {
708                        state.add_expected(lbl);
709                    }
710                    None
711                }
712            }
713
714            SpanKind::Boxed(inner) => inner.call(state),
715        }
716    }
717
718}
719
720mod span_scanner;
721pub(super) use span_scanner::SpanScanner;
722
723mod methods;
724
725mod constructors;
726pub use constructors::*;