Skip to main content

wpl/ast/syntax/
sep_pattern.rs

1use serde::{Deserialize, Deserializer, Serialize, Serializer};
2use smol_str::SmolStr;
3
4// ── Error formatting helpers ─────────────────────────────────────────
5
6/// Build a user-friendly error message with a visual pointer to the problematic position.
7///
8/// Example output:
9/// ```text
10/// sep pattern error: at most one * allowed
11///   {*a*}
12///      ^
13/// ```
14fn fmt_err(raw: &str, pos: usize, msg: &str) -> String {
15    let display = format!("{{{}}}", raw);
16    // pos is relative to raw; in display string `{raw}`, offset by 1 for the leading `{`
17    let pointer_offset = pos + 1;
18    let pointer_line: String = " ".repeat(pointer_offset) + "^";
19    format!(
20        "sep pattern error: {}\n  {}\n  {}",
21        msg, display, pointer_line
22    )
23}
24
25/// Build an error message without position (for structural issues).
26fn fmt_err_no_pos(raw: &str, msg: &str) -> String {
27    format!("sep pattern error: {} in {{{}}}", msg, raw)
28}
29
30// ── Data structures ──────────────────────────────────────────────────
31
32/// Result of a successful pattern match.
33#[derive(Debug, Clone, PartialEq)]
34pub struct SepMatch {
35    /// Bytes consumed (not including preserve portion).
36    pub consumed: usize,
37    /// Total bytes matched (including preserve, for debugging).
38    pub matched: usize,
39}
40
41/// A single segment inside a glob pattern.
42#[derive(Debug, Clone, PartialEq)]
43pub enum GlobSegment {
44    /// Contiguous literal characters.
45    Literal(SmolStr),
46    /// `*` — zero or more arbitrary characters (non-greedy).
47    Star,
48    /// `?` — exactly one arbitrary character.
49    Any,
50    /// `\s` — one or more whitespace characters `[ \t\r\n]+`.
51    Whitespace,
52    /// `\S` — one or more non-whitespace characters `[^ \t\r\n]+`.
53    NonWhitespace,
54    /// `\h` — one or more horizontal whitespace `[ \t]+`.
55    HorizontalWhitespace,
56    /// `\H` — one or more non-horizontal-whitespace `[^ \t]+`.
57    NonHorizontalWhitespace,
58}
59
60/// A compiled glob pattern with optional preserve tail.
61#[derive(Debug, Clone, PartialEq)]
62pub struct GlobPattern {
63    pub segments: Vec<GlobSegment>,
64    pub preserve: Option<Vec<GlobSegment>>,
65}
66
67/// Compiled matcher – either a plain literal or a glob.
68#[derive(Debug, Clone, PartialEq)]
69pub enum SepMatcher {
70    /// Pure literal, use `str::find` (internally memchr / two-way).
71    Literal(SmolStr),
72    /// Contains wildcards / whitespace macros.
73    Glob(GlobPattern),
74}
75
76/// A compiled separator pattern built from `{…}` syntax.
77#[derive(Debug, Clone, PartialEq)]
78pub struct SepPattern {
79    pub(crate) raw: SmolStr,
80    pub(crate) compiled: SepMatcher,
81}
82
83// ── build_pattern parser ─────────────────────────────────────────────
84
85/// Build a `SepPattern` from the raw content inside `{…}`.
86pub fn build_pattern(raw: &str) -> Result<SepPattern, String> {
87    if raw.is_empty() {
88        return Err("sep pattern error: pattern is empty, expected content inside {}".to_string());
89    }
90
91    // 1. Separate preserve portion: find un-escaped `(` … `)` at the very end.
92    let (main_raw, preserve_raw) = split_preserve(raw)?;
93
94    // 2. Parse main body segments.
95    let main_offset = 0;
96    let (segments, star_count) = parse_segments(raw, main_raw, main_offset)?;
97
98    // 3. Parse preserve segments (if any).
99    let preserve = if let Some(pr) = preserve_raw {
100        let preserve_offset = main_raw.len() + 1; // +1 for '('
101        let (psegs, _) = parse_segments(raw, pr, preserve_offset)?;
102        Some(psegs)
103    } else {
104        None
105    };
106
107    // 4. Validate star count.
108    if star_count > 1 {
109        // Find position of the second `*` for the error pointer.
110        let second_star_pos = find_nth_unescaped(raw, b'*', 2).unwrap_or(raw.len() - 1);
111        return Err(fmt_err(raw, second_star_pos, "at most one * allowed"));
112    }
113
114    // 5. Ensure non-empty after parsing.
115    if segments.is_empty() && preserve.as_ref().is_none_or(|p| p.is_empty()) {
116        return Err(fmt_err_no_pos(
117            raw,
118            "pattern resolves to empty after parsing",
119        ));
120    }
121
122    // 6. Choose matcher.
123    let has_wildcard = segments.iter().any(|s| {
124        matches!(
125            s,
126            GlobSegment::Star
127                | GlobSegment::Any
128                | GlobSegment::Whitespace
129                | GlobSegment::NonWhitespace
130                | GlobSegment::HorizontalWhitespace
131                | GlobSegment::NonHorizontalWhitespace
132        )
133    });
134    let compiled = if !has_wildcard && preserve.is_none() {
135        // Pure literal – collapse all Literal segments into one string.
136        let lit: String = segments
137            .iter()
138            .map(|s| match s {
139                GlobSegment::Literal(l) => l.as_str(),
140                _ => unreachable!(),
141            })
142            .collect();
143        SepMatcher::Literal(SmolStr::from(lit))
144    } else {
145        SepMatcher::Glob(GlobPattern { segments, preserve })
146    };
147
148    Ok(SepPattern {
149        raw: SmolStr::from(raw),
150        compiled,
151    })
152}
153
154/// Find the byte position of the n-th un-escaped occurrence of `target` in `s`.
155fn find_nth_unescaped(s: &str, target: u8, n: usize) -> Option<usize> {
156    let bytes = s.as_bytes();
157    let mut count = 0;
158    for i in 0..bytes.len() {
159        if bytes[i] == target && !is_escaped(bytes, i) {
160            count += 1;
161            if count == n {
162                return Some(i);
163            }
164        }
165    }
166    None
167}
168
169/// Split raw pattern into (main, Option<preserve>).
170/// `(…)` must be at the very end of the string and un-escaped.
171fn split_preserve(raw: &str) -> Result<(&str, Option<&str>), String> {
172    let bytes = raw.as_bytes();
173    let len = bytes.len();
174    if len == 0 || bytes[len - 1] != b')' {
175        return Ok((raw, None));
176    }
177    // Check the `)` is not escaped.
178    if is_escaped(bytes, len - 1) {
179        return Ok((raw, None));
180    }
181    // Walk backwards to find matching un-escaped `(`.
182    let mut depth = 0i32;
183    let mut open_pos = None;
184    let mut i = len;
185    while i > 0 {
186        i -= 1;
187        if bytes[i] == b')' && !is_escaped(bytes, i) {
188            depth += 1;
189        } else if bytes[i] == b'(' && !is_escaped(bytes, i) {
190            depth -= 1;
191            if depth == 0 {
192                open_pos = Some(i);
193                break;
194            }
195        }
196    }
197    let open = match open_pos {
198        Some(p) => p,
199        None => return Ok((raw, None)), // unbalanced – treat as literal
200    };
201
202    // Validate that `(` is at a valid position (nothing after `)` except end).
203    // The `)` is already the last byte, so we only need to check that nothing
204    // between the closing `)` position and end is unexpected. Since we matched
205    // the *last* `)`, this is already guaranteed.
206
207    // Also validate that there's no un-escaped `(` before `open` that also has
208    // a `)` – this would mean `()` is not at the end. Actually, the simplest
209    // check: there must be no un-escaped `(` in the main portion.
210    let main_part = &raw[..open];
211    {
212        let mb = main_part.as_bytes();
213        for j in 0..mb.len() {
214            if mb[j] == b'(' && !is_escaped(mb, j) {
215                return Err(fmt_err(
216                    raw,
217                    j,
218                    "(...) must appear only at the end; found earlier '(' here",
219                ));
220            }
221        }
222    }
223
224    let preserve_content = &raw[open + 1..len - 1];
225    Ok((main_part, Some(preserve_content)))
226}
227
228/// Check if byte at `pos` is preceded by an odd number of backslashes.
229fn is_escaped(bytes: &[u8], pos: usize) -> bool {
230    let mut count = 0usize;
231    let mut p = pos;
232    while p > 0 {
233        p -= 1;
234        if bytes[p] == b'\\' {
235            count += 1;
236        } else {
237            break;
238        }
239    }
240    count % 2 == 1
241}
242
243/// Parse a segment string into `Vec<GlobSegment>` and count of `*`.
244/// `raw` is the full original pattern (for error messages), `s` is the slice being parsed,
245/// `base_offset` is the byte offset of `s` within `raw`.
246fn parse_segments(
247    raw: &str,
248    s: &str,
249    base_offset: usize,
250) -> Result<(Vec<GlobSegment>, usize), String> {
251    let mut segs = Vec::new();
252    let mut lit_buf = String::new();
253    let mut star_count = 0usize;
254    let bytes = s.as_bytes();
255    let len = bytes.len();
256    let mut i = 0;
257
258    while i < len {
259        let b = bytes[i];
260        if b == b'\\' && i + 1 < len {
261            let next = bytes[i + 1];
262            match next {
263                b'\\' | b'*' | b'?' | b'{' | b'}' | b'(' | b')' => {
264                    lit_buf.push(next as char);
265                    i += 2;
266                }
267                b'0' => {
268                    lit_buf.push('\0');
269                    i += 2;
270                }
271                b'n' => {
272                    lit_buf.push('\n');
273                    i += 2;
274                }
275                b't' => {
276                    lit_buf.push('\t');
277                    i += 2;
278                }
279                b'r' => {
280                    lit_buf.push('\r');
281                    i += 2;
282                }
283                b's' => {
284                    flush_literal(&mut lit_buf, &mut segs);
285                    segs.push(GlobSegment::Whitespace);
286                    i += 2;
287                }
288                b'S' => {
289                    flush_literal(&mut lit_buf, &mut segs);
290                    segs.push(GlobSegment::NonWhitespace);
291                    i += 2;
292                }
293                b'h' => {
294                    flush_literal(&mut lit_buf, &mut segs);
295                    segs.push(GlobSegment::HorizontalWhitespace);
296                    i += 2;
297                }
298                b'H' => {
299                    flush_literal(&mut lit_buf, &mut segs);
300                    segs.push(GlobSegment::NonHorizontalWhitespace);
301                    i += 2;
302                }
303                _ => {
304                    // Unknown escape: treat as literal character (e.g. \: → ':').
305                    // This preserves backward compatibility with existing configs
306                    // that use non-standard escapes like \:, \= etc.
307                    lit_buf.push(next as char);
308                    i += 2;
309                }
310            }
311        } else if b == b'*' {
312            flush_literal(&mut lit_buf, &mut segs);
313            segs.push(GlobSegment::Star);
314            star_count += 1;
315            if star_count > 1 {
316                return Err(fmt_err(
317                    raw,
318                    base_offset + i,
319                    "at most one * allowed; use \\* to match a literal asterisk",
320                ));
321            }
322            i += 1;
323        } else if b == b'?' {
324            flush_literal(&mut lit_buf, &mut segs);
325            segs.push(GlobSegment::Any);
326            i += 1;
327        } else if b == b'(' || b == b')' {
328            return Err(fmt_err(
329                raw,
330                base_offset + i,
331                &format!(
332                    "unexpected '{}'; (...) preserve must be at the end, use \\{} for literal",
333                    b as char, b as char
334                ),
335            ));
336        } else {
337            // Regular character – but must handle UTF-8 properly.
338            let ch = s[i..].chars().next().unwrap();
339            lit_buf.push(ch);
340            i += ch.len_utf8();
341        }
342    }
343    flush_literal(&mut lit_buf, &mut segs);
344    Ok((segs, star_count))
345}
346
347fn flush_literal(buf: &mut String, segs: &mut Vec<GlobSegment>) {
348    if !buf.is_empty() {
349        segs.push(GlobSegment::Literal(SmolStr::from(buf.as_str())));
350        buf.clear();
351    }
352}
353
354// ── Matching engine ──────────────────────────────────────────────────
355
356impl SepPattern {
357    /// Find the first match in `haystack`. Returns `(offset, SepMatch)` where
358    /// `offset` is the byte position where the match starts (= field content length).
359    pub fn find(&self, haystack: &str) -> Option<(usize, SepMatch)> {
360        match &self.compiled {
361            SepMatcher::Literal(lit) => {
362                let pos = haystack.find(lit.as_str())?;
363                Some((
364                    pos,
365                    SepMatch {
366                        consumed: lit.len(),
367                        matched: lit.len(),
368                    },
369                ))
370            }
371            SepMatcher::Glob(glob) => glob_find(glob, haystack),
372        }
373    }
374
375    /// Match only at the start of `haystack` (for `consume_sep`).
376    pub fn match_at_start(&self, haystack: &str) -> Option<SepMatch> {
377        match &self.compiled {
378            SepMatcher::Literal(lit) => {
379                if haystack.starts_with(lit.as_str()) {
380                    Some(SepMatch {
381                        consumed: lit.len(),
382                        matched: lit.len(),
383                    })
384                } else {
385                    None
386                }
387            }
388            SepMatcher::Glob(glob) => glob_match_at(glob, haystack, 0).map(|total| {
389                let main_len = try_match_segments(&glob.segments, haystack).unwrap_or(0);
390                let consumed = main_len;
391                SepMatch {
392                    consumed,
393                    matched: total,
394                }
395            }),
396        }
397    }
398
399    /// Return the raw pattern string.
400    pub fn raw(&self) -> &str {
401        self.raw.as_str()
402    }
403}
404
405/// For a Star-at-start pattern, find how many bytes Star consumes (non-greedy)
406/// and how many bytes the remaining main segments consume.
407/// Returns `(star_bytes, rest_bytes)`.
408fn try_match_star_split(segments: &[GlobSegment], s: &str) -> Option<(usize, usize)> {
409    debug_assert!(matches!(segments.first(), Some(GlobSegment::Star)));
410    let remaining = &segments[1..];
411    // Non-greedy: try expanding Star from 0 chars upwards.
412    if let Some(rest_len) = try_match_segments(remaining, s) {
413        return Some((0, rest_len));
414    }
415    let mut char_iter = s.char_indices();
416    while let Some((_, _)) = char_iter.next() {
417        let byte_pos = char_iter.clone().next().map(|(p, _)| p).unwrap_or(s.len());
418        let after = &s[byte_pos..];
419        if let Some(rest_len) = try_match_segments(remaining, after) {
420            return Some((byte_pos, rest_len));
421        }
422    }
423    None
424}
425
426/// Find first occurrence of glob pattern in haystack.
427fn glob_find(glob: &GlobPattern, haystack: &str) -> Option<(usize, SepMatch)> {
428    let segs = &glob.segments;
429    if segs.is_empty() {
430        // Only preserve – scan haystack for the first position where preserve matches.
431        // The offset is the field content length; consumed is 0 (preserve is not consumed).
432        if let Some(preserve) = &glob.preserve {
433            // Optimization: if first preserve segment is Literal, use str::find for fast skip.
434            if let Some(GlobSegment::Literal(first_lit)) = preserve.first() {
435                let lit = first_lit.as_str();
436                let mut search_start = 0;
437                while search_start <= haystack.len() {
438                    if let Some(pos) = haystack[search_start..].find(lit) {
439                        let abs_pos = search_start + pos;
440                        if let Some(plen) = try_match_segments(preserve, &haystack[abs_pos..]) {
441                            return Some((
442                                abs_pos,
443                                SepMatch {
444                                    consumed: 0,
445                                    matched: plen,
446                                },
447                            ));
448                        }
449                        let next_char_len = haystack[abs_pos..]
450                            .chars()
451                            .next()
452                            .map(|c| c.len_utf8())
453                            .unwrap_or(1);
454                        search_start = abs_pos + next_char_len;
455                    } else {
456                        break;
457                    }
458                }
459                return None;
460            }
461            // General case: scan char by char.
462            for (pos, _) in haystack.char_indices() {
463                if let Some(plen) = try_match_segments(preserve, &haystack[pos..]) {
464                    return Some((
465                        pos,
466                        SepMatch {
467                            consumed: 0,
468                            matched: plen,
469                        },
470                    ));
471                }
472            }
473            return None;
474        }
475        return None;
476    }
477
478    // Star-at-start: Star's consumed bytes = field content (offset),
479    // remaining segments' consumed bytes = separator (consumed).
480    if matches!(segs.first(), Some(GlobSegment::Star)) {
481        let (star_bytes, rest_bytes) = try_match_star_split(segs, haystack)?;
482        let preserve_bytes = if let Some(preserve) = &glob.preserve {
483            let after_main = &haystack[star_bytes + rest_bytes..];
484            try_match_segments(preserve, after_main)?
485        } else {
486            0
487        };
488        return Some((
489            star_bytes,
490            SepMatch {
491                consumed: rest_bytes,
492                matched: rest_bytes + preserve_bytes,
493            },
494        ));
495    }
496
497    // Optimization: if first segment is Literal, use str::find for fast skip.
498    if let Some(GlobSegment::Literal(first_lit)) = segs.first() {
499        let lit = first_lit.as_str();
500        let mut search_start = 0;
501        while search_start <= haystack.len() {
502            if let Some(pos) = haystack[search_start..].find(lit) {
503                let abs_pos = search_start + pos;
504                if let Some(total) = glob_match_at(glob, haystack, abs_pos) {
505                    let main_len = try_match_segments(segs, &haystack[abs_pos..]).unwrap_or(0);
506                    return Some((
507                        abs_pos,
508                        SepMatch {
509                            consumed: main_len,
510                            matched: total,
511                        },
512                    ));
513                }
514                // Advance by one char (not lit.len()) to avoid skipping overlapping positions.
515                let next_char_len = haystack[abs_pos..]
516                    .chars()
517                    .next()
518                    .map(|c| c.len_utf8())
519                    .unwrap_or(1);
520                search_start = abs_pos + next_char_len;
521            } else {
522                break;
523            }
524        }
525        return None;
526    }
527
528    // General case: scan char by char.
529    for (pos, _) in haystack.char_indices() {
530        if let Some(total) = glob_match_at(glob, haystack, pos) {
531            let main_len = try_match_segments(segs, &haystack[pos..]).unwrap_or(0);
532            return Some((
533                pos,
534                SepMatch {
535                    consumed: main_len,
536                    matched: total,
537                },
538            ));
539        }
540    }
541    None
542}
543
544/// Attempt full match of glob pattern (main + preserve) starting at byte offset `start`.
545/// Returns total matched length (main + preserve) or None.
546fn glob_match_at(glob: &GlobPattern, haystack: &str, start: usize) -> Option<usize> {
547    let s = &haystack[start..];
548    let main_len = try_match_segments(&glob.segments, s)?;
549    if let Some(preserve) = &glob.preserve {
550        let rest = &s[main_len..];
551        let plen = try_match_segments(preserve, rest)?;
552        Some(main_len + plen)
553    } else {
554        Some(main_len)
555    }
556}
557
558/// Try to match segments against the start of `s`. Returns consumed byte count.
559fn try_match_segments(segments: &[GlobSegment], s: &str) -> Option<usize> {
560    if segments.is_empty() {
561        return Some(0);
562    }
563    match &segments[0] {
564        GlobSegment::Literal(lit) => {
565            if s.starts_with(lit.as_str()) {
566                let rest = &s[lit.len()..];
567                let tail = try_match_segments(&segments[1..], rest)?;
568                Some(lit.len() + tail)
569            } else {
570                None
571            }
572        }
573        GlobSegment::Any => {
574            let ch = s.chars().next()?;
575            let clen = ch.len_utf8();
576            let rest = &s[clen..];
577            let tail = try_match_segments(&segments[1..], rest)?;
578            Some(clen + tail)
579        }
580        GlobSegment::Whitespace => {
581            match_char_class_backtrack(consume_whitespace, s, &segments[1..])
582        }
583        GlobSegment::NonWhitespace => {
584            match_char_class_backtrack(consume_non_whitespace, s, &segments[1..])
585        }
586        GlobSegment::HorizontalWhitespace => {
587            match_char_class_backtrack(consume_horizontal_whitespace, s, &segments[1..])
588        }
589        GlobSegment::NonHorizontalWhitespace => {
590            match_char_class_backtrack(consume_non_horizontal_whitespace, s, &segments[1..])
591        }
592        GlobSegment::Star => {
593            // Non-greedy: try expanding from 0 chars upwards.
594            let remaining = &segments[1..];
595            let mut char_iter = s.char_indices();
596            // Try matching 0 chars consumed by Star.
597            if let Some(tail) = try_match_segments(remaining, s) {
598                return Some(tail);
599            }
600            // Expand one char at a time.
601            while let Some((_, ch)) = char_iter.next() {
602                let byte_pos = char_iter.clone().next().map(|(p, _)| p).unwrap_or(s.len());
603                // byte_pos points to start of next char (or end).
604                // But we need to account for the current char's UTF-8 length:
605                let after = &s[byte_pos..];
606                if let Some(tail) = try_match_segments(remaining, after) {
607                    return Some(byte_pos + tail);
608                }
609                // Don't expand past string.
610                let _ = ch;
611            }
612            None
613        }
614    }
615}
616
617/// Match a character-class segment (like `\s`, `\S`, `\h`, `\H`) with greedy-then-backtrack.
618///
619/// `consume_fn` returns the maximum number of bytes that the character class matches at the
620/// start of `s`. We try that maximum first (fast path); if the remaining segments don't match,
621/// we backtrack one character at a time until we find a length that works (minimum 1 character).
622fn match_char_class_backtrack(
623    consume_fn: fn(&str) -> usize,
624    s: &str,
625    remaining: &[GlobSegment],
626) -> Option<usize> {
627    let max = consume_fn(s);
628    if max == 0 {
629        return None;
630    }
631    // Fast path: greedy consumption (covers most cases like \s followed by non-ws literal).
632    let rest = &s[max..];
633    if let Some(tail) = try_match_segments(remaining, rest) {
634        return Some(max + tail);
635    }
636    // Slow path: backtrack from (max - 1 char) down to 1 char.
637    // Walk backwards through char boundaries within consumed range.
638    let consumed_slice = &s[..max];
639    let mut pos = max;
640    for (i, _) in consumed_slice.char_indices().rev() {
641        // `i` is the start of the last char; skip it to try one less char.
642        pos = i;
643        if pos == 0 {
644            break; // Must consume at least 1 char.
645        }
646        let rest = &s[pos..];
647        if let Some(tail) = try_match_segments(remaining, rest) {
648            return Some(pos + tail);
649        }
650    }
651    let _ = pos;
652    None
653}
654
655fn consume_whitespace(s: &str) -> usize {
656    let mut n = 0;
657    for ch in s.chars() {
658        if ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' {
659            n += ch.len_utf8();
660        } else {
661            break;
662        }
663    }
664    n
665}
666
667fn consume_non_whitespace(s: &str) -> usize {
668    let mut n = 0;
669    for ch in s.chars() {
670        if ch != ' ' && ch != '\t' && ch != '\r' && ch != '\n' {
671            n += ch.len_utf8();
672        } else {
673            break;
674        }
675    }
676    n
677}
678
679fn consume_horizontal_whitespace(s: &str) -> usize {
680    let mut n = 0;
681    for ch in s.chars() {
682        if ch == ' ' || ch == '\t' {
683            n += ch.len_utf8();
684        } else {
685            break;
686        }
687    }
688    n
689}
690
691fn consume_non_horizontal_whitespace(s: &str) -> usize {
692    let mut n = 0;
693    for ch in s.chars() {
694        if ch != ' ' && ch != '\t' {
695            n += ch.len_utf8();
696        } else {
697            break;
698        }
699    }
700    n
701}
702
703// ── Serde ────────────────────────────────────────────────────────────
704
705impl Serialize for SepPattern {
706    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
707    where
708        S: Serializer,
709    {
710        serializer.serialize_str(self.raw.as_str())
711    }
712}
713
714impl<'de> Deserialize<'de> for SepPattern {
715    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
716    where
717        D: Deserializer<'de>,
718    {
719        let s = String::deserialize(deserializer)?;
720        build_pattern(&s).map_err(serde::de::Error::custom)
721    }
722}
723
724// ── Tests ────────────────────────────────────────────────────────────
725
726#[cfg(test)]
727mod tests {
728    use super::*;
729
730    // ── build_pattern parsing ────────────────────────────────────────
731
732    #[test]
733    fn test_parse_literal() {
734        let p = build_pattern("abc").unwrap();
735        assert_eq!(p.compiled, SepMatcher::Literal("abc".into()));
736    }
737
738    #[test]
739    fn test_parse_literal_with_newline() {
740        let p = build_pattern("ab\\n").unwrap();
741        assert_eq!(p.compiled, SepMatcher::Literal("ab\n".into()));
742    }
743
744    #[test]
745    fn test_parse_literal_with_null() {
746        let p = build_pattern("ab\\0").unwrap();
747        assert_eq!(p.compiled, SepMatcher::Literal("ab\0".into()));
748    }
749
750    #[test]
751    fn test_parse_literal_with_tab() {
752        let p = build_pattern("ab\\t").unwrap();
753        assert_eq!(p.compiled, SepMatcher::Literal("ab\t".into()));
754    }
755
756    #[test]
757    fn test_parse_literal_with_cr() {
758        let p = build_pattern("ab\\r").unwrap();
759        assert_eq!(p.compiled, SepMatcher::Literal("ab\r".into()));
760    }
761
762    #[test]
763    fn test_parse_escaped_chars() {
764        let p = build_pattern("a\\*b\\?c").unwrap();
765        assert_eq!(p.compiled, SepMatcher::Literal("a*b?c".into()));
766    }
767
768    #[test]
769    fn test_parse_escaped_braces() {
770        let p = build_pattern("a\\{b\\}c").unwrap();
771        assert_eq!(p.compiled, SepMatcher::Literal("a{b}c".into()));
772    }
773
774    #[test]
775    fn test_parse_escaped_parens() {
776        let p = build_pattern("a\\(b\\)").unwrap();
777        assert_eq!(p.compiled, SepMatcher::Literal("a(b)".into()));
778    }
779
780    #[test]
781    fn test_parse_glob_star_eq() {
782        let p = build_pattern("*=").unwrap();
783        match &p.compiled {
784            SepMatcher::Glob(g) => {
785                assert_eq!(g.segments.len(), 2);
786                assert_eq!(g.segments[0], GlobSegment::Star);
787                assert_eq!(g.segments[1], GlobSegment::Literal("=".into()));
788                assert!(g.preserve.is_none());
789            }
790            _ => panic!("expected Glob"),
791        }
792    }
793
794    #[test]
795    fn test_parse_glob_key_star() {
796        let p = build_pattern("key=*").unwrap();
797        match &p.compiled {
798            SepMatcher::Glob(g) => {
799                assert_eq!(g.segments.len(), 2);
800                assert_eq!(g.segments[0], GlobSegment::Literal("key=".into()));
801                assert_eq!(g.segments[1], GlobSegment::Star);
802            }
803            _ => panic!("expected Glob"),
804        }
805    }
806
807    #[test]
808    fn test_parse_glob_field_any() {
809        let p = build_pattern("field?:").unwrap();
810        match &p.compiled {
811            SepMatcher::Glob(g) => {
812                assert_eq!(g.segments.len(), 3);
813                assert_eq!(g.segments[0], GlobSegment::Literal("field".into()));
814                assert_eq!(g.segments[1], GlobSegment::Any);
815                assert_eq!(g.segments[2], GlobSegment::Literal(":".into()));
816            }
817            _ => panic!("expected Glob"),
818        }
819    }
820
821    #[test]
822    fn test_parse_whitespace() {
823        let p = build_pattern("\\s=").unwrap();
824        match &p.compiled {
825            SepMatcher::Glob(g) => {
826                assert_eq!(g.segments.len(), 2);
827                assert_eq!(g.segments[0], GlobSegment::Whitespace);
828                assert_eq!(g.segments[1], GlobSegment::Literal("=".into()));
829            }
830            _ => panic!("expected Glob"),
831        }
832    }
833
834    #[test]
835    fn test_parse_horizontal_whitespace() {
836        let p = build_pattern("\\h:\\h").unwrap();
837        match &p.compiled {
838            SepMatcher::Glob(g) => {
839                assert_eq!(g.segments.len(), 3);
840                assert_eq!(g.segments[0], GlobSegment::HorizontalWhitespace);
841                assert_eq!(g.segments[1], GlobSegment::Literal(":".into()));
842                assert_eq!(g.segments[2], GlobSegment::HorizontalWhitespace);
843            }
844            _ => panic!("expected Glob"),
845        }
846    }
847
848    #[test]
849    fn test_parse_non_whitespace() {
850        let p = build_pattern("\\s\\S=").unwrap();
851        match &p.compiled {
852            SepMatcher::Glob(g) => {
853                assert_eq!(g.segments.len(), 3);
854                assert_eq!(g.segments[0], GlobSegment::Whitespace);
855                assert_eq!(g.segments[1], GlobSegment::NonWhitespace);
856                assert_eq!(g.segments[2], GlobSegment::Literal("=".into()));
857            }
858            _ => panic!("expected Glob"),
859        }
860    }
861
862    #[test]
863    fn test_parse_non_horizontal_whitespace() {
864        let p = build_pattern("\\h\\H:\\H").unwrap();
865        match &p.compiled {
866            SepMatcher::Glob(g) => {
867                assert_eq!(g.segments.len(), 4);
868                assert_eq!(g.segments[0], GlobSegment::HorizontalWhitespace);
869                assert_eq!(g.segments[1], GlobSegment::NonHorizontalWhitespace);
870                assert_eq!(g.segments[2], GlobSegment::Literal(":".into()));
871                assert_eq!(g.segments[3], GlobSegment::NonHorizontalWhitespace);
872            }
873            _ => panic!("expected Glob"),
874        }
875    }
876
877    #[test]
878    fn test_parse_preserve() {
879        let p = build_pattern("*(key=)").unwrap();
880        match &p.compiled {
881            SepMatcher::Glob(g) => {
882                assert_eq!(g.segments, vec![GlobSegment::Star]);
883                let preserve = g.preserve.as_ref().unwrap();
884                assert_eq!(preserve.len(), 1);
885                assert_eq!(preserve[0], GlobSegment::Literal("key=".into()));
886            }
887            _ => panic!("expected Glob"),
888        }
889    }
890
891    #[test]
892    fn test_parse_preserve_with_whitespace() {
893        let p = build_pattern("*\\s(next)").unwrap();
894        match &p.compiled {
895            SepMatcher::Glob(g) => {
896                assert_eq!(g.segments, vec![GlobSegment::Star, GlobSegment::Whitespace]);
897                let preserve = g.preserve.as_ref().unwrap();
898                assert_eq!(preserve.len(), 1);
899                assert_eq!(preserve[0], GlobSegment::Literal("next".into()));
900            }
901            _ => panic!("expected Glob"),
902        }
903    }
904
905    // ── Constraint violations ────────────────────────────────────────
906
907    #[test]
908    fn test_err_multi_star() {
909        let e = build_pattern("*a*").unwrap_err();
910        assert!(e.contains("at most one * allowed"), "got: {}", e);
911        // Verify visual pointer is present
912        assert!(
913            e.contains("{*a*}"),
914            "should show the full pattern, got: {}",
915            e
916        );
917        assert!(e.contains("^"), "should have a pointer, got: {}", e);
918    }
919
920    #[test]
921    fn test_err_preserve_not_end() {
922        let e = build_pattern("(key)*=").unwrap_err();
923        assert!(
924            e.contains("(...)") || e.contains("preserve") || e.contains("unexpected '('"),
925            "got: {}",
926            e
927        );
928    }
929
930    #[test]
931    fn test_parse_star_in_preserve() {
932        // `*(c*=)` — Star in main + anchored Star in preserve
933        let p = build_pattern("*(c*=)").unwrap();
934        match &p.compiled {
935            SepMatcher::Glob(g) => {
936                assert_eq!(g.segments, vec![GlobSegment::Star]);
937                let preserve = g.preserve.as_ref().unwrap();
938                assert_eq!(preserve.len(), 3);
939                assert_eq!(preserve[0], GlobSegment::Literal("c".into()));
940                assert_eq!(preserve[1], GlobSegment::Star);
941                assert_eq!(preserve[2], GlobSegment::Literal("=".into()));
942            }
943            _ => panic!("expected Glob"),
944        }
945    }
946
947    #[test]
948    fn test_err_empty() {
949        let e = build_pattern("").unwrap_err();
950        assert!(e.contains("empty"), "got: {}", e);
951    }
952
953    #[test]
954    fn test_unknown_escape_as_literal() {
955        // Unknown escapes like \x, \z, \:, \= are treated as literal characters
956        // for backward compatibility with existing configs.
957        let p = build_pattern("ab\\x").unwrap();
958        assert_eq!(p.compiled, SepMatcher::Literal("abx".into()));
959
960        let p = build_pattern("field\\:=").unwrap();
961        assert_eq!(p.compiled, SepMatcher::Literal("field:=".into()));
962
963        let p = build_pattern("\\z").unwrap();
964        assert_eq!(p.compiled, SepMatcher::Literal("z".into()));
965    }
966
967    #[test]
968    fn test_err_visual_pointer_position() {
969        // In `{*a*}`, the second `*` is at raw position 2 → display position 3
970        let e = build_pattern("*a*").unwrap_err();
971        let lines: Vec<&str> = e.lines().collect();
972        assert!(lines.len() >= 3, "expected 3 lines, got: {}", e);
973        // Line 2: `  {*a*}`
974        assert!(lines[1].contains("{*a*}"), "got line1: {}", lines[1]);
975        // Line 3: pointer `     ^` — the `^` should be under the second `*`
976        let pointer_line = lines[2];
977        let caret_pos = pointer_line.find('^').expect("no ^ found");
978        // In `  {*a*}`, second `*` is at display col 4 (2 spaces + { + * + a + *)
979        // base_offset=0 in main body, i=2 for second star → pointer_offset=2+1=3
980        // with 2 leading spaces: col 5
981        assert_eq!(
982            caret_pos, 5,
983            "caret at wrong position in: {:?}",
984            pointer_line
985        );
986    }
987
988    #[test]
989    fn test_err_messages_display() {
990        // This test prints all error messages for visual inspection.
991        // Run with: cargo test -p wp-lang -- test_err_messages_display --nocapture
992        let cases = vec![
993            ("", "empty pattern"),
994            ("*a*", "multiple stars"),
995            ("(key)*=", "preserve not at end"),
996            ("test(mid)abc", "paren not at end"),
997        ];
998        for (input, label) in cases {
999            let err = build_pattern(input).unwrap_err();
1000            println!("--- {} ---\n{}\n", label, err);
1001        }
1002    }
1003
1004    // ── Matching ─────────────────────────────────────────────────────
1005
1006    #[test]
1007    fn test_match_literal() {
1008        let p = build_pattern("abc").unwrap();
1009        let (off, m) = p.find("xyzabcdef").unwrap();
1010        assert_eq!(off, 3);
1011        assert_eq!(m.consumed, 3);
1012        assert_eq!(m.matched, 3);
1013    }
1014
1015    #[test]
1016    fn test_match_literal_no_match() {
1017        let p = build_pattern("abc").unwrap();
1018        assert!(p.find("xyzdef").is_none());
1019    }
1020
1021    #[test]
1022    fn test_match_star_eq_non_greedy() {
1023        // `{*=}` on "a=b=c" → non-greedy: Star matches "a", "=" is separator
1024        // offset = 1 (Star consumed "a" = field content)
1025        // consumed = 1 ("=" = separator)
1026        let p = build_pattern("*=").unwrap();
1027        let (off, m) = p.find("a=b=c").unwrap();
1028        assert_eq!(off, 1);
1029        assert_eq!(m.consumed, 1);
1030        assert_eq!(m.matched, 1);
1031    }
1032
1033    #[test]
1034    fn test_match_whitespace_eq() {
1035        // `{\s=}` on "key  =val" → offset=3, consumed=3 (" " " " "=")
1036        let p = build_pattern("\\s=").unwrap();
1037        let (off, m) = p.find("key  =val").unwrap();
1038        assert_eq!(off, 3);
1039        assert_eq!(m.consumed, 3);
1040        assert_eq!(m.matched, 3);
1041    }
1042
1043    #[test]
1044    fn test_match_preserve() {
1045        // `{*\s(key=)}` on "hello  key=value"
1046        // Star matches "hello" (5 bytes = field content = offset)
1047        // \s matches "  " (2 bytes = separator consumed)
1048        // preserve "key=" (4 bytes, not consumed)
1049        let p = build_pattern("*\\s(key=)").unwrap();
1050        let (off, m) = p.find("hello  key=value").unwrap();
1051        assert_eq!(off, 5);
1052        assert_eq!(m.consumed, 2);
1053        assert_eq!(m.matched, 6); // 2 (\s) + 4 (preserve "key=")
1054    }
1055
1056    #[test]
1057    fn test_match_field_any() {
1058        // `{field?:}` on "fieldA:value" → offset=0, consumed=7
1059        let p = build_pattern("field?:").unwrap();
1060        let (off, m) = p.find("fieldA:value").unwrap();
1061        assert_eq!(off, 0);
1062        assert_eq!(m.consumed, 7);
1063        assert_eq!(m.matched, 7);
1064    }
1065
1066    #[test]
1067    fn test_match_horizontal_whitespace() {
1068        // `{\h:\h}` on "key\t:\tval" → offset=3, consumed=3
1069        let p = build_pattern("\\h:\\h").unwrap();
1070        let (off, m) = p.find("key\t:\tval").unwrap();
1071        assert_eq!(off, 3);
1072        assert_eq!(m.consumed, 3);
1073        assert_eq!(m.matched, 3);
1074    }
1075
1076    #[test]
1077    fn test_match_non_whitespace() {
1078        // `{\s\S=}` on "msg=Test message externalId=0"
1079        // \s matches at first space (pos 8), but \S then consumes "message"
1080        // and "=" doesn't match " externalId..." → fail at pos 8.
1081        // At pos 16: \s matches " ", \S matches "externalId", "=" matches → success
1082        let p = build_pattern("\\s\\S=").unwrap();
1083        let (off, m) = p.find("msg=Test message externalId=0").unwrap();
1084        assert_eq!(off, 16); // split before " externalId="
1085        assert_eq!(m.consumed, 12); // " " + "externalId" + "="
1086        assert_eq!(m.matched, 12);
1087    }
1088
1089    #[test]
1090    fn test_match_non_whitespace_preserve_kvarr() {
1091        // `{\s(\S=)}` — the kvarr separator pattern:
1092        // \s consumed (separator), \S= preserved (lookahead for next key=)
1093        let p = build_pattern("\\s(\\S=)").unwrap();
1094        let (off, m) = p.find("msg=Test message externalId=0").unwrap();
1095        assert_eq!(off, 16); // field content: "msg=Test message"
1096        assert_eq!(m.consumed, 1); // consumed: " " (space)
1097        assert_eq!(m.matched, 12); // matched: " " + "externalId" + "="
1098    }
1099
1100    #[test]
1101    fn test_match_non_horizontal_whitespace() {
1102        // `{\h\H=}` on "key\t:\tval\texternalId=0"
1103        let p = build_pattern("\\H=").unwrap();
1104        let (off, m) = p.find("key\t:\tval\texternalId=0").unwrap();
1105        // \H matches "key" (stops at \t), then "=" doesn't match "\t:..." → fail
1106        // Scanning... \H at "externalId=0": matches "externalId", "=" matches → success
1107        assert_eq!(off, 10);
1108        assert_eq!(m.consumed, 11); // "externalId="
1109    }
1110
1111    #[test]
1112    fn test_match_no_match() {
1113        let p = build_pattern("\\s=").unwrap();
1114        assert!(p.find("key=val").is_none());
1115    }
1116
1117    #[test]
1118    fn test_match_at_start_literal() {
1119        let p = build_pattern("abc").unwrap();
1120        let m = p.match_at_start("abcdef").unwrap();
1121        assert_eq!(m.consumed, 3);
1122        assert!(p.match_at_start("xabc").is_none());
1123    }
1124
1125    #[test]
1126    fn test_match_at_start_glob() {
1127        let p = build_pattern("\\s=").unwrap();
1128        let m = p.match_at_start("  =val").unwrap();
1129        assert_eq!(m.consumed, 3);
1130        assert!(p.match_at_start("val  =").is_none());
1131    }
1132
1133    #[test]
1134    fn test_match_star_at_end() {
1135        // `{key=*}` on "key=value" → offset=0, consumed=9
1136        let p = build_pattern("key=*").unwrap();
1137        let (off, m) = p.find("key=value").unwrap();
1138        assert_eq!(off, 0);
1139        // Star matches "value" (all remaining since no following segment)
1140        // But non-greedy star with no remaining segments matches 0 chars
1141        // Actually, non-greedy star with no remaining segments: try 0 first → succeeds
1142        assert_eq!(m.consumed, 4); // "key=" + 0 chars from Star
1143        assert_eq!(m.matched, 4);
1144    }
1145
1146    #[test]
1147    fn test_match_star_newline() {
1148        // `{\s=*\n}` on "  =hello\n"
1149        let p = build_pattern("\\s=*\\n").unwrap();
1150        let (off, m) = p.find("  =hello\n").unwrap();
1151        assert_eq!(off, 0);
1152        assert_eq!(m.consumed, 9);
1153    }
1154
1155    #[test]
1156    fn test_match_preserve_only() {
1157        // Pattern with only preserve: `(abc)` applied to "abcdef"
1158        let p = build_pattern("(abc)").unwrap();
1159        match &p.compiled {
1160            SepMatcher::Glob(g) => {
1161                assert!(g.segments.is_empty());
1162                assert!(g.preserve.is_some());
1163            }
1164            _ => panic!("expected Glob"),
1165        }
1166        // Match at position 0
1167        let (off, m) = p.find("abcdef").unwrap();
1168        assert_eq!(off, 0);
1169        assert_eq!(m.consumed, 0);
1170        assert_eq!(m.matched, 3);
1171
1172        // Match at non-zero offset: field content is "xyz", preserve "abc" found at pos 3
1173        let (off, m) = p.find("xyzabcdef").unwrap();
1174        assert_eq!(off, 3);
1175        assert_eq!(m.consumed, 0);
1176        assert_eq!(m.matched, 3);
1177
1178        // No match
1179        assert!(p.find("xyzdef").is_none());
1180    }
1181
1182    #[test]
1183    fn test_match_preserve_only_command() {
1184        // Real-world pattern: `{(command=)}` — find "command=" as lookahead separator
1185        let p = build_pattern("(command=)").unwrap();
1186        let (off, m) = p.find("hello command=value").unwrap();
1187        assert_eq!(off, 6); // "hello " is field content
1188        assert_eq!(m.consumed, 0); // separator is zero-width
1189        assert_eq!(m.matched, 8); // "command=".len()
1190
1191        // Match at start
1192        let (off, m) = p.find("command=value").unwrap();
1193        assert_eq!(off, 0);
1194        assert_eq!(m.consumed, 0);
1195        assert_eq!(m.matched, 8);
1196    }
1197
1198    #[test]
1199    fn test_match_preserve_with_star() {
1200        // `{(c*=)}` — preserve-only with anchored Star
1201        // On "hello cmd=value": find first position where c*= matches
1202        let p = build_pattern("(c*=)").unwrap();
1203        let (off, m) = p.find("hello cmd=value").unwrap();
1204        assert_eq!(off, 6); // "hello " is field content
1205        assert_eq!(m.consumed, 0);
1206        assert_eq!(m.matched, 4); // "cmd=" matched by c + Star("md") + =
1207
1208        // Multiple candidates: picks first
1209        let (off, m) = p.find("hello cat=1 cmd=2").unwrap();
1210        assert_eq!(off, 6); // first "c" at position 6
1211        assert_eq!(m.consumed, 0);
1212        assert_eq!(m.matched, 4); // "cat="
1213    }
1214
1215    // ── Serde round-trip ─────────────────────────────────────────────
1216
1217    #[test]
1218    fn test_serde_roundtrip() {
1219        let p = build_pattern("*\\s(key=)").unwrap();
1220        let json = serde_json::to_string(&p).unwrap();
1221        // JSON escapes the backslash: raw `*\s(key=)` → JSON `"*\\s(key=)"`
1222        assert_eq!(json, r#""*\\s(key=)""#);
1223        let p2: SepPattern = serde_json::from_str(&json).unwrap();
1224        assert_eq!(p.raw, p2.raw);
1225        assert_eq!(p.compiled, p2.compiled);
1226    }
1227
1228    #[test]
1229    fn test_serde_roundtrip_literal() {
1230        let p = build_pattern("abc").unwrap();
1231        let json = serde_json::to_string(&p).unwrap();
1232        let p2: SepPattern = serde_json::from_str(&json).unwrap();
1233        assert_eq!(p, p2);
1234    }
1235}