Skip to main content

formatparse_core/
lookaround.rs

1//! Trailing regex lookaround assertions in format type tails (issue #9 / parse#209).
2
3use crate::parser::count_capturing_groups;
4use fancy_regex::Regex;
5
6/// Maximum bytes allowed for the concatenated lookaround tail (all groups).
7const MAX_LOOKAROUND_TAIL_BYTES: usize = 4096;
8
9/// Split `type_str` into the type token(s) and any trailing `(?=…)` / `(?!…)` / `(?<=…)` / `(?<!…)`.
10///
11/// Call only for non-strftime type tails (caller handles `%…` and rejects embedded lookarounds).
12pub fn split_type_base_and_lookaround_tail(type_str: &str) -> (&str, &str) {
13    let t = type_str.trim();
14    if let Some(i) = find_first_lookaround_start(t) {
15        let base = t[..i].trim_end();
16        let tail = t[i..].trim_start();
17        (base, tail)
18    } else {
19        (t, "")
20    }
21}
22
23fn find_first_lookaround_start(s: &str) -> Option<usize> {
24    s.char_indices()
25        .map(|(i, _)| i)
26        .find(|&i| starts_with_lookaround(s, i))
27}
28
29fn starts_with_lookaround(s: &str, i: usize) -> bool {
30    let rest = &s[i..];
31    rest.starts_with("(?<=")
32        || rest.starts_with("(?<!")
33        || rest.starts_with("(?=")
34        || rest.starts_with("(?!")
35}
36
37/// Extract end byte index (exclusive) of the balanced `(...)` group starting at `open_idx`.
38fn balanced_paren_group_end(s: &str, open_idx: usize) -> Option<usize> {
39    if !s[open_idx..].starts_with('(') {
40        return None;
41    }
42    let mut depth = 0i32;
43    let mut i = open_idx;
44    while i < s.len() {
45        let ch = s[i..].chars().next()?;
46        if ch == '\\' {
47            i += ch.len_utf8();
48            if i < s.len() {
49                i += s[i..].chars().next()?.len_utf8();
50            }
51            continue;
52        }
53        match ch {
54            '(' => {
55                depth += 1;
56                i += ch.len_utf8();
57            }
58            ')' => {
59                depth -= 1;
60                i += ch.len_utf8();
61                if depth == 0 {
62                    return Some(i);
63                }
64            }
65            _ => i += ch.len_utf8(),
66        }
67    }
68    None
69}
70
71/// Parse `tail` into concatenated lookbehind and lookahead fragments (order preserved).
72/// Each segment must be a single non-capturing lookaround group with no capturing groups inside.
73pub fn parse_lookaround_tail(tail: &str) -> Result<(String, String), String> {
74    let tail = tail.trim();
75    if tail.is_empty() {
76        return Ok((String::new(), String::new()));
77    }
78    if tail.len() > MAX_LOOKAROUND_TAIL_BYTES {
79        return Err(format!(
80            "Lookaround tail exceeds maximum length of {} bytes",
81            MAX_LOOKAROUND_TAIL_BYTES
82        ));
83    }
84
85    let mut lookbehind = String::new();
86    let mut lookahead = String::new();
87    let mut pos = 0usize;
88    let t = tail;
89
90    while pos < t.len() {
91        while let Some(c) = t[pos..].chars().next() {
92            if c.is_whitespace() {
93                pos += c.len_utf8();
94            } else {
95                break;
96            }
97        }
98        if pos >= t.len() {
99            break;
100        }
101        if t.as_bytes().get(pos) != Some(&b'(') {
102            return Err(format!(
103                "Unexpected text in lookaround tail at byte {}: expected '('",
104                pos
105            ));
106        }
107        let end = balanced_paren_group_end(t, pos).ok_or_else(|| {
108            format!(
109                "Unclosed parenthesis in lookaround tail starting at byte {}",
110                pos
111            )
112        })?;
113        let group = &t[pos..end];
114        if !is_allowed_lookaround_prefix(group) {
115            return Err(format!(
116                "Invalid lookaround group (must start with (?=, (?!, (?<=, or (?<!): {:?}",
117                truncate(group, 64)
118            ));
119        }
120        if count_capturing_groups(group) != 0 {
121            return Err("Lookaround groups must not contain capturing parentheses".to_string());
122        }
123        Regex::new(group).map_err(|e| format!("Invalid lookaround regex: {}", e))?;
124
125        if group.starts_with("(?<=") || group.starts_with("(?<!") {
126            lookbehind.push_str(group);
127        } else {
128            lookahead.push_str(group);
129        }
130        pos = end;
131    }
132
133    Ok((lookbehind, lookahead))
134}
135
136fn truncate(s: &str, max: usize) -> String {
137    if s.len() <= max {
138        s.to_string()
139    } else {
140        format!("{}…", &s[..max])
141    }
142}
143
144fn is_allowed_lookaround_prefix(group: &str) -> bool {
145    group.starts_with("(?<=")
146        || group.starts_with("(?<!")
147        || group.starts_with("(?=")
148        || group.starts_with("(?!")
149}
150
151/// True if `body` contains only literal characters and `\\.` escapes, with no other regex operators.
152fn is_literal_lookaround_body(body: &str) -> bool {
153    let mut it = body.chars();
154    while let Some(ch) = it.next() {
155        if ch == '\\' {
156            if it.next().is_none() {
157                return false;
158            }
159            continue;
160        }
161        match ch {
162            '|' | '(' | ')' | '[' | ']' | '.' | '*' | '+' | '?' | '{' | '}' | '^' | '$' => {
163                return false;
164            }
165            _ => {}
166        }
167    }
168    true
169}
170
171/// fancy-regex 0.14 does not match some anchored patterns that combine `^` / `$` with
172/// **positive** lookbehind at the start or **positive** lookahead before the end anchor.
173/// For literal-only bodies, rewrite those assertions to non-capturing groups so spans stay
174/// on the field capture and full-string parse still works.
175///
176/// Returns `(prefix, field_body_rest, lookahead_suffix)` to be assembled as
177/// `prefix + "(?P<name>" + field_body_rest + ")" + lookahead_suffix` (or the unnamed variant).
178pub fn rewrite_field_fragments_for_engine_anchor(
179    field_body: &str,
180    trailing_lookahead: &str,
181) -> (String, String, String) {
182    let mut prefix = String::new();
183    let mut rest = field_body;
184    while rest.starts_with("(?<=") {
185        let Some(end) = balanced_paren_group_end(rest, 0) else {
186            break;
187        };
188        let group = &rest[..end];
189        let inner = group
190            .strip_prefix("(?<=")
191            .and_then(|g| g.strip_suffix(')'))
192            .unwrap_or("");
193        if !is_literal_lookaround_body(inner) {
194            break;
195        }
196        prefix.push_str("(?:");
197        prefix.push_str(inner);
198        prefix.push(')');
199        rest = &rest[end..];
200    }
201    let la = lower_positive_lookahead_suffix(trailing_lookahead);
202    (prefix, rest.to_string(), la)
203}
204
205fn lower_positive_lookahead_suffix(trailing_lookahead: &str) -> String {
206    let t = trailing_lookahead.trim();
207    if t.is_empty() {
208        return String::new();
209    }
210    let mut out = String::new();
211    let mut pos = 0usize;
212    while pos < t.len() {
213        while let Some(c) = t[pos..].chars().next() {
214            if c.is_whitespace() {
215                pos += c.len_utf8();
216            } else {
217                break;
218            }
219        }
220        if pos >= t.len() {
221            break;
222        }
223        if t.as_bytes().get(pos) != Some(&b'(') {
224            out.push_str(&t[pos..]);
225            break;
226        }
227        let Some(end) = balanced_paren_group_end(t, pos) else {
228            out.push_str(&t[pos..]);
229            break;
230        };
231        let group = &t[pos..end];
232        if let Some(inner) = group.strip_prefix("(?=").and_then(|g| g.strip_suffix(')')) {
233            if is_literal_lookaround_body(inner) {
234                out.push_str("(?:");
235                out.push_str(inner);
236                out.push(')');
237            } else {
238                out.push_str(group);
239            }
240        } else {
241            out.push_str(group);
242        }
243        pos = end;
244    }
245    out
246}
247
248/// If `type_str` is a strftime tail (`%…` but not exactly `%`), reject when lookarounds are present.
249pub fn reject_lookaround_in_strftime(type_str: &str) -> Result<(), String> {
250    let t = type_str.trim();
251    if t == "%" {
252        return Ok(());
253    }
254    if t.starts_with('%') && find_first_lookaround_start(t).is_some() {
255        return Err(
256            "Lookaround assertions are not supported with strftime (%…) format types".to_string(),
257        );
258    }
259    Ok(())
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    #[test]
267    fn split_d_lookahead() {
268        let (base, tail) = split_type_base_and_lookaround_tail("d(?=px)");
269        assert_eq!(base, "d");
270        assert_eq!(tail, "(?=px)");
271    }
272
273    #[test]
274    fn split_custom_lookahead() {
275        let (base, tail) = split_type_base_and_lookaround_tail("MyType(?=x)");
276        assert_eq!(base, "MyType");
277        assert_eq!(tail, "(?=x)");
278    }
279
280    #[test]
281    fn strftime_rejects_embedded_lookaround() {
282        let err = reject_lookaround_in_strftime("%Y(?=x)").unwrap_err();
283        assert!(err.contains("strftime"), "{}", err);
284    }
285
286    #[test]
287    fn parse_tail_orders_lb_then_la() {
288        let (lb, la) = parse_lookaround_tail("(?<=\\$)(?=px)").unwrap();
289        assert!(lb.starts_with("(?<="));
290        assert!(la.starts_with("(?="));
291    }
292
293    #[test]
294    fn regex_engine_accepts_issue_examples() {
295        Regex::new(r"\d+(?=px)").expect("lookahead");
296        Regex::new(r"(?<=\$)\d+").expect("lookbehind");
297        Regex::new(r"(?<=\$)\d+(?=px)").expect("combined");
298    }
299
300    #[test]
301    fn reject_capture_inside_lookaround() {
302        let err = parse_lookaround_tail(r"(?=([0-9]))").unwrap_err();
303        assert!(err.contains("capturing"));
304    }
305
306    #[test]
307    fn rewrite_lowers_literal_positive_lb_and_la() {
308        let (p, b, la) = rewrite_field_fragments_for_engine_anchor(r"(?<=\$)\d+", "(?=(?:px))");
309        assert_eq!(p, r"(?:\$)");
310        assert_eq!(b, r"\d+");
311        // Non-simple lookahead body is preserved
312        assert_eq!(la, "(?=(?:px))");
313
314        let (p2, b2, la2) = rewrite_field_fragments_for_engine_anchor(r"(?<=\$)\d+", "(?=px)");
315        assert_eq!(p2, r"(?:\$)");
316        assert_eq!(b2, r"\d+");
317        assert_eq!(la2, "(?:px)");
318    }
319}