formatparse_core/
lookaround.rs1use crate::parser::count_capturing_groups;
4use fancy_regex::Regex;
5
6const MAX_LOOKAROUND_TAIL_BYTES: usize = 4096;
8
9pub fn split_type_base_and_lookaround_tail(type_str: &str) -> (&str, &str) {
13 let t = type_str.trim();
14 if let Some(i) = find_first_lookaround_start(t) {
15 let base = t[..i].trim_end();
16 let tail = t[i..].trim_start();
17 (base, tail)
18 } else {
19 (t, "")
20 }
21}
22
23fn find_first_lookaround_start(s: &str) -> Option<usize> {
24 s.char_indices()
25 .map(|(i, _)| i)
26 .find(|&i| starts_with_lookaround(s, i))
27}
28
29fn starts_with_lookaround(s: &str, i: usize) -> bool {
30 let rest = &s[i..];
31 rest.starts_with("(?<=")
32 || rest.starts_with("(?<!")
33 || rest.starts_with("(?=")
34 || rest.starts_with("(?!")
35}
36
37fn balanced_paren_group_end(s: &str, open_idx: usize) -> Option<usize> {
39 if !s[open_idx..].starts_with('(') {
40 return None;
41 }
42 let mut depth = 0i32;
43 let mut i = open_idx;
44 while i < s.len() {
45 let ch = s[i..].chars().next()?;
46 if ch == '\\' {
47 i += ch.len_utf8();
48 if i < s.len() {
49 i += s[i..].chars().next()?.len_utf8();
50 }
51 continue;
52 }
53 match ch {
54 '(' => {
55 depth += 1;
56 i += ch.len_utf8();
57 }
58 ')' => {
59 depth -= 1;
60 i += ch.len_utf8();
61 if depth == 0 {
62 return Some(i);
63 }
64 }
65 _ => i += ch.len_utf8(),
66 }
67 }
68 None
69}
70
71pub fn parse_lookaround_tail(tail: &str) -> Result<(String, String), String> {
74 let tail = tail.trim();
75 if tail.is_empty() {
76 return Ok((String::new(), String::new()));
77 }
78 if tail.len() > MAX_LOOKAROUND_TAIL_BYTES {
79 return Err(format!(
80 "Lookaround tail exceeds maximum length of {} bytes",
81 MAX_LOOKAROUND_TAIL_BYTES
82 ));
83 }
84
85 let mut lookbehind = String::new();
86 let mut lookahead = String::new();
87 let mut pos = 0usize;
88 let t = tail;
89
90 while pos < t.len() {
91 while let Some(c) = t[pos..].chars().next() {
92 if c.is_whitespace() {
93 pos += c.len_utf8();
94 } else {
95 break;
96 }
97 }
98 if pos >= t.len() {
99 break;
100 }
101 if t.as_bytes().get(pos) != Some(&b'(') {
102 return Err(format!(
103 "Unexpected text in lookaround tail at byte {}: expected '('",
104 pos
105 ));
106 }
107 let end = balanced_paren_group_end(t, pos).ok_or_else(|| {
108 format!(
109 "Unclosed parenthesis in lookaround tail starting at byte {}",
110 pos
111 )
112 })?;
113 let group = &t[pos..end];
114 if !is_allowed_lookaround_prefix(group) {
115 return Err(format!(
116 "Invalid lookaround group (must start with (?=, (?!, (?<=, or (?<!): {:?}",
117 truncate(group, 64)
118 ));
119 }
120 if count_capturing_groups(group) != 0 {
121 return Err("Lookaround groups must not contain capturing parentheses".to_string());
122 }
123 Regex::new(group).map_err(|e| format!("Invalid lookaround regex: {}", e))?;
124
125 if group.starts_with("(?<=") || group.starts_with("(?<!") {
126 lookbehind.push_str(group);
127 } else {
128 lookahead.push_str(group);
129 }
130 pos = end;
131 }
132
133 Ok((lookbehind, lookahead))
134}
135
136fn truncate(s: &str, max: usize) -> String {
137 if s.len() <= max {
138 s.to_string()
139 } else {
140 format!("{}…", &s[..max])
141 }
142}
143
144fn is_allowed_lookaround_prefix(group: &str) -> bool {
145 group.starts_with("(?<=")
146 || group.starts_with("(?<!")
147 || group.starts_with("(?=")
148 || group.starts_with("(?!")
149}
150
151fn is_literal_lookaround_body(body: &str) -> bool {
153 let mut it = body.chars();
154 while let Some(ch) = it.next() {
155 if ch == '\\' {
156 if it.next().is_none() {
157 return false;
158 }
159 continue;
160 }
161 match ch {
162 '|' | '(' | ')' | '[' | ']' | '.' | '*' | '+' | '?' | '{' | '}' | '^' | '$' => {
163 return false;
164 }
165 _ => {}
166 }
167 }
168 true
169}
170
171pub fn rewrite_field_fragments_for_engine_anchor(
179 field_body: &str,
180 trailing_lookahead: &str,
181) -> (String, String, String) {
182 let mut prefix = String::new();
183 let mut rest = field_body;
184 while rest.starts_with("(?<=") {
185 let Some(end) = balanced_paren_group_end(rest, 0) else {
186 break;
187 };
188 let group = &rest[..end];
189 let inner = group
190 .strip_prefix("(?<=")
191 .and_then(|g| g.strip_suffix(')'))
192 .unwrap_or("");
193 if !is_literal_lookaround_body(inner) {
194 break;
195 }
196 prefix.push_str("(?:");
197 prefix.push_str(inner);
198 prefix.push(')');
199 rest = &rest[end..];
200 }
201 let la = lower_positive_lookahead_suffix(trailing_lookahead);
202 (prefix, rest.to_string(), la)
203}
204
205fn lower_positive_lookahead_suffix(trailing_lookahead: &str) -> String {
206 let t = trailing_lookahead.trim();
207 if t.is_empty() {
208 return String::new();
209 }
210 let mut out = String::new();
211 let mut pos = 0usize;
212 while pos < t.len() {
213 while let Some(c) = t[pos..].chars().next() {
214 if c.is_whitespace() {
215 pos += c.len_utf8();
216 } else {
217 break;
218 }
219 }
220 if pos >= t.len() {
221 break;
222 }
223 if t.as_bytes().get(pos) != Some(&b'(') {
224 out.push_str(&t[pos..]);
225 break;
226 }
227 let Some(end) = balanced_paren_group_end(t, pos) else {
228 out.push_str(&t[pos..]);
229 break;
230 };
231 let group = &t[pos..end];
232 if let Some(inner) = group.strip_prefix("(?=").and_then(|g| g.strip_suffix(')')) {
233 if is_literal_lookaround_body(inner) {
234 out.push_str("(?:");
235 out.push_str(inner);
236 out.push(')');
237 } else {
238 out.push_str(group);
239 }
240 } else {
241 out.push_str(group);
242 }
243 pos = end;
244 }
245 out
246}
247
248pub fn reject_lookaround_in_strftime(type_str: &str) -> Result<(), String> {
250 let t = type_str.trim();
251 if t == "%" {
252 return Ok(());
253 }
254 if t.starts_with('%') && find_first_lookaround_start(t).is_some() {
255 return Err(
256 "Lookaround assertions are not supported with strftime (%…) format types".to_string(),
257 );
258 }
259 Ok(())
260}
261
262#[cfg(test)]
263mod tests {
264 use super::*;
265
266 #[test]
267 fn split_d_lookahead() {
268 let (base, tail) = split_type_base_and_lookaround_tail("d(?=px)");
269 assert_eq!(base, "d");
270 assert_eq!(tail, "(?=px)");
271 }
272
273 #[test]
274 fn split_custom_lookahead() {
275 let (base, tail) = split_type_base_and_lookaround_tail("MyType(?=x)");
276 assert_eq!(base, "MyType");
277 assert_eq!(tail, "(?=x)");
278 }
279
280 #[test]
281 fn strftime_rejects_embedded_lookaround() {
282 let err = reject_lookaround_in_strftime("%Y(?=x)").unwrap_err();
283 assert!(err.contains("strftime"), "{}", err);
284 }
285
286 #[test]
287 fn parse_tail_orders_lb_then_la() {
288 let (lb, la) = parse_lookaround_tail("(?<=\\$)(?=px)").unwrap();
289 assert!(lb.starts_with("(?<="));
290 assert!(la.starts_with("(?="));
291 }
292
293 #[test]
294 fn regex_engine_accepts_issue_examples() {
295 Regex::new(r"\d+(?=px)").expect("lookahead");
296 Regex::new(r"(?<=\$)\d+").expect("lookbehind");
297 Regex::new(r"(?<=\$)\d+(?=px)").expect("combined");
298 }
299
300 #[test]
301 fn reject_capture_inside_lookaround() {
302 let err = parse_lookaround_tail(r"(?=([0-9]))").unwrap_err();
303 assert!(err.contains("capturing"));
304 }
305
306 #[test]
307 fn rewrite_lowers_literal_positive_lb_and_la() {
308 let (p, b, la) = rewrite_field_fragments_for_engine_anchor(r"(?<=\$)\d+", "(?=(?:px))");
309 assert_eq!(p, r"(?:\$)");
310 assert_eq!(b, r"\d+");
311 assert_eq!(la, "(?=(?:px))");
313
314 let (p2, b2, la2) = rewrite_field_fragments_for_engine_anchor(r"(?<=\$)\d+", "(?=px)");
315 assert_eq!(p2, r"(?:\$)");
316 assert_eq!(b2, r"\d+");
317 assert_eq!(la2, "(?:px)");
318 }
319}