Skip to main content

nu_command/strings/
parse.rs

1use fancy_regex::{Captures, Regex, RegexBuilder};
2use nu_engine::command_prelude::*;
3use nu_protocol::shell_error::generic::GenericError;
4use nu_protocol::{ListStream, Signals, engine::StateWorkingSet};
5use std::collections::VecDeque;
6
7#[derive(Clone)]
8pub struct Parse;
9
10impl Command for Parse {
11    fn name(&self) -> &str {
12        "parse"
13    }
14
15    fn description(&self) -> &str {
16        "Parse columns from string data using a simple pattern or a supplied regular expression."
17    }
18
19    fn search_terms(&self) -> Vec<&str> {
20        vec!["pattern", "match", "regex", "str extract"]
21    }
22
23    fn extra_description(&self) -> &str {
24        "The parse command always uses regular expressions even when you use a simple pattern. If a simple pattern is supplied, parse will transform that pattern into a regular expression."
25    }
26
27    fn signature(&self) -> nu_protocol::Signature {
28        Signature::build("parse")
29            .required("pattern", SyntaxShape::String, "The pattern to match.")
30            .input_output_types(vec![
31                (Type::String, Type::table()),
32                (Type::List(Box::new(Type::Any)), Type::table()),
33            ])
34            .switch("regex", "Use full regex syntax for patterns.", Some('r'))
35            .named(
36                "backtrack",
37                SyntaxShape::Int,
38                "Set the max backtrack limit for regex.",
39                Some('b'),
40            )
41            .allow_variants_without_examples(true)
42            .category(Category::Strings)
43    }
44
45    fn examples(&self) -> Vec<Example<'_>> {
46        vec![
47            Example {
48                description: "Parse a string into two named columns.",
49                example: "\"hi there\" | parse \"{foo} {bar}\"",
50                result: Some(Value::test_list(vec![Value::test_record(record! {
51                    "foo" => Value::test_string("hi"),
52                    "bar" => Value::test_string("there"),
53                })])),
54            },
55            Example {
56                description: "Parse a string, ignoring a column with _.",
57                example: "\"hello world\" | parse \"{foo} {_}\"",
58                result: Some(Value::test_list(vec![Value::test_record(record! {
59                    "foo" => Value::test_string("hello"),
60                })])),
61            },
62            Example {
63                description: "This is how the first example is interpreted in the source code.",
64                example: "\"hi there\" | parse --regex '(?s)\\A(?P<foo>.*?) (?P<bar>.*?)\\z'",
65                result: Some(Value::test_list(vec![Value::test_record(record! {
66                    "foo" => Value::test_string("hi"),
67                    "bar" => Value::test_string("there"),
68                })])),
69            },
70            Example {
71                description: "Parse a string using fancy-regex named capture group pattern.",
72                example: "\"foo bar.\" | parse --regex '\\s*(?<name>\\w+)(?=\\.)'",
73                result: Some(Value::test_list(vec![Value::test_record(record! {
74                    "name" => Value::test_string("bar"),
75                })])),
76            },
77            Example {
78                description: "Parse a string using fancy-regex capture group pattern.",
79                example: "\"foo! bar.\" | parse --regex '(\\w+)(?=\\.)|(\\w+)(?=!)'",
80                result: Some(Value::test_list(vec![
81                    Value::test_record(record! {
82                        "capture0" => Value::test_nothing(),
83                        "capture1" => Value::test_string("foo"),
84                    }),
85                    Value::test_record(record! {
86                        "capture0" => Value::test_string("bar"),
87                        "capture1" => Value::test_nothing(),
88                    }),
89                ])),
90            },
91            Example {
92                description: "Parse a string using fancy-regex look behind pattern.",
93                example: "\" @another(foo bar)   \" | parse --regex '\\s*(?<=[() ])(@\\w+)(\\([^)]*\\))?\\s*'",
94                result: Some(Value::test_list(vec![Value::test_record(record! {
95                    "capture0" => Value::test_string("@another"),
96                    "capture1" => Value::test_string("(foo bar)"),
97                })])),
98            },
99            Example {
100                description: "Parse a string using fancy-regex look ahead atomic group pattern.",
101                example: "\"abcd\" | parse --regex '^a(bc(?=d)|b)cd$'",
102                result: Some(Value::test_list(vec![Value::test_record(record! {
103                    "capture0" => Value::test_string("b"),
104                })])),
105            },
106            Example {
107                description: "Parse a string with a manually set fancy-regex backtrack limit.",
108                example: "\"hi there\" | parse --backtrack 1500000 \"{foo} {bar}\"",
109                result: Some(Value::test_list(vec![Value::test_record(record! {
110                    "foo" => Value::test_string("hi"),
111                    "bar" => Value::test_string("there"),
112                })])),
113            },
114        ]
115    }
116
117    fn is_const(&self) -> bool {
118        true
119    }
120
121    fn run(
122        &self,
123        engine_state: &EngineState,
124        stack: &mut Stack,
125        call: &Call,
126        input: PipelineData,
127    ) -> Result<PipelineData, ShellError> {
128        let pattern: Spanned<String> = call.req(engine_state, stack, 0)?;
129        let regex: bool = call.has_flag(engine_state, stack, "regex")?;
130        let backtrack_limit: usize = call
131            .get_flag(engine_state, stack, "backtrack")?
132            .unwrap_or(1_000_000); // 1_000_000 is fancy_regex default
133        operate(engine_state, pattern, regex, backtrack_limit, call, input)
134    }
135
136    fn run_const(
137        &self,
138        working_set: &StateWorkingSet,
139        call: &Call,
140        input: PipelineData,
141    ) -> Result<PipelineData, ShellError> {
142        let pattern: Spanned<String> = call.req_const(working_set, 0)?;
143        let regex: bool = call.has_flag_const(working_set, "regex")?;
144        let backtrack_limit: usize = call
145            .get_flag_const(working_set, "backtrack")?
146            .unwrap_or(1_000_000);
147        operate(
148            working_set.permanent(),
149            pattern,
150            regex,
151            backtrack_limit,
152            call,
153            input,
154        )
155    }
156}
157
158fn operate(
159    engine_state: &EngineState,
160    pattern: Spanned<String>,
161    regex: bool,
162    backtrack_limit: usize,
163    call: &Call,
164    input: PipelineData,
165) -> Result<PipelineData, ShellError> {
166    let head = call.head;
167
168    let pattern_item = pattern.item;
169    let pattern_span = pattern.span;
170
171    let item_to_parse = if regex {
172        pattern_item
173    } else {
174        build_regex(&pattern_item, pattern_span)?
175    };
176
177    let regex = RegexBuilder::new(&item_to_parse)
178        .backtrack_limit(backtrack_limit)
179        .build()
180        .map_err(|e| {
181            ShellError::Generic(GenericError::new(
182                "Error with regular expression",
183                e.to_string(),
184                pattern_span,
185            ))
186        })?;
187
188    let columns = regex
189        .capture_names()
190        .skip(1)
191        .enumerate()
192        .map(|(i, name)| {
193            name.map(String::from)
194                .unwrap_or_else(|| format!("capture{i}"))
195        })
196        .collect::<Vec<_>>();
197
198    match input {
199        PipelineData::Empty => Ok(PipelineData::empty()),
200        PipelineData::Value(value, ..) => match value {
201            Value::String { val, .. } => {
202                let captures = regex
203                    .captures_iter(&val)
204                    .map(|captures| captures_to_value(captures, &columns, head))
205                    .collect::<Result<_, _>>()?;
206
207                Ok(Value::list(captures, head).into_pipeline_data())
208            }
209            Value::List { vals, .. } => {
210                let iter = vals.into_iter().map(move |val| {
211                    let span = val.span();
212                    let type_ = val.get_type();
213                    val.into_string()
214                        .map_err(|_| ShellError::OnlySupportsThisInputType {
215                            exp_input_type: "string".into(),
216                            wrong_type: type_.to_string(),
217                            dst_span: head,
218                            src_span: span,
219                        })
220                });
221
222                let iter = ParseIter {
223                    captures: VecDeque::new(),
224                    regex,
225                    columns,
226                    iter,
227                    span: head,
228                    signals: engine_state.signals().clone(),
229                };
230
231                Ok(ListStream::new(iter, head, Signals::empty()).into())
232            }
233            value => Err(ShellError::OnlySupportsThisInputType {
234                exp_input_type: "string".into(),
235                wrong_type: value.get_type().to_string(),
236                dst_span: head,
237                src_span: value.span(),
238            }),
239        },
240        PipelineData::ListStream(stream, ..) => Ok(stream
241            .modify(|stream| {
242                let iter = stream.map(move |val| {
243                    let span = val.span();
244                    val.into_string().map_err(|_| ShellError::PipelineMismatch {
245                        exp_input_type: "string".into(),
246                        dst_span: head,
247                        src_span: span,
248                    })
249                });
250
251                ParseIter {
252                    captures: VecDeque::new(),
253                    regex,
254                    columns,
255                    iter,
256                    span: head,
257                    signals: engine_state.signals().clone(),
258                }
259            })
260            .into()),
261        PipelineData::ByteStream(stream, ..) => {
262            if let Some(lines) = stream.lines() {
263                let iter = ParseIter {
264                    captures: VecDeque::new(),
265                    regex,
266                    columns,
267                    iter: lines,
268                    span: head,
269                    signals: engine_state.signals().clone(),
270                };
271
272                Ok(ListStream::new(iter, head, Signals::empty()).into())
273            } else {
274                Ok(PipelineData::empty())
275            }
276        }
277    }
278}
279
280fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
281    let mut output = "(?s)\\A".to_string();
282
283    // Single-pass scanner keeps parsing state explicit and avoids byte-offset bookkeeping.
284    let mut loop_input = input.char_indices().peekable();
285    let mut before = String::new();
286    let mut column = String::new();
287    let mut in_column = false;
288
289    while let Some((_, c)) = loop_input.next() {
290        if !in_column {
291            if c == '{' {
292                // If '{{', still creating a plaintext parse command, but just for a single '{' char.
293                let mut literal_lbrace = false;
294                if let Some((next_idx, '{')) = loop_input.peek().copied() {
295                    // Don't consume the second `{` if it starts a trailing capture like `{{name}`.
296                    let after = &input[next_idx + 1..];
297                    literal_lbrace = true;
298
299                    if !is_trailing_capture(after) {
300                        loop_input.next();
301                    }
302                }
303
304                if literal_lbrace {
305                    before.push(c);
306                    continue;
307                }
308
309                if !before.is_empty() {
310                    output.push_str(&fancy_regex::escape(&before));
311                    before.clear();
312                }
313
314                in_column = true;
315                continue;
316            }
317
318            before.push(c);
319            continue;
320        }
321
322        if c == '}' {
323            if !column.is_empty() {
324                output.push_str("(?");
325                if column == "_" {
326                    // discard placeholder column(s)
327                    output.push(':');
328                } else {
329                    // create capture group for column
330                    output.push_str("P<");
331                    output.push_str(&column);
332                    output.push('>');
333                }
334                output.push_str(".*?)");
335                column.clear();
336            }
337
338            in_column = false;
339            continue;
340        }
341
342        column.push(c);
343        if loop_input.peek().is_none() {
344            return Err(ShellError::DelimiterError {
345                msg: "Found opening `{` without an associated closing `}`".to_owned(),
346                span,
347            });
348        }
349    }
350
351    if !before.is_empty() {
352        output.push_str(&fancy_regex::escape(&before));
353    }
354
355    output.push_str("\\z");
356    Ok(output)
357}
358
359/// Returns true when the remainder after the second `{` in `{{` forms a trailing capture.
360///
361/// For example, this returns true for `name}` in `{{name}` and false for `name}x{tail}`.
362fn is_trailing_capture(after: &str) -> bool {
363    after
364        .find(['}', '{'])
365        .is_some_and(|pos| after.as_bytes()[pos] == b'}' && pos + 1 == after.len())
366}
367
368struct ParseIter<I: Iterator<Item = Result<String, ShellError>>> {
369    captures: VecDeque<Value>,
370    regex: Regex,
371    columns: Vec<String>,
372    iter: I,
373    span: Span,
374    signals: Signals,
375}
376
377impl<I: Iterator<Item = Result<String, ShellError>>> ParseIter<I> {
378    fn populate_captures(&mut self, str: &str) -> Result<(), ShellError> {
379        for captures in self.regex.captures_iter(str) {
380            self.captures
381                .push_back(captures_to_value(captures, &self.columns, self.span)?);
382        }
383        Ok(())
384    }
385}
386
387impl<I: Iterator<Item = Result<String, ShellError>>> Iterator for ParseIter<I> {
388    type Item = Value;
389
390    fn next(&mut self) -> Option<Value> {
391        loop {
392            if self.signals.interrupted() {
393                return None;
394            }
395
396            if let Some(val) = self.captures.pop_front() {
397                return Some(val);
398            }
399
400            let result = self
401                .iter
402                .next()?
403                .and_then(|str| self.populate_captures(&str));
404
405            if let Err(err) = result {
406                return Some(Value::error(err, self.span));
407            }
408        }
409    }
410}
411
412fn captures_to_value(
413    captures: Result<Captures, fancy_regex::Error>,
414    columns: &[String],
415    span: Span,
416) -> Result<Value, ShellError> {
417    let captures = captures.map_err(|err| {
418        ShellError::Generic(GenericError::new(
419            "Error with regular expression captures",
420            err.to_string(),
421            span,
422        ))
423    })?;
424
425    let record = columns
426        .iter()
427        .zip(captures.iter().skip(1))
428        .map(|(column, match_)| {
429            let match_value = match_
430                .map(|m| Value::string(m.as_str(), span))
431                .unwrap_or(Value::nothing(span));
432            (column.clone(), match_value)
433        })
434        .collect();
435
436    Ok(Value::record(record, span))
437}
438
439#[cfg(test)]
440mod test {
441    use super::*;
442
443    #[test]
444    fn test_examples() -> nu_test_support::Result {
445        nu_test_support::test().examples(Parse)
446    }
447}