nu_command/strings/
parse.rs

1use fancy_regex::{Captures, Regex, RegexBuilder};
2use nu_engine::command_prelude::*;
3use nu_protocol::{ListStream, Signals, engine::StateWorkingSet};
4use std::collections::VecDeque;
5
6#[derive(Clone)]
7pub struct Parse;
8
9impl Command for Parse {
10    fn name(&self) -> &str {
11        "parse"
12    }
13
14    fn description(&self) -> &str {
15        "Parse columns from string data using a simple pattern or a supplied regular expression."
16    }
17
18    fn search_terms(&self) -> Vec<&str> {
19        vec!["pattern", "match", "regex", "str extract"]
20    }
21
22    fn extra_description(&self) -> &str {
23        "The parse command always uses regular expressions even when you use a simple pattern. If a simple pattern is supplied, parse will transform that pattern into a regular expression."
24    }
25
26    fn signature(&self) -> nu_protocol::Signature {
27        Signature::build("parse")
28            .required("pattern", SyntaxShape::String, "The pattern to match.")
29            .input_output_types(vec![
30                (Type::String, Type::table()),
31                (Type::List(Box::new(Type::Any)), Type::table()),
32            ])
33            .switch("regex", "use full regex syntax for patterns", Some('r'))
34            .named(
35                "backtrack",
36                SyntaxShape::Int,
37                "set the max backtrack limit for regex",
38                Some('b'),
39            )
40            .allow_variants_without_examples(true)
41            .category(Category::Strings)
42    }
43
44    fn examples(&self) -> Vec<Example> {
45        vec![
46            Example {
47                description: "Parse a string into two named columns",
48                example: "\"hi there\" | parse \"{foo} {bar}\"",
49                result: Some(Value::test_list(vec![Value::test_record(record! {
50                    "foo" => Value::test_string("hi"),
51                    "bar" => Value::test_string("there"),
52                })])),
53            },
54            Example {
55                description: "Parse a string, ignoring a column with _",
56                example: "\"hello world\" | parse \"{foo} {_}\"",
57                result: Some(Value::test_list(vec![Value::test_record(record! {
58                    "foo" => Value::test_string("hello"),
59                })])),
60            },
61            Example {
62                description: "This is how the first example is interpreted in the source code",
63                example: "\"hi there\" | parse --regex '(?s)\\A(?P<foo>.*?) (?P<bar>.*?)\\z'",
64                result: Some(Value::test_list(vec![Value::test_record(record! {
65                    "foo" => Value::test_string("hi"),
66                    "bar" => Value::test_string("there"),
67                })])),
68            },
69            Example {
70                description: "Parse a string using fancy-regex named capture group pattern",
71                example: "\"foo bar.\" | parse --regex '\\s*(?<name>\\w+)(?=\\.)'",
72                result: Some(Value::test_list(vec![Value::test_record(record! {
73                    "name" => Value::test_string("bar"),
74                })])),
75            },
76            Example {
77                description: "Parse a string using fancy-regex capture group pattern",
78                example: "\"foo! bar.\" | parse --regex '(\\w+)(?=\\.)|(\\w+)(?=!)'",
79                result: Some(Value::test_list(vec![
80                    Value::test_record(record! {
81                        "capture0" => Value::test_nothing(),
82                        "capture1" => Value::test_string("foo"),
83                    }),
84                    Value::test_record(record! {
85                        "capture0" => Value::test_string("bar"),
86                        "capture1" => Value::test_nothing(),
87                    }),
88                ])),
89            },
90            Example {
91                description: "Parse a string using fancy-regex look behind pattern",
92                example: "\" @another(foo bar)   \" | parse --regex '\\s*(?<=[() ])(@\\w+)(\\([^)]*\\))?\\s*'",
93                result: Some(Value::test_list(vec![Value::test_record(record! {
94                    "capture0" => Value::test_string("@another"),
95                    "capture1" => Value::test_string("(foo bar)"),
96                })])),
97            },
98            Example {
99                description: "Parse a string using fancy-regex look ahead atomic group pattern",
100                example: "\"abcd\" | parse --regex '^a(bc(?=d)|b)cd$'",
101                result: Some(Value::test_list(vec![Value::test_record(record! {
102                    "capture0" => Value::test_string("b"),
103                })])),
104            },
105            Example {
106                description: "Parse a string with a manually set fancy-regex backtrack limit",
107                example: "\"hi there\" | parse --backtrack 1500000 \"{foo} {bar}\"",
108                result: Some(Value::test_list(vec![Value::test_record(record! {
109                    "foo" => Value::test_string("hi"),
110                    "bar" => Value::test_string("there"),
111                })])),
112            },
113        ]
114    }
115
116    fn is_const(&self) -> bool {
117        true
118    }
119
120    fn run(
121        &self,
122        engine_state: &EngineState,
123        stack: &mut Stack,
124        call: &Call,
125        input: PipelineData,
126    ) -> Result<PipelineData, ShellError> {
127        let pattern: Spanned<String> = call.req(engine_state, stack, 0)?;
128        let regex: bool = call.has_flag(engine_state, stack, "regex")?;
129        let backtrack_limit: usize = call
130            .get_flag(engine_state, stack, "backtrack")?
131            .unwrap_or(1_000_000); // 1_000_000 is fancy_regex default
132        operate(engine_state, pattern, regex, backtrack_limit, call, input)
133    }
134
135    fn run_const(
136        &self,
137        working_set: &StateWorkingSet,
138        call: &Call,
139        input: PipelineData,
140    ) -> Result<PipelineData, ShellError> {
141        let pattern: Spanned<String> = call.req_const(working_set, 0)?;
142        let regex: bool = call.has_flag_const(working_set, "regex")?;
143        let backtrack_limit: usize = call
144            .get_flag_const(working_set, "backtrack")?
145            .unwrap_or(1_000_000);
146        operate(
147            working_set.permanent(),
148            pattern,
149            regex,
150            backtrack_limit,
151            call,
152            input,
153        )
154    }
155}
156
157fn operate(
158    engine_state: &EngineState,
159    pattern: Spanned<String>,
160    regex: bool,
161    backtrack_limit: usize,
162    call: &Call,
163    input: PipelineData,
164) -> Result<PipelineData, ShellError> {
165    let head = call.head;
166
167    let pattern_item = pattern.item;
168    let pattern_span = pattern.span;
169
170    let item_to_parse = if regex {
171        pattern_item
172    } else {
173        build_regex(&pattern_item, pattern_span)?
174    };
175
176    let regex = RegexBuilder::new(&item_to_parse)
177        .backtrack_limit(backtrack_limit)
178        .build()
179        .map_err(|e| ShellError::GenericError {
180            error: "Error with regular expression".into(),
181            msg: e.to_string(),
182            span: Some(pattern_span),
183            help: None,
184            inner: vec![],
185        })?;
186
187    let columns = regex
188        .capture_names()
189        .skip(1)
190        .enumerate()
191        .map(|(i, name)| {
192            name.map(String::from)
193                .unwrap_or_else(|| format!("capture{i}"))
194        })
195        .collect::<Vec<_>>();
196
197    match input {
198        PipelineData::Empty => Ok(PipelineData::Empty),
199        PipelineData::Value(value, ..) => match value {
200            Value::String { val, .. } => {
201                let captures = regex
202                    .captures_iter(&val)
203                    .map(|captures| captures_to_value(captures, &columns, head))
204                    .collect::<Result<_, _>>()?;
205
206                Ok(Value::list(captures, head).into_pipeline_data())
207            }
208            Value::List { vals, .. } => {
209                let iter = vals.into_iter().map(move |val| {
210                    let span = val.span();
211                    let type_ = val.get_type();
212                    val.into_string()
213                        .map_err(|_| ShellError::OnlySupportsThisInputType {
214                            exp_input_type: "string".into(),
215                            wrong_type: type_.to_string(),
216                            dst_span: head,
217                            src_span: span,
218                        })
219                });
220
221                let iter = ParseIter {
222                    captures: VecDeque::new(),
223                    regex,
224                    columns,
225                    iter,
226                    span: head,
227                    signals: engine_state.signals().clone(),
228                };
229
230                Ok(ListStream::new(iter, head, Signals::empty()).into())
231            }
232            value => Err(ShellError::OnlySupportsThisInputType {
233                exp_input_type: "string".into(),
234                wrong_type: value.get_type().to_string(),
235                dst_span: head,
236                src_span: value.span(),
237            }),
238        },
239        PipelineData::ListStream(stream, ..) => Ok(stream
240            .modify(|stream| {
241                let iter = stream.map(move |val| {
242                    let span = val.span();
243                    val.into_string().map_err(|_| ShellError::PipelineMismatch {
244                        exp_input_type: "string".into(),
245                        dst_span: head,
246                        src_span: span,
247                    })
248                });
249
250                ParseIter {
251                    captures: VecDeque::new(),
252                    regex,
253                    columns,
254                    iter,
255                    span: head,
256                    signals: engine_state.signals().clone(),
257                }
258            })
259            .into()),
260        PipelineData::ByteStream(stream, ..) => {
261            if let Some(lines) = stream.lines() {
262                let iter = ParseIter {
263                    captures: VecDeque::new(),
264                    regex,
265                    columns,
266                    iter: lines,
267                    span: head,
268                    signals: engine_state.signals().clone(),
269                };
270
271                Ok(ListStream::new(iter, head, Signals::empty()).into())
272            } else {
273                Ok(PipelineData::Empty)
274            }
275        }
276    }
277}
278
279fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
280    let mut output = "(?s)\\A".to_string();
281
282    let mut loop_input = input.chars().peekable();
283    loop {
284        let mut before = String::new();
285        while let Some(c) = loop_input.next() {
286            if c == '{' {
287                // If '{{', still creating a plaintext parse command, but just for a single '{' char
288                if loop_input.peek() == Some(&'{') {
289                    let _ = loop_input.next();
290                } else {
291                    break;
292                }
293            }
294            before.push(c);
295        }
296
297        if !before.is_empty() {
298            output.push_str(&fancy_regex::escape(&before));
299        }
300
301        // Look for column as we're now at one
302        let mut column = String::new();
303        while let Some(c) = loop_input.next() {
304            if c == '}' {
305                break;
306            }
307            column.push(c);
308
309            if loop_input.peek().is_none() {
310                return Err(ShellError::DelimiterError {
311                    msg: "Found opening `{` without an associated closing `}`".to_owned(),
312                    span,
313                });
314            }
315        }
316
317        if !column.is_empty() {
318            output.push_str("(?");
319            if column == "_" {
320                // discard placeholder column(s)
321                output.push(':');
322            } else {
323                // create capture group for column
324                output.push_str("P<");
325                output.push_str(&column);
326                output.push('>');
327            }
328            output.push_str(".*?)");
329        }
330
331        if before.is_empty() && column.is_empty() {
332            break;
333        }
334    }
335
336    output.push_str("\\z");
337    Ok(output)
338}
339
340struct ParseIter<I: Iterator<Item = Result<String, ShellError>>> {
341    captures: VecDeque<Value>,
342    regex: Regex,
343    columns: Vec<String>,
344    iter: I,
345    span: Span,
346    signals: Signals,
347}
348
349impl<I: Iterator<Item = Result<String, ShellError>>> ParseIter<I> {
350    fn populate_captures(&mut self, str: &str) -> Result<(), ShellError> {
351        for captures in self.regex.captures_iter(str) {
352            self.captures
353                .push_back(captures_to_value(captures, &self.columns, self.span)?);
354        }
355        Ok(())
356    }
357}
358
359impl<I: Iterator<Item = Result<String, ShellError>>> Iterator for ParseIter<I> {
360    type Item = Value;
361
362    fn next(&mut self) -> Option<Value> {
363        loop {
364            if self.signals.interrupted() {
365                return None;
366            }
367
368            if let Some(val) = self.captures.pop_front() {
369                return Some(val);
370            }
371
372            let result = self
373                .iter
374                .next()?
375                .and_then(|str| self.populate_captures(&str));
376
377            if let Err(err) = result {
378                return Some(Value::error(err, self.span));
379            }
380        }
381    }
382}
383
384fn captures_to_value(
385    captures: Result<Captures, fancy_regex::Error>,
386    columns: &[String],
387    span: Span,
388) -> Result<Value, ShellError> {
389    let captures = captures.map_err(|err| ShellError::GenericError {
390        error: "Error with regular expression captures".into(),
391        msg: err.to_string(),
392        span: Some(span),
393        help: None,
394        inner: vec![],
395    })?;
396
397    let record = columns
398        .iter()
399        .zip(captures.iter().skip(1))
400        .map(|(column, match_)| {
401            let match_value = match_
402                .map(|m| Value::string(m.as_str(), span))
403                .unwrap_or(Value::nothing(span));
404            (column.clone(), match_value)
405        })
406        .collect();
407
408    Ok(Value::record(record, span))
409}
410
411#[cfg(test)]
412mod test {
413    use super::*;
414
415    #[test]
416    fn test_examples() {
417        crate::test_examples(Parse)
418    }
419}