nu_command/strings/
parse.rs

1use fancy_regex::{Captures, Regex};
2use nu_engine::command_prelude::*;
3use nu_protocol::{ListStream, Signals, engine::StateWorkingSet};
4use std::collections::VecDeque;
5
6#[derive(Clone)]
7pub struct Parse;
8
9impl Command for Parse {
10    fn name(&self) -> &str {
11        "parse"
12    }
13
14    fn description(&self) -> &str {
15        "Parse columns from string data using a simple pattern or a supplied regular expression."
16    }
17
18    fn search_terms(&self) -> Vec<&str> {
19        vec!["pattern", "match", "regex", "str extract"]
20    }
21
22    fn extra_description(&self) -> &str {
23        "The parse command always uses regular expressions even when you use a simple pattern. If a simple pattern is supplied, parse will transform that pattern into a regular expression."
24    }
25
26    fn signature(&self) -> nu_protocol::Signature {
27        Signature::build("parse")
28            .required("pattern", SyntaxShape::String, "The pattern to match.")
29            .input_output_types(vec![
30                (Type::String, Type::table()),
31                (Type::List(Box::new(Type::Any)), Type::table()),
32            ])
33            .switch("regex", "use full regex syntax for patterns", Some('r'))
34            .allow_variants_without_examples(true)
35            .category(Category::Strings)
36    }
37
38    fn examples(&self) -> Vec<Example> {
39        vec![
40            Example {
41                description: "Parse a string into two named columns",
42                example: "\"hi there\" | parse \"{foo} {bar}\"",
43                result: Some(Value::test_list(vec![Value::test_record(record! {
44                    "foo" => Value::test_string("hi"),
45                    "bar" => Value::test_string("there"),
46                })])),
47            },
48            Example {
49                description: "Parse a string, ignoring a column with _",
50                example: "\"hello world\" | parse \"{foo} {_}\"",
51                result: Some(Value::test_list(vec![Value::test_record(record! {
52                    "foo" => Value::test_string("hello"),
53                })])),
54            },
55            Example {
56                description: "This is how the first example is interpreted in the source code",
57                example: "\"hi there\" | parse --regex '(?s)\\A(?P<foo>.*?) (?P<bar>.*?)\\z'",
58                result: Some(Value::test_list(vec![Value::test_record(record! {
59                    "foo" => Value::test_string("hi"),
60                    "bar" => Value::test_string("there"),
61                })])),
62            },
63            Example {
64                description: "Parse a string using fancy-regex named capture group pattern",
65                example: "\"foo bar.\" | parse --regex '\\s*(?<name>\\w+)(?=\\.)'",
66                result: Some(Value::test_list(vec![Value::test_record(record! {
67                    "name" => Value::test_string("bar"),
68                })])),
69            },
70            Example {
71                description: "Parse a string using fancy-regex capture group pattern",
72                example: "\"foo! bar.\" | parse --regex '(\\w+)(?=\\.)|(\\w+)(?=!)'",
73                result: Some(Value::test_list(vec![
74                    Value::test_record(record! {
75                        "capture0" => Value::test_string(""),
76                        "capture1" => Value::test_string("foo"),
77                    }),
78                    Value::test_record(record! {
79                        "capture0" => Value::test_string("bar"),
80                        "capture1" => Value::test_string(""),
81                    }),
82                ])),
83            },
84            Example {
85                description: "Parse a string using fancy-regex look behind pattern",
86                example: "\" @another(foo bar)   \" | parse --regex '\\s*(?<=[() ])(@\\w+)(\\([^)]*\\))?\\s*'",
87                result: Some(Value::test_list(vec![Value::test_record(record! {
88                    "capture0" => Value::test_string("@another"),
89                    "capture1" => Value::test_string("(foo bar)"),
90                })])),
91            },
92            Example {
93                description: "Parse a string using fancy-regex look ahead atomic group pattern",
94                example: "\"abcd\" | parse --regex '^a(bc(?=d)|b)cd$'",
95                result: Some(Value::test_list(vec![Value::test_record(record! {
96                    "capture0" => Value::test_string("b"),
97                })])),
98            },
99        ]
100    }
101
102    fn is_const(&self) -> bool {
103        true
104    }
105
106    fn run(
107        &self,
108        engine_state: &EngineState,
109        stack: &mut Stack,
110        call: &Call,
111        input: PipelineData,
112    ) -> Result<PipelineData, ShellError> {
113        let pattern: Spanned<String> = call.req(engine_state, stack, 0)?;
114        let regex: bool = call.has_flag(engine_state, stack, "regex")?;
115        operate(engine_state, pattern, regex, call, input)
116    }
117
118    fn run_const(
119        &self,
120        working_set: &StateWorkingSet,
121        call: &Call,
122        input: PipelineData,
123    ) -> Result<PipelineData, ShellError> {
124        let pattern: Spanned<String> = call.req_const(working_set, 0)?;
125        let regex: bool = call.has_flag_const(working_set, "regex")?;
126        operate(working_set.permanent(), pattern, regex, call, input)
127    }
128}
129
130fn operate(
131    engine_state: &EngineState,
132    pattern: Spanned<String>,
133    regex: bool,
134    call: &Call,
135    input: PipelineData,
136) -> Result<PipelineData, ShellError> {
137    let head = call.head;
138
139    let pattern_item = pattern.item;
140    let pattern_span = pattern.span;
141
142    let item_to_parse = if regex {
143        pattern_item
144    } else {
145        build_regex(&pattern_item, pattern_span)?
146    };
147
148    let regex = Regex::new(&item_to_parse).map_err(|e| ShellError::GenericError {
149        error: "Error with regular expression".into(),
150        msg: e.to_string(),
151        span: Some(pattern_span),
152        help: None,
153        inner: vec![],
154    })?;
155
156    let columns = regex
157        .capture_names()
158        .skip(1)
159        .enumerate()
160        .map(|(i, name)| {
161            name.map(String::from)
162                .unwrap_or_else(|| format!("capture{i}"))
163        })
164        .collect::<Vec<_>>();
165
166    match input {
167        PipelineData::Empty => Ok(PipelineData::Empty),
168        PipelineData::Value(value, ..) => match value {
169            Value::String { val, .. } => {
170                let captures = regex
171                    .captures_iter(&val)
172                    .map(|captures| captures_to_value(captures, &columns, head))
173                    .collect::<Result<_, _>>()?;
174
175                Ok(Value::list(captures, head).into_pipeline_data())
176            }
177            Value::List { vals, .. } => {
178                let iter = vals.into_iter().map(move |val| {
179                    let span = val.span();
180                    let type_ = val.get_type();
181                    val.into_string()
182                        .map_err(|_| ShellError::OnlySupportsThisInputType {
183                            exp_input_type: "string".into(),
184                            wrong_type: type_.to_string(),
185                            dst_span: head,
186                            src_span: span,
187                        })
188                });
189
190                let iter = ParseIter {
191                    captures: VecDeque::new(),
192                    regex,
193                    columns,
194                    iter,
195                    span: head,
196                    signals: engine_state.signals().clone(),
197                };
198
199                Ok(ListStream::new(iter, head, Signals::empty()).into())
200            }
201            value => Err(ShellError::OnlySupportsThisInputType {
202                exp_input_type: "string".into(),
203                wrong_type: value.get_type().to_string(),
204                dst_span: head,
205                src_span: value.span(),
206            }),
207        },
208        PipelineData::ListStream(stream, ..) => Ok(stream
209            .modify(|stream| {
210                let iter = stream.map(move |val| {
211                    let span = val.span();
212                    val.into_string().map_err(|_| ShellError::PipelineMismatch {
213                        exp_input_type: "string".into(),
214                        dst_span: head,
215                        src_span: span,
216                    })
217                });
218
219                ParseIter {
220                    captures: VecDeque::new(),
221                    regex,
222                    columns,
223                    iter,
224                    span: head,
225                    signals: engine_state.signals().clone(),
226                }
227            })
228            .into()),
229        PipelineData::ByteStream(stream, ..) => {
230            if let Some(lines) = stream.lines() {
231                let iter = ParseIter {
232                    captures: VecDeque::new(),
233                    regex,
234                    columns,
235                    iter: lines,
236                    span: head,
237                    signals: engine_state.signals().clone(),
238                };
239
240                Ok(ListStream::new(iter, head, Signals::empty()).into())
241            } else {
242                Ok(PipelineData::Empty)
243            }
244        }
245    }
246}
247
248fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
249    let mut output = "(?s)\\A".to_string();
250
251    let mut loop_input = input.chars().peekable();
252    loop {
253        let mut before = String::new();
254        while let Some(c) = loop_input.next() {
255            if c == '{' {
256                // If '{{', still creating a plaintext parse command, but just for a single '{' char
257                if loop_input.peek() == Some(&'{') {
258                    let _ = loop_input.next();
259                } else {
260                    break;
261                }
262            }
263            before.push(c);
264        }
265
266        if !before.is_empty() {
267            output.push_str(&fancy_regex::escape(&before));
268        }
269
270        // Look for column as we're now at one
271        let mut column = String::new();
272        while let Some(c) = loop_input.next() {
273            if c == '}' {
274                break;
275            }
276            column.push(c);
277
278            if loop_input.peek().is_none() {
279                return Err(ShellError::DelimiterError {
280                    msg: "Found opening `{` without an associated closing `}`".to_owned(),
281                    span,
282                });
283            }
284        }
285
286        if !column.is_empty() {
287            output.push_str("(?");
288            if column == "_" {
289                // discard placeholder column(s)
290                output.push(':');
291            } else {
292                // create capture group for column
293                output.push_str("P<");
294                output.push_str(&column);
295                output.push('>');
296            }
297            output.push_str(".*?)");
298        }
299
300        if before.is_empty() && column.is_empty() {
301            break;
302        }
303    }
304
305    output.push_str("\\z");
306    Ok(output)
307}
308
309struct ParseIter<I: Iterator<Item = Result<String, ShellError>>> {
310    captures: VecDeque<Value>,
311    regex: Regex,
312    columns: Vec<String>,
313    iter: I,
314    span: Span,
315    signals: Signals,
316}
317
318impl<I: Iterator<Item = Result<String, ShellError>>> ParseIter<I> {
319    fn populate_captures(&mut self, str: &str) -> Result<(), ShellError> {
320        for captures in self.regex.captures_iter(str) {
321            self.captures
322                .push_back(captures_to_value(captures, &self.columns, self.span)?);
323        }
324        Ok(())
325    }
326}
327
328impl<I: Iterator<Item = Result<String, ShellError>>> Iterator for ParseIter<I> {
329    type Item = Value;
330
331    fn next(&mut self) -> Option<Value> {
332        loop {
333            if self.signals.interrupted() {
334                return None;
335            }
336
337            if let Some(val) = self.captures.pop_front() {
338                return Some(val);
339            }
340
341            let result = self
342                .iter
343                .next()?
344                .and_then(|str| self.populate_captures(&str));
345
346            if let Err(err) = result {
347                return Some(Value::error(err, self.span));
348            }
349        }
350    }
351}
352
353fn captures_to_value(
354    captures: Result<Captures, fancy_regex::Error>,
355    columns: &[String],
356    span: Span,
357) -> Result<Value, ShellError> {
358    let captures = captures.map_err(|err| ShellError::GenericError {
359        error: "Error with regular expression captures".into(),
360        msg: err.to_string(),
361        span: Some(span),
362        help: None,
363        inner: vec![],
364    })?;
365
366    let record = columns
367        .iter()
368        .zip(captures.iter().skip(1))
369        .map(|(column, match_)| {
370            let match_str = match_.map(|m| m.as_str()).unwrap_or("");
371            (column.clone(), Value::string(match_str, span))
372        })
373        .collect();
374
375    Ok(Value::record(record, span))
376}
377
378#[cfg(test)]
379mod test {
380    use super::*;
381
382    #[test]
383    fn test_examples() {
384        crate::test_examples(Parse)
385    }
386}