nu_command/strings/
parse.rs

1use fancy_regex::{Captures, Regex};
2use nu_engine::command_prelude::*;
3use nu_protocol::{engine::StateWorkingSet, ListStream, Signals};
4use std::collections::VecDeque;
5
6#[derive(Clone)]
7pub struct Parse;
8
9impl Command for Parse {
10    fn name(&self) -> &str {
11        "parse"
12    }
13
14    fn description(&self) -> &str {
15        "Parse columns from string data using a simple pattern or a supplied regular expression."
16    }
17
18    fn search_terms(&self) -> Vec<&str> {
19        vec!["pattern", "match", "regex", "str extract"]
20    }
21
22    fn extra_description(&self) -> &str {
23        "The parse command always uses regular expressions even when you use a simple pattern. If a simple pattern is supplied, parse will transform that pattern into a regular expression."
24    }
25
26    fn signature(&self) -> nu_protocol::Signature {
27        Signature::build("parse")
28            .required("pattern", SyntaxShape::String, "The pattern to match.")
29            .input_output_types(vec![
30                (Type::String, Type::table()),
31                (Type::List(Box::new(Type::Any)), Type::table()),
32            ])
33            .switch("regex", "use full regex syntax for patterns", Some('r'))
34            .allow_variants_without_examples(true)
35            .category(Category::Strings)
36    }
37
38    fn examples(&self) -> Vec<Example> {
39        vec![
40            Example {
41                description: "Parse a string into two named columns",
42                example: "\"hi there\" | parse \"{foo} {bar}\"",
43                result: Some(Value::test_list(
44                    vec![Value::test_record(record! {
45                        "foo" => Value::test_string("hi"),
46                        "bar" => Value::test_string("there"),
47                    })])),
48            },
49            Example {
50                description: "This is how the first example is interpreted in the source code",
51                example: "\"hi there\" | parse --regex '(?s)\\A(?P<foo>.*?) (?P<bar>.*?)\\z'",
52                result: Some(Value::test_list(
53                    vec![Value::test_record(record! {
54                        "foo" => Value::test_string("hi"),
55                        "bar" => Value::test_string("there"),
56                    })])),
57            },
58            Example {
59                description: "Parse a string using fancy-regex named capture group pattern",
60                example: "\"foo bar.\" | parse --regex '\\s*(?<name>\\w+)(?=\\.)'",
61                result: Some(Value::test_list(
62                    vec![Value::test_record(record! {
63                        "name" => Value::test_string("bar"),
64                    })],
65                )),
66            },
67            Example {
68                description: "Parse a string using fancy-regex capture group pattern",
69                example: "\"foo! bar.\" | parse --regex '(\\w+)(?=\\.)|(\\w+)(?=!)'",
70                result: Some(Value::test_list(
71                    vec![
72                        Value::test_record(record! {
73                            "capture0" => Value::test_string(""),
74                            "capture1" => Value::test_string("foo"),
75                        }),
76                        Value::test_record(record! {
77                            "capture0" => Value::test_string("bar"),
78                            "capture1" => Value::test_string(""),
79                        }),
80                    ],
81                )),
82            },
83            Example {
84                description: "Parse a string using fancy-regex look behind pattern",
85                example:
86                    "\" @another(foo bar)   \" | parse --regex '\\s*(?<=[() ])(@\\w+)(\\([^)]*\\))?\\s*'",
87                result: Some(Value::test_list(
88                    vec![Value::test_record(record! {
89                        "capture0" => Value::test_string("@another"),
90                        "capture1" => Value::test_string("(foo bar)"),
91                    })],
92                )),
93            },
94            Example {
95                description: "Parse a string using fancy-regex look ahead atomic group pattern",
96                example: "\"abcd\" | parse --regex '^a(bc(?=d)|b)cd$'",
97                result: Some(Value::test_list(
98                    vec![Value::test_record(record! {
99                        "capture0" => Value::test_string("b"),
100                    })],
101                )),
102            },
103        ]
104    }
105
106    fn is_const(&self) -> bool {
107        true
108    }
109
110    fn run(
111        &self,
112        engine_state: &EngineState,
113        stack: &mut Stack,
114        call: &Call,
115        input: PipelineData,
116    ) -> Result<PipelineData, ShellError> {
117        let pattern: Spanned<String> = call.req(engine_state, stack, 0)?;
118        let regex: bool = call.has_flag(engine_state, stack, "regex")?;
119        operate(engine_state, pattern, regex, call, input)
120    }
121
122    fn run_const(
123        &self,
124        working_set: &StateWorkingSet,
125        call: &Call,
126        input: PipelineData,
127    ) -> Result<PipelineData, ShellError> {
128        let pattern: Spanned<String> = call.req_const(working_set, 0)?;
129        let regex: bool = call.has_flag_const(working_set, "regex")?;
130        operate(working_set.permanent(), pattern, regex, call, input)
131    }
132}
133
134fn operate(
135    engine_state: &EngineState,
136    pattern: Spanned<String>,
137    regex: bool,
138    call: &Call,
139    input: PipelineData,
140) -> Result<PipelineData, ShellError> {
141    let head = call.head;
142
143    let pattern_item = pattern.item;
144    let pattern_span = pattern.span;
145
146    let item_to_parse = if regex {
147        pattern_item
148    } else {
149        build_regex(&pattern_item, pattern_span)?
150    };
151
152    let regex = Regex::new(&item_to_parse).map_err(|e| ShellError::GenericError {
153        error: "Error with regular expression".into(),
154        msg: e.to_string(),
155        span: Some(pattern_span),
156        help: None,
157        inner: vec![],
158    })?;
159
160    let columns = regex
161        .capture_names()
162        .skip(1)
163        .enumerate()
164        .map(|(i, name)| {
165            name.map(String::from)
166                .unwrap_or_else(|| format!("capture{i}"))
167        })
168        .collect::<Vec<_>>();
169
170    match input {
171        PipelineData::Empty => Ok(PipelineData::Empty),
172        PipelineData::Value(value, ..) => match value {
173            Value::String { val, .. } => {
174                let captures = regex
175                    .captures_iter(&val)
176                    .map(|captures| captures_to_value(captures, &columns, head))
177                    .collect::<Result<_, _>>()?;
178
179                Ok(Value::list(captures, head).into_pipeline_data())
180            }
181            Value::List { vals, .. } => {
182                let iter = vals.into_iter().map(move |val| {
183                    let span = val.span();
184                    let type_ = val.get_type();
185                    val.into_string()
186                        .map_err(|_| ShellError::OnlySupportsThisInputType {
187                            exp_input_type: "string".into(),
188                            wrong_type: type_.to_string(),
189                            dst_span: head,
190                            src_span: span,
191                        })
192                });
193
194                let iter = ParseIter {
195                    captures: VecDeque::new(),
196                    regex,
197                    columns,
198                    iter,
199                    span: head,
200                    signals: engine_state.signals().clone(),
201                };
202
203                Ok(ListStream::new(iter, head, Signals::empty()).into())
204            }
205            value => Err(ShellError::OnlySupportsThisInputType {
206                exp_input_type: "string".into(),
207                wrong_type: value.get_type().to_string(),
208                dst_span: head,
209                src_span: value.span(),
210            }),
211        },
212        PipelineData::ListStream(stream, ..) => Ok(stream
213            .modify(|stream| {
214                let iter = stream.map(move |val| {
215                    let span = val.span();
216                    val.into_string().map_err(|_| ShellError::PipelineMismatch {
217                        exp_input_type: "string".into(),
218                        dst_span: head,
219                        src_span: span,
220                    })
221                });
222
223                ParseIter {
224                    captures: VecDeque::new(),
225                    regex,
226                    columns,
227                    iter,
228                    span: head,
229                    signals: engine_state.signals().clone(),
230                }
231            })
232            .into()),
233        PipelineData::ByteStream(stream, ..) => {
234            if let Some(lines) = stream.lines() {
235                let iter = ParseIter {
236                    captures: VecDeque::new(),
237                    regex,
238                    columns,
239                    iter: lines,
240                    span: head,
241                    signals: engine_state.signals().clone(),
242                };
243
244                Ok(ListStream::new(iter, head, Signals::empty()).into())
245            } else {
246                Ok(PipelineData::Empty)
247            }
248        }
249    }
250}
251
252fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
253    let mut output = "(?s)\\A".to_string();
254
255    let mut loop_input = input.chars().peekable();
256    loop {
257        let mut before = String::new();
258        while let Some(c) = loop_input.next() {
259            if c == '{' {
260                // If '{{', still creating a plaintext parse command, but just for a single '{' char
261                if loop_input.peek() == Some(&'{') {
262                    let _ = loop_input.next();
263                } else {
264                    break;
265                }
266            }
267            before.push(c);
268        }
269
270        if !before.is_empty() {
271            output.push_str(&fancy_regex::escape(&before));
272        }
273
274        // Look for column as we're now at one
275        let mut column = String::new();
276        while let Some(c) = loop_input.next() {
277            if c == '}' {
278                break;
279            }
280            column.push(c);
281
282            if loop_input.peek().is_none() {
283                return Err(ShellError::DelimiterError {
284                    msg: "Found opening `{` without an associated closing `}`".to_owned(),
285                    span,
286                });
287            }
288        }
289
290        if !column.is_empty() {
291            output.push_str("(?P<");
292            output.push_str(&column);
293            output.push_str(">.*?)");
294        }
295
296        if before.is_empty() && column.is_empty() {
297            break;
298        }
299    }
300
301    output.push_str("\\z");
302    Ok(output)
303}
304
305struct ParseIter<I: Iterator<Item = Result<String, ShellError>>> {
306    captures: VecDeque<Value>,
307    regex: Regex,
308    columns: Vec<String>,
309    iter: I,
310    span: Span,
311    signals: Signals,
312}
313
314impl<I: Iterator<Item = Result<String, ShellError>>> ParseIter<I> {
315    fn populate_captures(&mut self, str: &str) -> Result<(), ShellError> {
316        for captures in self.regex.captures_iter(str) {
317            self.captures
318                .push_back(captures_to_value(captures, &self.columns, self.span)?);
319        }
320        Ok(())
321    }
322}
323
324impl<I: Iterator<Item = Result<String, ShellError>>> Iterator for ParseIter<I> {
325    type Item = Value;
326
327    fn next(&mut self) -> Option<Value> {
328        loop {
329            if self.signals.interrupted() {
330                return None;
331            }
332
333            if let Some(val) = self.captures.pop_front() {
334                return Some(val);
335            }
336
337            let result = self
338                .iter
339                .next()?
340                .and_then(|str| self.populate_captures(&str));
341
342            if let Err(err) = result {
343                return Some(Value::error(err, self.span));
344            }
345        }
346    }
347}
348
349fn captures_to_value(
350    captures: Result<Captures, fancy_regex::Error>,
351    columns: &[String],
352    span: Span,
353) -> Result<Value, ShellError> {
354    let captures = captures.map_err(|err| ShellError::GenericError {
355        error: "Error with regular expression captures".into(),
356        msg: err.to_string(),
357        span: Some(span),
358        help: None,
359        inner: vec![],
360    })?;
361
362    let record = columns
363        .iter()
364        .zip(captures.iter().skip(1))
365        .map(|(column, match_)| {
366            let match_str = match_.map(|m| m.as_str()).unwrap_or("");
367            (column.clone(), Value::string(match_str, span))
368        })
369        .collect();
370
371    Ok(Value::record(record, span))
372}
373
374#[cfg(test)]
375mod test {
376    use super::*;
377
378    #[test]
379    fn test_examples() {
380        crate::test_examples(Parse)
381    }
382}