1use fancy_regex::{Captures, Regex, RegexBuilder};
2use nu_engine::command_prelude::*;
3use nu_protocol::shell_error::generic::GenericError;
4use nu_protocol::{ListStream, Signals, engine::StateWorkingSet};
5use std::collections::VecDeque;
6
7#[derive(Clone)]
8pub struct Parse;
9
10impl Command for Parse {
11 fn name(&self) -> &str {
12 "parse"
13 }
14
15 fn description(&self) -> &str {
16 "Parse columns from string data using a simple pattern or a supplied regular expression."
17 }
18
19 fn search_terms(&self) -> Vec<&str> {
20 vec!["pattern", "match", "regex", "str extract"]
21 }
22
23 fn extra_description(&self) -> &str {
24 "The parse command always uses regular expressions even when you use a simple pattern. If a simple pattern is supplied, parse will transform that pattern into a regular expression."
25 }
26
27 fn signature(&self) -> nu_protocol::Signature {
28 Signature::build("parse")
29 .required("pattern", SyntaxShape::String, "The pattern to match.")
30 .input_output_types(vec![
31 (Type::String, Type::table()),
32 (Type::List(Box::new(Type::Any)), Type::table()),
33 ])
34 .switch("regex", "Use full regex syntax for patterns.", Some('r'))
35 .named(
36 "backtrack",
37 SyntaxShape::Int,
38 "Set the max backtrack limit for regex.",
39 Some('b'),
40 )
41 .allow_variants_without_examples(true)
42 .category(Category::Strings)
43 }
44
45 fn examples(&self) -> Vec<Example<'_>> {
46 vec![
47 Example {
48 description: "Parse a string into two named columns.",
49 example: "\"hi there\" | parse \"{foo} {bar}\"",
50 result: Some(Value::test_list(vec![Value::test_record(record! {
51 "foo" => Value::test_string("hi"),
52 "bar" => Value::test_string("there"),
53 })])),
54 },
55 Example {
56 description: "Parse a string, ignoring a column with _.",
57 example: "\"hello world\" | parse \"{foo} {_}\"",
58 result: Some(Value::test_list(vec![Value::test_record(record! {
59 "foo" => Value::test_string("hello"),
60 })])),
61 },
62 Example {
63 description: "This is how the first example is interpreted in the source code.",
64 example: "\"hi there\" | parse --regex '(?s)\\A(?P<foo>.*?) (?P<bar>.*?)\\z'",
65 result: Some(Value::test_list(vec![Value::test_record(record! {
66 "foo" => Value::test_string("hi"),
67 "bar" => Value::test_string("there"),
68 })])),
69 },
70 Example {
71 description: "Parse a string using fancy-regex named capture group pattern.",
72 example: "\"foo bar.\" | parse --regex '\\s*(?<name>\\w+)(?=\\.)'",
73 result: Some(Value::test_list(vec![Value::test_record(record! {
74 "name" => Value::test_string("bar"),
75 })])),
76 },
77 Example {
78 description: "Parse a string using fancy-regex capture group pattern.",
79 example: "\"foo! bar.\" | parse --regex '(\\w+)(?=\\.)|(\\w+)(?=!)'",
80 result: Some(Value::test_list(vec![
81 Value::test_record(record! {
82 "capture0" => Value::test_nothing(),
83 "capture1" => Value::test_string("foo"),
84 }),
85 Value::test_record(record! {
86 "capture0" => Value::test_string("bar"),
87 "capture1" => Value::test_nothing(),
88 }),
89 ])),
90 },
91 Example {
92 description: "Parse a string using fancy-regex look behind pattern.",
93 example: "\" @another(foo bar) \" | parse --regex '\\s*(?<=[() ])(@\\w+)(\\([^)]*\\))?\\s*'",
94 result: Some(Value::test_list(vec![Value::test_record(record! {
95 "capture0" => Value::test_string("@another"),
96 "capture1" => Value::test_string("(foo bar)"),
97 })])),
98 },
99 Example {
100 description: "Parse a string using fancy-regex look ahead atomic group pattern.",
101 example: "\"abcd\" | parse --regex '^a(bc(?=d)|b)cd$'",
102 result: Some(Value::test_list(vec![Value::test_record(record! {
103 "capture0" => Value::test_string("b"),
104 })])),
105 },
106 Example {
107 description: "Parse a string with a manually set fancy-regex backtrack limit.",
108 example: "\"hi there\" | parse --backtrack 1500000 \"{foo} {bar}\"",
109 result: Some(Value::test_list(vec![Value::test_record(record! {
110 "foo" => Value::test_string("hi"),
111 "bar" => Value::test_string("there"),
112 })])),
113 },
114 ]
115 }
116
117 fn is_const(&self) -> bool {
118 true
119 }
120
121 fn run(
122 &self,
123 engine_state: &EngineState,
124 stack: &mut Stack,
125 call: &Call,
126 input: PipelineData,
127 ) -> Result<PipelineData, ShellError> {
128 let pattern: Spanned<String> = call.req(engine_state, stack, 0)?;
129 let regex: bool = call.has_flag(engine_state, stack, "regex")?;
130 let backtrack_limit: usize = call
131 .get_flag(engine_state, stack, "backtrack")?
132 .unwrap_or(1_000_000); operate(engine_state, pattern, regex, backtrack_limit, call, input)
134 }
135
136 fn run_const(
137 &self,
138 working_set: &StateWorkingSet,
139 call: &Call,
140 input: PipelineData,
141 ) -> Result<PipelineData, ShellError> {
142 let pattern: Spanned<String> = call.req_const(working_set, 0)?;
143 let regex: bool = call.has_flag_const(working_set, "regex")?;
144 let backtrack_limit: usize = call
145 .get_flag_const(working_set, "backtrack")?
146 .unwrap_or(1_000_000);
147 operate(
148 working_set.permanent(),
149 pattern,
150 regex,
151 backtrack_limit,
152 call,
153 input,
154 )
155 }
156}
157
158fn operate(
159 engine_state: &EngineState,
160 pattern: Spanned<String>,
161 regex: bool,
162 backtrack_limit: usize,
163 call: &Call,
164 input: PipelineData,
165) -> Result<PipelineData, ShellError> {
166 let head = call.head;
167
168 let pattern_item = pattern.item;
169 let pattern_span = pattern.span;
170
171 let item_to_parse = if regex {
172 pattern_item
173 } else {
174 build_regex(&pattern_item, pattern_span)?
175 };
176
177 let regex = RegexBuilder::new(&item_to_parse)
178 .backtrack_limit(backtrack_limit)
179 .build()
180 .map_err(|e| {
181 ShellError::Generic(GenericError::new(
182 "Error with regular expression",
183 e.to_string(),
184 pattern_span,
185 ))
186 })?;
187
188 let columns = regex
189 .capture_names()
190 .skip(1)
191 .enumerate()
192 .map(|(i, name)| {
193 name.map(String::from)
194 .unwrap_or_else(|| format!("capture{i}"))
195 })
196 .collect::<Vec<_>>();
197
198 match input {
199 PipelineData::Empty => Ok(PipelineData::empty()),
200 PipelineData::Value(value, ..) => match value {
201 Value::String { val, .. } => {
202 let captures = regex
203 .captures_iter(&val)
204 .map(|captures| captures_to_value(captures, &columns, head))
205 .collect::<Result<_, _>>()?;
206
207 Ok(Value::list(captures, head).into_pipeline_data())
208 }
209 Value::List { vals, .. } => {
210 let iter = vals.into_iter().map(move |val| {
211 let span = val.span();
212 let type_ = val.get_type();
213 val.into_string()
214 .map_err(|_| ShellError::OnlySupportsThisInputType {
215 exp_input_type: "string".into(),
216 wrong_type: type_.to_string(),
217 dst_span: head,
218 src_span: span,
219 })
220 });
221
222 let iter = ParseIter {
223 captures: VecDeque::new(),
224 regex,
225 columns,
226 iter,
227 span: head,
228 signals: engine_state.signals().clone(),
229 };
230
231 Ok(ListStream::new(iter, head, Signals::empty()).into())
232 }
233 value => Err(ShellError::OnlySupportsThisInputType {
234 exp_input_type: "string".into(),
235 wrong_type: value.get_type().to_string(),
236 dst_span: head,
237 src_span: value.span(),
238 }),
239 },
240 PipelineData::ListStream(stream, ..) => Ok(stream
241 .modify(|stream| {
242 let iter = stream.map(move |val| {
243 let span = val.span();
244 val.into_string().map_err(|_| ShellError::PipelineMismatch {
245 exp_input_type: "string".into(),
246 dst_span: head,
247 src_span: span,
248 })
249 });
250
251 ParseIter {
252 captures: VecDeque::new(),
253 regex,
254 columns,
255 iter,
256 span: head,
257 signals: engine_state.signals().clone(),
258 }
259 })
260 .into()),
261 PipelineData::ByteStream(stream, ..) => {
262 if let Some(lines) = stream.lines() {
263 let iter = ParseIter {
264 captures: VecDeque::new(),
265 regex,
266 columns,
267 iter: lines,
268 span: head,
269 signals: engine_state.signals().clone(),
270 };
271
272 Ok(ListStream::new(iter, head, Signals::empty()).into())
273 } else {
274 Ok(PipelineData::empty())
275 }
276 }
277 }
278}
279
280fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
281 let mut output = "(?s)\\A".to_string();
282
283 let mut loop_input = input.char_indices().peekable();
285 let mut before = String::new();
286 let mut column = String::new();
287 let mut in_column = false;
288
289 while let Some((_, c)) = loop_input.next() {
290 if !in_column {
291 if c == '{' {
292 let mut literal_lbrace = false;
294 if let Some((next_idx, '{')) = loop_input.peek().copied() {
295 let after = &input[next_idx + 1..];
297 literal_lbrace = true;
298
299 if !is_trailing_capture(after) {
300 loop_input.next();
301 }
302 }
303
304 if literal_lbrace {
305 before.push(c);
306 continue;
307 }
308
309 if !before.is_empty() {
310 output.push_str(&fancy_regex::escape(&before));
311 before.clear();
312 }
313
314 in_column = true;
315 continue;
316 }
317
318 before.push(c);
319 continue;
320 }
321
322 if c == '}' {
323 if !column.is_empty() {
324 output.push_str("(?");
325 if column == "_" {
326 output.push(':');
328 } else {
329 output.push_str("P<");
331 output.push_str(&column);
332 output.push('>');
333 }
334 output.push_str(".*?)");
335 column.clear();
336 }
337
338 in_column = false;
339 continue;
340 }
341
342 column.push(c);
343 if loop_input.peek().is_none() {
344 return Err(ShellError::DelimiterError {
345 msg: "Found opening `{` without an associated closing `}`".to_owned(),
346 span,
347 });
348 }
349 }
350
351 if !before.is_empty() {
352 output.push_str(&fancy_regex::escape(&before));
353 }
354
355 output.push_str("\\z");
356 Ok(output)
357}
358
359fn is_trailing_capture(after: &str) -> bool {
363 after
364 .find(['}', '{'])
365 .is_some_and(|pos| after.as_bytes()[pos] == b'}' && pos + 1 == after.len())
366}
367
368struct ParseIter<I: Iterator<Item = Result<String, ShellError>>> {
369 captures: VecDeque<Value>,
370 regex: Regex,
371 columns: Vec<String>,
372 iter: I,
373 span: Span,
374 signals: Signals,
375}
376
377impl<I: Iterator<Item = Result<String, ShellError>>> ParseIter<I> {
378 fn populate_captures(&mut self, str: &str) -> Result<(), ShellError> {
379 for captures in self.regex.captures_iter(str) {
380 self.captures
381 .push_back(captures_to_value(captures, &self.columns, self.span)?);
382 }
383 Ok(())
384 }
385}
386
387impl<I: Iterator<Item = Result<String, ShellError>>> Iterator for ParseIter<I> {
388 type Item = Value;
389
390 fn next(&mut self) -> Option<Value> {
391 loop {
392 if self.signals.interrupted() {
393 return None;
394 }
395
396 if let Some(val) = self.captures.pop_front() {
397 return Some(val);
398 }
399
400 let result = self
401 .iter
402 .next()?
403 .and_then(|str| self.populate_captures(&str));
404
405 if let Err(err) = result {
406 return Some(Value::error(err, self.span));
407 }
408 }
409 }
410}
411
412fn captures_to_value(
413 captures: Result<Captures, fancy_regex::Error>,
414 columns: &[String],
415 span: Span,
416) -> Result<Value, ShellError> {
417 let captures = captures.map_err(|err| {
418 ShellError::Generic(GenericError::new(
419 "Error with regular expression captures",
420 err.to_string(),
421 span,
422 ))
423 })?;
424
425 let record = columns
426 .iter()
427 .zip(captures.iter().skip(1))
428 .map(|(column, match_)| {
429 let match_value = match_
430 .map(|m| Value::string(m.as_str(), span))
431 .unwrap_or(Value::nothing(span));
432 (column.clone(), match_value)
433 })
434 .collect();
435
436 Ok(Value::record(record, span))
437}
438
439#[cfg(test)]
440mod test {
441 use super::*;
442
443 #[test]
444 fn test_examples() -> nu_test_support::Result {
445 nu_test_support::test().examples(Parse)
446 }
447}