1use fancy_regex::{Captures, Regex, RegexBuilder};
2use nu_engine::command_prelude::*;
3use nu_protocol::{ListStream, Signals, engine::StateWorkingSet};
4use std::collections::VecDeque;
5
6#[derive(Clone)]
7pub struct Parse;
8
9impl Command for Parse {
10 fn name(&self) -> &str {
11 "parse"
12 }
13
14 fn description(&self) -> &str {
15 "Parse columns from string data using a simple pattern or a supplied regular expression."
16 }
17
18 fn search_terms(&self) -> Vec<&str> {
19 vec!["pattern", "match", "regex", "str extract"]
20 }
21
22 fn extra_description(&self) -> &str {
23 "The parse command always uses regular expressions even when you use a simple pattern. If a simple pattern is supplied, parse will transform that pattern into a regular expression."
24 }
25
26 fn signature(&self) -> nu_protocol::Signature {
27 Signature::build("parse")
28 .required("pattern", SyntaxShape::String, "The pattern to match.")
29 .input_output_types(vec![
30 (Type::String, Type::table()),
31 (Type::List(Box::new(Type::Any)), Type::table()),
32 ])
33 .switch("regex", "use full regex syntax for patterns", Some('r'))
34 .named(
35 "backtrack",
36 SyntaxShape::Int,
37 "set the max backtrack limit for regex",
38 Some('b'),
39 )
40 .allow_variants_without_examples(true)
41 .category(Category::Strings)
42 }
43
44 fn examples(&self) -> Vec<Example> {
45 vec![
46 Example {
47 description: "Parse a string into two named columns",
48 example: "\"hi there\" | parse \"{foo} {bar}\"",
49 result: Some(Value::test_list(vec![Value::test_record(record! {
50 "foo" => Value::test_string("hi"),
51 "bar" => Value::test_string("there"),
52 })])),
53 },
54 Example {
55 description: "Parse a string, ignoring a column with _",
56 example: "\"hello world\" | parse \"{foo} {_}\"",
57 result: Some(Value::test_list(vec![Value::test_record(record! {
58 "foo" => Value::test_string("hello"),
59 })])),
60 },
61 Example {
62 description: "This is how the first example is interpreted in the source code",
63 example: "\"hi there\" | parse --regex '(?s)\\A(?P<foo>.*?) (?P<bar>.*?)\\z'",
64 result: Some(Value::test_list(vec![Value::test_record(record! {
65 "foo" => Value::test_string("hi"),
66 "bar" => Value::test_string("there"),
67 })])),
68 },
69 Example {
70 description: "Parse a string using fancy-regex named capture group pattern",
71 example: "\"foo bar.\" | parse --regex '\\s*(?<name>\\w+)(?=\\.)'",
72 result: Some(Value::test_list(vec![Value::test_record(record! {
73 "name" => Value::test_string("bar"),
74 })])),
75 },
76 Example {
77 description: "Parse a string using fancy-regex capture group pattern",
78 example: "\"foo! bar.\" | parse --regex '(\\w+)(?=\\.)|(\\w+)(?=!)'",
79 result: Some(Value::test_list(vec![
80 Value::test_record(record! {
81 "capture0" => Value::test_nothing(),
82 "capture1" => Value::test_string("foo"),
83 }),
84 Value::test_record(record! {
85 "capture0" => Value::test_string("bar"),
86 "capture1" => Value::test_nothing(),
87 }),
88 ])),
89 },
90 Example {
91 description: "Parse a string using fancy-regex look behind pattern",
92 example: "\" @another(foo bar) \" | parse --regex '\\s*(?<=[() ])(@\\w+)(\\([^)]*\\))?\\s*'",
93 result: Some(Value::test_list(vec![Value::test_record(record! {
94 "capture0" => Value::test_string("@another"),
95 "capture1" => Value::test_string("(foo bar)"),
96 })])),
97 },
98 Example {
99 description: "Parse a string using fancy-regex look ahead atomic group pattern",
100 example: "\"abcd\" | parse --regex '^a(bc(?=d)|b)cd$'",
101 result: Some(Value::test_list(vec![Value::test_record(record! {
102 "capture0" => Value::test_string("b"),
103 })])),
104 },
105 Example {
106 description: "Parse a string with a manually set fancy-regex backtrack limit",
107 example: "\"hi there\" | parse --backtrack 1500000 \"{foo} {bar}\"",
108 result: Some(Value::test_list(vec![Value::test_record(record! {
109 "foo" => Value::test_string("hi"),
110 "bar" => Value::test_string("there"),
111 })])),
112 },
113 ]
114 }
115
116 fn is_const(&self) -> bool {
117 true
118 }
119
120 fn run(
121 &self,
122 engine_state: &EngineState,
123 stack: &mut Stack,
124 call: &Call,
125 input: PipelineData,
126 ) -> Result<PipelineData, ShellError> {
127 let pattern: Spanned<String> = call.req(engine_state, stack, 0)?;
128 let regex: bool = call.has_flag(engine_state, stack, "regex")?;
129 let backtrack_limit: usize = call
130 .get_flag(engine_state, stack, "backtrack")?
131 .unwrap_or(1_000_000); operate(engine_state, pattern, regex, backtrack_limit, call, input)
133 }
134
135 fn run_const(
136 &self,
137 working_set: &StateWorkingSet,
138 call: &Call,
139 input: PipelineData,
140 ) -> Result<PipelineData, ShellError> {
141 let pattern: Spanned<String> = call.req_const(working_set, 0)?;
142 let regex: bool = call.has_flag_const(working_set, "regex")?;
143 let backtrack_limit: usize = call
144 .get_flag_const(working_set, "backtrack")?
145 .unwrap_or(1_000_000);
146 operate(
147 working_set.permanent(),
148 pattern,
149 regex,
150 backtrack_limit,
151 call,
152 input,
153 )
154 }
155}
156
157fn operate(
158 engine_state: &EngineState,
159 pattern: Spanned<String>,
160 regex: bool,
161 backtrack_limit: usize,
162 call: &Call,
163 input: PipelineData,
164) -> Result<PipelineData, ShellError> {
165 let head = call.head;
166
167 let pattern_item = pattern.item;
168 let pattern_span = pattern.span;
169
170 let item_to_parse = if regex {
171 pattern_item
172 } else {
173 build_regex(&pattern_item, pattern_span)?
174 };
175
176 let regex = RegexBuilder::new(&item_to_parse)
177 .backtrack_limit(backtrack_limit)
178 .build()
179 .map_err(|e| ShellError::GenericError {
180 error: "Error with regular expression".into(),
181 msg: e.to_string(),
182 span: Some(pattern_span),
183 help: None,
184 inner: vec![],
185 })?;
186
187 let columns = regex
188 .capture_names()
189 .skip(1)
190 .enumerate()
191 .map(|(i, name)| {
192 name.map(String::from)
193 .unwrap_or_else(|| format!("capture{i}"))
194 })
195 .collect::<Vec<_>>();
196
197 match input {
198 PipelineData::Empty => Ok(PipelineData::Empty),
199 PipelineData::Value(value, ..) => match value {
200 Value::String { val, .. } => {
201 let captures = regex
202 .captures_iter(&val)
203 .map(|captures| captures_to_value(captures, &columns, head))
204 .collect::<Result<_, _>>()?;
205
206 Ok(Value::list(captures, head).into_pipeline_data())
207 }
208 Value::List { vals, .. } => {
209 let iter = vals.into_iter().map(move |val| {
210 let span = val.span();
211 let type_ = val.get_type();
212 val.into_string()
213 .map_err(|_| ShellError::OnlySupportsThisInputType {
214 exp_input_type: "string".into(),
215 wrong_type: type_.to_string(),
216 dst_span: head,
217 src_span: span,
218 })
219 });
220
221 let iter = ParseIter {
222 captures: VecDeque::new(),
223 regex,
224 columns,
225 iter,
226 span: head,
227 signals: engine_state.signals().clone(),
228 };
229
230 Ok(ListStream::new(iter, head, Signals::empty()).into())
231 }
232 value => Err(ShellError::OnlySupportsThisInputType {
233 exp_input_type: "string".into(),
234 wrong_type: value.get_type().to_string(),
235 dst_span: head,
236 src_span: value.span(),
237 }),
238 },
239 PipelineData::ListStream(stream, ..) => Ok(stream
240 .modify(|stream| {
241 let iter = stream.map(move |val| {
242 let span = val.span();
243 val.into_string().map_err(|_| ShellError::PipelineMismatch {
244 exp_input_type: "string".into(),
245 dst_span: head,
246 src_span: span,
247 })
248 });
249
250 ParseIter {
251 captures: VecDeque::new(),
252 regex,
253 columns,
254 iter,
255 span: head,
256 signals: engine_state.signals().clone(),
257 }
258 })
259 .into()),
260 PipelineData::ByteStream(stream, ..) => {
261 if let Some(lines) = stream.lines() {
262 let iter = ParseIter {
263 captures: VecDeque::new(),
264 regex,
265 columns,
266 iter: lines,
267 span: head,
268 signals: engine_state.signals().clone(),
269 };
270
271 Ok(ListStream::new(iter, head, Signals::empty()).into())
272 } else {
273 Ok(PipelineData::Empty)
274 }
275 }
276 }
277}
278
279fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
280 let mut output = "(?s)\\A".to_string();
281
282 let mut loop_input = input.chars().peekable();
283 loop {
284 let mut before = String::new();
285 while let Some(c) = loop_input.next() {
286 if c == '{' {
287 if loop_input.peek() == Some(&'{') {
289 let _ = loop_input.next();
290 } else {
291 break;
292 }
293 }
294 before.push(c);
295 }
296
297 if !before.is_empty() {
298 output.push_str(&fancy_regex::escape(&before));
299 }
300
301 let mut column = String::new();
303 while let Some(c) = loop_input.next() {
304 if c == '}' {
305 break;
306 }
307 column.push(c);
308
309 if loop_input.peek().is_none() {
310 return Err(ShellError::DelimiterError {
311 msg: "Found opening `{` without an associated closing `}`".to_owned(),
312 span,
313 });
314 }
315 }
316
317 if !column.is_empty() {
318 output.push_str("(?");
319 if column == "_" {
320 output.push(':');
322 } else {
323 output.push_str("P<");
325 output.push_str(&column);
326 output.push('>');
327 }
328 output.push_str(".*?)");
329 }
330
331 if before.is_empty() && column.is_empty() {
332 break;
333 }
334 }
335
336 output.push_str("\\z");
337 Ok(output)
338}
339
340struct ParseIter<I: Iterator<Item = Result<String, ShellError>>> {
341 captures: VecDeque<Value>,
342 regex: Regex,
343 columns: Vec<String>,
344 iter: I,
345 span: Span,
346 signals: Signals,
347}
348
349impl<I: Iterator<Item = Result<String, ShellError>>> ParseIter<I> {
350 fn populate_captures(&mut self, str: &str) -> Result<(), ShellError> {
351 for captures in self.regex.captures_iter(str) {
352 self.captures
353 .push_back(captures_to_value(captures, &self.columns, self.span)?);
354 }
355 Ok(())
356 }
357}
358
359impl<I: Iterator<Item = Result<String, ShellError>>> Iterator for ParseIter<I> {
360 type Item = Value;
361
362 fn next(&mut self) -> Option<Value> {
363 loop {
364 if self.signals.interrupted() {
365 return None;
366 }
367
368 if let Some(val) = self.captures.pop_front() {
369 return Some(val);
370 }
371
372 let result = self
373 .iter
374 .next()?
375 .and_then(|str| self.populate_captures(&str));
376
377 if let Err(err) = result {
378 return Some(Value::error(err, self.span));
379 }
380 }
381 }
382}
383
384fn captures_to_value(
385 captures: Result<Captures, fancy_regex::Error>,
386 columns: &[String],
387 span: Span,
388) -> Result<Value, ShellError> {
389 let captures = captures.map_err(|err| ShellError::GenericError {
390 error: "Error with regular expression captures".into(),
391 msg: err.to_string(),
392 span: Some(span),
393 help: None,
394 inner: vec![],
395 })?;
396
397 let record = columns
398 .iter()
399 .zip(captures.iter().skip(1))
400 .map(|(column, match_)| {
401 let match_value = match_
402 .map(|m| Value::string(m.as_str(), span))
403 .unwrap_or(Value::nothing(span));
404 (column.clone(), match_value)
405 })
406 .collect();
407
408 Ok(Value::record(record, span))
409}
410
411#[cfg(test)]
412mod test {
413 use super::*;
414
415 #[test]
416 fn test_examples() {
417 crate::test_examples(Parse)
418 }
419}