nu_command/strings/
detect_columns.rs

1use itertools::Itertools;
2use nu_engine::command_prelude::*;
3use nu_protocol::{Config, Range};
4use std::{io::Cursor, iter::Peekable, str::CharIndices, sync::Arc};
5
6type Input<'t> = Peekable<CharIndices<'t>>;
7
8#[derive(Clone)]
9pub struct DetectColumns;
10
11impl Command for DetectColumns {
12    fn name(&self) -> &str {
13        "detect columns"
14    }
15
16    fn signature(&self) -> Signature {
17        Signature::build("detect columns")
18            .named(
19                "skip",
20                SyntaxShape::Int,
21                "number of rows to skip before detecting",
22                Some('s'),
23            )
24            .input_output_types(vec![(Type::String, Type::table())])
25            .switch("no-headers", "don't detect headers", Some('n'))
26            .named(
27                "combine-columns",
28                SyntaxShape::Range,
29                "columns to be combined; listed as a range",
30                Some('c'),
31            )
32            .switch(
33                "guess",
34                "detect columns by guessing width, it may be useful if default one doesn't work",
35                None,
36            )
37            .category(Category::Strings)
38    }
39
40    fn description(&self) -> &str {
41        "Attempt to automatically split text into multiple columns."
42    }
43
44    fn search_terms(&self) -> Vec<&str> {
45        vec!["split", "tabular"]
46    }
47
48    fn examples(&self) -> Vec<Example> {
49        vec![
50            Example {
51                description: "use --guess if you find default algorithm not working",
52                example: r"
53'Filesystem     1K-blocks      Used Available Use% Mounted on
54none             8150224         4   8150220   1% /mnt/c' | detect columns --guess",
55                result: Some(Value::test_list(vec![Value::test_record(record! {
56                    "Filesystem" => Value::test_string("none"),
57                    "1K-blocks" => Value::test_string("8150224"),
58                    "Used" => Value::test_string("4"),
59                    "Available" => Value::test_string("8150220"),
60                    "Use%" => Value::test_string("1%"),
61                    "Mounted on" => Value::test_string("/mnt/c")
62                })])),
63            },
64            Example {
65                description: "detect columns with no headers",
66                example: "'a b c' | detect columns  --no-headers",
67                result: Some(Value::test_list(vec![Value::test_record(record! {
68                        "column0" => Value::test_string("a"),
69                        "column1" => Value::test_string("b"),
70                        "column2" => Value::test_string("c"),
71                })])),
72            },
73            Example {
74                description: "",
75                example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 0..1 ",
76                result: None,
77            },
78            Example {
79                description: "Splits a multi-line string into columns with headers detected",
80                example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns -2..-1 ",
81                result: None,
82            },
83            Example {
84                description: "Splits a multi-line string into columns with headers detected",
85                example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 2.. ",
86                result: None,
87            },
88            Example {
89                description: "Parse external ls command and combine columns for datetime",
90                example: "^ls -lh | detect columns --no-headers --skip 1 --combine-columns 5..7",
91                result: None,
92            },
93        ]
94    }
95
96    fn is_const(&self) -> bool {
97        true
98    }
99
100    fn run(
101        &self,
102        engine_state: &EngineState,
103        stack: &mut Stack,
104        call: &Call,
105        input: PipelineData,
106    ) -> Result<PipelineData, ShellError> {
107        let num_rows_to_skip: Option<usize> = call.get_flag(engine_state, stack, "skip")?;
108        let noheader = call.has_flag(engine_state, stack, "no-headers")?;
109        let range: Option<Range> = call.get_flag(engine_state, stack, "combine-columns")?;
110        let config = stack.get_config(engine_state);
111
112        let args = Arguments {
113            noheader,
114            num_rows_to_skip,
115            range,
116            config,
117        };
118
119        if call.has_flag(engine_state, stack, "guess")? {
120            guess_width(engine_state, call, input, args)
121        } else {
122            detect_columns(engine_state, call, input, args)
123        }
124    }
125
126    fn run_const(
127        &self,
128        working_set: &StateWorkingSet,
129        call: &Call,
130        input: PipelineData,
131    ) -> Result<PipelineData, ShellError> {
132        let num_rows_to_skip: Option<usize> = call.get_flag_const(working_set, "skip")?;
133        let noheader = call.has_flag_const(working_set, "no-headers")?;
134        let range: Option<Range> = call.get_flag_const(working_set, "combine-columns")?;
135        let config = working_set.get_config().clone();
136
137        let args = Arguments {
138            noheader,
139            num_rows_to_skip,
140            range,
141            config,
142        };
143
144        if call.has_flag_const(working_set, "guess")? {
145            guess_width(working_set.permanent(), call, input, args)
146        } else {
147            detect_columns(working_set.permanent(), call, input, args)
148        }
149    }
150}
151
152struct Arguments {
153    num_rows_to_skip: Option<usize>,
154    noheader: bool,
155    range: Option<Range>,
156    config: Arc<Config>,
157}
158
159fn guess_width(
160    engine_state: &EngineState,
161    call: &Call,
162    input: PipelineData,
163    args: Arguments,
164) -> Result<PipelineData, ShellError> {
165    use super::guess_width::GuessWidth;
166    let input_span = input.span().unwrap_or(call.head);
167
168    let mut input = input.collect_string("", &args.config)?;
169    if let Some(rows) = args.num_rows_to_skip {
170        input = input.lines().skip(rows).map(|x| x.to_string()).join("\n");
171    }
172
173    let mut guess_width = GuessWidth::new_reader(Box::new(Cursor::new(input)));
174
175    let result = guess_width.read_all();
176
177    if result.is_empty() {
178        return Ok(Value::nothing(input_span).into_pipeline_data());
179    }
180    if !args.noheader {
181        let columns = result[0].clone();
182        Ok(result
183            .into_iter()
184            .skip(1)
185            .map(move |s| {
186                let mut values: Vec<Value> = s
187                    .into_iter()
188                    .map(|v| Value::string(v, input_span))
189                    .collect();
190                // some rows may has less columns, fill it with ""
191                for _ in values.len()..columns.len() {
192                    values.push(Value::string("", input_span));
193                }
194                let record =
195                    Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
196                match record {
197                    Ok(r) => match &args.range {
198                        Some(range) => merge_record(r, range, input_span),
199                        None => Value::record(r, input_span),
200                    },
201                    Err(e) => Value::error(e, input_span),
202                }
203            })
204            .into_pipeline_data(input_span, engine_state.signals().clone()))
205    } else {
206        let length = result[0].len();
207        let columns: Vec<String> = (0..length).map(|n| format!("column{n}")).collect();
208        Ok(result
209            .into_iter()
210            .map(move |s| {
211                let mut values: Vec<Value> = s
212                    .into_iter()
213                    .map(|v| Value::string(v, input_span))
214                    .collect();
215                // some rows may has less columns, fill it with ""
216                for _ in values.len()..columns.len() {
217                    values.push(Value::string("", input_span));
218                }
219                let record =
220                    Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
221                match record {
222                    Ok(r) => match &args.range {
223                        Some(range) => merge_record(r, range, input_span),
224                        None => Value::record(r, input_span),
225                    },
226                    Err(e) => Value::error(e, input_span),
227                }
228            })
229            .into_pipeline_data(input_span, engine_state.signals().clone()))
230    }
231}
232
233fn detect_columns(
234    engine_state: &EngineState,
235    call: &Call,
236    input: PipelineData,
237    args: Arguments,
238) -> Result<PipelineData, ShellError> {
239    let name_span = call.head;
240    let input_span = input.span().unwrap_or(Span::unknown());
241    let input = input.collect_string("", &args.config)?;
242
243    let input: Vec<_> = input
244        .lines()
245        .skip(args.num_rows_to_skip.unwrap_or_default())
246        .map(|x| x.to_string())
247        .collect();
248
249    let mut input = input.into_iter();
250    let headers = input.next();
251
252    if let Some(orig_headers) = headers {
253        let mut headers = find_columns(&orig_headers);
254
255        if args.noheader {
256            for header in headers.iter_mut().enumerate() {
257                header.1.item = format!("column{}", header.0);
258            }
259        }
260
261        Ok(args
262            .noheader
263            .then_some(orig_headers)
264            .into_iter()
265            .chain(input)
266            .map(move |x| {
267                let row = find_columns(&x);
268
269                let mut record = Record::new();
270
271                if headers.len() == row.len() {
272                    for (header, val) in headers.iter().zip(row.iter()) {
273                        record.push(&header.item, Value::string(&val.item, name_span));
274                    }
275                } else {
276                    let mut pre_output = vec![];
277
278                    // column counts don't line up, so see if we can figure out why
279                    for cell in row {
280                        for header in &headers {
281                            if cell.span.start <= header.span.end
282                                && cell.span.end > header.span.start
283                            {
284                                pre_output.push((
285                                    header.item.to_string(),
286                                    Value::string(&cell.item, name_span),
287                                ));
288                            }
289                        }
290                    }
291
292                    for header in &headers {
293                        let mut found = false;
294                        for pre_o in &pre_output {
295                            if pre_o.0 == header.item {
296                                found = true;
297                                break;
298                            }
299                        }
300
301                        if !found {
302                            pre_output.push((header.item.to_string(), Value::nothing(name_span)));
303                        }
304                    }
305
306                    for header in &headers {
307                        for pre_o in &pre_output {
308                            if pre_o.0 == header.item {
309                                record.push(&header.item, pre_o.1.clone());
310                            }
311                        }
312                    }
313                }
314
315                let has_column_duplicates = record.columns().duplicates().count() > 0;
316                if has_column_duplicates {
317                    return Err(ShellError::ColumnDetectionFailure {
318                        bad_value: input_span,
319                        failure_site: name_span,
320                    });
321                }
322
323                Ok(match &args.range {
324                    Some(range) => merge_record(record, range, name_span),
325                    None => Value::record(record, name_span),
326                })
327            })
328            .collect::<Result<Vec<_>, _>>()?
329            .into_pipeline_data(call.head, engine_state.signals().clone()))
330    } else {
331        Ok(PipelineData::empty())
332    }
333}
334
335pub fn find_columns(input: &str) -> Vec<Spanned<String>> {
336    let mut chars = input.char_indices().peekable();
337    let mut output = vec![];
338
339    while let Some((_, c)) = chars.peek() {
340        if c.is_whitespace() {
341            // If the next character is non-newline whitespace, skip it.
342
343            let _ = chars.next();
344        } else {
345            // Otherwise, try to consume an unclassified token.
346
347            let result = baseline(&mut chars);
348
349            output.push(result);
350        }
351    }
352
353    output
354}
355
356#[derive(Clone, Copy)]
357enum BlockKind {
358    Parenthesis,
359    Brace,
360    Bracket,
361}
362
363fn baseline(src: &mut Input) -> Spanned<String> {
364    let mut token_contents = String::new();
365
366    let start_offset = if let Some((pos, _)) = src.peek() {
367        *pos
368    } else {
369        0
370    };
371
372    // This variable tracks the starting character of a string literal, so that
373    // we remain inside the string literal lexer mode until we encounter the
374    // closing quote.
375    let mut quote_start: Option<char> = None;
376
377    // This Vec tracks paired delimiters
378    let mut block_level: Vec<BlockKind> = vec![];
379
380    // A baseline token is terminated if it's not nested inside of a paired
381    // delimiter and the next character is one of: `|`, `;`, `#` or any
382    // whitespace.
383    fn is_termination(block_level: &[BlockKind], c: char) -> bool {
384        block_level.is_empty() && (c.is_whitespace())
385    }
386
387    // The process of slurping up a baseline token repeats:
388    //
389    // - String literal, which begins with `'`, `"` or `\``, and continues until
390    //   the same character is encountered again.
391    // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
392    //   the matching closing delimiter is found, skipping comments and string
393    //   literals.
394    // - When not nested inside of a delimiter pair, when a terminating
395    //   character (whitespace, `|`, `;` or `#`) is encountered, the baseline
396    //   token is done.
397    // - Otherwise, accumulate the character into the current baseline token.
398    while let Some((_, c)) = src.peek() {
399        let c = *c;
400
401        if quote_start.is_some() {
402            // If we encountered the closing quote character for the current
403            // string, we're done with the current string.
404            if Some(c) == quote_start {
405                quote_start = None;
406            }
407        } else if c == '\n' {
408            if is_termination(&block_level, c) {
409                break;
410            }
411        } else if c == '\'' || c == '"' || c == '`' {
412            // We encountered the opening quote of a string literal.
413            quote_start = Some(c);
414        } else if c == '[' {
415            // We encountered an opening `[` delimiter.
416            block_level.push(BlockKind::Bracket);
417        } else if c == ']' {
418            // We encountered a closing `]` delimiter. Pop off the opening `[`
419            // delimiter.
420            if let Some(BlockKind::Bracket) = block_level.last() {
421                let _ = block_level.pop();
422            }
423        } else if c == '{' {
424            // We encountered an opening `{` delimiter.
425            block_level.push(BlockKind::Brace);
426        } else if c == '}' {
427            // We encountered a closing `}` delimiter. Pop off the opening `{`.
428            if let Some(BlockKind::Brace) = block_level.last() {
429                let _ = block_level.pop();
430            }
431        } else if c == '(' {
432            // We enceountered an opening `(` delimiter.
433            block_level.push(BlockKind::Parenthesis);
434        } else if c == ')' {
435            // We encountered a closing `)` delimiter. Pop off the opening `(`.
436            if let Some(BlockKind::Parenthesis) = block_level.last() {
437                let _ = block_level.pop();
438            }
439        } else if is_termination(&block_level, c) {
440            break;
441        }
442
443        // Otherwise, accumulate the character into the current token.
444        token_contents.push(c);
445
446        // Consume the character.
447        let _ = src.next();
448    }
449
450    let span = Span::new(start_offset, start_offset + token_contents.len());
451
452    // If there is still unclosed opening delimiters, close them and add
453    // synthetic closing characters to the accumulated token.
454    if block_level.last().is_some() {
455        // let delim: char = (*block).closing();
456        // let cause = ParseError::unexpected_eof(delim.to_string(), span);
457
458        // while let Some(bk) = block_level.pop() {
459        //     token_contents.push(bk.closing());
460        // }
461
462        return Spanned {
463            item: token_contents,
464            span,
465        };
466    }
467
468    if quote_start.is_some() {
469        // The non-lite parse trims quotes on both sides, so we add the expected quote so that
470        // anyone wanting to consume this partial parse (e.g., completions) will be able to get
471        // correct information from the non-lite parse.
472        // token_contents.push(delimiter);
473
474        // return (
475        //     token_contents.spanned(span),
476        //     Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
477        // );
478        return Spanned {
479            item: token_contents,
480            span,
481        };
482    }
483
484    Spanned {
485        item: token_contents,
486        span,
487    }
488}
489
490fn merge_record(record: Record, range: &Range, input_span: Span) -> Value {
491    let (start_index, end_index) = match process_range(range, record.len(), input_span) {
492        Ok(Some((l_idx, r_idx))) => (l_idx, r_idx),
493        Ok(None) => return Value::record(record, input_span),
494        Err(e) => return Value::error(e, input_span),
495    };
496
497    match merge_record_impl(record, start_index, end_index, input_span) {
498        Ok(rec) => Value::record(rec, input_span),
499        Err(err) => Value::error(err, input_span),
500    }
501}
502
503fn process_range(
504    range: &Range,
505    length: usize,
506    input_span: Span,
507) -> Result<Option<(usize, usize)>, ShellError> {
508    match nu_cmd_base::util::process_range(range) {
509        Ok((l_idx, r_idx)) => {
510            let l_idx = if l_idx < 0 {
511                length as isize + l_idx
512            } else {
513                l_idx
514            };
515
516            let r_idx = if r_idx < 0 {
517                length as isize + r_idx
518            } else {
519                r_idx
520            };
521
522            if !(l_idx <= r_idx && (r_idx >= 0 || l_idx < (length as isize))) {
523                return Ok(None);
524            }
525
526            Ok(Some((
527                l_idx.max(0) as usize,
528                (r_idx as usize + 1).min(length),
529            )))
530        }
531        Err(processing_error) => Err(processing_error("could not find range index", input_span)),
532    }
533}
534
535fn merge_record_impl(
536    record: Record,
537    start_index: usize,
538    end_index: usize,
539    input_span: Span,
540) -> Result<Record, ShellError> {
541    let (mut cols, mut vals): (Vec<_>, Vec<_>) = record.into_iter().unzip();
542    // Merge Columns
543    ((start_index + 1)..(cols.len() - end_index + start_index + 1)).for_each(|idx| {
544        cols.swap(idx, end_index - start_index - 1 + idx);
545    });
546    cols.truncate(cols.len() - end_index + start_index + 1);
547
548    // Merge Values
549    let combined = vals
550        .iter()
551        .take(end_index)
552        .skip(start_index)
553        .map(|v| v.coerce_str().unwrap_or_default())
554        .join(" ");
555    let binding = Value::string(combined, Span::unknown());
556    let last_seg = vals.split_off(end_index);
557    vals.truncate(start_index);
558    vals.push(binding);
559    vals.extend(last_seg);
560
561    Record::from_raw_cols_vals(cols, vals, Span::unknown(), input_span)
562}
563
564#[cfg(test)]
565mod test {
566    use super::*;
567
568    #[test]
569    fn test_examples() {
570        crate::test_examples(DetectColumns)
571    }
572}