nu_command/strings/
detect_columns.rs

1use itertools::Itertools;
2use nu_engine::command_prelude::*;
3use nu_protocol::{Config, Range};
4use std::{io::Cursor, iter::Peekable, str::CharIndices, sync::Arc};
5
6type Input<'t> = Peekable<CharIndices<'t>>;
7
8#[derive(Clone)]
9pub struct DetectColumns;
10
11impl Command for DetectColumns {
12    fn name(&self) -> &str {
13        "detect columns"
14    }
15
16    fn signature(&self) -> Signature {
17        Signature::build("detect columns")
18            .named(
19                "skip",
20                SyntaxShape::Int,
21                "number of rows to skip before detecting",
22                Some('s'),
23            )
24            .input_output_types(vec![(Type::String, Type::table())])
25            .switch("no-headers", "don't detect headers", Some('n'))
26            .named(
27                "combine-columns",
28                SyntaxShape::Range,
29                "columns to be combined; listed as a range",
30                Some('c'),
31            )
32            .switch(
33                "guess",
34                "detect columns by guessing width, it may be useful if default one doesn't work",
35                None,
36            )
37            .category(Category::Strings)
38    }
39
40    fn description(&self) -> &str {
41        "Attempt to automatically split text into multiple columns."
42    }
43
44    fn search_terms(&self) -> Vec<&str> {
45        vec!["split", "tabular"]
46    }
47
48    fn examples(&self) -> Vec<Example> {
49        vec![
50            Example {
51                description: "use --guess if you find default algorithm not working",
52                example: r"
53'Filesystem     1K-blocks      Used Available Use% Mounted on
54none             8150224         4   8150220   1% /mnt/c' | detect columns --guess",
55                result: Some(Value::test_list(vec![Value::test_record(record! {
56                    "Filesystem" => Value::test_string("none"),
57                    "1K-blocks" => Value::test_string("8150224"),
58                    "Used" => Value::test_string("4"),
59                    "Available" => Value::test_string("8150220"),
60                    "Use%" => Value::test_string("1%"),
61                    "Mounted on" => Value::test_string("/mnt/c")
62                })])),
63            },
64            Example {
65                description: "detect columns with no headers",
66                example: "'a b c' | detect columns  --no-headers",
67                result: Some(Value::test_list(vec![Value::test_record(record! {
68                        "column0" => Value::test_string("a"),
69                        "column1" => Value::test_string("b"),
70                        "column2" => Value::test_string("c"),
71                })])),
72            },
73            Example {
74                description: "",
75                example:
76                    "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 0..1 ",
77                result: None,
78            },
79            Example {
80                description: "Splits a multi-line string into columns with headers detected",
81                example:
82                    "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns -2..-1 ",
83                result: None,
84            },
85            Example {
86                description: "Splits a multi-line string into columns with headers detected",
87                example:
88                    "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 2.. ",
89                result: None,
90            },
91            Example {
92                description: "Parse external ls command and combine columns for datetime",
93                example: "^ls -lh | detect columns --no-headers --skip 1 --combine-columns 5..7",
94                result: None,
95            },
96        ]
97    }
98
99    fn is_const(&self) -> bool {
100        true
101    }
102
103    fn run(
104        &self,
105        engine_state: &EngineState,
106        stack: &mut Stack,
107        call: &Call,
108        input: PipelineData,
109    ) -> Result<PipelineData, ShellError> {
110        let num_rows_to_skip: Option<usize> = call.get_flag(engine_state, stack, "skip")?;
111        let noheader = call.has_flag(engine_state, stack, "no-headers")?;
112        let range: Option<Range> = call.get_flag(engine_state, stack, "combine-columns")?;
113        let config = stack.get_config(engine_state);
114
115        let args = Arguments {
116            noheader,
117            num_rows_to_skip,
118            range,
119            config,
120        };
121
122        if call.has_flag(engine_state, stack, "guess")? {
123            guess_width(engine_state, call, input, args)
124        } else {
125            detect_columns(engine_state, call, input, args)
126        }
127    }
128
129    fn run_const(
130        &self,
131        working_set: &StateWorkingSet,
132        call: &Call,
133        input: PipelineData,
134    ) -> Result<PipelineData, ShellError> {
135        let num_rows_to_skip: Option<usize> = call.get_flag_const(working_set, "skip")?;
136        let noheader = call.has_flag_const(working_set, "no-headers")?;
137        let range: Option<Range> = call.get_flag_const(working_set, "combine-columns")?;
138        let config = working_set.get_config().clone();
139
140        let args = Arguments {
141            noheader,
142            num_rows_to_skip,
143            range,
144            config,
145        };
146
147        if call.has_flag_const(working_set, "guess")? {
148            guess_width(working_set.permanent(), call, input, args)
149        } else {
150            detect_columns(working_set.permanent(), call, input, args)
151        }
152    }
153}
154
155struct Arguments {
156    num_rows_to_skip: Option<usize>,
157    noheader: bool,
158    range: Option<Range>,
159    config: Arc<Config>,
160}
161
162fn guess_width(
163    engine_state: &EngineState,
164    call: &Call,
165    input: PipelineData,
166    args: Arguments,
167) -> Result<PipelineData, ShellError> {
168    use super::guess_width::GuessWidth;
169    let input_span = input.span().unwrap_or(call.head);
170
171    let mut input = input.collect_string("", &args.config)?;
172    if let Some(rows) = args.num_rows_to_skip {
173        input = input.lines().skip(rows).map(|x| x.to_string()).join("\n");
174    }
175
176    let mut guess_width = GuessWidth::new_reader(Box::new(Cursor::new(input)));
177
178    let result = guess_width.read_all();
179
180    if result.is_empty() {
181        return Ok(Value::nothing(input_span).into_pipeline_data());
182    }
183    if !args.noheader {
184        let columns = result[0].clone();
185        Ok(result
186            .into_iter()
187            .skip(1)
188            .map(move |s| {
189                let mut values: Vec<Value> = s
190                    .into_iter()
191                    .map(|v| Value::string(v, input_span))
192                    .collect();
193                // some rows may has less columns, fill it with ""
194                for _ in values.len()..columns.len() {
195                    values.push(Value::string("", input_span));
196                }
197                let record =
198                    Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
199                match record {
200                    Ok(r) => match &args.range {
201                        Some(range) => merge_record(r, range, input_span),
202                        None => Value::record(r, input_span),
203                    },
204                    Err(e) => Value::error(e, input_span),
205                }
206            })
207            .into_pipeline_data(input_span, engine_state.signals().clone()))
208    } else {
209        let length = result[0].len();
210        let columns: Vec<String> = (0..length).map(|n| format!("column{n}")).collect();
211        Ok(result
212            .into_iter()
213            .map(move |s| {
214                let mut values: Vec<Value> = s
215                    .into_iter()
216                    .map(|v| Value::string(v, input_span))
217                    .collect();
218                // some rows may has less columns, fill it with ""
219                for _ in values.len()..columns.len() {
220                    values.push(Value::string("", input_span));
221                }
222                let record =
223                    Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
224                match record {
225                    Ok(r) => match &args.range {
226                        Some(range) => merge_record(r, range, input_span),
227                        None => Value::record(r, input_span),
228                    },
229                    Err(e) => Value::error(e, input_span),
230                }
231            })
232            .into_pipeline_data(input_span, engine_state.signals().clone()))
233    }
234}
235
236fn detect_columns(
237    engine_state: &EngineState,
238    call: &Call,
239    input: PipelineData,
240    args: Arguments,
241) -> Result<PipelineData, ShellError> {
242    let name_span = call.head;
243    let input = input.collect_string("", &args.config)?;
244
245    let input: Vec<_> = input
246        .lines()
247        .skip(args.num_rows_to_skip.unwrap_or_default())
248        .map(|x| x.to_string())
249        .collect();
250
251    let mut input = input.into_iter();
252    let headers = input.next();
253
254    if let Some(orig_headers) = headers {
255        let mut headers = find_columns(&orig_headers);
256
257        if args.noheader {
258            for header in headers.iter_mut().enumerate() {
259                header.1.item = format!("column{}", header.0);
260            }
261        }
262
263        Ok(args
264            .noheader
265            .then_some(orig_headers)
266            .into_iter()
267            .chain(input)
268            .map(move |x| {
269                let row = find_columns(&x);
270
271                let mut record = Record::new();
272
273                if headers.len() == row.len() {
274                    for (header, val) in headers.iter().zip(row.iter()) {
275                        record.push(&header.item, Value::string(&val.item, name_span));
276                    }
277                } else {
278                    let mut pre_output = vec![];
279
280                    // column counts don't line up, so see if we can figure out why
281                    for cell in row {
282                        for header in &headers {
283                            if cell.span.start <= header.span.end
284                                && cell.span.end > header.span.start
285                            {
286                                pre_output.push((
287                                    header.item.to_string(),
288                                    Value::string(&cell.item, name_span),
289                                ));
290                            }
291                        }
292                    }
293
294                    for header in &headers {
295                        let mut found = false;
296                        for pre_o in &pre_output {
297                            if pre_o.0 == header.item {
298                                found = true;
299                                break;
300                            }
301                        }
302
303                        if !found {
304                            pre_output.push((header.item.to_string(), Value::nothing(name_span)));
305                        }
306                    }
307
308                    for header in &headers {
309                        for pre_o in &pre_output {
310                            if pre_o.0 == header.item {
311                                record.push(&header.item, pre_o.1.clone());
312                            }
313                        }
314                    }
315                }
316
317                match &args.range {
318                    Some(range) => merge_record(record, range, name_span),
319                    None => Value::record(record, name_span),
320                }
321            })
322            .into_pipeline_data(call.head, engine_state.signals().clone()))
323    } else {
324        Ok(PipelineData::empty())
325    }
326}
327
328pub fn find_columns(input: &str) -> Vec<Spanned<String>> {
329    let mut chars = input.char_indices().peekable();
330    let mut output = vec![];
331
332    while let Some((_, c)) = chars.peek() {
333        if c.is_whitespace() {
334            // If the next character is non-newline whitespace, skip it.
335
336            let _ = chars.next();
337        } else {
338            // Otherwise, try to consume an unclassified token.
339
340            let result = baseline(&mut chars);
341
342            output.push(result);
343        }
344    }
345
346    output
347}
348
349#[derive(Clone, Copy)]
350enum BlockKind {
351    Parenthesis,
352    Brace,
353    Bracket,
354}
355
356fn baseline(src: &mut Input) -> Spanned<String> {
357    let mut token_contents = String::new();
358
359    let start_offset = if let Some((pos, _)) = src.peek() {
360        *pos
361    } else {
362        0
363    };
364
365    // This variable tracks the starting character of a string literal, so that
366    // we remain inside the string literal lexer mode until we encounter the
367    // closing quote.
368    let mut quote_start: Option<char> = None;
369
370    // This Vec tracks paired delimiters
371    let mut block_level: Vec<BlockKind> = vec![];
372
373    // A baseline token is terminated if it's not nested inside of a paired
374    // delimiter and the next character is one of: `|`, `;`, `#` or any
375    // whitespace.
376    fn is_termination(block_level: &[BlockKind], c: char) -> bool {
377        block_level.is_empty() && (c.is_whitespace())
378    }
379
380    // The process of slurping up a baseline token repeats:
381    //
382    // - String literal, which begins with `'`, `"` or `\``, and continues until
383    //   the same character is encountered again.
384    // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
385    //   the matching closing delimiter is found, skipping comments and string
386    //   literals.
387    // - When not nested inside of a delimiter pair, when a terminating
388    //   character (whitespace, `|`, `;` or `#`) is encountered, the baseline
389    //   token is done.
390    // - Otherwise, accumulate the character into the current baseline token.
391    while let Some((_, c)) = src.peek() {
392        let c = *c;
393
394        if quote_start.is_some() {
395            // If we encountered the closing quote character for the current
396            // string, we're done with the current string.
397            if Some(c) == quote_start {
398                quote_start = None;
399            }
400        } else if c == '\n' {
401            if is_termination(&block_level, c) {
402                break;
403            }
404        } else if c == '\'' || c == '"' || c == '`' {
405            // We encountered the opening quote of a string literal.
406            quote_start = Some(c);
407        } else if c == '[' {
408            // We encountered an opening `[` delimiter.
409            block_level.push(BlockKind::Bracket);
410        } else if c == ']' {
411            // We encountered a closing `]` delimiter. Pop off the opening `[`
412            // delimiter.
413            if let Some(BlockKind::Bracket) = block_level.last() {
414                let _ = block_level.pop();
415            }
416        } else if c == '{' {
417            // We encountered an opening `{` delimiter.
418            block_level.push(BlockKind::Brace);
419        } else if c == '}' {
420            // We encountered a closing `}` delimiter. Pop off the opening `{`.
421            if let Some(BlockKind::Brace) = block_level.last() {
422                let _ = block_level.pop();
423            }
424        } else if c == '(' {
425            // We enceountered an opening `(` delimiter.
426            block_level.push(BlockKind::Parenthesis);
427        } else if c == ')' {
428            // We encountered a closing `)` delimiter. Pop off the opening `(`.
429            if let Some(BlockKind::Parenthesis) = block_level.last() {
430                let _ = block_level.pop();
431            }
432        } else if is_termination(&block_level, c) {
433            break;
434        }
435
436        // Otherwise, accumulate the character into the current token.
437        token_contents.push(c);
438
439        // Consume the character.
440        let _ = src.next();
441    }
442
443    let span = Span::new(start_offset, start_offset + token_contents.len());
444
445    // If there is still unclosed opening delimiters, close them and add
446    // synthetic closing characters to the accumulated token.
447    if block_level.last().is_some() {
448        // let delim: char = (*block).closing();
449        // let cause = ParseError::unexpected_eof(delim.to_string(), span);
450
451        // while let Some(bk) = block_level.pop() {
452        //     token_contents.push(bk.closing());
453        // }
454
455        return Spanned {
456            item: token_contents,
457            span,
458        };
459    }
460
461    if quote_start.is_some() {
462        // The non-lite parse trims quotes on both sides, so we add the expected quote so that
463        // anyone wanting to consume this partial parse (e.g., completions) will be able to get
464        // correct information from the non-lite parse.
465        // token_contents.push(delimiter);
466
467        // return (
468        //     token_contents.spanned(span),
469        //     Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
470        // );
471        return Spanned {
472            item: token_contents,
473            span,
474        };
475    }
476
477    Spanned {
478        item: token_contents,
479        span,
480    }
481}
482
483fn merge_record(record: Record, range: &Range, input_span: Span) -> Value {
484    let (start_index, end_index) = match process_range(range, record.len(), input_span) {
485        Ok(Some((l_idx, r_idx))) => (l_idx, r_idx),
486        Ok(None) => return Value::record(record, input_span),
487        Err(e) => return Value::error(e, input_span),
488    };
489
490    match merge_record_impl(record, start_index, end_index, input_span) {
491        Ok(rec) => Value::record(rec, input_span),
492        Err(err) => Value::error(err, input_span),
493    }
494}
495
496fn process_range(
497    range: &Range,
498    length: usize,
499    input_span: Span,
500) -> Result<Option<(usize, usize)>, ShellError> {
501    match nu_cmd_base::util::process_range(range) {
502        Ok((l_idx, r_idx)) => {
503            let l_idx = if l_idx < 0 {
504                length as isize + l_idx
505            } else {
506                l_idx
507            };
508
509            let r_idx = if r_idx < 0 {
510                length as isize + r_idx
511            } else {
512                r_idx
513            };
514
515            if !(l_idx <= r_idx && (r_idx >= 0 || l_idx < (length as isize))) {
516                return Ok(None);
517            }
518
519            Ok(Some((
520                l_idx.max(0) as usize,
521                (r_idx as usize + 1).min(length),
522            )))
523        }
524        Err(processing_error) => Err(processing_error("could not find range index", input_span)),
525    }
526}
527
528fn merge_record_impl(
529    record: Record,
530    start_index: usize,
531    end_index: usize,
532    input_span: Span,
533) -> Result<Record, ShellError> {
534    let (mut cols, mut vals): (Vec<_>, Vec<_>) = record.into_iter().unzip();
535    // Merge Columns
536    ((start_index + 1)..(cols.len() - end_index + start_index + 1)).for_each(|idx| {
537        cols.swap(idx, end_index - start_index - 1 + idx);
538    });
539    cols.truncate(cols.len() - end_index + start_index + 1);
540
541    // Merge Values
542    let combined = vals
543        .iter()
544        .take(end_index)
545        .skip(start_index)
546        .map(|v| v.coerce_str().unwrap_or_default())
547        .join(" ");
548    let binding = Value::string(combined, Span::unknown());
549    let last_seg = vals.split_off(end_index);
550    vals.truncate(start_index);
551    vals.push(binding);
552    vals.extend(last_seg);
553
554    Record::from_raw_cols_vals(cols, vals, Span::unknown(), input_span)
555}
556
557#[cfg(test)]
558mod test {
559    use super::*;
560
561    #[test]
562    fn test_examples() {
563        crate::test_examples(DetectColumns)
564    }
565}