nu_command/strings/
detect_columns.rs

1use itertools::Itertools;
2use nu_engine::command_prelude::*;
3use nu_protocol::{Config, Range};
4use std::{io::Cursor, iter::Peekable, str::CharIndices, sync::Arc};
5
6type Input<'t> = Peekable<CharIndices<'t>>;
7
8/// Helper function to check if a character is a box drawing character.
9/// Includes Unicode box drawing symbols (horizontal, vertical, intersections, corners)
10/// as well as ASCII equivalents like '-' and '|'.
11fn is_box_char(c: char) -> bool {
12    matches!(
13        c,
14        // Horizontal box drawing characters (Unicode and ASCII)
15        '─' | '━' | '┄' | '┅' | '┈' | '┉' | '-' | '=' |
16        // Vertical box drawing characters (Unicode and ASCII)
17        '│' | '┃' | '┆' | '┇' | '┊' | '┋' | '|' |
18        // Box intersection and corner characters
19        '+' | '├' | '┤' | '┬' | '┴' | '┼' | '┌' | '┐' | '└' | '┘'
20    )
21}
22
23/// Attempts to automatically split text into multiple columns.
24///
25/// This command parses tabular data from strings or passes through existing tables.
26/// When `--ignore-box-chars` is used, it ignores separator lines and cleans box drawing characters from tokens.
27#[derive(Clone)]
28pub struct DetectColumns;
29
30impl Command for DetectColumns {
31    fn name(&self) -> &str {
32        "detect columns"
33    }
34
35    fn signature(&self) -> Signature {
36        Signature::build("detect columns")
37            .named(
38                "skip",
39                SyntaxShape::Int,
40                "Number of rows to skip before detecting.",
41                Some('s'),
42            )
43            .input_output_types(vec![
44                (Type::String, Type::table()),
45                (Type::table(), Type::table()),
46            ])
47            .switch("no-headers", "Don't detect headers.", Some('n'))
48            .switch(
49                "ignore-box-chars",
50                "Ignore lines consisting entirely of box drawing characters and clean box characters from tokens.",
51                Some('i'),
52            )
53            .named(
54                "combine-columns",
55                SyntaxShape::Range,
56                "Columns to be combined; listed as a range.",
57                Some('c'),
58            )
59            .switch(
60                "guess",
61                "Detect columns by guessing width, it may be useful if default one doesn't work.",
62                None,
63            )
64            .category(Category::Strings)
65    }
66
67    fn description(&self) -> &str {
68        "Attempt to automatically split text into multiple columns."
69    }
70
71    fn search_terms(&self) -> Vec<&str> {
72        vec!["split", "tabular"]
73    }
74
75    fn examples(&self) -> Vec<Example<'_>> {
76        vec![
77            Example {
78                description: "use --guess if you find default algorithm not working",
79                example: r"
80'Filesystem     1K-blocks      Used Available Use% Mounted on
81none             8150224         4   8150220   1% /mnt/c' | detect columns --guess",
82                result: Some(Value::test_list(vec![Value::test_record(record! {
83                    "Filesystem" => Value::test_string("none"),
84                    "1K-blocks" => Value::test_string("8150224"),
85                    "Used" => Value::test_string("4"),
86                    "Available" => Value::test_string("8150220"),
87                    "Use%" => Value::test_string("1%"),
88                    "Mounted on" => Value::test_string("/mnt/c")
89                })])),
90            },
91            Example {
92                description: "detect columns with no headers",
93                example: "'a b c' | detect columns  --no-headers",
94                result: Some(Value::test_list(vec![Value::test_record(record! {
95                        "column0" => Value::test_string("a"),
96                        "column1" => Value::test_string("b"),
97                        "column2" => Value::test_string("c"),
98                })])),
99            },
100            Example {
101                description: "",
102                example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 0..1 ",
103                result: None,
104            },
105            Example {
106                description: "Splits a multi-line string into columns with headers detected",
107                example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns -2..-1 ",
108                result: None,
109            },
110            Example {
111                description: "Splits a multi-line string into columns with headers detected",
112                example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 2.. ",
113                result: None,
114            },
115            Example {
116                description: "Parse external ls command and combine columns for datetime",
117                example: "^ls -lh | detect columns --no-headers --skip 1 --combine-columns 5..7",
118                result: None,
119            },
120            Example {
121                description: "Table literal input is passed through unchanged",
122                example: "[[name, age]; [Alice, 25]] | detect columns",
123                result: Some(Value::test_list(vec![Value::test_record(record! {
124                    "name" => Value::test_string("Alice"),
125                    "age" => Value::test_int(25)
126                })])),
127            },
128            Example {
129                description: "List of records input is passed through unchanged",
130                example: "[{name: Alice, age: 25}, {name: Bob, age: 30}] | detect columns",
131                result: Some(Value::test_list(vec![
132                    Value::test_record(record! {
133                        "name" => Value::test_string("Alice"),
134                        "age" => Value::test_int(25)
135                    }),
136                    Value::test_record(record! {
137                        "name" => Value::test_string("Bob"),
138                        "age" => Value::test_int(30)
139                    }),
140                ])),
141            },
142            Example {
143                description: "Parse a box-bordered table by ignoring separator lines and using header positions",
144                example: r#""+-------+-------+
145| col1  | col2  |
146+-------+-------+
147| a     | b     |
148+-------+-------+" | detect columns --ignore-box-chars"#,
149                result: Some(Value::test_list(vec![Value::test_record(record! {
150                    "col1" => Value::test_string("a"),
151                    "col2" => Value::test_string("b"),
152                })])),
153            },
154        ]
155    }
156
157    fn is_const(&self) -> bool {
158        true
159    }
160
161    fn run(
162        &self,
163        engine_state: &EngineState,
164        stack: &mut Stack,
165        call: &Call,
166        input: PipelineData,
167    ) -> Result<PipelineData, ShellError> {
168        // Extract command arguments
169        let num_rows_to_skip: Option<usize> = call.get_flag(engine_state, stack, "skip")?;
170        let noheader = call.has_flag(engine_state, stack, "no-headers")?;
171        let range: Option<Range> = call.get_flag(engine_state, stack, "combine-columns")?;
172        let ignore_box_chars = call.has_flag(engine_state, stack, "ignore-box-chars")?;
173        let config = stack.get_config(engine_state);
174
175        let args = Arguments {
176            noheader,
177            num_rows_to_skip,
178            range,
179            config,
180            ignore_box_chars,
181        };
182
183        // Dispatch to appropriate implementation based on guess flag
184        if call.has_flag(engine_state, stack, "guess")? {
185            guess_width(engine_state, call, input, args)
186        } else {
187            detect_columns(engine_state, call, input, args)
188        }
189    }
190
191    fn run_const(
192        &self,
193        working_set: &StateWorkingSet,
194        call: &Call,
195        input: PipelineData,
196    ) -> Result<PipelineData, ShellError> {
197        let num_rows_to_skip: Option<usize> = call.get_flag_const(working_set, "skip")?;
198        let noheader = call.has_flag_const(working_set, "no-headers")?;
199        let range: Option<Range> = call.get_flag_const(working_set, "combine-columns")?;
200        let ignore_box_chars = call.has_flag_const(working_set, "ignore-box-chars")?;
201        let config = working_set.get_config().clone();
202
203        let args = Arguments {
204            noheader,
205            num_rows_to_skip,
206            range,
207            config,
208            ignore_box_chars,
209        };
210
211        if call.has_flag_const(working_set, "guess")? {
212            guess_width(working_set.permanent(), call, input, args)
213        } else {
214            detect_columns(working_set.permanent(), call, input, args)
215        }
216    }
217}
218
219struct Arguments {
220    num_rows_to_skip: Option<usize>,
221    noheader: bool,
222    range: Option<Range>,
223    config: Arc<Config>,
224    ignore_box_chars: bool,
225}
226
227fn guess_width(
228    engine_state: &EngineState,
229    call: &Call,
230    input: PipelineData,
231    args: Arguments,
232) -> Result<PipelineData, ShellError> {
233    use super::guess_width::GuessWidth;
234    let input_span = input.span().unwrap_or(call.head);
235
236    let mut input = input.collect_string("", &args.config)?;
237    if let Some(rows) = args.num_rows_to_skip {
238        input = input.lines().skip(rows).map(|x| x.to_string()).join("\n");
239    }
240
241    // Apply box character filtering if requested
242    if args.ignore_box_chars {
243        let filtered_lines = filter_box_chars(input.lines().map(|s| s.to_string()));
244        input = filtered_lines.join("\n");
245    }
246
247    let mut guess_width = GuessWidth::new_reader(Box::new(Cursor::new(input)));
248
249    let result = guess_width.read_all();
250
251    if result.is_empty() {
252        return Ok(Value::nothing(input_span).into_pipeline_data());
253    }
254    if !args.noheader {
255        let columns = result[0].clone();
256        Ok(result
257            .into_iter()
258            .skip(1)
259            .map(move |s| {
260                let mut values: Vec<Value> = s
261                    .into_iter()
262                    .map(|v| Value::string(v, input_span))
263                    .collect();
264                // some rows may has less columns, fill it with ""
265                for _ in values.len()..columns.len() {
266                    values.push(Value::string("", input_span));
267                }
268                let record =
269                    Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
270                match record {
271                    Ok(r) => match &args.range {
272                        Some(range) => merge_record(r, range, input_span),
273                        None => Value::record(r, input_span),
274                    },
275                    Err(e) => Value::error(e, input_span),
276                }
277            })
278            .into_pipeline_data(input_span, engine_state.signals().clone()))
279    } else {
280        let length = result[0].len();
281        let columns: Vec<String> = (0..length).map(|n| format!("column{n}")).collect();
282        Ok(result
283            .into_iter()
284            .map(move |s| {
285                let mut values: Vec<Value> = s
286                    .into_iter()
287                    .map(|v| Value::string(v, input_span))
288                    .collect();
289                // some rows may has less columns, fill it with ""
290                for _ in values.len()..columns.len() {
291                    values.push(Value::string("", input_span));
292                }
293                let record =
294                    Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
295                match record {
296                    Ok(r) => match &args.range {
297                        Some(range) => merge_record(r, range, input_span),
298                        None => Value::record(r, input_span),
299                    },
300                    Err(e) => Value::error(e, input_span),
301                }
302            })
303            .into_pipeline_data(input_span, engine_state.signals().clone()))
304    }
305}
306
307/// Core function to detect columns from input data.
308/// Handles different input types: passes through tables, parses strings.
309/// Applies filtering and cleaning based on the ignore_box_chars flag.
310fn detect_columns(
311    _engine_state: &EngineState,
312    call: &Call,
313    input: PipelineData,
314    args: Arguments,
315) -> Result<PipelineData, ShellError> {
316    let name_span = call.head;
317    let input_span = input.span().unwrap_or(Span::unknown());
318
319    // Handle different input types
320    match input {
321        // If input is already a table (list of records), pass it through unchanged
322        PipelineData::Value(val, _) => {
323            if let Value::List { vals, .. } = &val
324                && vals.iter().all(|v| matches!(v, Value::Record { .. }))
325            {
326                return Ok(val.into_pipeline_data());
327            }
328            // Otherwise, coerce to string for parsing
329            let input_str = val.coerce_str()?.to_string();
330            process_string_input(input_str, args, name_span, input_span)
331        }
332        // Table streams are passed through directly
333        PipelineData::ListStream(_, _) => Ok(input),
334        // External command output is collected as string
335        PipelineData::ByteStream(_, _) => {
336            let input_str = input.collect_string("", &args.config)?;
337            process_string_input(input_str, args, name_span, input_span)
338        }
339        // Empty input yields empty string
340        PipelineData::Empty => Ok(PipelineData::empty()),
341    }
342}
343
344/// Process string input for column detection.
345fn process_string_input(
346    input_str: String,
347    args: Arguments,
348    name_span: Span,
349    input_span: Span,
350) -> Result<PipelineData, ShellError> {
351    // Split input string into lines and skip the specified number of rows
352    let lines_iter = input_str
353        .lines()
354        .skip(args.num_rows_to_skip.unwrap_or_default());
355
356    // Conditionally filter out lines consisting entirely of box drawing characters
357    // and clean box characters from the remaining lines
358    // This helps clean up tabular output from commands like `iptab` that use box drawings
359    let filtered_lines: Vec<_> = if args.ignore_box_chars {
360        filter_box_chars(lines_iter.map(|s| s.to_string()))
361    } else {
362        // No filtering: pass through all lines as-is
363        lines_iter.map(|x| x.to_string()).collect()
364    };
365
366    let mut lines = filtered_lines.into_iter();
367    let header_line = lines.next();
368
369    if let Some(header_line) = header_line {
370        if args.ignore_box_chars {
371            process_with_box_filter(header_line, lines, args, name_span, input_span)
372        } else {
373            process_standard(header_line, lines, args, name_span, input_span)
374        }
375    } else {
376        Ok(PipelineData::empty())
377    }
378}
379
380/// Process input when ignore_box_chars is enabled.
381/// Handles both position-based and whitespace-based splitting depending on table format.
382fn process_with_box_filter(
383    header_line: String,
384    lines: impl Iterator<Item = String>,
385    args: Arguments,
386    name_span: Span,
387    input_span: Span,
388) -> Result<PipelineData, ShellError> {
389    // Check if the header line contains internal | separators
390    // If so, replace them with spaces so whitespace-based detection works
391    let has_internal_separators = header_line.contains('|') || header_line.contains('│');
392
393    let (processed_headers, processed_lines): (String, Vec<String>) = if has_internal_separators {
394        // Replace internal | with spaces for whitespace-based splitting
395        let replace_separators = |s: &str| {
396            s.chars()
397                .map(|c| if c == '|' || c == '│' { ' ' } else { c })
398                .collect::<String>()
399        };
400        (
401            replace_separators(&header_line),
402            lines.map(|line| replace_separators(&line)).collect(),
403        )
404    } else {
405        // No internal separators - use position-based splitting
406        (header_line.clone(), lines.collect())
407    };
408
409    // Use position-based splitting for tables without internal separators (like iptab)
410    if !has_internal_separators {
411        let header_positions = find_header_positions(&header_line);
412
413        if header_positions.is_empty() {
414            return Ok(PipelineData::empty());
415        }
416
417        // Extract header names
418        let mut header_names: Vec<String> = header_positions
419            .iter()
420            .map(|(_, name)| name.clone())
421            .collect();
422
423        if args.noheader {
424            for (i, name) in header_names.iter_mut().enumerate() {
425                *name = format!("column{i}");
426            }
427        }
428
429        // Check for duplicate column names
430        check_duplicate_string_headers(&header_names, input_span, name_span)?;
431
432        // Collect all lines for processing
433        let all_lines: Vec<_> = args
434            .noheader
435            .then_some(header_line.clone())
436            .into_iter()
437            .chain(processed_lines)
438            .collect();
439
440        return Ok(Value::list(
441            all_lines
442                .into_iter()
443                .map(|line| {
444                    let values = split_line_by_positions(&line, &header_positions);
445                    let mut record = Record::new();
446
447                    for (header, val) in header_names.iter().zip(values.iter()) {
448                        record.push(header, Value::string(val, name_span));
449                    }
450
451                    // Fill in missing columns with empty strings
452                    for header in header_names.iter().skip(values.len()) {
453                        record.push(header, Value::string("", name_span));
454                    }
455
456                    Ok::<Value, ShellError>(match &args.range {
457                        Some(range) => merge_record(record, range, name_span),
458                        None => Value::record(record, name_span),
459                    })
460                })
461                .collect::<Result<Vec<_>, _>>()?,
462            name_span,
463        )
464        .into_pipeline_data());
465    }
466
467    // Tables with internal separators: use whitespace-based splitting on processed data
468    let mut headers = find_columns(&processed_headers);
469
470    if args.noheader {
471        for header in headers.iter_mut().enumerate() {
472            header.1.item = format!("column{}", header.0);
473        }
474    }
475
476    // Check for duplicate column names
477    check_duplicate_headers(&headers, input_span, name_span)?;
478
479    // Collect all lines for processing
480    let all_lines: Vec<_> = args
481        .noheader
482        .then_some(processed_headers.clone())
483        .into_iter()
484        .chain(processed_lines)
485        .collect();
486
487    Ok(Value::list(
488        all_lines
489            .into_iter()
490            .map(|line| {
491                let row = find_columns(&line);
492                let mut record = Record::new();
493
494                for (header, val) in headers.iter().zip(row.iter()) {
495                    record.push(&header.item, Value::string(&val.item, name_span));
496                }
497
498                // Fill in missing columns with empty strings
499                for header in headers.iter().skip(row.len()) {
500                    record.push(&header.item, Value::string("", name_span));
501                }
502
503                Ok::<Value, ShellError>(match &args.range {
504                    Some(range) => merge_record(record, range, name_span),
505                    None => Value::record(record, name_span),
506                })
507            })
508            .collect::<Result<Vec<_>, _>>()?,
509        name_span,
510    )
511    .into_pipeline_data())
512}
513
514/// Process input with standard whitespace-based column detection.
515fn process_standard(
516    header_line: String,
517    lines: impl Iterator<Item = String>,
518    args: Arguments,
519    name_span: Span,
520    input_span: Span,
521) -> Result<PipelineData, ShellError> {
522    // Standard whitespace-based column detection
523    let mut headers = find_columns(&header_line);
524
525    if args.noheader {
526        for header in headers.iter_mut().enumerate() {
527            header.1.item = format!("column{}", header.0);
528        }
529    }
530
531    // Check for duplicate column names - this would create an invalid record
532    check_duplicate_headers(&headers, input_span, name_span)?;
533
534    // Collect remaining lines
535    let remaining_lines: Vec<_> = lines.collect();
536
537    // Check if column detection is working: if the first data row doesn't match
538    // the header structure, detection has failed and we should output all lines
539    // in a consistent "data" column to preserve the original data.
540    let detection_failed = remaining_lines
541        .first()
542        .is_some_and(|first_line| find_columns(first_line).len() != headers.len());
543
544    // When detection fails, include ALL original lines (including the first "header" line)
545    // When detection succeeds, only include header line if --no-headers was specified
546    let all_lines: Vec<_> = if detection_failed {
547        // Include the original first line since detection failed
548        std::iter::once(header_line.clone())
549            .chain(remaining_lines)
550            .collect()
551    } else {
552        // Detection succeeded - only include first line if --no-headers
553        args.noheader
554            .then_some(header_line.clone())
555            .into_iter()
556            .chain(remaining_lines)
557            .collect()
558    };
559
560    Ok(Value::list(
561        all_lines
562            .into_iter()
563            .map(move |x| {
564                let row = find_columns(&x);
565
566                let mut record = Record::new();
567
568                if !detection_failed && headers.len() == row.len() {
569                    for (header, val) in headers.iter().zip(row.iter()) {
570                        record.push(&header.item, Value::string(&val.item, name_span));
571                    }
572                } else {
573                    // Output the raw data - either detection failed or row doesn't match
574                    record.push("data", Value::string(&x, name_span));
575                }
576
577                Ok::<Value, ShellError>(match &args.range {
578                    Some(range) => merge_record(record, range, name_span),
579                    None => Value::record(record, name_span),
580                })
581            })
582            .collect::<Result<Vec<_>, _>>()?,
583        name_span,
584    )
585    .into_pipeline_data())
586}
587
588pub fn find_columns(input: &str) -> Vec<Spanned<String>> {
589    // For space-separated format, use the original baseline method
590    let mut chars = input.char_indices().peekable();
591    let mut output = vec![];
592
593    while let Some((_, c)) = chars.peek() {
594        if c.is_whitespace() {
595            // If the next character is non-newline whitespace, skip it.
596            let _ = chars.next();
597        } else {
598            // Otherwise, try to consume an unclassified token.
599            let result = baseline(&mut chars);
600            output.push(result);
601        }
602    }
603
604    output
605}
606
607/// Return `true` if any of the given string‑like items contains duplicates.
608///
609/// The generic form accepts anything whose items implement `AsRef<str>`, which
610/// includes `&str`, `String`, and `Spanned<String>` (via `.item`).
611///
612/// We allocate owned `String`s in the hash set; this keeps lifetimes simple and
613/// avoids borrowing issues when the input iterator produces temporaries.
614fn has_duplicate_names<I, S>(iter: I) -> bool
615where
616    I: IntoIterator<Item = S>,
617    S: AsRef<str>,
618{
619    let mut set = std::collections::HashSet::new();
620    for item in iter {
621        let s = item.as_ref();
622        if !set.insert(s.to_string()) {
623            return true;
624        }
625    }
626    false
627}
628
629/// Check for duplicate column names and return an error if found.
630fn check_duplicate_headers(
631    headers: &[Spanned<String>],
632    input_span: Span,
633    name_span: Span,
634) -> Result<(), ShellError> {
635    if has_duplicate_names(headers.iter().map(|h| &h.item)) {
636        Err(ShellError::ColumnDetectionFailure {
637            bad_value: input_span,
638            failure_site: name_span,
639        })
640    } else {
641        Ok(())
642    }
643}
644
645/// Check for duplicate column names in string headers and return an error if found.
646fn check_duplicate_string_headers(
647    headers: &[String],
648    input_span: Span,
649    name_span: Span,
650) -> Result<(), ShellError> {
651    if has_duplicate_names(headers.iter().map(|s| s.as_str())) {
652        Err(ShellError::ColumnDetectionFailure {
653            bad_value: input_span,
654            failure_site: name_span,
655        })
656    } else {
657        Ok(())
658    }
659}
660
661/// Filter and clean box drawing characters from lines.
662/// Returns filtered lines with box-only lines removed and border characters stripped.
663fn filter_box_chars<I>(lines_iter: I) -> Vec<String>
664where
665    I: Iterator<Item = String>,
666{
667    lines_iter
668        // Filter out lines where all non-whitespace characters are box drawing characters
669        .filter(|r| !r.trim().chars().all(is_box_char))
670        // Clean border characters from each line
671        .map(|line| {
672            let trimmed = line.trim();
673            // Strip only leading border character (| or │) and one optional space
674            let cleaned = trimmed
675                .strip_prefix('|')
676                .or_else(|| trimmed.strip_prefix('│'))
677                .unwrap_or(trimmed);
678            let cleaned = cleaned.strip_prefix(' ').unwrap_or(cleaned);
679            // Strip only trailing border character and one optional space
680            let cleaned = cleaned
681                .strip_suffix('|')
682                .or_else(|| cleaned.strip_suffix('│'))
683                .unwrap_or(cleaned);
684            let cleaned = cleaned.strip_suffix(' ').unwrap_or(cleaned);
685            cleaned.to_string()
686        })
687        .collect()
688}
689
690/// Find column positions (start indices) from a header line.
691/// Returns a vector of (start_position, header_name) pairs.
692fn find_header_positions(header_line: &str) -> Vec<(usize, String)> {
693    let mut positions = vec![];
694    let mut in_word = false;
695    let mut word_start = 0;
696    let mut current_word = String::new();
697
698    for (idx, c) in header_line.char_indices() {
699        if c.is_whitespace() {
700            if in_word {
701                // End of a word
702                positions.push((word_start, current_word.clone()));
703                current_word.clear();
704                in_word = false;
705            }
706        } else {
707            if !in_word {
708                // Start of a new word
709                word_start = idx;
710                in_word = true;
711            }
712            current_word.push(c);
713        }
714    }
715
716    // Don't forget the last word if the line doesn't end with whitespace
717    if in_word && !current_word.is_empty() {
718        positions.push((word_start, current_word));
719    }
720
721    positions
722}
723
724/// Adjust an index to the nearest character boundary for the given string.
725///
726/// - if `backward` is true, walk *backwards* from `idx` until a valid boundary is
727///   found (or zero is reached). this is used for column **starts**, since a
728///   header-derived offset landing inside a multibyte char should be moved to the
729///   beginning of that char.
730/// - otherwise walk *forwards* until a valid boundary or the end of the string is
731///   reached. this is used for column **ends** so that we don't truncate a character.
732#[inline]
733fn adjust_char_boundary(s: &str, idx: usize, backward: bool) -> usize {
734    if s.is_char_boundary(idx) {
735        return idx;
736    }
737
738    if backward {
739        (0..idx).rev().find(|&i| s.is_char_boundary(i)).unwrap_or(0)
740    } else {
741        (idx..=s.len())
742            .find(|&i| s.is_char_boundary(i))
743            .unwrap_or(s.len())
744    }
745}
746
747/// Given the raw header-derived byte `start`/`end` positions, compute a safe
748/// (start,end) pair for `line`, clamped to `prev_end` to avoid overlap.  Both
749/// returned indices are guaranteed to be valid char boundaries.
750fn safe_slice_range(line: &str, start: usize, end: usize, prev_end: usize) -> (usize, usize) {
751    let line_len = line.len();
752    let actual_end = end.min(line_len);
753
754    let mut safe_start = adjust_char_boundary(line, start, true);
755    if safe_start < prev_end {
756        safe_start = prev_end;
757    }
758
759    let mut safe_end = adjust_char_boundary(line, actual_end, false);
760    if safe_end < safe_start {
761        safe_end = safe_start;
762    }
763
764    (safe_start, safe_end)
765}
766
767/// Split a data line into columns based on header positions.
768/// Each column's value is the substring from its header position to the next header position.
769///
770/// Note that the header positions are computed from the first line only and are
771/// therefore byte offsets **in that header string**. subsequent rows may contain
772/// wider characters (e.g. an ellipsis or accented letter) which makes those offsets
773/// invalid for the later lines. we therefore adjust each start/end offset to a
774/// valid character boundary *for the line being sliced* to avoid panics.
775fn split_line_by_positions(line: &str, positions: &[(usize, String)]) -> Vec<String> {
776    if positions.is_empty() {
777        return vec![line.to_string()];
778    }
779
780    let mut values = vec![];
781    let line_len = line.len();
782
783    let mut prev_end = 0;
784    for (i, (start, _)) in positions.iter().enumerate() {
785        let start = *start;
786        let end = if i + 1 < positions.len() {
787            positions[i + 1].0
788        } else {
789            line_len
790        };
791
792        if start < line_len {
793            let (safe_start, safe_end) = safe_slice_range(line, start, end, prev_end);
794            let value = &line[safe_start..safe_end];
795            values.push(value.trim().to_string());
796            prev_end = safe_end;
797        } else {
798            values.push(String::new());
799        }
800    }
801
802    values
803}
804
805#[derive(Clone, Copy)]
806enum BlockKind {
807    Parenthesis,
808    Brace,
809    Bracket,
810}
811
812/// Tokenizes a single "baseline" token from the input stream.
813/// A baseline token is a sequence of characters that can span multiple lines,
814/// but is bounded by whitespace, pipes, semicolons, or other shell syntax elements.
815/// It handles string literals, nested delimiters (parentheses, braces, brackets),
816/// and stops at terminating characters.
817fn baseline(src: &mut Input) -> Spanned<String> {
818    let mut token_contents = String::new();
819
820    let start_offset = if let Some((pos, _)) = src.peek() {
821        *pos
822    } else {
823        0
824    };
825
826    // This variable tracks the starting character of a string literal, so that
827    // we remain inside the string literal lexer mode until we encounter the
828    // closing quote.
829    let mut quote_start: Option<char> = None;
830
831    // This Vec tracks paired delimiters
832    let mut block_level: Vec<BlockKind> = vec![];
833
834    // A baseline token is terminated if it's not nested inside of a paired
835    // delimiter and the next character is one of: `|`, `;`, `#` or any
836    // whitespace.
837    fn is_termination(block_level: &[BlockKind], c: char) -> bool {
838        block_level.is_empty() && (c.is_whitespace())
839    }
840
841    // The process of slurping up a baseline token repeats:
842    //
843    // - String literal, which begins with `'`, `"` or `\``, and continues until
844    //   the same character is encountered again.
845    // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
846    //   the matching closing delimiter is found, skipping comments and string
847    //   literals.
848    // - When not nested inside of a delimiter pair, when a terminating
849    //   character (whitespace, `|`, `;` or `#`) is encountered, the baseline
850    //   token is done.
851    // - Otherwise, accumulate the character into the current baseline token.
852    while let Some((_, c)) = src.peek() {
853        let c = *c;
854
855        if quote_start.is_some() {
856            // If we encountered the closing quote character for the current
857            // string, we're done with the current string.
858            if Some(c) == quote_start {
859                quote_start = None;
860            }
861        } else if c == '\n' {
862            if is_termination(&block_level, c) {
863                break;
864            }
865        } else if c == '\'' || c == '"' || c == '`' {
866            // We encountered the opening quote of a string literal.
867            quote_start = Some(c);
868        } else if c == '[' {
869            // We encountered an opening `[` delimiter.
870            block_level.push(BlockKind::Bracket);
871        } else if c == ']' {
872            // We encountered a closing `]` delimiter. Pop off the opening `[`
873            // delimiter.
874            if let Some(BlockKind::Bracket) = block_level.last() {
875                let _ = block_level.pop();
876            }
877        } else if c == '{' {
878            // We encountered an opening `{` delimiter.
879            block_level.push(BlockKind::Brace);
880        } else if c == '}' {
881            // We encountered a closing `}` delimiter. Pop off the opening `{`.
882            if let Some(BlockKind::Brace) = block_level.last() {
883                let _ = block_level.pop();
884            }
885        } else if c == '(' {
886            // We enceountered an opening `(` delimiter.
887            block_level.push(BlockKind::Parenthesis);
888        } else if c == ')' {
889            // We encountered a closing `)` delimiter. Pop off the opening `(`.
890            if let Some(BlockKind::Parenthesis) = block_level.last() {
891                let _ = block_level.pop();
892            }
893        } else if is_termination(&block_level, c) {
894            break;
895        }
896
897        // Otherwise, accumulate the character into the current token.
898        token_contents.push(c);
899
900        // Consume the character.
901        let _ = src.next();
902    }
903
904    let span = Span::new(start_offset, start_offset + token_contents.len());
905
906    // If there is still unclosed opening delimiters, close them and add
907    // synthetic closing characters to the accumulated token.
908    if block_level.last().is_some() {
909        // let delim: char = (*block).closing();
910        // let cause = ParseError::unexpected_eof(delim.to_string(), span);
911
912        // while let Some(bk) = block_level.pop() {
913        //     token_contents.push(bk.closing());
914        // }
915
916        return Spanned {
917            item: token_contents,
918            span,
919        };
920    }
921
922    if quote_start.is_some() {
923        // The non-lite parse trims quotes on both sides, so we add the expected quote so that
924        // anyone wanting to consume this partial parse (e.g., completions) will be able to get
925        // correct information from the non-lite parse.
926        // token_contents.push(delimiter);
927
928        // return (
929        //     token_contents.spanned(span),
930        //     Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
931        // );
932        return Spanned {
933            item: token_contents,
934            span,
935        };
936    }
937
938    Spanned {
939        item: token_contents,
940        span,
941    }
942}
943
944fn merge_record(record: Record, range: &Range, input_span: Span) -> Value {
945    let (start_index, end_index) = match process_range(range, record.len(), input_span) {
946        Ok(Some((l_idx, r_idx))) => (l_idx, r_idx),
947        Ok(None) => return Value::record(record, input_span),
948        Err(e) => return Value::error(e, input_span),
949    };
950
951    match merge_record_impl(record, start_index, end_index, input_span) {
952        Ok(rec) => Value::record(rec, input_span),
953        Err(err) => Value::error(err, input_span),
954    }
955}
956
957fn process_range(
958    range: &Range,
959    length: usize,
960    input_span: Span,
961) -> Result<Option<(usize, usize)>, ShellError> {
962    match nu_cmd_base::util::process_range(range) {
963        Ok((l_idx, r_idx)) => {
964            let l_idx = if l_idx < 0 {
965                length as isize + l_idx
966            } else {
967                l_idx
968            };
969
970            let r_idx = if r_idx < 0 {
971                length as isize + r_idx
972            } else {
973                r_idx
974            };
975
976            if !(l_idx <= r_idx && (r_idx >= 0 || l_idx < (length as isize))) {
977                return Ok(None);
978            }
979
980            Ok(Some((
981                l_idx.max(0) as usize,
982                (r_idx as usize + 1).min(length),
983            )))
984        }
985        Err(processing_error) => Err(processing_error("could not find range index", input_span)),
986    }
987}
988
989fn merge_record_impl(
990    record: Record,
991    start_index: usize,
992    end_index: usize,
993    input_span: Span,
994) -> Result<Record, ShellError> {
995    let (mut cols, mut vals): (Vec<_>, Vec<_>) = record.into_iter().unzip();
996    // Merge Columns
997    ((start_index + 1)..(cols.len() - end_index + start_index + 1)).for_each(|idx| {
998        cols.swap(idx, end_index - start_index - 1 + idx);
999    });
1000    cols.truncate(cols.len() - end_index + start_index + 1);
1001
1002    // Merge Values
1003    let combined = vals
1004        .iter()
1005        .take(end_index)
1006        .skip(start_index)
1007        .map(|v| v.coerce_str().unwrap_or_default())
1008        .join(" ");
1009    let binding = Value::string(combined, Span::unknown());
1010    let last_seg = vals.split_off(end_index);
1011    vals.truncate(start_index);
1012    vals.push(binding);
1013    vals.extend(last_seg);
1014
1015    Record::from_raw_cols_vals(cols, vals, Span::unknown(), input_span)
1016}
1017
1018#[cfg(test)]
1019mod test {
1020    use super::*;
1021
1022    #[test]
1023    fn test_examples() {
1024        crate::test_examples(DetectColumns)
1025    }
1026
1027    /// Ensure that splitting a line using a header offset that falls inside a
1028    /// multibyte character does not panic and produces a reasonable result. This
1029    /// mirrors the crash described in the issue where an ellipsis in a data row
1030    /// caused slicing to panic.
1031    #[test]
1032    fn split_line_by_positions_multibyte_boundary() {
1033        // `…` is three bytes long; choose an index in the middle of it.
1034        let line = "a…b";
1035        assert!(!line.is_char_boundary(2));
1036
1037        // pretend the second column was discovered at byte offset 2
1038        let positions = vec![(0, "a".to_string()), (2, "b".to_string())];
1039
1040        let cols = split_line_by_positions(line, &positions);
1041        // After clamping, the first column captures the ellipsis and the second
1042        // column begins at the byte boundary after it. result should be
1043        // ["a…", "b"].
1044        assert_eq!(cols, vec!["a…".to_string(), "b".to_string()]);
1045    }
1046
1047    #[test]
1048    fn split_line_with_various_unicode() {
1049        // header positions for three simple space-separated columns
1050        let positions = find_header_positions("a b c");
1051
1052        let examples = [
1053            "x é y",         // combining accent
1054            "x 😄 y",        // single emoji
1055            "x 👨‍👩‍👧‍👦 y",        // ZWJ family emoji
1056            "x 中 y",        // CJK character
1057            "x a\u{0301} y", // decomposed accent
1058        ];
1059
1060        for &line in examples.iter() {
1061            // should never panic and should produce three columns; we don't assert
1062            // on the exact values because wide graphemes may be split unpredictably,
1063            // but the column count should remain stable.
1064            let cols = split_line_by_positions(line, &positions);
1065            assert_eq!(cols.len(), 3, "line produced wrong column count: {}", line);
1066        }
1067    }
1068}
nu_command/strings/detect_columns.rs

nu_command/strings/
detect_columns.rs