use itertools::Itertools;
use nu_engine::command_prelude::*;
use nu_protocol::{Config, Range};
use std::{io::Cursor, iter::Peekable, str::CharIndices, sync::Arc};
type Input<'t> = Peekable<CharIndices<'t>>;
fn is_box_char(c: char) -> bool {
matches!(
c,
'─' | '━' | '┄' | '┅' | '┈' | '┉' | '-' | '=' |
'│' | '┃' | '┆' | '┇' | '┊' | '┋' | '|' |
'+' | '├' | '┤' | '┬' | '┴' | '┼' | '┌' | '┐' | '└' | '┘'
)
}
#[derive(Clone)]
pub struct DetectColumns;
impl Command for DetectColumns {
fn name(&self) -> &str {
"detect columns"
}
fn signature(&self) -> Signature {
Signature::build("detect columns")
.named(
"skip",
SyntaxShape::Int,
"Number of rows to skip before detecting.",
Some('s'),
)
.input_output_types(vec![
(Type::String, Type::table()),
(Type::table(), Type::table()),
])
.switch("no-headers", "Don't detect headers.", Some('n'))
.switch(
"ignore-box-chars",
"Ignore lines consisting entirely of box drawing characters and clean box characters from tokens.",
Some('i'),
)
.named(
"combine-columns",
SyntaxShape::Range,
"Columns to be combined; listed as a range.",
Some('c'),
)
.switch(
"guess",
"Detect columns by guessing width, it may be useful if default one doesn't work.",
None,
)
.category(Category::Strings)
}
fn description(&self) -> &str {
"Attempt to automatically split text into multiple columns."
}
fn search_terms(&self) -> Vec<&str> {
vec!["split", "tabular"]
}
fn examples(&self) -> Vec<Example<'_>> {
vec![
Example {
description: "use --guess if you find default algorithm not working",
example: "
'Filesystem 1K-blocks Used Available Use% Mounted on
none 8150224 4 8150220 1% /mnt/c' | detect columns --guess",
result: Some(Value::test_list(vec![Value::test_record(record! {
"Filesystem" => Value::test_string("none"),
"1K-blocks" => Value::test_string("8150224"),
"Used" => Value::test_string("4"),
"Available" => Value::test_string("8150220"),
"Use%" => Value::test_string("1%"),
"Mounted on" => Value::test_string("/mnt/c")
})])),
},
Example {
description: "detect columns with no headers",
example: "'a b c' | detect columns --no-headers",
result: Some(Value::test_list(vec![Value::test_record(record! {
"column0" => Value::test_string("a"),
"column1" => Value::test_string("b"),
"column2" => Value::test_string("c"),
})])),
},
Example {
description: "",
example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 0..1 ",
result: None,
},
Example {
description: "Splits a multi-line string into columns with headers detected",
example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns -2..-1 ",
result: None,
},
Example {
description: "Splits a multi-line string into columns with headers detected",
example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 2.. ",
result: None,
},
Example {
description: "Parse external ls command and combine columns for datetime",
example: "^ls -lh | detect columns --no-headers --skip 1 --combine-columns 5..7",
result: None,
},
Example {
description: "Table literal input is passed through unchanged",
example: "[[name, age]; [Alice, 25]] | detect columns",
result: Some(Value::test_list(vec![Value::test_record(record! {
"name" => Value::test_string("Alice"),
"age" => Value::test_int(25)
})])),
},
Example {
description: "List of records input is passed through unchanged",
example: "[{name: Alice, age: 25}, {name: Bob, age: 30}] | detect columns",
result: Some(Value::test_list(vec![
Value::test_record(record! {
"name" => Value::test_string("Alice"),
"age" => Value::test_int(25)
}),
Value::test_record(record! {
"name" => Value::test_string("Bob"),
"age" => Value::test_int(30)
}),
])),
},
Example {
description: "Parse a box-bordered table by ignoring separator lines and using header positions",
example: r#""+-------+-------+
| col1 | col2 |
+-------+-------+
| a | b |
+-------+-------+" | detect columns --ignore-box-chars"#,
result: Some(Value::test_list(vec![Value::test_record(record! {
"col1" => Value::test_string("a"),
"col2" => Value::test_string("b"),
})])),
},
]
}
fn is_const(&self) -> bool {
true
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let num_rows_to_skip: Option<usize> = call.get_flag(engine_state, stack, "skip")?;
let noheader = call.has_flag(engine_state, stack, "no-headers")?;
let range: Option<Range> = call.get_flag(engine_state, stack, "combine-columns")?;
let ignore_box_chars = call.has_flag(engine_state, stack, "ignore-box-chars")?;
let config = stack.get_config(engine_state);
let args = Arguments {
noheader,
num_rows_to_skip,
range,
config,
ignore_box_chars,
};
if call.has_flag(engine_state, stack, "guess")? {
guess_width(engine_state, call, input, args)
} else {
detect_columns(engine_state, call, input, args)
}
}
fn run_const(
&self,
working_set: &StateWorkingSet,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let num_rows_to_skip: Option<usize> = call.get_flag_const(working_set, "skip")?;
let noheader = call.has_flag_const(working_set, "no-headers")?;
let range: Option<Range> = call.get_flag_const(working_set, "combine-columns")?;
let ignore_box_chars = call.has_flag_const(working_set, "ignore-box-chars")?;
let config = working_set.get_config().clone();
let args = Arguments {
noheader,
num_rows_to_skip,
range,
config,
ignore_box_chars,
};
if call.has_flag_const(working_set, "guess")? {
guess_width(working_set.permanent(), call, input, args)
} else {
detect_columns(working_set.permanent(), call, input, args)
}
}
}
struct Arguments {
num_rows_to_skip: Option<usize>,
noheader: bool,
range: Option<Range>,
config: Arc<Config>,
ignore_box_chars: bool,
}
fn guess_width(
engine_state: &EngineState,
call: &Call,
input: PipelineData,
args: Arguments,
) -> Result<PipelineData, ShellError> {
use super::guess_width::GuessWidth;
let input_span = input.span().unwrap_or(call.head);
let mut input = input.collect_string("", &args.config)?;
if let Some(rows) = args.num_rows_to_skip {
input = input.lines().skip(rows).map(|x| x.to_string()).join("\n");
}
if args.ignore_box_chars {
let filtered_lines = filter_box_chars(input.lines().map(|s| s.to_string()));
input = filtered_lines.join("\n");
}
let mut guess_width = GuessWidth::new_reader(Box::new(Cursor::new(input)));
let result = guess_width.read_all();
if result.is_empty() {
return Ok(Value::nothing(input_span).into_pipeline_data());
}
if !args.noheader {
let columns = result[0].clone();
Ok(result
.into_iter()
.skip(1)
.map(move |s| {
let mut values: Vec<Value> = s
.into_iter()
.map(|v| Value::string(v, input_span))
.collect();
for _ in values.len()..columns.len() {
values.push(Value::string("", input_span));
}
let record =
Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
match record {
Ok(r) => match &args.range {
Some(range) => merge_record(r, range, input_span),
None => Value::record(r, input_span),
},
Err(e) => Value::error(e, input_span),
}
})
.into_pipeline_data(input_span, engine_state.signals().clone()))
} else {
let length = result[0].len();
let columns: Vec<String> = (0..length).map(|n| format!("column{n}")).collect();
Ok(result
.into_iter()
.map(move |s| {
let mut values: Vec<Value> = s
.into_iter()
.map(|v| Value::string(v, input_span))
.collect();
for _ in values.len()..columns.len() {
values.push(Value::string("", input_span));
}
let record =
Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
match record {
Ok(r) => match &args.range {
Some(range) => merge_record(r, range, input_span),
None => Value::record(r, input_span),
},
Err(e) => Value::error(e, input_span),
}
})
.into_pipeline_data(input_span, engine_state.signals().clone()))
}
}
fn detect_columns(
_engine_state: &EngineState,
call: &Call,
input: PipelineData,
args: Arguments,
) -> Result<PipelineData, ShellError> {
let name_span = call.head;
let input_span = input.span().unwrap_or(name_span);
match input {
PipelineData::Value(val, _) => {
if let Value::List { vals, .. } = &val
&& vals.iter().all(|v| matches!(v, Value::Record { .. }))
{
return Ok(val.into_pipeline_data());
}
let input_str = val.coerce_str()?.to_string();
process_string_input(input_str, args, name_span, input_span)
}
PipelineData::ListStream(_, _) => Ok(input),
PipelineData::ByteStream(_, _) => {
let input_str = input.collect_string("", &args.config)?;
process_string_input(input_str, args, name_span, input_span)
}
PipelineData::Empty => Ok(PipelineData::empty()),
}
}
fn process_string_input(
input_str: String,
args: Arguments,
name_span: Span,
input_span: Span,
) -> Result<PipelineData, ShellError> {
let lines_iter = input_str
.lines()
.skip(args.num_rows_to_skip.unwrap_or_default());
let filtered_lines: Vec<_> = if args.ignore_box_chars {
filter_box_chars(lines_iter.map(|s| s.to_string()))
} else {
lines_iter.map(|x| x.to_string()).collect()
};
let mut lines = filtered_lines.into_iter();
let header_line = lines.next();
if let Some(header_line) = header_line {
if args.ignore_box_chars {
process_with_box_filter(header_line, lines, args, name_span, input_span)
} else {
process_standard(header_line, lines, args, name_span, input_span)
}
} else {
Ok(PipelineData::empty())
}
}
fn process_with_box_filter(
header_line: String,
lines: impl Iterator<Item = String>,
args: Arguments,
name_span: Span,
input_span: Span,
) -> Result<PipelineData, ShellError> {
let has_internal_separators = header_line.contains('|') || header_line.contains('│');
let (processed_headers, processed_lines): (String, Vec<String>) = if has_internal_separators {
let replace_separators = |s: &str| {
s.chars()
.map(|c| if c == '|' || c == '│' { ' ' } else { c })
.collect::<String>()
};
(
replace_separators(&header_line),
lines.map(|line| replace_separators(&line)).collect(),
)
} else {
(header_line.clone(), lines.collect())
};
if !has_internal_separators {
let header_positions = find_header_positions(&header_line);
if header_positions.is_empty() {
return Ok(PipelineData::empty());
}
let mut header_names: Vec<String> = header_positions
.iter()
.map(|(_, name)| name.clone())
.collect();
if args.noheader {
for (i, name) in header_names.iter_mut().enumerate() {
*name = format!("column{i}");
}
}
check_duplicate_string_headers(&header_names, input_span, name_span)?;
let all_lines: Vec<_> = args
.noheader
.then_some(header_line.clone())
.into_iter()
.chain(processed_lines)
.collect();
return Ok(Value::list(
all_lines
.into_iter()
.map(|line| {
let values = split_line_by_positions(&line, &header_positions);
let mut record = Record::new();
for (header, val) in header_names.iter().zip(values.iter()) {
record.push(header, Value::string(val, name_span));
}
for header in header_names.iter().skip(values.len()) {
record.push(header, Value::string("", name_span));
}
Ok::<Value, ShellError>(match &args.range {
Some(range) => merge_record(record, range, name_span),
None => Value::record(record, name_span),
})
})
.collect::<Result<Vec<_>, _>>()?,
name_span,
)
.into_pipeline_data());
}
let mut headers = find_columns(&processed_headers);
if args.noheader {
for header in headers.iter_mut().enumerate() {
header.1.item = format!("column{}", header.0);
}
}
check_duplicate_headers(&headers, input_span, name_span)?;
let all_lines: Vec<_> = args
.noheader
.then_some(processed_headers.clone())
.into_iter()
.chain(processed_lines)
.collect();
Ok(Value::list(
all_lines
.into_iter()
.map(|line| {
let row = find_columns(&line);
let mut record = Record::new();
for (header, val) in headers.iter().zip(row.iter()) {
record.push(&header.item, Value::string(&val.item, name_span));
}
for header in headers.iter().skip(row.len()) {
record.push(&header.item, Value::string("", name_span));
}
Ok::<Value, ShellError>(match &args.range {
Some(range) => merge_record(record, range, name_span),
None => Value::record(record, name_span),
})
})
.collect::<Result<Vec<_>, _>>()?,
name_span,
)
.into_pipeline_data())
}
fn process_standard(
header_line: String,
lines: impl Iterator<Item = String>,
args: Arguments,
name_span: Span,
input_span: Span,
) -> Result<PipelineData, ShellError> {
let mut headers = find_columns(&header_line);
if args.noheader {
for header in headers.iter_mut().enumerate() {
header.1.item = format!("column{}", header.0);
}
}
check_duplicate_headers(&headers, input_span, name_span)?;
let remaining_lines: Vec<_> = lines.collect();
let detection_failed = remaining_lines
.first()
.is_some_and(|first_line| find_columns(first_line).len() != headers.len());
let all_lines: Vec<_> = if detection_failed {
std::iter::once(header_line.clone())
.chain(remaining_lines)
.collect()
} else {
args.noheader
.then_some(header_line.clone())
.into_iter()
.chain(remaining_lines)
.collect()
};
Ok(Value::list(
all_lines
.into_iter()
.map(move |x| {
let row = find_columns(&x);
let mut record = Record::new();
if !detection_failed && headers.len() == row.len() {
for (header, val) in headers.iter().zip(row.iter()) {
record.push(&header.item, Value::string(&val.item, name_span));
}
} else {
record.push("data", Value::string(&x, name_span));
}
Ok::<Value, ShellError>(match &args.range {
Some(range) => merge_record(record, range, name_span),
None => Value::record(record, name_span),
})
})
.collect::<Result<Vec<_>, _>>()?,
name_span,
)
.into_pipeline_data())
}
pub fn find_columns(input: &str) -> Vec<Spanned<String>> {
let mut chars = input.char_indices().peekable();
let mut output = vec![];
while let Some((_, c)) = chars.peek() {
if c.is_whitespace() {
let _ = chars.next();
} else {
let result = baseline(&mut chars);
output.push(result);
}
}
output
}
fn has_duplicate_names<I, S>(iter: I) -> bool
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let mut set = std::collections::HashSet::new();
for item in iter {
let s = item.as_ref();
if !set.insert(s.to_string()) {
return true;
}
}
false
}
fn check_duplicate_headers(
headers: &[Spanned<String>],
input_span: Span,
name_span: Span,
) -> Result<(), ShellError> {
if has_duplicate_names(headers.iter().map(|h| &h.item)) {
Err(ShellError::ColumnDetectionFailure {
bad_value: input_span,
failure_site: name_span,
})
} else {
Ok(())
}
}
fn check_duplicate_string_headers(
headers: &[String],
input_span: Span,
name_span: Span,
) -> Result<(), ShellError> {
if has_duplicate_names(headers.iter().map(|s| s.as_str())) {
Err(ShellError::ColumnDetectionFailure {
bad_value: input_span,
failure_site: name_span,
})
} else {
Ok(())
}
}
fn filter_box_chars<I>(lines_iter: I) -> Vec<String>
where
I: Iterator<Item = String>,
{
lines_iter
.filter(|r| !r.trim().chars().all(is_box_char))
.map(|line| {
let trimmed = line.trim();
let cleaned = trimmed
.strip_prefix('|')
.or_else(|| trimmed.strip_prefix('│'))
.unwrap_or(trimmed);
let cleaned = cleaned.strip_prefix(' ').unwrap_or(cleaned);
let cleaned = cleaned
.strip_suffix('|')
.or_else(|| cleaned.strip_suffix('│'))
.unwrap_or(cleaned);
let cleaned = cleaned.strip_suffix(' ').unwrap_or(cleaned);
cleaned.to_string()
})
.collect()
}
fn find_header_positions(header_line: &str) -> Vec<(usize, String)> {
let mut positions = vec![];
let mut in_word = false;
let mut word_start = 0;
let mut current_word = String::new();
for (idx, c) in header_line.char_indices() {
if c.is_whitespace() {
if in_word {
positions.push((word_start, current_word.clone()));
current_word.clear();
in_word = false;
}
} else {
if !in_word {
word_start = idx;
in_word = true;
}
current_word.push(c);
}
}
if in_word && !current_word.is_empty() {
positions.push((word_start, current_word));
}
positions
}
#[inline]
fn adjust_char_boundary(s: &str, idx: usize, backward: bool) -> usize {
if s.is_char_boundary(idx) {
return idx;
}
if backward {
(0..idx).rev().find(|&i| s.is_char_boundary(i)).unwrap_or(0)
} else {
(idx..=s.len())
.find(|&i| s.is_char_boundary(i))
.unwrap_or(s.len())
}
}
fn safe_slice_range(line: &str, start: usize, end: usize, prev_end: usize) -> (usize, usize) {
let line_len = line.len();
let actual_end = end.min(line_len);
let mut safe_start = adjust_char_boundary(line, start, true);
if safe_start < prev_end {
safe_start = prev_end;
}
let mut safe_end = adjust_char_boundary(line, actual_end, false);
if safe_end < safe_start {
safe_end = safe_start;
}
(safe_start, safe_end)
}
fn split_line_by_positions(line: &str, positions: &[(usize, String)]) -> Vec<String> {
if positions.is_empty() {
return vec![line.to_string()];
}
let mut values = vec![];
let line_len = line.len();
let mut prev_end = 0;
for (i, (start, _)) in positions.iter().enumerate() {
let start = *start;
let end = if i + 1 < positions.len() {
positions[i + 1].0
} else {
line_len
};
if start < line_len {
let (safe_start, safe_end) = safe_slice_range(line, start, end, prev_end);
let value = &line[safe_start..safe_end];
values.push(value.trim().to_string());
prev_end = safe_end;
} else {
values.push(String::new());
}
}
values
}
#[derive(Clone, Copy)]
enum BlockKind {
Parenthesis,
Brace,
Bracket,
}
fn baseline(src: &mut Input) -> Spanned<String> {
let mut token_contents = String::new();
let start_offset = if let Some((pos, _)) = src.peek() {
*pos
} else {
0
};
let mut quote_start: Option<char> = None;
let mut block_level: Vec<BlockKind> = vec![];
fn is_termination(block_level: &[BlockKind], c: char) -> bool {
block_level.is_empty() && (c.is_whitespace())
}
while let Some((_, c)) = src.peek() {
let c = *c;
if quote_start.is_some() {
if Some(c) == quote_start {
quote_start = None;
}
} else if c == '\n' {
if is_termination(&block_level, c) {
break;
}
} else if c == '\'' || c == '"' || c == '`' {
quote_start = Some(c);
} else if c == '[' {
block_level.push(BlockKind::Bracket);
} else if c == ']' {
if let Some(BlockKind::Bracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '{' {
block_level.push(BlockKind::Brace);
} else if c == '}' {
if let Some(BlockKind::Brace) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '(' {
block_level.push(BlockKind::Parenthesis);
} else if c == ')' {
if let Some(BlockKind::Parenthesis) = block_level.last() {
let _ = block_level.pop();
}
} else if is_termination(&block_level, c) {
break;
}
token_contents.push(c);
let _ = src.next();
}
let span = Span::new(start_offset, start_offset + token_contents.len());
if block_level.last().is_some() {
return Spanned {
item: token_contents,
span,
};
}
if quote_start.is_some() {
return Spanned {
item: token_contents,
span,
};
}
Spanned {
item: token_contents,
span,
}
}
fn merge_record(record: Record, range: &Range, input_span: Span) -> Value {
let (start_index, end_index) = match process_range(range, record.len(), input_span) {
Ok(Some((l_idx, r_idx))) => (l_idx, r_idx),
Ok(None) => return Value::record(record, input_span),
Err(e) => return Value::error(e, input_span),
};
match merge_record_impl(record, start_index, end_index, input_span) {
Ok(rec) => Value::record(rec, input_span),
Err(err) => Value::error(err, input_span),
}
}
fn process_range(
range: &Range,
length: usize,
input_span: Span,
) -> Result<Option<(usize, usize)>, ShellError> {
match nu_cmd_base::util::process_range(range) {
Ok((l_idx, r_idx)) => {
let l_idx = if l_idx < 0 {
length as isize + l_idx
} else {
l_idx
};
let r_idx = if r_idx < 0 {
length as isize + r_idx
} else {
r_idx
};
if !(l_idx <= r_idx && (r_idx >= 0 || l_idx < (length as isize))) {
return Ok(None);
}
Ok(Some((
l_idx.max(0) as usize,
(r_idx as usize + 1).min(length),
)))
}
Err(processing_error) => Err(processing_error("could not find range index", input_span)),
}
}
fn merge_record_impl(
record: Record,
start_index: usize,
end_index: usize,
input_span: Span,
) -> Result<Record, ShellError> {
let (mut cols, mut vals): (Vec<_>, Vec<_>) = record.into_iter().unzip();
((start_index + 1)..(cols.len() - end_index + start_index + 1)).for_each(|idx| {
cols.swap(idx, end_index - start_index - 1 + idx);
});
cols.truncate(cols.len() - end_index + start_index + 1);
let combined = vals
.iter()
.take(end_index)
.skip(start_index)
.map(|v| v.coerce_str().unwrap_or_default())
.join(" ");
let binding = Value::string(combined, input_span);
let last_seg = vals.split_off(end_index);
vals.truncate(start_index);
vals.push(binding);
vals.extend(last_seg);
Record::from_raw_cols_vals(cols, vals, input_span, input_span)
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_examples() -> nu_test_support::Result {
nu_test_support::test().examples(DetectColumns)
}
#[test]
fn split_line_by_positions_multibyte_boundary() {
let line = "a…b";
assert!(!line.is_char_boundary(2));
let positions = vec![(0, "a".to_string()), (2, "b".to_string())];
let cols = split_line_by_positions(line, &positions);
assert_eq!(cols, vec!["a…".to_string(), "b".to_string()]);
}
#[test]
fn split_line_with_various_unicode() {
let positions = find_header_positions("a b c");
let examples = [
"x é y", "x 😄 y", "x 👨👩👧👦 y", "x 中 y", "x a\u{0301} y", ];
for &line in examples.iter() {
let cols = split_line_by_positions(line, &positions);
assert_eq!(cols.len(), 3, "line produced wrong column count: {}", line);
}
}
}