1use itertools::Itertools;
2use nu_engine::command_prelude::*;
3use nu_protocol::{Config, Range};
4use std::{io::Cursor, iter::Peekable, str::CharIndices, sync::Arc};
5
6type Input<'t> = Peekable<CharIndices<'t>>;
7
8#[derive(Clone)]
9pub struct DetectColumns;
10
11impl Command for DetectColumns {
12 fn name(&self) -> &str {
13 "detect columns"
14 }
15
16 fn signature(&self) -> Signature {
17 Signature::build("detect columns")
18 .named(
19 "skip",
20 SyntaxShape::Int,
21 "number of rows to skip before detecting",
22 Some('s'),
23 )
24 .input_output_types(vec![(Type::String, Type::table())])
25 .switch("no-headers", "don't detect headers", Some('n'))
26 .named(
27 "combine-columns",
28 SyntaxShape::Range,
29 "columns to be combined; listed as a range",
30 Some('c'),
31 )
32 .switch(
33 "guess",
34 "detect columns by guessing width, it may be useful if default one doesn't work",
35 None,
36 )
37 .category(Category::Strings)
38 }
39
40 fn description(&self) -> &str {
41 "Attempt to automatically split text into multiple columns."
42 }
43
44 fn search_terms(&self) -> Vec<&str> {
45 vec!["split", "tabular"]
46 }
47
48 fn examples(&self) -> Vec<Example<'_>> {
49 vec![
50 Example {
51 description: "use --guess if you find default algorithm not working",
52 example: r"
53'Filesystem 1K-blocks Used Available Use% Mounted on
54none 8150224 4 8150220 1% /mnt/c' | detect columns --guess",
55 result: Some(Value::test_list(vec![Value::test_record(record! {
56 "Filesystem" => Value::test_string("none"),
57 "1K-blocks" => Value::test_string("8150224"),
58 "Used" => Value::test_string("4"),
59 "Available" => Value::test_string("8150220"),
60 "Use%" => Value::test_string("1%"),
61 "Mounted on" => Value::test_string("/mnt/c")
62 })])),
63 },
64 Example {
65 description: "detect columns with no headers",
66 example: "'a b c' | detect columns --no-headers",
67 result: Some(Value::test_list(vec![Value::test_record(record! {
68 "column0" => Value::test_string("a"),
69 "column1" => Value::test_string("b"),
70 "column2" => Value::test_string("c"),
71 })])),
72 },
73 Example {
74 description: "",
75 example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 0..1 ",
76 result: None,
77 },
78 Example {
79 description: "Splits a multi-line string into columns with headers detected",
80 example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns -2..-1 ",
81 result: None,
82 },
83 Example {
84 description: "Splits a multi-line string into columns with headers detected",
85 example: "$'c1 c2 c3 c4 c5(char nl)a b c d e' | detect columns --combine-columns 2.. ",
86 result: None,
87 },
88 Example {
89 description: "Parse external ls command and combine columns for datetime",
90 example: "^ls -lh | detect columns --no-headers --skip 1 --combine-columns 5..7",
91 result: None,
92 },
93 ]
94 }
95
96 fn is_const(&self) -> bool {
97 true
98 }
99
100 fn run(
101 &self,
102 engine_state: &EngineState,
103 stack: &mut Stack,
104 call: &Call,
105 input: PipelineData,
106 ) -> Result<PipelineData, ShellError> {
107 let num_rows_to_skip: Option<usize> = call.get_flag(engine_state, stack, "skip")?;
108 let noheader = call.has_flag(engine_state, stack, "no-headers")?;
109 let range: Option<Range> = call.get_flag(engine_state, stack, "combine-columns")?;
110 let config = stack.get_config(engine_state);
111
112 let args = Arguments {
113 noheader,
114 num_rows_to_skip,
115 range,
116 config,
117 };
118
119 if call.has_flag(engine_state, stack, "guess")? {
120 guess_width(engine_state, call, input, args)
121 } else {
122 detect_columns(engine_state, call, input, args)
123 }
124 }
125
126 fn run_const(
127 &self,
128 working_set: &StateWorkingSet,
129 call: &Call,
130 input: PipelineData,
131 ) -> Result<PipelineData, ShellError> {
132 let num_rows_to_skip: Option<usize> = call.get_flag_const(working_set, "skip")?;
133 let noheader = call.has_flag_const(working_set, "no-headers")?;
134 let range: Option<Range> = call.get_flag_const(working_set, "combine-columns")?;
135 let config = working_set.get_config().clone();
136
137 let args = Arguments {
138 noheader,
139 num_rows_to_skip,
140 range,
141 config,
142 };
143
144 if call.has_flag_const(working_set, "guess")? {
145 guess_width(working_set.permanent(), call, input, args)
146 } else {
147 detect_columns(working_set.permanent(), call, input, args)
148 }
149 }
150}
151
152struct Arguments {
153 num_rows_to_skip: Option<usize>,
154 noheader: bool,
155 range: Option<Range>,
156 config: Arc<Config>,
157}
158
159fn guess_width(
160 engine_state: &EngineState,
161 call: &Call,
162 input: PipelineData,
163 args: Arguments,
164) -> Result<PipelineData, ShellError> {
165 use super::guess_width::GuessWidth;
166 let input_span = input.span().unwrap_or(call.head);
167
168 let mut input = input.collect_string("", &args.config)?;
169 if let Some(rows) = args.num_rows_to_skip {
170 input = input.lines().skip(rows).map(|x| x.to_string()).join("\n");
171 }
172
173 let mut guess_width = GuessWidth::new_reader(Box::new(Cursor::new(input)));
174
175 let result = guess_width.read_all();
176
177 if result.is_empty() {
178 return Ok(Value::nothing(input_span).into_pipeline_data());
179 }
180 if !args.noheader {
181 let columns = result[0].clone();
182 Ok(result
183 .into_iter()
184 .skip(1)
185 .map(move |s| {
186 let mut values: Vec<Value> = s
187 .into_iter()
188 .map(|v| Value::string(v, input_span))
189 .collect();
190 for _ in values.len()..columns.len() {
192 values.push(Value::string("", input_span));
193 }
194 let record =
195 Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
196 match record {
197 Ok(r) => match &args.range {
198 Some(range) => merge_record(r, range, input_span),
199 None => Value::record(r, input_span),
200 },
201 Err(e) => Value::error(e, input_span),
202 }
203 })
204 .into_pipeline_data(input_span, engine_state.signals().clone()))
205 } else {
206 let length = result[0].len();
207 let columns: Vec<String> = (0..length).map(|n| format!("column{n}")).collect();
208 Ok(result
209 .into_iter()
210 .map(move |s| {
211 let mut values: Vec<Value> = s
212 .into_iter()
213 .map(|v| Value::string(v, input_span))
214 .collect();
215 for _ in values.len()..columns.len() {
217 values.push(Value::string("", input_span));
218 }
219 let record =
220 Record::from_raw_cols_vals(columns.clone(), values, input_span, input_span);
221 match record {
222 Ok(r) => match &args.range {
223 Some(range) => merge_record(r, range, input_span),
224 None => Value::record(r, input_span),
225 },
226 Err(e) => Value::error(e, input_span),
227 }
228 })
229 .into_pipeline_data(input_span, engine_state.signals().clone()))
230 }
231}
232
233fn detect_columns(
234 engine_state: &EngineState,
235 call: &Call,
236 input: PipelineData,
237 args: Arguments,
238) -> Result<PipelineData, ShellError> {
239 let name_span = call.head;
240 let input_span = input.span().unwrap_or(Span::unknown());
241 let input = input.collect_string("", &args.config)?;
242
243 let input: Vec<_> = input
244 .lines()
245 .skip(args.num_rows_to_skip.unwrap_or_default())
246 .map(|x| x.to_string())
247 .collect();
248
249 let mut input = input.into_iter();
250 let headers = input.next();
251
252 if let Some(orig_headers) = headers {
253 let mut headers = find_columns(&orig_headers);
254
255 if args.noheader {
256 for header in headers.iter_mut().enumerate() {
257 header.1.item = format!("column{}", header.0);
258 }
259 }
260
261 Ok(args
262 .noheader
263 .then_some(orig_headers)
264 .into_iter()
265 .chain(input)
266 .map(move |x| {
267 let row = find_columns(&x);
268
269 let mut record = Record::new();
270
271 if headers.len() == row.len() {
272 for (header, val) in headers.iter().zip(row.iter()) {
273 record.push(&header.item, Value::string(&val.item, name_span));
274 }
275 } else {
276 let mut pre_output = vec![];
277
278 for cell in row {
280 for header in &headers {
281 if cell.span.start <= header.span.end
282 && cell.span.end > header.span.start
283 {
284 pre_output.push((
285 header.item.to_string(),
286 Value::string(&cell.item, name_span),
287 ));
288 }
289 }
290 }
291
292 for header in &headers {
293 let mut found = false;
294 for pre_o in &pre_output {
295 if pre_o.0 == header.item {
296 found = true;
297 break;
298 }
299 }
300
301 if !found {
302 pre_output.push((header.item.to_string(), Value::nothing(name_span)));
303 }
304 }
305
306 for header in &headers {
307 for pre_o in &pre_output {
308 if pre_o.0 == header.item {
309 record.push(&header.item, pre_o.1.clone());
310 }
311 }
312 }
313 }
314
315 let has_column_duplicates = record.columns().duplicates().count() > 0;
316 if has_column_duplicates {
317 return Err(ShellError::ColumnDetectionFailure {
318 bad_value: input_span,
319 failure_site: name_span,
320 });
321 }
322
323 Ok(match &args.range {
324 Some(range) => merge_record(record, range, name_span),
325 None => Value::record(record, name_span),
326 })
327 })
328 .collect::<Result<Vec<_>, _>>()?
329 .into_pipeline_data(call.head, engine_state.signals().clone()))
330 } else {
331 Ok(PipelineData::empty())
332 }
333}
334
335pub fn find_columns(input: &str) -> Vec<Spanned<String>> {
336 let mut chars = input.char_indices().peekable();
337 let mut output = vec![];
338
339 while let Some((_, c)) = chars.peek() {
340 if c.is_whitespace() {
341 let _ = chars.next();
344 } else {
345 let result = baseline(&mut chars);
348
349 output.push(result);
350 }
351 }
352
353 output
354}
355
356#[derive(Clone, Copy)]
357enum BlockKind {
358 Parenthesis,
359 Brace,
360 Bracket,
361}
362
363fn baseline(src: &mut Input) -> Spanned<String> {
364 let mut token_contents = String::new();
365
366 let start_offset = if let Some((pos, _)) = src.peek() {
367 *pos
368 } else {
369 0
370 };
371
372 let mut quote_start: Option<char> = None;
376
377 let mut block_level: Vec<BlockKind> = vec![];
379
380 fn is_termination(block_level: &[BlockKind], c: char) -> bool {
384 block_level.is_empty() && (c.is_whitespace())
385 }
386
387 while let Some((_, c)) = src.peek() {
399 let c = *c;
400
401 if quote_start.is_some() {
402 if Some(c) == quote_start {
405 quote_start = None;
406 }
407 } else if c == '\n' {
408 if is_termination(&block_level, c) {
409 break;
410 }
411 } else if c == '\'' || c == '"' || c == '`' {
412 quote_start = Some(c);
414 } else if c == '[' {
415 block_level.push(BlockKind::Bracket);
417 } else if c == ']' {
418 if let Some(BlockKind::Bracket) = block_level.last() {
421 let _ = block_level.pop();
422 }
423 } else if c == '{' {
424 block_level.push(BlockKind::Brace);
426 } else if c == '}' {
427 if let Some(BlockKind::Brace) = block_level.last() {
429 let _ = block_level.pop();
430 }
431 } else if c == '(' {
432 block_level.push(BlockKind::Parenthesis);
434 } else if c == ')' {
435 if let Some(BlockKind::Parenthesis) = block_level.last() {
437 let _ = block_level.pop();
438 }
439 } else if is_termination(&block_level, c) {
440 break;
441 }
442
443 token_contents.push(c);
445
446 let _ = src.next();
448 }
449
450 let span = Span::new(start_offset, start_offset + token_contents.len());
451
452 if block_level.last().is_some() {
455 return Spanned {
463 item: token_contents,
464 span,
465 };
466 }
467
468 if quote_start.is_some() {
469 return Spanned {
479 item: token_contents,
480 span,
481 };
482 }
483
484 Spanned {
485 item: token_contents,
486 span,
487 }
488}
489
490fn merge_record(record: Record, range: &Range, input_span: Span) -> Value {
491 let (start_index, end_index) = match process_range(range, record.len(), input_span) {
492 Ok(Some((l_idx, r_idx))) => (l_idx, r_idx),
493 Ok(None) => return Value::record(record, input_span),
494 Err(e) => return Value::error(e, input_span),
495 };
496
497 match merge_record_impl(record, start_index, end_index, input_span) {
498 Ok(rec) => Value::record(rec, input_span),
499 Err(err) => Value::error(err, input_span),
500 }
501}
502
503fn process_range(
504 range: &Range,
505 length: usize,
506 input_span: Span,
507) -> Result<Option<(usize, usize)>, ShellError> {
508 match nu_cmd_base::util::process_range(range) {
509 Ok((l_idx, r_idx)) => {
510 let l_idx = if l_idx < 0 {
511 length as isize + l_idx
512 } else {
513 l_idx
514 };
515
516 let r_idx = if r_idx < 0 {
517 length as isize + r_idx
518 } else {
519 r_idx
520 };
521
522 if !(l_idx <= r_idx && (r_idx >= 0 || l_idx < (length as isize))) {
523 return Ok(None);
524 }
525
526 Ok(Some((
527 l_idx.max(0) as usize,
528 (r_idx as usize + 1).min(length),
529 )))
530 }
531 Err(processing_error) => Err(processing_error("could not find range index", input_span)),
532 }
533}
534
535fn merge_record_impl(
536 record: Record,
537 start_index: usize,
538 end_index: usize,
539 input_span: Span,
540) -> Result<Record, ShellError> {
541 let (mut cols, mut vals): (Vec<_>, Vec<_>) = record.into_iter().unzip();
542 ((start_index + 1)..(cols.len() - end_index + start_index + 1)).for_each(|idx| {
544 cols.swap(idx, end_index - start_index - 1 + idx);
545 });
546 cols.truncate(cols.len() - end_index + start_index + 1);
547
548 let combined = vals
550 .iter()
551 .take(end_index)
552 .skip(start_index)
553 .map(|v| v.coerce_str().unwrap_or_default())
554 .join(" ");
555 let binding = Value::string(combined, Span::unknown());
556 let last_seg = vals.split_off(end_index);
557 vals.truncate(start_index);
558 vals.push(binding);
559 vals.extend(last_seg);
560
561 Record::from_raw_cols_vals(cols, vals, Span::unknown(), input_span)
562}
563
564#[cfg(test)]
565mod test {
566 use super::*;
567
568 #[test]
569 fn test_examples() {
570 crate::test_examples(DetectColumns)
571 }
572}