uu_expand/
expand.rs

1// This file is part of the uutils coreutils package.
2//
3// For the full copyright and license information, please view the LICENSE
4// file that was distributed with this source code.
5
6// spell-checker:ignore (ToDO) ctype cwidth iflag nbytes nspaces nums tspaces uflag Preprocess
7
8use clap::{Arg, ArgAction, ArgMatches, Command};
9use std::ffi::OsString;
10use std::fs::File;
11use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout};
12use std::num::IntErrorKind;
13use std::path::Path;
14use std::str::from_utf8;
15use thiserror::Error;
16use unicode_width::UnicodeWidthChar;
17use uucore::display::Quotable;
18use uucore::error::{FromIo, UError, UResult, set_exit_code};
19use uucore::translate;
20use uucore::{format_usage, show_error};
21
22pub mod options {
23    pub static TABS: &str = "tabs";
24    pub static INITIAL: &str = "initial";
25    pub static NO_UTF8: &str = "no-utf8";
26    pub static FILES: &str = "FILES";
27}
28
29static LONG_HELP: &str = "";
30
31static DEFAULT_TABSTOP: usize = 8;
32
33/// The mode to use when replacing tabs beyond the last one specified in
34/// the `--tabs` argument.
35#[derive(PartialEq)]
36enum RemainingMode {
37    None,
38    Slash,
39    Plus,
40}
41
42/// Decide whether the character is either a space or a comma.
43///
44/// # Examples
45///
46/// ```rust,ignore
47/// assert!(is_space_or_comma(' '))
48/// assert!(is_space_or_comma(','))
49/// assert!(!is_space_or_comma('a'))
50/// ```
51fn is_space_or_comma(c: char) -> bool {
52    c == ' ' || c == ','
53}
54
55/// Decide whether the character is either a digit or a comma.
56fn is_digit_or_comma(c: char) -> bool {
57    c.is_ascii_digit() || c == ','
58}
59
60/// Errors that can occur when parsing a `--tabs` argument.
61#[derive(Debug, Error)]
62enum ParseError {
63    #[error("{}", translate!("expand-error-invalid-character", "char" => .0.quote()))]
64    InvalidCharacter(String),
65    #[error("{}", translate!("expand-error-specifier-not-at-start", "specifier" => .0.quote(), "number" => .1.quote()))]
66    SpecifierNotAtStartOfNumber(String, String),
67    #[error("{}", translate!("expand-error-specifier-only-allowed-with-last", "specifier" => .0.quote()))]
68    SpecifierOnlyAllowedWithLastValue(String),
69    #[error("{}", translate!("expand-error-tab-size-cannot-be-zero"))]
70    TabSizeCannotBeZero,
71    #[error("{}", translate!("expand-error-tab-size-too-large", "size" => .0.quote()))]
72    TabSizeTooLarge(String),
73    #[error("{}", translate!("expand-error-tab-sizes-must-be-ascending"))]
74    TabSizesMustBeAscending,
75}
76
77impl UError for ParseError {}
78
79/// Parse a list of tabstops from a `--tabs` argument.
80///
81/// This function returns both the vector of numbers appearing in the
82/// comma- or space-separated list, and also an optional mode, specified
83/// by either a "/" or a "+" character appearing before the final number
84/// in the list. This mode defines the strategy to use for computing the
85/// number of spaces to use for columns beyond the end of the tab stop
86/// list specified here.
87fn tabstops_parse(s: &str) -> Result<(RemainingMode, Vec<usize>), ParseError> {
88    // Leading commas and spaces are ignored.
89    let s = s.trim_start_matches(is_space_or_comma);
90
91    // If there were only commas and spaces in the string, just use the
92    // default tabstops.
93    if s.is_empty() {
94        return Ok((RemainingMode::None, vec![DEFAULT_TABSTOP]));
95    }
96
97    let mut nums = vec![];
98    let mut remaining_mode = RemainingMode::None;
99    let mut is_specifier_already_used = false;
100    for word in s.split(is_space_or_comma) {
101        let bytes = word.as_bytes();
102        for i in 0..bytes.len() {
103            match bytes[i] {
104                b'+' => remaining_mode = RemainingMode::Plus,
105                b'/' => remaining_mode = RemainingMode::Slash,
106                _ => {
107                    // Parse a number from the byte sequence.
108                    let s = from_utf8(&bytes[i..]).unwrap();
109                    match s.parse::<usize>() {
110                        Ok(num) => {
111                            // Tab size must be positive.
112                            if num == 0 {
113                                return Err(ParseError::TabSizeCannotBeZero);
114                            }
115
116                            // Tab sizes must be ascending.
117                            if let Some(last_stop) = nums.last() {
118                                if *last_stop >= num {
119                                    return Err(ParseError::TabSizesMustBeAscending);
120                                }
121                            }
122
123                            if is_specifier_already_used {
124                                let specifier = if remaining_mode == RemainingMode::Slash {
125                                    "/".to_string()
126                                } else {
127                                    "+".to_string()
128                                };
129                                return Err(ParseError::SpecifierOnlyAllowedWithLastValue(
130                                    specifier,
131                                ));
132                            } else if remaining_mode != RemainingMode::None {
133                                is_specifier_already_used = true;
134                            }
135
136                            // Append this tab stop to the list of all tabstops.
137                            nums.push(num);
138                            break;
139                        }
140                        Err(e) => {
141                            if *e.kind() == IntErrorKind::PosOverflow {
142                                return Err(ParseError::TabSizeTooLarge(s.to_string()));
143                            }
144
145                            let s = s.trim_start_matches(char::is_numeric);
146                            return if s.starts_with('/') || s.starts_with('+') {
147                                Err(ParseError::SpecifierNotAtStartOfNumber(
148                                    s[0..1].to_string(),
149                                    s.to_string(),
150                                ))
151                            } else {
152                                Err(ParseError::InvalidCharacter(s.to_string()))
153                            };
154                        }
155                    }
156                }
157            }
158        }
159    }
160    // If no numbers could be parsed (for example, if `s` were "+,+,+"),
161    // then just use the default tabstops.
162    if nums.is_empty() {
163        nums = vec![DEFAULT_TABSTOP];
164    }
165
166    if nums.len() < 2 {
167        remaining_mode = RemainingMode::None;
168    }
169    Ok((remaining_mode, nums))
170}
171
172struct Options {
173    files: Vec<OsString>,
174    tabstops: Vec<usize>,
175    tspaces: String,
176    iflag: bool,
177    uflag: bool,
178
179    /// Strategy for expanding tabs for columns beyond those specified
180    /// in `tabstops`.
181    remaining_mode: RemainingMode,
182}
183
184impl Options {
185    fn new(matches: &ArgMatches) -> Result<Self, ParseError> {
186        let (remaining_mode, tabstops) = match matches.get_many::<String>(options::TABS) {
187            Some(s) => tabstops_parse(&s.map(|s| s.as_str()).collect::<Vec<_>>().join(","))?,
188            None => (RemainingMode::None, vec![DEFAULT_TABSTOP]),
189        };
190
191        let iflag = matches.get_flag(options::INITIAL);
192        let uflag = !matches.get_flag(options::NO_UTF8);
193
194        // avoid allocations when dumping out long sequences of spaces
195        // by precomputing the longest string of spaces we will ever need
196        let nspaces = tabstops
197            .iter()
198            .scan(0, |pr, &it| {
199                let ret = Some(it - *pr);
200                *pr = it;
201                ret
202            })
203            .max()
204            .unwrap(); // length of tabstops is guaranteed >= 1
205        let tspaces = " ".repeat(nspaces);
206
207        let files: Vec<OsString> = match matches.get_many::<OsString>(options::FILES) {
208            Some(s) => s.cloned().collect(),
209            None => vec![OsString::from("-")],
210        };
211
212        Ok(Self {
213            files,
214            tabstops,
215            tspaces,
216            iflag,
217            uflag,
218            remaining_mode,
219        })
220    }
221}
222
223/// Preprocess command line arguments and expand shortcuts. For example, "-7" is expanded to
224/// "--tabs=7" and "-1,3" to "--tabs=1 --tabs=3".
225fn expand_shortcuts(args: Vec<OsString>) -> Vec<OsString> {
226    let mut processed_args = Vec::with_capacity(args.len());
227
228    for arg in args {
229        if let Some(arg) = arg.to_str() {
230            if arg.starts_with('-') && arg[1..].chars().all(is_digit_or_comma) {
231                arg[1..]
232                    .split(',')
233                    .filter(|s| !s.is_empty())
234                    .for_each(|s| processed_args.push(OsString::from(format!("--tabs={s}"))));
235                continue;
236            }
237        }
238        processed_args.push(arg);
239    }
240
241    processed_args
242}
243
244#[uucore::main]
245pub fn uumain(args: impl uucore::Args) -> UResult<()> {
246    let matches =
247        uucore::clap_localization::handle_clap_result(uu_app(), expand_shortcuts(args.collect()))?;
248
249    expand(&Options::new(&matches)?)
250}
251
252pub fn uu_app() -> Command {
253    uucore::clap_localization::configure_localized_command(
254        Command::new(uucore::util_name())
255            .version(uucore::crate_version!())
256            .about(translate!("expand-about"))
257            .after_help(LONG_HELP)
258            .override_usage(format_usage(&translate!("expand-usage"))),
259    )
260    .infer_long_args(true)
261    .args_override_self(true)
262    .arg(
263        Arg::new(options::INITIAL)
264            .long(options::INITIAL)
265            .short('i')
266            .help(translate!("expand-help-initial"))
267            .action(ArgAction::SetTrue),
268    )
269    .arg(
270        Arg::new(options::TABS)
271            .long(options::TABS)
272            .short('t')
273            .value_name("N, LIST")
274            .action(ArgAction::Append)
275            .help(translate!("expand-help-tabs")),
276    )
277    .arg(
278        Arg::new(options::NO_UTF8)
279            .long(options::NO_UTF8)
280            .short('U')
281            .help(translate!("expand-help-no-utf8"))
282            .action(ArgAction::SetTrue),
283    )
284    .arg(
285        Arg::new(options::FILES)
286            .action(ArgAction::Append)
287            .hide(true)
288            .value_hint(clap::ValueHint::FilePath)
289            .value_parser(clap::value_parser!(OsString)),
290    )
291}
292
293fn open(path: &OsString) -> UResult<BufReader<Box<dyn Read + 'static>>> {
294    let file_buf;
295    if path == "-" {
296        Ok(BufReader::new(Box::new(stdin()) as Box<dyn Read>))
297    } else {
298        let path_ref = Path::new(path);
299        file_buf = File::open(path_ref).map_err_context(|| path.to_string_lossy().to_string())?;
300        Ok(BufReader::new(Box::new(file_buf) as Box<dyn Read>))
301    }
302}
303
304/// Compute the number of spaces to the next tabstop.
305///
306/// `tabstops` is the sequence of tabstop locations.
307///
308/// `col` is the index of the current cursor in the line being written.
309///
310/// If `remaining_mode` is [`RemainingMode::Plus`], then the last entry
311/// in the `tabstops` slice is interpreted as a relative number of
312/// spaces, which this function will return for every input value of
313/// `col` beyond the end of the second-to-last element of `tabstops`.
314fn next_tabstop(tabstops: &[usize], col: usize, remaining_mode: &RemainingMode) -> usize {
315    let num_tabstops = tabstops.len();
316    match remaining_mode {
317        RemainingMode::Plus => match tabstops[0..num_tabstops - 1].iter().find(|&&t| t > col) {
318            Some(t) => t - col,
319            None => {
320                let step_size = tabstops[num_tabstops - 1];
321                let last_fixed_tabstop = tabstops[num_tabstops - 2];
322                let characters_since_last_tabstop = col - last_fixed_tabstop;
323
324                let steps_required = 1 + characters_since_last_tabstop / step_size;
325                steps_required * step_size - characters_since_last_tabstop
326            }
327        },
328        RemainingMode::Slash => match tabstops[0..num_tabstops - 1].iter().find(|&&t| t > col) {
329            Some(t) => t - col,
330            None => tabstops[num_tabstops - 1] - col % tabstops[num_tabstops - 1],
331        },
332        RemainingMode::None => {
333            if num_tabstops == 1 {
334                tabstops[0] - col % tabstops[0]
335            } else {
336                match tabstops.iter().find(|&&t| t > col) {
337                    Some(t) => t - col,
338                    None => 1,
339                }
340            }
341        }
342    }
343}
344
345#[derive(PartialEq, Eq, Debug)]
346enum CharType {
347    Backspace,
348    Tab,
349    Other,
350}
351
352#[allow(clippy::cognitive_complexity)]
353fn expand_line(
354    buf: &mut Vec<u8>,
355    output: &mut BufWriter<std::io::Stdout>,
356    tabstops: &[usize],
357    options: &Options,
358) -> std::io::Result<()> {
359    use self::CharType::{Backspace, Other, Tab};
360
361    let mut col = 0;
362    let mut byte = 0;
363    let mut init = true;
364
365    while byte < buf.len() {
366        let (ctype, cwidth, nbytes) = if options.uflag {
367            let nbytes = char::from(buf[byte]).len_utf8();
368
369            if byte + nbytes > buf.len() {
370                // don't overrun buffer because of invalid UTF-8
371                (Other, 1, 1)
372            } else if let Ok(t) = from_utf8(&buf[byte..byte + nbytes]) {
373                match t.chars().next() {
374                    Some('\t') => (Tab, 0, nbytes),
375                    Some('\x08') => (Backspace, 0, nbytes),
376                    Some(c) => (Other, UnicodeWidthChar::width(c).unwrap_or(0), nbytes),
377                    None => {
378                        // no valid char at start of t, so take 1 byte
379                        (Other, 1, 1)
380                    }
381                }
382            } else {
383                (Other, 1, 1) // implicit assumption: non-UTF-8 char is 1 col wide
384            }
385        } else {
386            (
387                match buf.get(byte) {
388                    // always take exactly 1 byte in strict ASCII mode
389                    Some(0x09) => Tab,
390                    Some(0x08) => Backspace,
391                    _ => Other,
392                },
393                1,
394                1,
395            )
396        };
397
398        // figure out how many columns this char takes up
399        match ctype {
400            Tab => {
401                // figure out how many spaces to the next tabstop
402                let nts = next_tabstop(tabstops, col, &options.remaining_mode);
403                col += nts;
404
405                // now dump out either spaces if we're expanding, or a literal tab if we're not
406                if init || !options.iflag {
407                    if nts <= options.tspaces.len() {
408                        output.write_all(&options.tspaces.as_bytes()[..nts])?;
409                    } else {
410                        output.write_all(" ".repeat(nts).as_bytes())?;
411                    }
412                } else {
413                    output.write_all(&buf[byte..byte + nbytes])?;
414                }
415            }
416            _ => {
417                col = if ctype == Other {
418                    col + cwidth
419                } else if col > 0 {
420                    col - 1
421                } else {
422                    0
423                };
424
425                // if we're writing anything other than a space, then we're
426                // done with the line's leading spaces
427                if buf[byte] != 0x20 {
428                    init = false;
429                }
430
431                output.write_all(&buf[byte..byte + nbytes])?;
432            }
433        }
434
435        byte += nbytes; // advance the pointer
436    }
437
438    output.flush()?;
439    buf.truncate(0); // clear the buffer
440
441    Ok(())
442}
443
444fn expand(options: &Options) -> UResult<()> {
445    let mut output = BufWriter::new(stdout());
446    let ts = options.tabstops.as_ref();
447    let mut buf = Vec::new();
448
449    for file in &options.files {
450        if Path::new(file).is_dir() {
451            show_error!(
452                "{}",
453                translate!("expand-error-is-directory", "file" => file.to_string_lossy())
454            );
455            set_exit_code(1);
456            continue;
457        }
458        match open(file) {
459            Ok(mut fh) => {
460                while match fh.read_until(b'\n', &mut buf) {
461                    Ok(s) => s > 0,
462                    Err(_) => buf.is_empty(),
463                } {
464                    expand_line(&mut buf, &mut output, ts, options)
465                        .map_err_context(|| translate!("expand-error-failed-to-write-output"))?;
466                }
467            }
468            Err(e) => {
469                show_error!("{e}");
470                set_exit_code(1);
471            }
472        }
473    }
474    Ok(())
475}
476
477#[cfg(test)]
478mod tests {
479    use crate::is_digit_or_comma;
480
481    use super::RemainingMode;
482    use super::next_tabstop;
483
484    #[test]
485    fn test_next_tabstop_remaining_mode_none() {
486        assert_eq!(next_tabstop(&[1, 5], 0, &RemainingMode::None), 1);
487        assert_eq!(next_tabstop(&[1, 5], 3, &RemainingMode::None), 2);
488        assert_eq!(next_tabstop(&[1, 5], 6, &RemainingMode::None), 1);
489    }
490
491    #[test]
492    fn test_next_tabstop_remaining_mode_plus() {
493        assert_eq!(next_tabstop(&[1, 5], 0, &RemainingMode::Plus), 1);
494        assert_eq!(next_tabstop(&[1, 5], 3, &RemainingMode::Plus), 3);
495        assert_eq!(next_tabstop(&[1, 5], 6, &RemainingMode::Plus), 5);
496    }
497
498    #[test]
499    fn test_next_tabstop_remaining_mode_slash() {
500        assert_eq!(next_tabstop(&[1, 5], 0, &RemainingMode::Slash), 1);
501        assert_eq!(next_tabstop(&[1, 5], 3, &RemainingMode::Slash), 2);
502        assert_eq!(next_tabstop(&[1, 5], 6, &RemainingMode::Slash), 4);
503    }
504
505    #[test]
506    fn test_is_digit_or_comma() {
507        assert!(is_digit_or_comma('1'));
508        assert!(is_digit_or_comma(','));
509        assert!(!is_digit_or_comma('a'));
510    }
511}