1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
//  * This file is part of the uutils coreutils package.
//  *
//  * (c) Virgile Andreani <virgile.andreani@anbuco.fr>
//  * (c) kwantam <kwantam@gmail.com>
//  *     * 2015-04-28 ~ updated to work with both UTF-8 and non-UTF-8 encodings
//  *
//  * For the full copyright and license information, please view the LICENSE
//  * file that was distributed with this source code.

// spell-checker:ignore (ToDO) nums aflag uflag scol prevtab amode ctype cwidth nbytes lastcol pctype

#[macro_use]
extern crate uucore;
use clap::{App, Arg};
use std::fs::File;
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Stdout, Write};
use std::str::from_utf8;
use unicode_width::UnicodeWidthChar;

static NAME: &str = "unexpand";
static VERSION: &str = env!("CARGO_PKG_VERSION");
static USAGE: &str = "unexpand [OPTION]... [FILE]...";
static SUMMARY: &str = "Convert blanks in each FILE to tabs, writing to standard output.\n
                 With no FILE, or when FILE is -, read standard input.";

const DEFAULT_TABSTOP: usize = 8;

fn tabstops_parse(s: String) -> Vec<usize> {
    let words = s.split(',');

    let nums = words
        .map(|sn| {
            sn.parse()
                .unwrap_or_else(|_| crash!(1, "{}\n", "tab size contains invalid character(s)"))
        })
        .collect::<Vec<usize>>();

    if nums.iter().any(|&n| n == 0) {
        crash!(1, "{}\n", "tab size cannot be 0");
    }

    if let (false, _) = nums
        .iter()
        .fold((true, 0), |(acc, last), &n| (acc && last <= n, n))
    {
        crash!(1, "{}\n", "tab sizes must be ascending");
    }

    nums
}

mod options {
    pub const FILE: &str = "file";
    pub const ALL: &str = "all";
    pub const FIRST_ONLY: &str = "first-only";
    pub const TABS: &str = "tabs";
    pub const NO_UTF8: &str = "no-utf8";
}

struct Options {
    files: Vec<String>,
    tabstops: Vec<usize>,
    aflag: bool,
    uflag: bool,
}

impl Options {
    fn new(matches: clap::ArgMatches) -> Options {
        let tabstops = match matches.value_of(options::TABS) {
            None => vec![DEFAULT_TABSTOP],
            Some(s) => tabstops_parse(s.to_string()),
        };

        let aflag = (matches.is_present(options::ALL) || matches.is_present(options::TABS))
            && !matches.is_present(options::FIRST_ONLY);
        let uflag = !matches.is_present(options::NO_UTF8);

        let files = match matches.value_of(options::FILE) {
            Some(v) => vec![v.to_string()],
            None => vec!["-".to_owned()],
        };

        Options {
            files,
            tabstops,
            aflag,
            uflag,
        }
    }
}

pub fn uumain(args: impl uucore::Args) -> i32 {
    let args = args.collect_str();

    let matches = App::new(executable!())
        .name(NAME)
        .version(VERSION)
        .usage(USAGE)
        .about(SUMMARY)
        .arg(Arg::with_name(options::FILE).hidden(true).multiple(true))
        .arg(
            Arg::with_name(options::ALL)
                .short("a")
                .long(options::ALL)
                .help("convert all blanks, instead of just initial blanks")
                .takes_value(false),
        )
        .arg(
            Arg::with_name(options::FIRST_ONLY)
                .long(options::FIRST_ONLY)
                .help("convert only leading sequences of blanks (overrides -a)")
                .takes_value(false),
        )
        .arg(
            Arg::with_name(options::TABS)
                .short("t")
                .long(options::TABS)
                .long_help("use comma separated LIST of tab positions or have tabs N characters apart instead of 8 (enables -a)")
                .takes_value(true)
        )
        .arg(
            Arg::with_name(options::NO_UTF8)
                .short("U")
                .long(options::NO_UTF8)
                .takes_value(false)
                .help("interpret input file as 8-bit ASCII rather than UTF-8"))
        .get_matches_from(args);

    unexpand(Options::new(matches));

    0
}

fn open(path: String) -> BufReader<Box<dyn Read + 'static>> {
    let file_buf;
    if path == "-" {
        BufReader::new(Box::new(stdin()) as Box<dyn Read>)
    } else {
        file_buf = match File::open(&path[..]) {
            Ok(a) => a,
            Err(e) => crash!(1, "{}: {}", &path[..], e),
        };
        BufReader::new(Box::new(file_buf) as Box<dyn Read>)
    }
}

fn next_tabstop(tabstops: &[usize], col: usize) -> Option<usize> {
    if tabstops.len() == 1 {
        Some(tabstops[0] - col % tabstops[0])
    } else {
        // find next larger tab
        match tabstops.iter().find(|&&t| t > col) {
            Some(t) => Some(t - col),
            None => None, // if there isn't one in the list, tab becomes a single space
        }
    }
}

fn write_tabs(
    output: &mut BufWriter<Stdout>,
    tabstops: &[usize],
    mut scol: usize,
    col: usize,
    prevtab: bool,
    init: bool,
    amode: bool,
) {
    // This conditional establishes the following:
    // We never turn a single space before a non-blank into
    // a tab, unless it's at the start of the line.
    let ai = init || amode;
    if (ai && !prevtab && col > scol + 1) || (col > scol && (init || ai && prevtab)) {
        while let Some(nts) = next_tabstop(tabstops, scol) {
            if col < scol + nts {
                break;
            }

            safe_unwrap!(output.write_all(b"\t"));
            scol += nts;
        }
    }

    while col > scol {
        safe_unwrap!(output.write_all(b" "));
        scol += 1;
    }
}

#[derive(PartialEq, Eq, Debug)]
enum CharType {
    Backspace,
    Space,
    Tab,
    Other,
}

fn next_char_info(uflag: bool, buf: &[u8], byte: usize) -> (CharType, usize, usize) {
    let (ctype, cwidth, nbytes) = if uflag {
        let nbytes = char::from(buf[byte]).len_utf8();

        if byte + nbytes > buf.len() {
            // make sure we don't overrun the buffer because of invalid UTF-8
            (CharType::Other, 1, 1)
        } else if let Ok(t) = from_utf8(&buf[byte..byte + nbytes]) {
            // Now that we think it's UTF-8, figure out what kind of char it is
            match t.chars().next() {
                Some(' ') => (CharType::Space, 0, 1),
                Some('\t') => (CharType::Tab, 0, 1),
                Some('\x08') => (CharType::Backspace, 0, 1),
                Some(c) => (
                    CharType::Other,
                    UnicodeWidthChar::width(c).unwrap_or(0),
                    nbytes,
                ),
                None => {
                    // invalid char snuck past the utf8_validation_iterator somehow???
                    (CharType::Other, 1, 1)
                }
            }
        } else {
            // otherwise, it's not valid
            (CharType::Other, 1, 1) // implicit assumption: non-UTF8 char has display width 1
        }
    } else {
        (
            match buf[byte] {
                // always take exactly 1 byte in strict ASCII mode
                0x20 => CharType::Space,
                0x09 => CharType::Tab,
                0x08 => CharType::Backspace,
                _ => CharType::Other,
            },
            1,
            1,
        )
    };

    (ctype, cwidth, nbytes)
}

fn unexpand(options: Options) {
    let mut output = BufWriter::new(stdout());
    let ts = &options.tabstops[..];
    let mut buf = Vec::new();
    let lastcol = if ts.len() > 1 { *ts.last().unwrap() } else { 0 };

    for file in options.files.into_iter() {
        let mut fh = open(file);

        while match fh.read_until(b'\n', &mut buf) {
            Ok(s) => s > 0,
            Err(_) => !buf.is_empty(),
        } {
            let mut byte = 0; // offset into the buffer
            let mut col = 0; // the current column
            let mut scol = 0; // the start col for the current span, i.e., the already-printed width
            let mut init = true; // are we at the start of the line?
            let mut pctype = CharType::Other;

            while byte < buf.len() {
                // when we have a finite number of columns, never convert past the last column
                if lastcol > 0 && col >= lastcol {
                    write_tabs(
                        &mut output,
                        ts,
                        scol,
                        col,
                        pctype == CharType::Tab,
                        init,
                        true,
                    );
                    safe_unwrap!(output.write_all(&buf[byte..]));
                    scol = col;
                    break;
                }

                // figure out how big the next char is, if it's UTF-8
                let (ctype, cwidth, nbytes) = next_char_info(options.uflag, &buf, byte);

                // now figure out how many columns this char takes up, and maybe print it
                let tabs_buffered = init || options.aflag;
                match ctype {
                    CharType::Space | CharType::Tab => {
                        // compute next col, but only write space or tab chars if not buffering
                        col += if ctype == CharType::Space {
                            1
                        } else {
                            next_tabstop(ts, col).unwrap_or(1)
                        };

                        if !tabs_buffered {
                            safe_unwrap!(output.write_all(&buf[byte..byte + nbytes]));
                            scol = col; // now printed up to this column
                        }
                    }
                    CharType::Other | CharType::Backspace => {
                        // always
                        write_tabs(
                            &mut output,
                            ts,
                            scol,
                            col,
                            pctype == CharType::Tab,
                            init,
                            options.aflag,
                        );
                        init = false; // no longer at the start of a line
                        col = if ctype == CharType::Other {
                            // use computed width
                            col + cwidth
                        } else if col > 0 {
                            // Backspace case, but only if col > 0
                            col - 1
                        } else {
                            0
                        };
                        safe_unwrap!(output.write_all(&buf[byte..byte + nbytes]));
                        scol = col; // we've now printed up to this column
                    }
                }

                byte += nbytes; // move on to next char
                pctype = ctype; // save the previous type
            }

            // write out anything remaining
            write_tabs(
                &mut output,
                ts,
                scol,
                col,
                pctype == CharType::Tab,
                init,
                true,
            );
            safe_unwrap!(output.flush());
            buf.truncate(0); // clear out the buffer
        }
    }
    crash_if_err!(1, output.flush())
}