Skip to main content

uu_paste/
paste.rs

1// This file is part of the uutils coreutils package.
2//
3// For the full copyright and license information, please view the LICENSE
4// file that was distributed with this source code.
5
6use clap::{Arg, ArgAction, Command};
7use std::cell::{OnceCell, RefCell};
8use std::ffi::OsString;
9use std::fs::File;
10use std::io::{BufRead, BufReader, Read, Stdin, Write, stdin, stdout};
11use std::iter::Cycle;
12use std::path::Path;
13use std::rc::Rc;
14use std::slice::Iter;
15use uucore::error::{UResult, USimpleError};
16use uucore::format_usage;
17use uucore::i18n::charmap::mb_char_len;
18use uucore::line_ending::LineEnding;
19use uucore::translate;
20
21mod options {
22    pub const DELIMITER: &str = "delimiters";
23    pub const SERIAL: &str = "serial";
24    pub const FILE: &str = "file";
25    pub const ZERO_TERMINATED: &str = "zero-terminated";
26}
27
28#[uucore::main]
29pub fn uumain(args: impl uucore::Args) -> UResult<()> {
30    let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
31
32    let serial = matches.get_flag(options::SERIAL);
33    let delimiters = matches.get_one::<OsString>(options::DELIMITER).unwrap();
34    let files = matches
35        .get_many::<OsString>(options::FILE)
36        .unwrap()
37        .cloned()
38        .collect();
39    let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
40
41    paste(files, serial, delimiters, line_ending)
42}
43
44pub fn uu_app() -> Command {
45    Command::new(uucore::util_name())
46        .version(uucore::crate_version!())
47        .help_template(uucore::localized_help_template(uucore::util_name()))
48        .about(translate!("paste-about"))
49        .override_usage(format_usage(&translate!("paste-usage")))
50        .infer_long_args(true)
51        .arg(
52            Arg::new(options::SERIAL)
53                .long(options::SERIAL)
54                .short('s')
55                .help(translate!("paste-help-serial"))
56                .action(ArgAction::SetTrue),
57        )
58        .arg(
59            Arg::new(options::DELIMITER)
60                .long(options::DELIMITER)
61                .short('d')
62                .help(translate!("paste-help-delimiter"))
63                .value_name("LIST")
64                .default_value("\t")
65                .hide_default_value(true)
66                .value_parser(clap::value_parser!(OsString)),
67        )
68        .arg(
69            Arg::new(options::FILE)
70                .value_name("FILE")
71                .action(ArgAction::Append)
72                .default_value("-")
73                .value_hint(clap::ValueHint::FilePath)
74                .value_parser(clap::value_parser!(OsString)),
75        )
76        .arg(
77            Arg::new(options::ZERO_TERMINATED)
78                .long(options::ZERO_TERMINATED)
79                .short('z')
80                .help(translate!("paste-help-zero-terminated"))
81                .action(ArgAction::SetTrue),
82        )
83}
84
85#[allow(clippy::cognitive_complexity)]
86fn paste(
87    filenames: Vec<OsString>,
88    serial: bool,
89    delimiters: &OsString,
90    line_ending: LineEnding,
91) -> UResult<()> {
92    let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?;
93
94    let stdin_once_cell = OnceCell::<Rc<RefCell<Stdin>>>::new();
95
96    let mut input_source_vec = Vec::with_capacity(filenames.len());
97
98    for filename in filenames {
99        let input_source = if filename == "-" {
100            InputSource::StandardInput(
101                stdin_once_cell
102                    .get_or_init(|| Rc::new(RefCell::new(stdin())))
103                    .clone(),
104            )
105        } else {
106            let path = Path::new(&filename);
107            let file = File::open(path)?;
108            InputSource::File(BufReader::new(file))
109        };
110
111        input_source_vec.push(input_source);
112    }
113
114    let line_ending_byte = u8::from(line_ending);
115    let input_source_vec_len = input_source_vec.len();
116    let mut stdout = stdout().lock();
117
118    if !serial && input_source_vec_len == 1 {
119        // With a single input source (no -s), `paste` output is identical to input,
120        // except that a missing final line ending must be added.
121        // Stream directly to avoid unbounded line buffering on inputs like /dev/zero.
122        return write_single_input_source(
123            &mut stdout,
124            input_source_vec
125                .pop()
126                .expect("input_source_vec_len was checked to be exactly one"),
127            line_ending_byte,
128        );
129    }
130
131    let line_ending_byte_array_ref = &[line_ending_byte];
132
133    let mut delimiter_state = DelimiterState::new(&unescaped_and_encoded_delimiters);
134
135    let mut output = Vec::new();
136
137    if serial {
138        for input_source in &mut input_source_vec {
139            output.clear();
140
141            loop {
142                if input_source.read_until(line_ending_byte, &mut output)? == 0 {
143                    break;
144                }
145                remove_trailing_line_ending_byte(line_ending_byte, &mut output);
146
147                delimiter_state.write_delimiter(&mut output);
148            }
149
150            delimiter_state.remove_trailing_delimiter(&mut output);
151
152            stdout.write_all(&output)?;
153            stdout.write_all(line_ending_byte_array_ref)?;
154        }
155    } else {
156        let mut eof = vec![false; input_source_vec_len];
157
158        loop {
159            output.clear();
160
161            let mut eof_count = 0;
162
163            for (i, input_source) in input_source_vec.iter_mut().enumerate() {
164                if eof[i] {
165                    eof_count += 1;
166                } else {
167                    match input_source.read_until(line_ending_byte, &mut output)? {
168                        0 => {
169                            eof[i] = true;
170                            eof_count += 1;
171                        }
172                        _ => {
173                            remove_trailing_line_ending_byte(line_ending_byte, &mut output);
174                        }
175                    }
176                }
177
178                delimiter_state.write_delimiter(&mut output);
179            }
180
181            if eof_count == input_source_vec_len {
182                break;
183            }
184
185            delimiter_state.remove_trailing_delimiter(&mut output);
186
187            stdout.write_all(&output)?;
188            stdout.write_all(line_ending_byte_array_ref)?;
189
190            // Quote:
191            //     When the -s option is not specified:
192            //     [...]
193            //     The delimiter shall be reset to the first element of list after each file operand is processed.
194            // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
195            delimiter_state.reset_to_first_delimiter();
196        }
197    }
198
199    Ok(())
200}
201
202fn write_single_input_source(
203    writer: &mut impl Write,
204    mut input_source: InputSource,
205    line_ending_byte: u8,
206) -> UResult<()> {
207    let mut buffer = [0_u8; 8 * 1024];
208    let mut has_data = false;
209    let mut last_byte = line_ending_byte;
210
211    loop {
212        let bytes_read = input_source.read(&mut buffer)?;
213
214        if bytes_read == 0 {
215            break;
216        }
217
218        has_data = true;
219        last_byte = buffer[bytes_read - 1];
220
221        writer.write_all(&buffer[..bytes_read])?;
222    }
223
224    if has_data && last_byte != line_ending_byte {
225        writer.write_all(&[line_ending_byte])?;
226    }
227
228    Ok(())
229}
230
231fn parse_delimiters(delimiters: &OsString) -> UResult<Box<[Box<[u8]>]>> {
232    let bytes = uucore::os_str_as_bytes(delimiters)?;
233    let mut vec = Vec::<Box<[u8]>>::with_capacity(bytes.len());
234    let mut i = 0;
235
236    while i < bytes.len() {
237        if bytes[i] == b'\\' {
238            i += 1;
239            if i >= bytes.len() {
240                return Err(USimpleError::new(
241                    1,
242                    translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters.to_string_lossy()),
243                ));
244            }
245            match bytes[i] {
246                b'0' => vec.push(Box::new([])),
247                b'\\' => vec.push(Box::new([b'\\'])),
248                b'n' => vec.push(Box::new([b'\n'])),
249                b't' => vec.push(Box::new([b'\t'])),
250                b'b' => vec.push(Box::new([b'\x08'])),
251                b'f' => vec.push(Box::new([b'\x0C'])),
252                b'r' => vec.push(Box::new([b'\r'])),
253                b'v' => vec.push(Box::new([b'\x0B'])),
254                _ => {
255                    // Unknown escape: strip backslash, use the following character(s)
256                    let remaining = &bytes[i..];
257                    let len = mb_char_len(remaining).min(remaining.len());
258                    vec.push(Box::from(&bytes[i..i + len]));
259                    i += len;
260                    continue;
261                }
262            }
263            i += 1;
264        } else {
265            let remaining = &bytes[i..];
266            let len = mb_char_len(remaining).min(remaining.len());
267            vec.push(Box::from(&bytes[i..i + len]));
268            i += len;
269        }
270    }
271
272    Ok(vec.into_boxed_slice())
273}
274
275fn remove_trailing_line_ending_byte(line_ending_byte: u8, output: &mut Vec<u8>) {
276    if let Some(&byte) = output.last() {
277        if byte == line_ending_byte {
278            assert_eq!(output.pop(), Some(line_ending_byte));
279        }
280    }
281}
282
283enum DelimiterState<'a> {
284    NoDelimiters,
285    OneDelimiter(&'a [u8]),
286    MultipleDelimiters {
287        current_delimiter: &'a [u8],
288        delimiters: &'a [Box<[u8]>],
289        delimiters_iterator: Cycle<Iter<'a, Box<[u8]>>>,
290    },
291}
292
293impl<'a> DelimiterState<'a> {
294    fn new(unescaped_and_encoded_delimiters: &'a [Box<[u8]>]) -> Self {
295        match unescaped_and_encoded_delimiters {
296            [] => DelimiterState::NoDelimiters,
297            [only_delimiter] => {
298                // -d '\0' is equivalent to -d ''
299                if only_delimiter.is_empty() {
300                    DelimiterState::NoDelimiters
301                } else {
302                    DelimiterState::OneDelimiter(only_delimiter)
303                }
304            }
305            [first_delimiter, ..] => DelimiterState::MultipleDelimiters {
306                current_delimiter: first_delimiter,
307                delimiters: unescaped_and_encoded_delimiters,
308                delimiters_iterator: unescaped_and_encoded_delimiters.iter().cycle(),
309            },
310        }
311    }
312
313    /// This should only be used to return to the start of the delimiter list after a file has been processed.
314    /// This should only be used when the "serial" option is disabled.
315    /// This is a no-op unless there are multiple delimiters.
316    fn reset_to_first_delimiter(&mut self) {
317        if let DelimiterState::MultipleDelimiters {
318            delimiters_iterator,
319            delimiters,
320            ..
321        } = self
322        {
323            *delimiters_iterator = delimiters.iter().cycle();
324        }
325    }
326
327    /// Remove the trailing delimiter.
328    /// If there are no delimiters, this is a no-op.
329    fn remove_trailing_delimiter(&mut self, output: &mut Vec<u8>) {
330        let delimiter_length = match self {
331            DelimiterState::OneDelimiter(only_delimiter) => only_delimiter.len(),
332            DelimiterState::MultipleDelimiters {
333                current_delimiter, ..
334            } => current_delimiter.len(),
335            DelimiterState::NoDelimiters => {
336                return;
337            }
338        };
339
340        // `delimiter_length` will be zero if the current delimiter is a "\0" delimiter
341        if delimiter_length > 0 {
342            let output_len = output.len();
343
344            if let Some(output_without_delimiter_length) = output_len.checked_sub(delimiter_length)
345            {
346                output.truncate(output_without_delimiter_length);
347            } else {
348                // This branch is NOT unreachable, must be skipped
349                // `output` should be empty in this case
350                assert_eq!(output_len, 0);
351            }
352        }
353    }
354
355    /// Append the current delimiter to `output`.
356    /// If there are no delimiters, this is a no-op.
357    fn write_delimiter(&mut self, output: &mut Vec<u8>) {
358        match self {
359            DelimiterState::OneDelimiter(only_delimiter) => {
360                output.extend_from_slice(only_delimiter);
361            }
362            DelimiterState::MultipleDelimiters {
363                current_delimiter,
364                delimiters_iterator,
365                ..
366            } => {
367                // Unwrap because `delimiters_iterator` is a cycle iter and was created from a non-empty slice
368                let bo = delimiters_iterator.next().unwrap();
369
370                output.extend_from_slice(bo);
371
372                *current_delimiter = bo;
373            }
374            DelimiterState::NoDelimiters => {}
375        }
376    }
377}
378
379enum InputSource {
380    File(BufReader<File>),
381    StandardInput(Rc<RefCell<Stdin>>),
382}
383
384impl InputSource {
385    fn read(&mut self, buf: &mut [u8]) -> UResult<usize> {
386        let us = match self {
387            Self::File(bu) => bu.read(buf)?,
388            Self::StandardInput(rc) => rc
389                .try_borrow()
390                .map_err(|bo| {
391                    USimpleError::new(1, translate!("paste-error-stdin-borrow", "error" => bo))
392                })?
393                .lock()
394                .read(buf)?,
395        };
396
397        Ok(us)
398    }
399
400    fn read_until(&mut self, byte: u8, buf: &mut Vec<u8>) -> UResult<usize> {
401        let us = match self {
402            Self::File(bu) => bu.read_until(byte, buf)?,
403            Self::StandardInput(rc) => rc
404                .try_borrow()
405                .map_err(|bo| {
406                    USimpleError::new(1, translate!("paste-error-stdin-borrow", "error" => bo))
407                })?
408                .lock()
409                .read_until(byte, buf)?,
410        };
411
412        Ok(us)
413    }
414}