Skip to main content

coreutils_rs/paste/
core.rs

1use std::io::Write;
2
3/// Configuration for the paste command.
4pub struct PasteConfig {
5    /// Delimiter characters, cycled through columns.
6    pub delimiters: Vec<u8>,
7    /// Serial mode: paste one file at a time.
8    pub serial: bool,
9    /// Use NUL as line terminator instead of newline.
10    pub zero_terminated: bool,
11}
12
13impl Default for PasteConfig {
14    fn default() -> Self {
15        Self {
16            delimiters: vec![b'\t'],
17            serial: false,
18            zero_terminated: false,
19        }
20    }
21}
22
23/// Parse delimiter string with escape sequences.
24/// Supports: \n (newline), \t (tab), \\ (backslash), \0 (NUL), empty string (no delimiter).
25pub fn parse_delimiters(s: &str) -> Vec<u8> {
26    if s.is_empty() {
27        return Vec::new();
28    }
29    let bytes = s.as_bytes();
30    let mut result = Vec::with_capacity(bytes.len());
31    let mut i = 0;
32    while i < bytes.len() {
33        if bytes[i] == b'\\' && i + 1 < bytes.len() {
34            match bytes[i + 1] {
35                b'n' => {
36                    result.push(b'\n');
37                    i += 2;
38                }
39                b't' => {
40                    result.push(b'\t');
41                    i += 2;
42                }
43                b'\\' => {
44                    result.push(b'\\');
45                    i += 2;
46                }
47                b'0' => {
48                    result.push(0);
49                    i += 2;
50                }
51                _ => {
52                    // Unknown escape: treat backslash as literal
53                    result.push(b'\\');
54                    i += 1;
55                }
56            }
57        } else {
58            result.push(bytes[i]);
59            i += 1;
60        }
61    }
62    result
63}
64
65/// Build line start/end offsets for a given data buffer.
66/// Returns a Vec of (start, end) pairs where end is exclusive and does NOT include the terminator.
67#[inline]
68fn build_line_offsets(data: &[u8], terminator: u8) -> Vec<(usize, usize)> {
69    let mut offsets = Vec::new();
70    if data.is_empty() {
71        return offsets;
72    }
73    // Pre-count lines for exact allocation
74    let count = memchr::memchr_iter(terminator, data).count()
75        + if data.last() != Some(&terminator) {
76            1
77        } else {
78            0
79        };
80    offsets.reserve_exact(count);
81    let mut start = 0;
82    for pos in memchr::memchr_iter(terminator, data) {
83        offsets.push((start, pos));
84        start = pos + 1;
85    }
86    // Last line without trailing terminator
87    if start < data.len() {
88        offsets.push((start, data.len()));
89    }
90    offsets
91}
92
93/// Paste files in normal (parallel) mode and return the output buffer.
94/// For each line index, concatenate corresponding lines from all files with delimiters.
95pub fn paste_parallel_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
96    let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
97
98    // Build line offset arrays for each file
99    let all_offsets: Vec<Vec<(usize, usize)>> = file_data
100        .iter()
101        .map(|d| build_line_offsets(d, terminator))
102        .collect();
103
104    let max_lines = all_offsets.iter().map(|o| o.len()).max().unwrap_or(0);
105    if max_lines == 0 && file_data.iter().all(|d| d.is_empty()) {
106        return Vec::new();
107    }
108
109    // Estimate output size
110    let total_input: usize = file_data.iter().map(|d| d.len()).sum();
111    let delim_overhead = max_lines * file_data.len();
112    let mut output = Vec::with_capacity(total_input + delim_overhead);
113
114    let delims = &config.delimiters;
115
116    for line_idx in 0..max_lines {
117        for (file_idx, (offsets, data)) in all_offsets.iter().zip(file_data.iter()).enumerate() {
118            if file_idx > 0 && !delims.is_empty() {
119                output.push(delims[(file_idx - 1) % delims.len()]);
120            }
121            if line_idx < offsets.len() {
122                let (start, end) = offsets[line_idx];
123                output.extend_from_slice(&data[start..end]);
124            }
125        }
126        output.push(terminator);
127    }
128
129    output
130}
131
132/// Paste files in serial mode and return the output buffer.
133/// For each file, join all lines with the delimiter list (cycling).
134pub fn paste_serial_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
135    let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
136    let delims = &config.delimiters;
137
138    // Estimate output size
139    let total_input: usize = file_data.iter().map(|d| d.len()).sum();
140    let mut output = Vec::with_capacity(total_input + file_data.len());
141
142    for data in file_data {
143        let offsets = build_line_offsets(data, terminator);
144        for (i, &(start, end)) in offsets.iter().enumerate() {
145            if i > 0 && !delims.is_empty() {
146                output.push(delims[(i - 1) % delims.len()]);
147            }
148            output.extend_from_slice(&data[start..end]);
149        }
150        output.push(terminator);
151    }
152
153    output
154}
155
156/// Main paste entry point. Writes directly to the provided writer.
157pub fn paste(
158    file_data: &[&[u8]],
159    config: &PasteConfig,
160    out: &mut impl Write,
161) -> std::io::Result<()> {
162    let output = if config.serial {
163        paste_serial_to_vec(file_data, config)
164    } else {
165        paste_parallel_to_vec(file_data, config)
166    };
167    out.write_all(&output)
168}
169
170/// Build the paste output as a Vec, then return it for the caller to write.
171/// This allows the binary to use raw write() for maximum throughput.
172pub fn paste_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
173    if config.serial {
174        paste_serial_to_vec(file_data, config)
175    } else {
176        paste_parallel_to_vec(file_data, config)
177    }
178}