Skip to main content

coreutils_rs/paste/
core.rs

1use std::io::Write;
2
3/// Configuration for the paste command.
4pub struct PasteConfig {
5    /// Delimiter characters, cycled through columns.
6    pub delimiters: Vec<u8>,
7    /// Serial mode: paste one file at a time.
8    pub serial: bool,
9    /// Use NUL as line terminator instead of newline.
10    pub zero_terminated: bool,
11}
12
13impl Default for PasteConfig {
14    fn default() -> Self {
15        Self {
16            delimiters: vec![b'\t'],
17            serial: false,
18            zero_terminated: false,
19        }
20    }
21}
22
23/// Parse delimiter string with escape sequences.
24/// Supports: \n (newline), \t (tab), \\ (backslash), \0 (NUL), empty string (no delimiter).
25pub fn parse_delimiters(s: &str) -> Vec<u8> {
26    if s.is_empty() {
27        return Vec::new();
28    }
29    let bytes = s.as_bytes();
30    let mut result = Vec::with_capacity(bytes.len());
31    let mut i = 0;
32    while i < bytes.len() {
33        if bytes[i] == b'\\' && i + 1 < bytes.len() {
34            match bytes[i + 1] {
35                b'n' => {
36                    result.push(b'\n');
37                    i += 2;
38                }
39                b't' => {
40                    result.push(b'\t');
41                    i += 2;
42                }
43                b'\\' => {
44                    result.push(b'\\');
45                    i += 2;
46                }
47                b'0' => {
48                    result.push(0);
49                    i += 2;
50                }
51                _ => {
52                    // Unknown escape: treat backslash as literal
53                    result.push(b'\\');
54                    i += 1;
55                }
56            }
57        } else {
58            result.push(bytes[i]);
59            i += 1;
60        }
61    }
62    result
63}
64
65/// Paste files in normal (parallel) mode and return the output buffer.
66/// Uses cursor-based scanning — no offset arrays, minimal allocation.
67pub fn paste_parallel_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
68    let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
69    let delims = &config.delimiters;
70
71    if file_data.is_empty() || file_data.iter().all(|d| d.is_empty()) {
72        return Vec::new();
73    }
74
75    // Count max lines using SIMD memchr (fast count pass, no allocation)
76    let max_lines = file_data
77        .iter()
78        .map(|data| {
79            if data.is_empty() {
80                return 0;
81            }
82            let count = memchr::memchr_iter(terminator, data).count();
83            if data.last() != Some(&terminator) {
84                count + 1
85            } else {
86                count
87            }
88        })
89        .max()
90        .unwrap_or(0);
91
92    if max_lines == 0 {
93        return Vec::new();
94    }
95
96    // Estimate output size
97    let total_input: usize = file_data.iter().map(|d| d.len()).sum();
98    let delim_overhead = max_lines * file_data.len();
99    let mut output = Vec::with_capacity(total_input + delim_overhead);
100
101    // Cursors track current position in each file (no offset arrays needed)
102    let mut cursors = vec![0usize; file_data.len()];
103
104    for _ in 0..max_lines {
105        for (file_idx, data) in file_data.iter().enumerate() {
106            if file_idx > 0 && !delims.is_empty() {
107                output.push(delims[(file_idx - 1) % delims.len()]);
108            }
109            let cursor = &mut cursors[file_idx];
110            if *cursor < data.len() {
111                match memchr::memchr(terminator, &data[*cursor..]) {
112                    Some(pos) => {
113                        output.extend_from_slice(&data[*cursor..*cursor + pos]);
114                        *cursor += pos + 1;
115                    }
116                    None => {
117                        output.extend_from_slice(&data[*cursor..]);
118                        *cursor = data.len();
119                    }
120                }
121            }
122        }
123        output.push(terminator);
124    }
125
126    output
127}
128
129/// Paste files in serial mode and return the output buffer.
130/// For each file, join all lines with the delimiter list (cycling).
131/// Uses inline memchr scanning — no offset arrays needed.
132pub fn paste_serial_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
133    let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
134    let delims = &config.delimiters;
135
136    // Estimate output size
137    let total_input: usize = file_data.iter().map(|d| d.len()).sum();
138    let mut output = Vec::with_capacity(total_input + file_data.len());
139
140    for data in file_data {
141        if data.is_empty() {
142            output.push(terminator);
143            continue;
144        }
145        // Strip trailing terminator if present (we add our own at the end)
146        let effective = if data.last() == Some(&terminator) {
147            &data[..data.len() - 1]
148        } else {
149            *data
150        };
151        // Scan through data, replacing terminators with cycling delimiters
152        let mut cursor = 0;
153        let mut delim_idx = 0;
154        while cursor < effective.len() {
155            match memchr::memchr(terminator, &effective[cursor..]) {
156                Some(pos) => {
157                    output.extend_from_slice(&effective[cursor..cursor + pos]);
158                    if !delims.is_empty() {
159                        output.push(delims[delim_idx % delims.len()]);
160                        delim_idx += 1;
161                    }
162                    cursor += pos + 1;
163                }
164                None => {
165                    output.extend_from_slice(&effective[cursor..]);
166                    break;
167                }
168            }
169        }
170        output.push(terminator);
171    }
172
173    output
174}
175
176/// Main paste entry point. Writes directly to the provided writer.
177pub fn paste(
178    file_data: &[&[u8]],
179    config: &PasteConfig,
180    out: &mut impl Write,
181) -> std::io::Result<()> {
182    let output = if config.serial {
183        paste_serial_to_vec(file_data, config)
184    } else {
185        paste_parallel_to_vec(file_data, config)
186    };
187    out.write_all(&output)
188}
189
190/// Build the paste output as a Vec, then return it for the caller to write.
191/// This allows the binary to use raw write() for maximum throughput.
192pub fn paste_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
193    if config.serial {
194        paste_serial_to_vec(file_data, config)
195    } else {
196        paste_parallel_to_vec(file_data, config)
197    }
198}