use std::io::Write;
pub struct PasteConfig {
pub delimiters: Vec<u8>,
pub serial: bool,
pub zero_terminated: bool,
}
impl Default for PasteConfig {
fn default() -> Self {
Self {
delimiters: vec![b'\t'],
serial: false,
zero_terminated: false,
}
}
}
pub fn parse_delimiters(s: &str) -> Vec<u8> {
if s.is_empty() {
return Vec::new();
}
let bytes = s.as_bytes();
let mut result = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'\\' && i + 1 < bytes.len() {
match bytes[i + 1] {
b'n' => {
result.push(b'\n');
i += 2;
}
b't' => {
result.push(b'\t');
i += 2;
}
b'\\' => {
result.push(b'\\');
i += 2;
}
b'0' => {
result.push(0);
i += 2;
}
_ => {
result.push(b'\\');
i += 1;
}
}
} else {
result.push(bytes[i]);
i += 1;
}
}
result
}
const BUF_SIZE: usize = 2 * 1024 * 1024;
#[cfg(unix)]
pub fn raw_write_all(data: &[u8]) -> std::io::Result<()> {
let mut written = 0;
while written < data.len() {
let ret = unsafe {
libc::write(
1,
data[written..].as_ptr() as *const libc::c_void,
(data.len() - written) as _,
)
};
if ret > 0 {
written += ret as usize;
} else if ret == 0 {
return Err(std::io::Error::new(
std::io::ErrorKind::WriteZero,
"write returned 0",
));
} else {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::Interrupted {
continue;
}
return Err(err);
}
}
Ok(())
}
#[cfg(not(unix))]
pub fn raw_write_all(data: &[u8]) -> std::io::Result<()> {
let stdout = std::io::stdout();
let mut lock = stdout.lock();
lock.write_all(data)?;
lock.flush()
}
pub fn paste_parallel_stream(file_data: &[&[u8]], config: &PasteConfig) -> std::io::Result<()> {
let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
let delims = &config.delimiters;
let has_delims = !delims.is_empty();
let nfiles = file_data.len();
if nfiles == 0 || file_data.iter().all(|d| d.is_empty()) {
return Ok(());
}
if nfiles == 1 {
let data = file_data[0];
if data.is_empty() {
return Ok(());
}
if *data.last().unwrap() == terminator {
return raw_write_all(data);
}
raw_write_all(data)?;
return raw_write_all(&[terminator]);
}
if nfiles == 2 && delims.len() == 1 {
return paste_two_files_streaming(file_data[0], file_data[1], delims[0], terminator);
}
paste_n_files_streaming(file_data, delims, has_delims, nfiles, terminator)
}
fn paste_two_files_streaming(
data_a: &[u8],
data_b: &[u8],
delim: u8,
terminator: u8,
) -> std::io::Result<()> {
if data_a.is_empty() && data_b.is_empty() {
return Ok(());
}
let ptr_a = data_a.as_ptr();
let ptr_b = data_b.as_ptr();
let len_a = data_a.len();
let len_b = data_b.len();
let buf_cap = BUF_SIZE;
let mut buf: Vec<u8> = Vec::with_capacity(buf_cap + 65536);
let mut pos: usize = 0;
let mut iter_a = memchr::memchr_iter(terminator, data_a);
let mut iter_b = memchr::memchr_iter(terminator, data_b);
let mut cur_a: usize = 0; let mut cur_b: usize = 0; let mut done_a = len_a == 0;
let mut done_b = len_b == 0;
while !done_a || !done_b {
let (a_start, a_len, a_has_line) = if !done_a {
match iter_a.next() {
Some(nl_pos) => {
let start = cur_a;
let line_len = nl_pos - cur_a;
cur_a = nl_pos + 1;
(start, line_len, true)
}
None => {
done_a = true;
if cur_a < len_a {
let start = cur_a;
let line_len = len_a - cur_a;
cur_a = len_a;
(start, line_len, true)
} else {
(0, 0, false)
}
}
}
} else {
(0, 0, false)
};
let (b_start, b_len, b_has_line) = if !done_b {
match iter_b.next() {
Some(nl_pos) => {
let start = cur_b;
let line_len = nl_pos - cur_b;
cur_b = nl_pos + 1;
(start, line_len, true)
}
None => {
done_b = true;
if cur_b < len_b {
let start = cur_b;
let line_len = len_b - cur_b;
cur_b = len_b;
(start, line_len, true)
} else {
(0, 0, false)
}
}
}
} else {
(0, 0, false)
};
if !a_has_line && !b_has_line {
break;
}
debug_assert!(a_start + a_len <= len_a, "a out of bounds");
debug_assert!(b_start + b_len <= len_b, "b out of bounds");
debug_assert!(a_len < isize::MAX as usize && b_len < isize::MAX as usize);
debug_assert!(
a_len
.checked_add(b_len)
.and_then(|x| x.checked_add(2))
.is_some()
);
let out_len = a_len + b_len + 2;
if pos + out_len > buf.capacity() {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
buf.clear();
pos = 0;
if out_len > buf.capacity() {
buf.reserve(out_len);
}
}
unsafe {
let base = buf.as_mut_ptr();
if a_len > 0 {
std::ptr::copy_nonoverlapping(ptr_a.add(a_start), base.add(pos), a_len);
pos += a_len;
}
*base.add(pos) = delim;
pos += 1;
if b_len > 0 {
std::ptr::copy_nonoverlapping(ptr_b.add(b_start), base.add(pos), b_len);
pos += b_len;
}
*base.add(pos) = terminator;
pos += 1;
}
if pos >= buf_cap {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
buf.clear();
pos = 0;
}
}
if pos > 0 {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
}
Ok(())
}
fn paste_n_files_streaming(
file_data: &[&[u8]],
delims: &[u8],
has_delims: bool,
nfiles: usize,
terminator: u8,
) -> std::io::Result<()> {
if nfiles > 65536 {
return Err(std::io::Error::other("too many files"));
}
let mut cursors: Vec<usize> = vec![0; nfiles];
let mut done: Vec<bool> = file_data.iter().map(|d| d.is_empty()).collect();
let mut files_remaining = done.iter().filter(|&&d| !d).count();
let buf_cap = BUF_SIZE;
let mut buf: Vec<u8> = Vec::with_capacity(buf_cap + 65536);
let mut pos: usize = 0;
let mut iters: Vec<memchr::Memchr<'_>> = file_data
.iter()
.map(|d| memchr::memchr_iter(terminator, d))
.collect();
while files_remaining > 0 {
debug_assert!(
pos < buf_cap,
"saved_pos invariant: pos must be < buf_cap at iteration start"
);
let saved_pos = pos;
let mut any_iter_advanced = false;
for file_idx in 0..nfiles {
if file_idx > 0 && has_delims {
let d = unsafe { *delims.get_unchecked((file_idx - 1) % delims.len()) };
debug_assert!(
pos < buf.capacity(),
"delimiter flush should be unreachable under nfiles invariant"
);
if pos >= buf.capacity() {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
buf.clear();
pos = 0;
}
unsafe { *buf.as_mut_ptr().add(pos) = d };
pos += 1;
}
if !done[file_idx] {
let data = file_data[file_idx];
let cur = cursors[file_idx];
match iters[file_idx].next() {
Some(nl_pos) => {
let line_len = nl_pos - cur;
any_iter_advanced = true;
if line_len > 0 {
if pos + line_len > buf.capacity() {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
buf.clear();
pos = 0;
if line_len > buf.capacity() {
buf.reserve(line_len + 4096);
}
}
unsafe {
std::ptr::copy_nonoverlapping(
data.as_ptr().add(cur),
buf.as_mut_ptr().add(pos),
line_len,
);
}
pos += line_len;
}
cursors[file_idx] = nl_pos + 1;
}
None => {
let rem = data.len() - cur;
if rem > 0 {
any_iter_advanced = true;
if pos + rem > buf.capacity() {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
buf.clear();
pos = 0;
if rem > buf.capacity() {
buf.reserve(rem + 4096);
}
}
unsafe {
std::ptr::copy_nonoverlapping(
data.as_ptr().add(cur),
buf.as_mut_ptr().add(pos),
rem,
);
}
pos += rem;
}
done[file_idx] = true;
files_remaining -= 1;
cursors[file_idx] = data.len();
}
}
}
}
if !any_iter_advanced {
debug_assert_eq!(files_remaining, 0);
pos = saved_pos;
break;
}
if pos >= buf.capacity() {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
buf.clear();
pos = 0;
}
unsafe { *buf.as_mut_ptr().add(pos) = terminator };
pos += 1;
if pos >= buf_cap {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
buf.clear();
pos = 0;
}
}
if pos > 0 {
unsafe { buf.set_len(pos) };
raw_write_all(&buf)?;
}
Ok(())
}
pub fn paste_serial_stream(file_data: &[&[u8]], config: &PasteConfig) -> std::io::Result<()> {
let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
let delims = &config.delimiters;
let has_delims = !delims.is_empty();
if has_delims && delims.len() == 1 {
let replacement = delims[0];
let needs_replace = replacement != terminator;
let mut buf: Vec<u8> = Vec::with_capacity(BUF_SIZE + 4096);
for data in file_data {
if data.is_empty() {
buf.push(terminator);
if buf.len() >= BUF_SIZE {
raw_write_all(&buf)?;
buf.clear();
}
continue;
}
let effective = if data.last() == Some(&terminator) {
&data[..data.len() - 1]
} else {
data
};
let mut cursor = 0usize;
while cursor < effective.len() {
let chunk_end = (cursor + BUF_SIZE).min(effective.len());
let chunk = &effective[cursor..chunk_end];
let start = buf.len();
buf.extend_from_slice(chunk);
if needs_replace {
for pos in memchr::memchr_iter(terminator, chunk) {
buf[start + pos] = replacement;
}
}
cursor = chunk_end;
if buf.len() >= BUF_SIZE {
raw_write_all(&buf)?;
buf.clear();
}
}
buf.push(terminator);
if buf.len() >= BUF_SIZE {
raw_write_all(&buf)?;
buf.clear();
}
}
if !buf.is_empty() {
raw_write_all(&buf)?;
}
return Ok(());
}
let mut buf: Vec<u8> = Vec::with_capacity(BUF_SIZE + 4096);
for data in file_data {
if data.is_empty() {
buf.push(terminator);
if buf.len() >= BUF_SIZE {
raw_write_all(&buf)?;
buf.clear();
}
continue;
}
let mut cursor = 0usize;
let mut line_idx = 0usize;
let mut iter = memchr::memchr_iter(terminator, data);
loop {
if line_idx > 0 && has_delims {
buf.push(delims[(line_idx - 1) % delims.len()]);
}
match iter.next() {
Some(nl_pos) => {
let line = &data[cursor..nl_pos];
if !line.is_empty() {
if buf.len() + line.len() > buf.capacity() {
raw_write_all(&buf)?;
buf.clear();
if line.len() > buf.capacity() {
buf.reserve(line.len() + 4096);
}
}
buf.extend_from_slice(line);
}
cursor = nl_pos + 1;
}
None => {
if cursor < data.len() {
let remaining = &data[cursor..];
if buf.len() + remaining.len() > buf.capacity() {
raw_write_all(&buf)?;
buf.clear();
if remaining.len() > buf.capacity() {
buf.reserve(remaining.len() + 4096);
}
}
buf.extend_from_slice(remaining);
}
break;
}
}
line_idx += 1;
if buf.len() >= BUF_SIZE {
raw_write_all(&buf)?;
buf.clear();
}
}
buf.push(terminator);
if buf.len() >= BUF_SIZE {
raw_write_all(&buf)?;
buf.clear();
}
}
if !buf.is_empty() {
raw_write_all(&buf)?;
}
Ok(())
}
pub fn paste_stream(file_data: &[&[u8]], config: &PasteConfig) -> std::io::Result<()> {
if config.serial {
paste_serial_stream(file_data, config)
} else {
paste_parallel_stream(file_data, config)
}
}
#[inline]
fn presplit_lines(data: &[u8], terminator: u8) -> Vec<(u32, u32)> {
if data.is_empty() {
return Vec::new();
}
assert!(
data.len() <= u32::MAX as usize,
"presplit_lines: data exceeds 4 GiB"
);
let estimated_lines = data.len() / 40 + 1;
let mut offsets = Vec::with_capacity(estimated_lines);
let mut start = 0u32;
for pos in memchr::memchr_iter(terminator, data) {
offsets.push((start, pos as u32));
start = pos as u32 + 1;
}
if data.last() != Some(&terminator) && (start as usize) < data.len() {
offsets.push((start, data.len() as u32));
}
offsets
}
pub fn paste_parallel_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
let delims = &config.delimiters;
if file_data.is_empty() || file_data.iter().all(|d| d.is_empty()) {
return Vec::new();
}
let file_lines: Vec<Vec<(u32, u32)>> = file_data
.iter()
.map(|data| presplit_lines(data, terminator))
.collect();
let max_lines = file_lines.iter().map(|l| l.len()).max().unwrap_or(0);
if max_lines == 0 {
return Vec::new();
}
let nfiles = file_data.len();
let has_delims = !delims.is_empty();
let delims_per_line = if has_delims && nfiles > 1 {
nfiles - 1
} else {
0
};
let mut exact_size = max_lines * (delims_per_line + 1); for fl in &file_lines {
for &(s, e) in fl.iter() {
exact_size += (e - s) as usize;
}
}
let mut output = Vec::with_capacity(exact_size);
unsafe {
let base: *mut u8 = output.as_mut_ptr();
let mut pos = 0usize;
for line_idx in 0..max_lines {
for file_idx in 0..nfiles {
if file_idx > 0 && has_delims {
*base.add(pos) = delims[(file_idx - 1) % delims.len()];
pos += 1;
}
let lines = &file_lines[file_idx];
if line_idx < lines.len() {
let (s, e) = *lines.get_unchecked(line_idx);
let len = (e - s) as usize;
if len > 0 {
std::ptr::copy_nonoverlapping(
file_data.get_unchecked(file_idx).as_ptr().add(s as usize),
base.add(pos),
len,
);
pos += len;
}
}
}
*base.add(pos) = terminator;
pos += 1;
}
assert_eq!(pos, exact_size, "exact_size miscalculated");
output.set_len(pos);
}
output
}
pub fn paste_serial_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
let terminator = if config.zero_terminated { 0u8 } else { b'\n' };
let delims = &config.delimiters;
let has_delims = !delims.is_empty();
let total_input: usize = file_data.iter().map(|d| d.len()).sum();
let mut output = Vec::with_capacity(total_input + file_data.len());
if has_delims && delims.len() == 1 {
let delim = delims[0];
let needs_replace = delim != terminator;
for data in file_data {
if data.is_empty() {
output.push(terminator);
continue;
}
let effective = if data.last() == Some(&terminator) {
&data[..data.len() - 1]
} else {
*data
};
if effective.is_empty() {
output.push(terminator);
continue;
}
let start = output.len();
output.extend_from_slice(effective);
if needs_replace {
for pos in memchr::memchr_iter(terminator, effective) {
output[start + pos] = delim;
}
}
output.push(terminator);
}
return output;
}
for data in file_data {
if data.is_empty() {
output.push(terminator);
continue;
}
let lines = presplit_lines(data, terminator);
if lines.is_empty() {
output.push(terminator);
continue;
}
let (s, e) = lines[0];
output.extend_from_slice(&data[s as usize..e as usize]);
for (i, &(s, e)) in lines[1..].iter().enumerate() {
if has_delims {
output.push(delims[i % delims.len()]);
}
output.extend_from_slice(&data[s as usize..e as usize]);
}
output.push(terminator);
}
output
}
pub fn paste(
file_data: &[&[u8]],
config: &PasteConfig,
out: &mut impl Write,
) -> std::io::Result<()> {
let output = if config.serial {
paste_serial_to_vec(file_data, config)
} else {
paste_parallel_to_vec(file_data, config)
};
out.write_all(&output)
}
pub fn paste_to_vec(file_data: &[&[u8]], config: &PasteConfig) -> Vec<u8> {
if config.serial {
paste_serial_to_vec(file_data, config)
} else {
paste_parallel_to_vec(file_data, config)
}
}