use memchr::memchr_iter;
use std::io::{self, BufRead, IoSlice, Write};
const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
const MAX_IOV: usize = 1024;
const SEQ_CHUNK: usize = 8 * 1024 * 1024;
fn process_chunked(
data: &[u8],
line_delim: u8,
out: &mut impl Write,
mut process_fn: impl FnMut(&[u8], &mut Vec<u8>),
) -> io::Result<()> {
if data.len() <= SEQ_CHUNK {
let mut buf = Vec::with_capacity(data.len() + 256);
process_fn(data, &mut buf);
if !buf.is_empty() {
out.write_all(&buf)?;
}
return Ok(());
}
let mut buf = Vec::with_capacity(SEQ_CHUNK * 2);
let mut start = 0;
while start < data.len() {
let end = if start + SEQ_CHUNK >= data.len() {
data.len()
} else {
match memchr::memrchr(line_delim, &data[start..start + SEQ_CHUNK]) {
Some(pos) => start + pos + 1,
None => (start + SEQ_CHUNK).min(data.len()),
}
};
buf.clear();
process_fn(&data[start..end], &mut buf);
if !buf.is_empty() {
out.write_all(&buf)?;
}
start = end;
}
Ok(())
}
pub struct CutConfig<'a> {
pub mode: CutMode,
pub ranges: &'a [Range],
pub complement: bool,
pub delim: u8,
pub output_delim: &'a [u8],
pub suppress_no_delim: bool,
pub line_delim: u8,
}
#[derive(Debug, Clone)]
pub struct Range {
pub start: usize, pub end: usize, }
pub fn parse_ranges(spec: &str, no_merge_adjacent: bool) -> Result<Vec<Range>, String> {
let mut ranges = Vec::new();
for part in spec.split(',') {
let part = part.trim();
if part.is_empty() {
continue;
}
if let Some(idx) = part.find('-') {
let left = &part[..idx];
let right = &part[idx + 1..];
if left.is_empty() && right.is_empty() {
return Err("invalid range with no endpoint: -".to_string());
}
let start = if left.is_empty() {
1
} else {
left.parse::<usize>()
.map_err(|_| format!("invalid range: '{}'", part))?
};
let end = if right.is_empty() {
usize::MAX
} else {
right
.parse::<usize>()
.map_err(|_| format!("invalid range: '{}'", part))?
};
if start == 0 {
return Err("fields and positions are numbered from 1".to_string());
}
if start > end {
return Err(format!("invalid decreasing range: '{}'", part));
}
ranges.push(Range { start, end });
} else {
let n = part
.parse::<usize>()
.map_err(|_| format!("invalid field: '{}'", part))?;
if n == 0 {
return Err("fields and positions are numbered from 1".to_string());
}
ranges.push(Range { start: n, end: n });
}
}
if ranges.is_empty() {
return Err("you must specify a list of bytes, characters, or fields".to_string());
}
ranges.sort_by_key(|r| (r.start, r.end));
let mut merged = vec![ranges[0].clone()];
for r in &ranges[1..] {
let last = merged.last_mut().unwrap();
if no_merge_adjacent {
if r.start <= last.end {
last.end = last.end.max(r.end);
} else {
merged.push(r.clone());
}
} else {
if r.start <= last.end.saturating_add(1) {
last.end = last.end.max(r.end);
} else {
merged.push(r.clone());
}
}
}
Ok(merged)
}
#[inline(always)]
fn in_ranges(ranges: &[Range], pos: usize) -> bool {
for r in ranges {
if pos < r.start {
return false;
}
if pos <= r.end {
return true;
}
}
false
}
#[inline]
fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
let mut mask: u64 = 0;
for i in 1..=64u32 {
let in_range = in_ranges(ranges, i as usize);
if in_range != complement {
mask |= 1u64 << (i - 1);
}
}
mask
}
#[inline(always)]
fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
if field_num <= 64 {
(mask >> (field_num - 1)) & 1 == 1
} else {
in_ranges(ranges, field_num) != complement
}
}
#[inline(always)]
unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
unsafe {
let len = buf.len();
std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
buf.set_len(len + data.len());
}
}
#[inline(always)]
unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
unsafe {
let len = buf.len();
*buf.as_mut_ptr().add(len) = b;
buf.set_len(len + 1);
}
}
#[inline]
fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
if slices.is_empty() {
return Ok(());
}
for batch in slices.chunks(MAX_IOV) {
let total: usize = batch.iter().map(|s| s.len()).sum();
let written = out.write_vectored(batch)?;
if written >= total {
continue;
}
if written == 0 {
return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
}
write_ioslices_slow(out, batch, written)?;
}
Ok(())
}
#[cold]
#[inline(never)]
fn write_ioslices_slow(
out: &mut impl Write,
slices: &[IoSlice],
mut skip: usize,
) -> io::Result<()> {
for slice in slices {
let len = slice.len();
if skip >= len {
skip -= len;
continue;
}
out.write_all(&slice[skip..])?;
skip = 0;
}
Ok(())
}
#[inline]
fn num_cpus() -> usize {
std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1)
}
fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
let num_threads = num_cpus().max(1);
if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
return vec![data];
}
let chunk_size = data.len() / num_threads;
let mut chunks = Vec::with_capacity(num_threads);
let mut pos = 0;
for _ in 0..num_threads - 1 {
let target = pos + chunk_size;
if target >= data.len() {
break;
}
let boundary = memchr::memchr(line_delim, &data[target..])
.map(|p| target + p + 1)
.unwrap_or(data.len());
if boundary > pos {
chunks.push(&data[pos..boundary]);
}
pos = boundary;
}
if pos < data.len() {
chunks.push(&data[pos..]);
}
chunks
}
fn process_fields_multi_select(
data: &[u8],
delim: u8,
line_delim: u8,
ranges: &[Range],
suppress: bool,
out: &mut impl Write,
) -> io::Result<()> {
let max_field = ranges.last().map_or(0, |r| r.end);
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len() * 3 / 4);
multi_select_chunk(
chunk, delim, line_delim, ranges, max_field, suppress, result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
multi_select_chunk(chunk, delim, line_delim, ranges, max_field, suppress, buf);
})?;
}
Ok(())
}
fn multi_select_chunk(
data: &[u8],
delim: u8,
line_delim: u8,
ranges: &[Range],
max_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
if max_field <= 64 && delim != line_delim {
let mut mask: u64 = 0;
for r in ranges {
let s = r.start.max(1);
let e = r.end.min(64);
for f in s..=e {
mask |= 1u64 << (f - 1);
}
}
if max_field <= 8 {
multi_select_chunk_bitmask(data, delim, line_delim, mask, max_field, suppress, buf);
} else {
multi_select_twolevel(data, delim, line_delim, mask, max_field, suppress, buf);
}
return;
}
buf.reserve(data.len());
let base = data.as_ptr();
let mut start = 0;
let max_delims = max_field.min(128);
for end_pos in memchr_iter(line_delim, data) {
let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
multi_select_line_fast(
line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
);
start = end_pos + 1;
}
if start < data.len() {
let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
multi_select_line_fast(
line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
);
}
}
fn multi_select_chunk_bitmask(
data: &[u8],
delim: u8,
line_delim: u8,
mask: u64,
max_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len() + 1);
let initial_len = buf.len();
let out_base = unsafe { buf.as_mut_ptr().add(initial_len) };
let src = data.as_ptr();
let mut wp: usize = 0;
let mut field_num: usize = 1; let mut field_start: usize = 0; let mut first_output = true; let mut has_delim = false;
for pos in memchr::memchr2_iter(delim, line_delim, data) {
if data[pos] == line_delim {
if !has_delim {
if !suppress {
let len = pos - field_start;
unsafe {
std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
}
wp += len;
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
} else {
if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
if !first_output {
unsafe {
*out_base.add(wp) = delim;
}
wp += 1;
}
let len = pos - field_start;
unsafe {
std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
}
wp += len;
}
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
field_num = 1;
field_start = pos + 1;
first_output = true;
has_delim = false;
} else {
has_delim = true;
if field_num <= max_field && (mask & (1u64 << (field_num - 1))) != 0 {
if !first_output {
unsafe {
*out_base.add(wp) = delim;
}
wp += 1;
}
let len = pos - field_start;
unsafe {
std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
}
wp += len;
first_output = false;
}
field_num += 1;
field_start = pos + 1;
}
}
if field_start < data.len() {
if !has_delim {
if !suppress {
let len = data.len() - field_start;
unsafe {
std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
}
wp += len;
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
} else {
if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
if !first_output {
unsafe {
*out_base.add(wp) = delim;
}
wp += 1;
}
let len = data.len() - field_start;
unsafe {
std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
}
wp += len;
}
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
}
debug_assert!(wp <= data.len() + 1);
unsafe {
buf.set_len(initial_len + wp);
}
}
fn multi_select_twolevel(
data: &[u8],
delim: u8,
line_delim: u8,
mask: u64,
max_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len() + 1);
let initial_len = buf.len();
let out_base = unsafe { buf.as_mut_ptr().add(initial_len) };
let src = data.as_ptr();
let mut wp: usize = 0;
let mut line_start: usize = 0;
for nl_pos in memchr_iter(line_delim, data) {
let line_len = nl_pos - line_start;
let line = &data[line_start..nl_pos];
if line_len == 0 {
if !suppress {
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
line_start = nl_pos + 1;
continue;
}
let mut field_num: usize = 1;
let mut field_start: usize = 0;
let mut first_output = true;
let mut has_delim = false;
for dp in memchr::memchr_iter(delim, line) {
has_delim = true;
if (mask >> (field_num - 1)) & 1 == 1 {
if !first_output {
unsafe {
*out_base.add(wp) = delim;
}
wp += 1;
}
let flen = dp - field_start;
unsafe {
std::ptr::copy_nonoverlapping(
src.add(line_start + field_start),
out_base.add(wp),
flen,
);
}
wp += flen;
first_output = false;
}
field_num += 1;
field_start = dp + 1;
if field_num > max_field {
break;
}
}
if !has_delim {
if !suppress {
unsafe {
std::ptr::copy_nonoverlapping(src.add(line_start), out_base.add(wp), line_len);
}
wp += line_len;
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
} else {
if field_num <= 64 && (mask >> (field_num - 1)) & 1 == 1 {
if !first_output {
unsafe {
*out_base.add(wp) = delim;
}
wp += 1;
}
let flen = line_len - field_start;
unsafe {
std::ptr::copy_nonoverlapping(
src.add(line_start + field_start),
out_base.add(wp),
flen,
);
}
wp += flen;
}
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
line_start = nl_pos + 1;
}
if line_start < data.len() {
let line = &data[line_start..];
let line_len = line.len();
let mut field_num: usize = 1;
let mut field_start: usize = 0;
let mut first_output = true;
let mut has_delim = false;
for dp in memchr::memchr_iter(delim, line) {
has_delim = true;
if (mask >> (field_num - 1)) & 1 == 1 {
if !first_output {
unsafe {
*out_base.add(wp) = delim;
}
wp += 1;
}
let flen = dp - field_start;
unsafe {
std::ptr::copy_nonoverlapping(
src.add(line_start + field_start),
out_base.add(wp),
flen,
);
}
wp += flen;
first_output = false;
}
field_num += 1;
field_start = dp + 1;
if field_num > max_field {
break;
}
}
if !has_delim {
if !suppress {
unsafe {
std::ptr::copy_nonoverlapping(src.add(line_start), out_base.add(wp), line_len);
}
wp += line_len;
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
} else {
if field_num <= 64 && (mask >> (field_num - 1)) & 1 == 1 {
if !first_output {
unsafe {
*out_base.add(wp) = delim;
}
wp += 1;
}
let flen = line_len - field_start;
unsafe {
std::ptr::copy_nonoverlapping(
src.add(line_start + field_start),
out_base.add(wp),
flen,
);
}
wp += flen;
}
unsafe {
*out_base.add(wp) = line_delim;
}
wp += 1;
}
}
debug_assert!(
wp <= data.len() + 1,
"wp={} exceeded reservation data.len()+1={}",
wp,
data.len() + 1
);
unsafe {
buf.set_len(initial_len + wp);
}
}
#[inline(always)]
fn multi_select_line_fast(
line: &[u8],
delim: u8,
line_delim: u8,
ranges: &[Range],
max_delims: usize,
suppress: bool,
buf: &mut Vec<u8>,
_line_abs_start: usize,
_data_base: *const u8,
) {
let len = line.len();
if len == 0 {
if !suppress {
unsafe { buf_push(buf, line_delim) };
}
return;
}
let base = line.as_ptr();
let mut delim_pos = [0usize; 128];
let mut num_delims: usize = 0;
for pos in memchr_iter(delim, line) {
if num_delims < max_delims {
delim_pos[num_delims] = pos;
num_delims += 1;
if num_delims >= max_delims {
break;
}
}
}
if num_delims == 0 {
if !suppress {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
return;
}
let total_fields = num_delims + 1;
let mut first_output = true;
for r in ranges {
let range_start = r.start;
let range_end = r.end.min(total_fields);
if range_start > total_fields {
break;
}
for field_num in range_start..=range_end {
if field_num > total_fields {
break;
}
let field_start = if field_num == 1 {
0
} else if field_num - 2 < num_delims {
delim_pos[field_num - 2] + 1
} else {
continue;
};
let field_end = if field_num <= num_delims {
delim_pos[field_num - 1]
} else {
len
};
if !first_output {
unsafe { buf_push(buf, delim) };
}
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
);
}
first_output = false;
}
}
unsafe { buf_push(buf, line_delim) };
}
fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
let delim = cfg.delim;
let line_delim = cfg.line_delim;
let ranges = cfg.ranges;
let complement = cfg.complement;
let output_delim = cfg.output_delim;
let suppress = cfg.suppress_no_delim;
if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
}
if complement
&& ranges.len() == 1
&& output_delim.len() == 1
&& output_delim[0] == delim
&& ranges[0].start == ranges[0].end
{
return process_complement_single_field(
data,
delim,
line_delim,
ranges[0].start,
suppress,
out,
);
}
if complement
&& ranges.len() == 1
&& ranges[0].start > 1
&& ranges[0].end < usize::MAX
&& output_delim.len() == 1
&& output_delim[0] == delim
{
return process_complement_range(
data,
delim,
line_delim,
ranges[0].start,
ranges[0].end,
suppress,
out,
);
}
if !complement
&& ranges.len() == 1
&& ranges[0].start == 1
&& output_delim.len() == 1
&& output_delim[0] == delim
&& ranges[0].end < usize::MAX
{
return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
}
if !complement
&& ranges.len() == 1
&& ranges[0].end == usize::MAX
&& ranges[0].start > 1
&& output_delim.len() == 1
&& output_delim[0] == delim
{
return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
}
if !complement
&& ranges.len() == 1
&& ranges[0].start > 1
&& ranges[0].end < usize::MAX
&& output_delim.len() == 1
&& output_delim[0] == delim
{
return process_fields_mid_range(
data,
delim,
line_delim,
ranges[0].start,
ranges[0].end,
suppress,
out,
);
}
if !complement
&& ranges.len() > 1
&& ranges.last().map_or(false, |r| r.end < usize::MAX)
&& output_delim.len() == 1
&& output_delim[0] == delim
&& delim != line_delim
{
return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
}
let max_field = if complement {
usize::MAX
} else {
ranges.last().map(|r| r.end).unwrap_or(0)
};
let field_mask = compute_field_mask(ranges, complement);
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len() + 1);
process_fields_chunk(
chunk,
delim,
ranges,
output_delim,
suppress,
max_field,
field_mask,
line_delim,
complement,
result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
process_fields_chunk(
chunk,
delim,
ranges,
output_delim,
suppress,
max_field,
field_mask,
line_delim,
complement,
buf,
);
})?;
}
Ok(())
}
fn process_fields_chunk(
data: &[u8],
delim: u8,
ranges: &[Range],
output_delim: &[u8],
suppress: bool,
max_field: usize,
field_mask: u64,
line_delim: u8,
complement: bool,
buf: &mut Vec<u8>,
) {
if delim != line_delim {
buf.reserve(data.len());
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
extract_fields_to_buf(
line,
delim,
ranges,
output_delim,
suppress,
max_field,
field_mask,
line_delim,
buf,
complement,
);
start = end_pos + 1;
}
if start < data.len() {
extract_fields_to_buf(
&data[start..],
delim,
ranges,
output_delim,
suppress,
max_field,
field_mask,
line_delim,
buf,
complement,
);
}
return;
}
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
extract_fields_to_buf(
line,
delim,
ranges,
output_delim,
suppress,
max_field,
field_mask,
line_delim,
buf,
complement,
);
start = end_pos + 1;
}
if start < data.len() {
extract_fields_to_buf(
&data[start..],
delim,
ranges,
output_delim,
suppress,
max_field,
field_mask,
line_delim,
buf,
complement,
);
}
}
fn process_single_field(
data: &[u8],
delim: u8,
line_delim: u8,
target: usize,
suppress: bool,
out: &mut impl Write,
) -> io::Result<()> {
let target_idx = target - 1;
if delim != line_delim {
if target_idx == 0 && !suppress {
if data.len() >= PARALLEL_THRESHOLD {
return single_field1_parallel(data, delim, line_delim, out);
}
return process_chunked(data, line_delim, out, |chunk, buf| {
single_field1_to_buf(chunk, delim, line_delim, buf);
});
}
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len() / 2);
process_single_field_chunk(
chunk, delim, target_idx, line_delim, suppress, result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
if !buf.is_empty() {
out.write_all(&buf)?;
}
}
return Ok(());
}
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len() / 4);
process_single_field_chunk(
chunk, delim, target_idx, line_delim, suppress, result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
let mut buf = Vec::with_capacity(data.len() / 4);
process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
if !buf.is_empty() {
out.write_all(&buf)?;
}
}
Ok(())
}
fn process_complement_range(
data: &[u8],
delim: u8,
line_delim: u8,
skip_start: usize,
skip_end: usize,
suppress: bool,
out: &mut impl Write,
) -> io::Result<()> {
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
complement_range_chunk(
chunk, delim, skip_start, skip_end, line_delim, suppress, result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
complement_range_chunk(
chunk, delim, skip_start, skip_end, line_delim, suppress, buf,
);
})?;
}
Ok(())
}
fn complement_range_chunk(
data: &[u8],
delim: u8,
skip_start: usize,
skip_end: usize,
line_delim: u8,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
start = end_pos + 1;
}
if start < data.len() {
complement_range_line(
&data[start..],
delim,
skip_start,
skip_end,
line_delim,
suppress,
buf,
);
}
}
#[inline(always)]
fn complement_range_line(
line: &[u8],
delim: u8,
skip_start: usize,
skip_end: usize,
line_delim: u8,
suppress: bool,
buf: &mut Vec<u8>,
) {
let len = line.len();
if len == 0 {
if !suppress {
unsafe { buf_push(buf, line_delim) };
}
return;
}
let base = line.as_ptr();
let need_prefix_delims = skip_start - 1; let need_skip_delims = skip_end - skip_start + 1; let total_need = need_prefix_delims + need_skip_delims;
let mut delim_count: usize = 0;
let mut prefix_end_pos: usize = usize::MAX; let mut suffix_start_pos: usize = usize::MAX;
for pos in memchr_iter(delim, line) {
delim_count += 1;
if delim_count == need_prefix_delims {
prefix_end_pos = pos;
}
if delim_count == total_need {
suffix_start_pos = pos + 1;
break;
}
}
if delim_count == 0 {
if !suppress {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
return;
}
if delim_count < need_prefix_delims {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
return;
}
let has_prefix = need_prefix_delims > 0;
let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
if has_prefix && has_suffix {
unsafe {
buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
buf_push(buf, delim);
buf_extend(
buf,
std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
);
buf_push(buf, line_delim);
}
} else if has_prefix {
unsafe {
buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
buf_push(buf, line_delim);
}
} else if has_suffix {
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
);
buf_push(buf, line_delim);
}
} else {
unsafe { buf_push(buf, line_delim) };
}
}
fn process_complement_single_field(
data: &[u8],
delim: u8,
line_delim: u8,
skip_field: usize,
suppress: bool,
out: &mut impl Write,
) -> io::Result<()> {
let skip_idx = skip_field - 1;
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
complement_single_field_chunk(
chunk, delim, skip_idx, line_delim, suppress, result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
complement_single_field_chunk(chunk, delim, skip_idx, line_delim, suppress, buf);
})?;
}
Ok(())
}
fn complement_single_field_chunk(
data: &[u8],
delim: u8,
skip_idx: usize,
line_delim: u8,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
start = end_pos + 1;
}
if start < data.len() {
complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
}
}
#[inline(always)]
fn complement_single_field_line(
line: &[u8],
delim: u8,
skip_idx: usize,
line_delim: u8,
suppress: bool,
buf: &mut Vec<u8>,
) {
let len = line.len();
if len == 0 {
if !suppress {
unsafe { buf_push(buf, line_delim) };
}
return;
}
let base = line.as_ptr();
let need_before = skip_idx;
let need_total = skip_idx + 1;
let mut delim_count: usize = 0;
let mut skip_start_pos: usize = 0;
let mut skip_end_pos: usize = len;
let mut found_end = false;
for pos in memchr_iter(delim, line) {
delim_count += 1;
if delim_count == need_before {
skip_start_pos = pos + 1;
}
if delim_count == need_total {
skip_end_pos = pos;
found_end = true;
break;
}
}
if delim_count == 0 {
if !suppress {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
return;
}
if delim_count < need_before {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
return;
}
let has_prefix = skip_idx > 0 && skip_start_pos > 0;
let has_suffix = found_end && skip_end_pos < len;
if has_prefix && has_suffix {
unsafe {
buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
buf_push(buf, delim);
buf_extend(
buf,
std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
);
buf_push(buf, line_delim);
}
} else if has_prefix {
unsafe {
buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
buf_push(buf, line_delim);
}
} else if has_suffix {
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
);
buf_push(buf, line_delim);
}
} else {
unsafe { buf_push(buf, line_delim) };
}
}
fn process_fields_prefix(
data: &[u8],
delim: u8,
line_delim: u8,
last_field: usize,
suppress: bool,
out: &mut impl Write,
) -> io::Result<()> {
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else if !suppress {
fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, buf);
})?;
}
Ok(())
}
#[inline]
fn fields_prefix_zerocopy(
data: &[u8],
delim: u8,
line_delim: u8,
last_field: usize,
out: &mut impl Write,
) -> io::Result<()> {
let newline_buf: [u8; 1] = [line_delim];
let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
let mut start = 0;
let mut run_start: usize = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
let mut field_count = 1;
let mut truncate_at: Option<usize> = None;
for dpos in memchr_iter(delim, line) {
if field_count >= last_field {
truncate_at = Some(start + dpos);
break;
}
field_count += 1;
}
if let Some(trunc_pos) = truncate_at {
if run_start < start {
iov.push(IoSlice::new(&data[run_start..start]));
}
iov.push(IoSlice::new(&data[start..trunc_pos]));
iov.push(IoSlice::new(&newline_buf));
run_start = end_pos + 1;
if iov.len() >= MAX_IOV - 2 {
write_ioslices(out, &iov)?;
iov.clear();
}
}
start = end_pos + 1;
}
if start < data.len() {
let line = &data[start..];
let mut field_count = 1;
let mut truncate_at: Option<usize> = None;
for dpos in memchr_iter(delim, line) {
if field_count >= last_field {
truncate_at = Some(start + dpos);
break;
}
field_count += 1;
}
if let Some(trunc_pos) = truncate_at {
if run_start < start {
iov.push(IoSlice::new(&data[run_start..start]));
}
iov.push(IoSlice::new(&data[start..trunc_pos]));
iov.push(IoSlice::new(&newline_buf));
if !iov.is_empty() {
write_ioslices(out, &iov)?;
}
return Ok(());
}
}
if run_start < data.len() {
iov.push(IoSlice::new(&data[run_start..]));
if !data.is_empty() && *data.last().unwrap() != line_delim {
iov.push(IoSlice::new(&newline_buf));
}
}
if !iov.is_empty() {
write_ioslices(out, &iov)?;
}
Ok(())
}
fn fields_prefix_chunk(
data: &[u8],
delim: u8,
line_delim: u8,
last_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
start = end_pos + 1;
}
if start < data.len() {
fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
}
}
#[inline(always)]
fn fields_prefix_line(
line: &[u8],
delim: u8,
line_delim: u8,
last_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
let len = line.len();
if len == 0 {
if !suppress {
unsafe { buf_push(buf, line_delim) };
}
return;
}
let base = line.as_ptr();
let mut field_count = 1usize;
let mut has_delim = false;
for pos in memchr_iter(delim, line) {
has_delim = true;
if field_count >= last_field {
unsafe {
buf_extend(buf, std::slice::from_raw_parts(base, pos));
buf_push(buf, line_delim);
}
return;
}
field_count += 1;
}
if !has_delim {
if !suppress {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
return;
}
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
fn process_fields_suffix(
data: &[u8],
delim: u8,
line_delim: u8,
start_field: usize,
suppress: bool,
out: &mut impl Write,
) -> io::Result<()> {
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, buf);
})?;
}
Ok(())
}
fn fields_suffix_chunk(
data: &[u8],
delim: u8,
line_delim: u8,
start_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
start = end_pos + 1;
}
if start < data.len() {
fields_suffix_line(
&data[start..],
delim,
line_delim,
start_field,
suppress,
buf,
);
}
}
#[inline(always)]
fn fields_suffix_line(
line: &[u8],
delim: u8,
line_delim: u8,
start_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
let len = line.len();
if len == 0 {
if !suppress {
unsafe { buf_push(buf, line_delim) };
}
return;
}
let base = line.as_ptr();
let skip_delims = start_field - 1;
let mut delim_count = 0usize;
let mut has_delim = false;
for pos in memchr_iter(delim, line) {
has_delim = true;
delim_count += 1;
if delim_count >= skip_delims {
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
);
buf_push(buf, line_delim);
}
return;
}
}
if !has_delim {
if !suppress {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
return;
}
unsafe { buf_push(buf, line_delim) };
}
fn process_fields_mid_range(
data: &[u8],
delim: u8,
line_delim: u8,
start_field: usize,
end_field: usize,
suppress: bool,
out: &mut impl Write,
) -> io::Result<()> {
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
fields_mid_range_chunk(
chunk,
delim,
line_delim,
start_field,
end_field,
suppress,
result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
fields_mid_range_chunk(
chunk,
delim,
line_delim,
start_field,
end_field,
suppress,
buf,
);
})?;
}
Ok(())
}
fn fields_mid_range_chunk(
data: &[u8],
delim: u8,
line_delim: u8,
start_field: usize,
end_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
fields_mid_range_line(
line,
delim,
line_delim,
start_field,
end_field,
suppress,
buf,
);
start = end_pos + 1;
}
if start < data.len() {
fields_mid_range_line(
&data[start..],
delim,
line_delim,
start_field,
end_field,
suppress,
buf,
);
}
}
#[inline(always)]
fn fields_mid_range_line(
line: &[u8],
delim: u8,
line_delim: u8,
start_field: usize,
end_field: usize,
suppress: bool,
buf: &mut Vec<u8>,
) {
let len = line.len();
if len == 0 {
if !suppress {
unsafe { buf_push(buf, line_delim) };
}
return;
}
let base = line.as_ptr();
let skip_before = start_field - 1; let field_span = end_field - start_field; let target_end_delim = skip_before + field_span + 1;
let mut delim_count = 0;
let mut range_start = 0;
let mut has_delim = false;
for pos in memchr_iter(delim, line) {
has_delim = true;
delim_count += 1;
if delim_count == skip_before {
range_start = pos + 1;
}
if delim_count == target_end_delim {
if skip_before == 0 {
range_start = 0;
}
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(range_start), pos - range_start),
);
buf_push(buf, line_delim);
}
return;
}
}
if !has_delim {
if !suppress {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
return;
}
if delim_count >= skip_before {
if skip_before == 0 {
range_start = 0;
}
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(range_start), len - range_start),
);
buf_push(buf, line_delim);
}
} else {
unsafe { buf_push(buf, line_delim) };
}
}
fn single_field1_parallel(
data: &[u8],
delim: u8,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len() + 1);
single_field1_to_buf(chunk, delim, line_delim, result);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)
}
#[inline]
fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
debug_assert_ne!(delim, line_delim, "delim and line_delim must differ");
buf.reserve(data.len() + 1);
let base = data.as_ptr();
let initial_len = buf.len();
let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
let mut start = 0;
let mut run_start: usize = 0;
let mut in_run = true;
for end_pos in memchr_iter(line_delim, data) {
let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
match memchr::memchr(delim, line) {
Some(dp) => {
if in_run && run_start < start {
let run_len = start - run_start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
out_ptr = out_ptr.add(run_len);
}
}
unsafe {
std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
out_ptr = out_ptr.add(dp);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
run_start = end_pos + 1;
in_run = true;
}
None => {
if !in_run {
run_start = start;
in_run = true;
}
}
}
start = end_pos + 1;
}
if in_run && run_start < start {
let run_len = start - run_start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
out_ptr = out_ptr.add(run_len);
}
}
if start < data.len() {
let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
match memchr::memchr(delim, line) {
Some(dp) => {
unsafe {
std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
out_ptr = out_ptr.add(dp);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
}
None => {
let len = data.len() - start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(start), out_ptr, len);
out_ptr = out_ptr.add(len);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
}
}
}
unsafe {
let new_len = out_ptr as usize - buf.as_ptr() as usize;
debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
buf.set_len(new_len);
}
}
#[inline]
#[allow(dead_code)]
fn single_field1_zerocopy(
data: &[u8],
delim: u8,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
let newline_buf: [u8; 1] = [line_delim];
let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
let mut run_start: usize = 0;
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = &data[start..end_pos];
if let Some(dp) = memchr::memchr(delim, line) {
if run_start < start {
iov.push(IoSlice::new(&data[run_start..start]));
}
iov.push(IoSlice::new(&data[start..start + dp]));
iov.push(IoSlice::new(&newline_buf));
run_start = end_pos + 1;
if iov.len() >= MAX_IOV - 2 {
write_ioslices(out, &iov)?;
iov.clear();
}
}
start = end_pos + 1;
}
if start < data.len() {
let line = &data[start..];
if let Some(dp) = memchr::memchr(delim, line) {
if run_start < start {
iov.push(IoSlice::new(&data[run_start..start]));
}
iov.push(IoSlice::new(&data[start..start + dp]));
iov.push(IoSlice::new(&newline_buf));
if !iov.is_empty() {
write_ioslices(out, &iov)?;
}
return Ok(());
}
}
if run_start < data.len() {
iov.push(IoSlice::new(&data[run_start..]));
if !data.is_empty() && *data.last().unwrap() != line_delim {
iov.push(IoSlice::new(&newline_buf));
}
}
if !iov.is_empty() {
write_ioslices(out, &iov)?;
}
Ok(())
}
fn process_single_field_chunk(
data: &[u8],
delim: u8,
target_idx: usize,
line_delim: u8,
suppress: bool,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len() + 1);
let base = data.as_ptr();
let initial_len = buf.len();
let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
let mut start = 0;
let mut run_start: usize = 0;
let mut in_run = !suppress;
for end_pos in memchr_iter(line_delim, data) {
let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
let line_len = end_pos - start;
if line_len == 0 {
if !suppress {
if !in_run {
run_start = start;
in_run = true;
}
}
start = end_pos + 1;
continue;
}
let mut field_start_offset = 0;
let mut field_idx = 0;
let mut found = false;
let mut has_delim = false;
for pos in memchr_iter(delim, line) {
has_delim = true;
if field_idx == target_idx {
if in_run && run_start < start {
let run_len = start - run_start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
out_ptr = out_ptr.add(run_len);
}
}
let field_len = pos - field_start_offset;
unsafe {
std::ptr::copy_nonoverlapping(
base.add(start + field_start_offset),
out_ptr,
field_len,
);
out_ptr = out_ptr.add(field_len);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
run_start = end_pos + 1;
in_run = true;
found = true;
break;
}
field_idx += 1;
field_start_offset = pos + 1;
}
if !found {
if !has_delim {
if !suppress {
if !in_run {
run_start = start;
in_run = true;
}
} else {
if in_run && run_start < start {
let run_len = start - run_start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
out_ptr = out_ptr.add(run_len);
}
}
in_run = false;
run_start = end_pos + 1;
}
} else if field_idx == target_idx {
if in_run && run_start < start {
let run_len = start - run_start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
out_ptr = out_ptr.add(run_len);
}
}
let field_len = line_len - field_start_offset;
unsafe {
std::ptr::copy_nonoverlapping(
base.add(start + field_start_offset),
out_ptr,
field_len,
);
out_ptr = out_ptr.add(field_len);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
run_start = end_pos + 1;
in_run = true;
} else {
if in_run && run_start < start {
let run_len = start - run_start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
out_ptr = out_ptr.add(run_len);
}
}
unsafe {
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
run_start = end_pos + 1;
in_run = true;
}
}
start = end_pos + 1;
}
if in_run && run_start < start {
let run_len = start - run_start;
unsafe {
std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
out_ptr = out_ptr.add(run_len);
}
}
if start < data.len() {
let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
let line_len = data.len() - start;
if line_len == 0 {
if !suppress {
unsafe {
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
}
} else {
let mut field_start_offset = 0;
let mut field_idx = 0;
let mut found = false;
let mut has_delim = false;
for pos in memchr_iter(delim, line) {
has_delim = true;
if field_idx == target_idx {
let field_len = pos - field_start_offset;
unsafe {
std::ptr::copy_nonoverlapping(
base.add(start + field_start_offset),
out_ptr,
field_len,
);
out_ptr = out_ptr.add(field_len);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
found = true;
break;
}
field_idx += 1;
field_start_offset = pos + 1;
}
if !found {
if !has_delim {
if !suppress {
unsafe {
std::ptr::copy_nonoverlapping(base.add(start), out_ptr, line_len);
out_ptr = out_ptr.add(line_len);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
}
} else if field_idx == target_idx {
let field_len = line_len - field_start_offset;
unsafe {
std::ptr::copy_nonoverlapping(
base.add(start + field_start_offset),
out_ptr,
field_len,
);
out_ptr = out_ptr.add(field_len);
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
} else {
unsafe {
*out_ptr = line_delim;
out_ptr = out_ptr.add(1);
}
}
}
}
}
unsafe {
let new_len = out_ptr as usize - buf.as_ptr() as usize;
debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
buf.set_len(new_len);
}
}
#[inline(always)]
fn extract_fields_to_buf(
line: &[u8],
delim: u8,
ranges: &[Range],
output_delim: &[u8],
suppress: bool,
max_field: usize,
field_mask: u64,
line_delim: u8,
buf: &mut Vec<u8>,
complement: bool,
) {
let len = line.len();
if len == 0 {
if !suppress {
buf.push(line_delim);
}
return;
}
let needed = len + output_delim.len() * 16 + 1;
if buf.capacity() - buf.len() < needed {
buf.reserve(needed);
}
let base = line.as_ptr();
let mut field_num: usize = 1;
let mut field_start: usize = 0;
let mut first_output = true;
let mut has_delim = false;
for delim_pos in memchr_iter(delim, line) {
has_delim = true;
if is_selected(field_num, field_mask, ranges, complement) {
if !first_output {
unsafe { buf_extend(buf, output_delim) };
}
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
)
};
first_output = false;
}
field_num += 1;
field_start = delim_pos + 1;
if field_num > max_field {
break;
}
}
if (field_num <= max_field || complement)
&& has_delim
&& is_selected(field_num, field_mask, ranges, complement)
{
if !first_output {
unsafe { buf_extend(buf, output_delim) };
}
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(field_start), len - field_start),
)
};
first_output = false;
}
if !first_output {
unsafe { buf_push(buf, line_delim) };
} else if !has_delim {
if !suppress {
unsafe {
buf_extend(buf, line);
buf_push(buf, line_delim);
}
}
} else {
unsafe { buf_push(buf, line_delim) };
}
}
fn process_bytes_from_start(
data: &[u8],
max_bytes: usize,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
if data.len() < 64 * 1024 * 1024 && max_bytes > 0 && max_bytes < usize::MAX {
let mut start = 0;
let mut all_fit = true;
for pos in memchr_iter(line_delim, data) {
if pos - start > max_bytes {
all_fit = false;
break;
}
start = pos + 1;
}
if all_fit && start < data.len() && data.len() - start > max_bytes {
all_fit = false;
}
if all_fit {
if !data.is_empty() && data[data.len() - 1] == line_delim {
return out.write_all(data);
} else if !data.is_empty() {
out.write_all(data)?;
return out.write_all(&[line_delim]);
}
return Ok(());
}
}
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
if max_bytes <= 512 {
let est_out = (data.len() / 4).max(max_bytes + 2);
let mut buf = Vec::with_capacity(est_out.min(data.len()));
bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
if !buf.is_empty() {
out.write_all(&buf)?;
}
} else {
bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
}
}
Ok(())
}
#[inline]
fn bytes_from_start_zerocopy(
data: &[u8],
max_bytes: usize,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
let newline_buf: [u8; 1] = [line_delim];
let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
let mut start = 0;
let mut run_start: usize = 0;
for pos in memchr_iter(line_delim, data) {
let line_len = pos - start;
if line_len > max_bytes {
if run_start < start {
iov.push(IoSlice::new(&data[run_start..start]));
}
iov.push(IoSlice::new(&data[start..start + max_bytes]));
iov.push(IoSlice::new(&newline_buf));
run_start = pos + 1;
if iov.len() >= MAX_IOV - 2 {
write_ioslices(out, &iov)?;
iov.clear();
}
}
start = pos + 1;
}
if start < data.len() {
let line_len = data.len() - start;
if line_len > max_bytes {
if run_start < start {
iov.push(IoSlice::new(&data[run_start..start]));
}
iov.push(IoSlice::new(&data[start..start + max_bytes]));
iov.push(IoSlice::new(&newline_buf));
if !iov.is_empty() {
write_ioslices(out, &iov)?;
}
return Ok(());
}
}
if run_start < data.len() {
iov.push(IoSlice::new(&data[run_start..]));
if !data.is_empty() && *data.last().unwrap() != line_delim {
iov.push(IoSlice::new(&newline_buf));
}
}
if !iov.is_empty() {
write_ioslices(out, &iov)?;
}
Ok(())
}
#[inline]
fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
buf.reserve(data.len());
let src = data.as_ptr();
let dst_base = buf.as_mut_ptr();
let mut wp = buf.len();
let mut start = 0;
for pos in memchr_iter(line_delim, data) {
let line_len = pos - start;
let take = line_len.min(max_bytes);
unsafe {
std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
*dst_base.add(wp + take) = line_delim;
}
wp += take + 1;
start = pos + 1;
}
if start < data.len() {
let line_len = data.len() - start;
let take = line_len.min(max_bytes);
unsafe {
std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
*dst_base.add(wp + take) = line_delim;
}
wp += take + 1;
}
unsafe { buf.set_len(wp) };
}
fn process_bytes_from_offset(
data: &[u8],
skip_bytes: usize,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
}
Ok(())
}
#[inline]
fn bytes_from_offset_zerocopy(
data: &[u8],
skip_bytes: usize,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
let delim_buf = [line_delim];
let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
let mut start = 0;
for pos in memchr_iter(line_delim, data) {
let line_len = pos - start;
if line_len > skip_bytes {
iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
}
iov.push(IoSlice::new(&delim_buf));
if iov.len() >= MAX_IOV - 1 {
write_ioslices(out, &iov)?;
iov.clear();
}
start = pos + 1;
}
if start < data.len() {
let line_len = data.len() - start;
if line_len > skip_bytes {
iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
}
iov.push(IoSlice::new(&delim_buf));
}
if !iov.is_empty() {
write_ioslices(out, &iov)?;
}
Ok(())
}
#[inline]
fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
buf.reserve(data.len());
let src = data.as_ptr();
let dst_base = buf.as_mut_ptr();
let mut wp = buf.len();
let mut start = 0;
for pos in memchr_iter(line_delim, data) {
let line_len = pos - start;
if line_len > skip_bytes {
let take = line_len - skip_bytes;
unsafe {
std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
}
wp += take;
}
unsafe {
*dst_base.add(wp) = line_delim;
}
wp += 1;
start = pos + 1;
}
if start < data.len() {
let line_len = data.len() - start;
if line_len > skip_bytes {
let take = line_len - skip_bytes;
unsafe {
std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
}
wp += take;
}
unsafe {
*dst_base.add(wp) = line_delim;
}
wp += 1;
}
unsafe { buf.set_len(wp) };
}
fn process_bytes_mid_range(
data: &[u8],
start_byte: usize,
end_byte: usize,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
let skip = start_byte.saturating_sub(1);
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, buf);
})?;
}
Ok(())
}
#[inline]
fn bytes_mid_range_chunk(
data: &[u8],
skip: usize,
end_byte: usize,
line_delim: u8,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let src = data.as_ptr();
let dst_base = buf.as_mut_ptr();
let mut wp = buf.len();
let mut start = 0;
for pos in memchr_iter(line_delim, data) {
let line_len = pos - start;
if line_len > skip {
let take_end = line_len.min(end_byte);
let take = take_end - skip;
unsafe {
std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
}
wp += take;
}
unsafe {
*dst_base.add(wp) = line_delim;
}
wp += 1;
start = pos + 1;
}
if start < data.len() {
let line_len = data.len() - start;
if line_len > skip {
let take_end = line_len.min(end_byte);
let take = take_end - skip;
unsafe {
std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
}
wp += take;
}
unsafe {
*dst_base.add(wp) = line_delim;
}
wp += 1;
}
unsafe { buf.set_len(wp) };
}
fn process_bytes_complement_mid(
data: &[u8],
skip_start: usize,
skip_end: usize,
line_delim: u8,
out: &mut impl Write,
) -> io::Result<()> {
let prefix_bytes = skip_start - 1; if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len());
bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, buf);
})?;
}
Ok(())
}
#[inline]
fn bytes_complement_mid_chunk(
data: &[u8],
prefix_bytes: usize,
skip_end: usize,
line_delim: u8,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let src = data.as_ptr();
let dst_base = buf.as_mut_ptr();
let mut wp = buf.len();
let mut start = 0;
for pos in memchr_iter(line_delim, data) {
let line_len = pos - start;
let take_prefix = prefix_bytes.min(line_len);
if take_prefix > 0 {
unsafe {
std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
}
wp += take_prefix;
}
if line_len > skip_end {
let suffix_len = line_len - skip_end;
unsafe {
std::ptr::copy_nonoverlapping(
src.add(start + skip_end),
dst_base.add(wp),
suffix_len,
);
}
wp += suffix_len;
}
unsafe {
*dst_base.add(wp) = line_delim;
}
wp += 1;
start = pos + 1;
}
if start < data.len() {
let line_len = data.len() - start;
let take_prefix = prefix_bytes.min(line_len);
if take_prefix > 0 {
unsafe {
std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
}
wp += take_prefix;
}
if line_len > skip_end {
let suffix_len = line_len - skip_end;
unsafe {
std::ptr::copy_nonoverlapping(
src.add(start + skip_end),
dst_base.add(wp),
suffix_len,
);
}
wp += suffix_len;
}
unsafe {
*dst_base.add(wp) = line_delim;
}
wp += 1;
}
unsafe { buf.set_len(wp) };
}
fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
let line_delim = cfg.line_delim;
let ranges = cfg.ranges;
let complement = cfg.complement;
let output_delim = cfg.output_delim;
if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
let max_bytes = ranges[0].end;
if max_bytes < usize::MAX {
return process_bytes_from_start(data, max_bytes, line_delim, out);
}
}
if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
let skip_bytes = ranges[0].start.saturating_sub(1);
if skip_bytes > 0 {
return process_bytes_from_offset(data, skip_bytes, line_delim, out);
}
}
if !complement
&& ranges.len() == 1
&& ranges[0].start > 1
&& ranges[0].end < usize::MAX
&& output_delim.is_empty()
{
return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
}
if complement
&& ranges.len() == 1
&& ranges[0].start == 1
&& ranges[0].end < usize::MAX
&& output_delim.is_empty()
{
return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
}
if complement
&& ranges.len() == 1
&& ranges[0].end == usize::MAX
&& ranges[0].start > 1
&& output_delim.is_empty()
{
let max_bytes = ranges[0].start - 1;
return process_bytes_from_start(data, max_bytes, line_delim, out);
}
if complement
&& ranges.len() == 1
&& ranges[0].start > 1
&& ranges[0].end < usize::MAX
&& output_delim.is_empty()
{
return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
}
if data.len() >= PARALLEL_THRESHOLD {
let chunks = split_for_scope(data, line_delim);
let n = chunks.len();
let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
rayon::scope(|s| {
for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
s.spawn(move |_| {
result.reserve(chunk.len() + 1);
process_bytes_chunk(
chunk,
ranges,
complement,
output_delim,
line_delim,
result,
);
});
}
});
let slices: Vec<IoSlice> = results
.iter()
.filter(|r| !r.is_empty())
.map(|r| IoSlice::new(r))
.collect();
write_ioslices(out, &slices)?;
} else {
process_chunked(data, line_delim, out, |chunk, buf| {
process_bytes_chunk(chunk, ranges, complement, output_delim, line_delim, buf);
})?;
}
Ok(())
}
fn process_bytes_chunk(
data: &[u8],
ranges: &[Range],
complement: bool,
output_delim: &[u8],
line_delim: u8,
buf: &mut Vec<u8>,
) {
buf.reserve(data.len());
let base = data.as_ptr();
let mut start = 0;
for end_pos in memchr_iter(line_delim, data) {
let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
unsafe { buf_push(buf, line_delim) };
start = end_pos + 1;
}
if start < data.len() {
let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
unsafe { buf_push(buf, line_delim) };
}
}
#[inline(always)]
fn cut_bytes_to_buf(
line: &[u8],
ranges: &[Range],
complement: bool,
output_delim: &[u8],
buf: &mut Vec<u8>,
) {
let len = line.len();
let base = line.as_ptr();
let mut first_range = true;
let needed = len + output_delim.len() * ranges.len() + 1;
if buf.capacity() - buf.len() < needed {
buf.reserve(needed);
}
if complement {
let mut pos: usize = 1;
for r in ranges {
let rs = r.start;
let re = r.end.min(len);
if pos < rs {
if !first_range && !output_delim.is_empty() {
unsafe { buf_extend(buf, output_delim) };
}
unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
first_range = false;
}
pos = re + 1;
if pos > len {
break;
}
}
if pos <= len {
if !first_range && !output_delim.is_empty() {
unsafe { buf_extend(buf, output_delim) };
}
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
)
};
}
} else if output_delim.is_empty() && ranges.len() == 1 {
let start = ranges[0].start.saturating_sub(1);
let end = ranges[0].end.min(len);
if start < len {
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(start), end - start),
)
};
}
} else {
for r in ranges {
let start = r.start.saturating_sub(1);
let end = r.end.min(len);
if start >= len {
break;
}
if !first_range && !output_delim.is_empty() {
unsafe { buf_extend(buf, output_delim) };
}
unsafe {
buf_extend(
buf,
std::slice::from_raw_parts(base.add(start), end - start),
)
};
first_range = false;
}
}
}
#[inline]
pub fn cut_fields(
line: &[u8],
delim: u8,
ranges: &[Range],
complement: bool,
output_delim: &[u8],
suppress_no_delim: bool,
out: &mut impl Write,
) -> io::Result<bool> {
if memchr::memchr(delim, line).is_none() {
if !suppress_no_delim {
out.write_all(line)?;
return Ok(true);
}
return Ok(false);
}
let mut field_num: usize = 1;
let mut field_start: usize = 0;
let mut first_output = true;
for delim_pos in memchr_iter(delim, line) {
let selected = in_ranges(ranges, field_num) != complement;
if selected {
if !first_output {
out.write_all(output_delim)?;
}
out.write_all(&line[field_start..delim_pos])?;
first_output = false;
}
field_start = delim_pos + 1;
field_num += 1;
}
let selected = in_ranges(ranges, field_num) != complement;
if selected {
if !first_output {
out.write_all(output_delim)?;
}
out.write_all(&line[field_start..])?;
}
Ok(true)
}
#[inline]
pub fn cut_bytes(
line: &[u8],
ranges: &[Range],
complement: bool,
output_delim: &[u8],
out: &mut impl Write,
) -> io::Result<bool> {
let mut first_range = true;
if complement {
let len = line.len();
let mut comp_ranges = Vec::new();
let mut pos: usize = 1;
for r in ranges {
let rs = r.start;
let re = r.end.min(len);
if pos < rs {
comp_ranges.push((pos, rs - 1));
}
pos = re + 1;
if pos > len {
break;
}
}
if pos <= len {
comp_ranges.push((pos, len));
}
for &(s, e) in &comp_ranges {
if !first_range && !output_delim.is_empty() {
out.write_all(output_delim)?;
}
out.write_all(&line[s - 1..e])?;
first_range = false;
}
} else {
for r in ranges {
let start = r.start.saturating_sub(1);
let end = r.end.min(line.len());
if start >= line.len() {
break;
}
if !first_range && !output_delim.is_empty() {
out.write_all(output_delim)?;
}
out.write_all(&line[start..end])?;
first_range = false;
}
}
Ok(true)
}
pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
let len = data.len();
let mut wp: usize = 0;
let mut rp: usize = 0;
while rp < len {
match memchr::memchr2(delim, line_delim, &data[rp..]) {
None => {
if suppress {
break;
}
let remaining = len - rp;
if wp != rp {
data.copy_within(rp..len, wp);
}
wp += remaining;
break;
}
Some(offset) => {
let actual = rp + offset;
if data[actual] == line_delim {
if suppress {
rp = actual + 1;
} else {
let chunk_len = actual + 1 - rp;
if wp != rp {
data.copy_within(rp..actual + 1, wp);
}
wp += chunk_len;
rp = actual + 1;
}
} else {
let field_len = actual - rp;
if wp != rp && field_len > 0 {
data.copy_within(rp..actual, wp);
}
wp += field_len;
data[wp] = line_delim;
wp += 1;
match memchr::memchr(line_delim, &data[actual + 1..]) {
None => {
rp = len;
}
Some(nl_off) => {
rp = actual + 1 + nl_off + 1;
}
}
}
}
}
}
wp
}
pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
match cfg.mode {
CutMode::Fields => process_fields_fast(data, cfg, out),
CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
}
}
pub fn process_cut_reader<R: BufRead>(
mut reader: R,
cfg: &CutConfig,
out: &mut impl Write,
) -> io::Result<()> {
const CHUNK_SIZE: usize = 16 * 1024 * 1024; let line_delim = cfg.line_delim;
let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
loop {
buf.reserve(CHUNK_SIZE);
let read_start = buf.len();
unsafe { buf.set_len(read_start + CHUNK_SIZE) };
let n = read_fully(&mut reader, &mut buf[read_start..])?;
buf.truncate(read_start + n);
if buf.is_empty() {
break;
}
if n == 0 {
process_cut_data(&buf, cfg, out)?;
break;
}
let process_end = match memchr::memrchr(line_delim, &buf) {
Some(pos) => pos + 1,
None => {
continue;
}
};
process_cut_data(&buf[..process_end], cfg, out)?;
let leftover_len = buf.len() - process_end;
if leftover_len > 0 {
buf.copy_within(process_end.., 0);
}
buf.truncate(leftover_len);
}
Ok(())
}
#[inline]
fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
let n = reader.read(buf)?;
if n == buf.len() || n == 0 {
return Ok(n);
}
let mut total = n;
while total < buf.len() {
match reader.read(&mut buf[total..]) {
Ok(0) => break,
Ok(n) => total += n,
Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e),
}
}
Ok(total)
}
pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
if cfg.complement {
return None;
}
if data.is_empty() || data[data.len() - 1] != cfg.line_delim {
return None;
}
match cfg.mode {
CutMode::Fields => {
if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
return None;
}
if cfg.delim == cfg.line_delim {
return None;
}
Some(cut_fields_inplace_general(
data,
cfg.delim,
cfg.line_delim,
cfg.ranges,
cfg.suppress_no_delim,
))
}
CutMode::Bytes | CutMode::Characters => {
if !cfg.output_delim.is_empty() {
return None;
}
Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
}
}
}
fn cut_fields_inplace_general(
data: &mut [u8],
delim: u8,
line_delim: u8,
ranges: &[Range],
suppress: bool,
) -> usize {
if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
return cut_field1_inplace(data, delim, line_delim, suppress);
}
let len = data.len();
if len == 0 {
return 0;
}
let max_field = ranges.last().map_or(0, |r| r.end);
let max_delims = max_field.min(128);
let mut wp: usize = 0;
let mut rp: usize = 0;
while rp < len {
let line_end = memchr::memchr(line_delim, &data[rp..])
.map(|p| rp + p)
.unwrap_or(len);
let line_len = line_end - rp;
let mut delim_pos = [0usize; 128];
let mut num_delims: usize = 0;
for pos in memchr_iter(delim, &data[rp..line_end]) {
if num_delims < max_delims {
delim_pos[num_delims] = pos;
num_delims += 1;
if num_delims >= max_delims {
break;
}
}
}
if num_delims == 0 {
if !suppress {
if wp != rp {
data.copy_within(rp..line_end, wp);
}
wp += line_len;
if line_end < len {
data[wp] = line_delim;
wp += 1;
}
}
} else {
let total_fields = num_delims + 1;
let mut first_output = true;
for r in ranges {
let range_start = r.start;
let range_end = r.end.min(total_fields);
if range_start > total_fields {
break;
}
for field_num in range_start..=range_end {
if field_num > total_fields {
break;
}
let field_start = if field_num == 1 {
0
} else if field_num - 2 < num_delims {
delim_pos[field_num - 2] + 1
} else {
continue;
};
let field_end = if field_num <= num_delims {
delim_pos[field_num - 1]
} else {
line_len
};
if !first_output {
data[wp] = delim;
wp += 1;
}
let flen = field_end - field_start;
if flen > 0 {
data.copy_within(rp + field_start..rp + field_start + flen, wp);
wp += flen;
}
first_output = false;
}
}
if !first_output && line_end < len {
data[wp] = line_delim;
wp += 1;
} else if first_output && line_end < len {
data[wp] = line_delim;
wp += 1;
}
}
rp = if line_end < len { line_end + 1 } else { len };
}
wp
}
fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
let len = data.len();
if len == 0 {
return 0;
}
if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
return len;
}
if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
}
let mut wp: usize = 0;
let mut rp: usize = 0;
while rp < len {
let line_end = memchr::memchr(line_delim, &data[rp..])
.map(|p| rp + p)
.unwrap_or(len);
let line_len = line_end - rp;
for r in ranges {
let start = r.start.saturating_sub(1);
let end = r.end.min(line_len);
if start >= line_len {
break;
}
let flen = end - start;
if flen > 0 {
data.copy_within(rp + start..rp + start + flen, wp);
wp += flen;
}
}
if line_end < len {
data[wp] = line_delim;
wp += 1;
}
rp = if line_end < len { line_end + 1 } else { len };
}
wp
}
fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
let len = data.len();
let mut all_fit = true;
let mut start = 0;
for pos in memchr_iter(line_delim, data) {
if pos - start > max_bytes {
all_fit = false;
break;
}
start = pos + 1;
}
if all_fit && start < len && len - start > max_bytes {
all_fit = false;
}
if all_fit {
return len;
}
let mut wp: usize = 0;
let mut rp: usize = 0;
while rp < len {
let line_end = memchr::memchr(line_delim, &data[rp..])
.map(|p| rp + p)
.unwrap_or(len);
let line_len = line_end - rp;
let take = line_len.min(max_bytes);
if take > 0 && wp != rp {
data.copy_within(rp..rp + take, wp);
}
wp += take;
if line_end < len {
data[wp] = line_delim;
wp += 1;
}
rp = if line_end < len { line_end + 1 } else { len };
}
wp
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum CutMode {
Bytes,
Characters,
Fields,
}