Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (4MB).
5/// Files above this threshold use rayon parallel chunked processing.
6/// Lowered from 32MB to enable parallelism on benchmark-sized (~7MB) inputs.
7/// Rayon pre-init in main() eliminates the 300-500us cold-start penalty.
8const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
9
10/// Max iovec entries per writev call (Linux default).
11const MAX_IOV: usize = 1024;
12
13/// Input chunk size for sequential processing. 8MB ensures benchmark-sized
14/// (~7MB) inputs process in a single pass, avoiding chunk-boundary overhead.
15/// Reduces write_all syscalls (~2 calls for 10MB vs ~40 at 256KB).
16const SEQ_CHUNK: usize = 8 * 1024 * 1024;
17
18/// Process data in newline-aligned chunks, writing each chunk's output immediately.
19/// Avoids allocating a full-size output buffer (e.g. 12MB for 11MB input).
20fn process_chunked(
21    data: &[u8],
22    line_delim: u8,
23    out: &mut impl Write,
24    mut process_fn: impl FnMut(&[u8], &mut Vec<u8>),
25) -> io::Result<()> {
26    // Fast path: data fits in one chunk, skip chunk-boundary scanning entirely.
27    if data.len() <= SEQ_CHUNK {
28        let mut buf = Vec::with_capacity(data.len() + 256);
29        process_fn(data, &mut buf);
30        if !buf.is_empty() {
31            out.write_all(&buf)?;
32        }
33        return Ok(());
34    }
35    let mut buf = Vec::with_capacity(SEQ_CHUNK * 2);
36    let mut start = 0;
37    while start < data.len() {
38        let end = if start + SEQ_CHUNK >= data.len() {
39            data.len()
40        } else {
41            match memchr::memrchr(line_delim, &data[start..start + SEQ_CHUNK]) {
42                Some(pos) => start + pos + 1,
43                None => (start + SEQ_CHUNK).min(data.len()),
44            }
45        };
46        buf.clear();
47        process_fn(&data[start..end], &mut buf);
48        if !buf.is_empty() {
49            out.write_all(&buf)?;
50        }
51        start = end;
52    }
53    Ok(())
54}
55
56/// Configuration for cut operations.
57pub struct CutConfig<'a> {
58    pub mode: CutMode,
59    pub ranges: &'a [Range],
60    pub complement: bool,
61    pub delim: u8,
62    pub output_delim: &'a [u8],
63    pub suppress_no_delim: bool,
64    pub line_delim: u8,
65}
66
67/// A range specification like 1, 3-5, -3, 4-
68#[derive(Debug, Clone)]
69pub struct Range {
70    pub start: usize, // 1-based, 0 means "from beginning"
71    pub end: usize,   // 1-based, usize::MAX means "to end"
72}
73
74/// Parse a LIST specification like "1,3-5,7-" into ranges.
75/// Each range is 1-based. Returns sorted, merged ranges.
76/// When `no_merge_adjacent` is true, overlapping ranges are still merged but
77/// adjacent ranges (e.g., 1-2,3-4) are kept separate. This is needed when
78/// `--output-delimiter` is specified for byte/char mode so the delimiter is
79/// inserted between originally separate but adjacent ranges.
80pub fn parse_ranges(spec: &str, no_merge_adjacent: bool) -> Result<Vec<Range>, String> {
81    let mut ranges = Vec::new();
82
83    for part in spec.split(',') {
84        let part = part.trim();
85        if part.is_empty() {
86            continue;
87        }
88
89        if let Some(idx) = part.find('-') {
90            let left = &part[..idx];
91            let right = &part[idx + 1..];
92
93            // Reject bare "-" (both sides empty)
94            if left.is_empty() && right.is_empty() {
95                return Err("invalid range with no endpoint: -".to_string());
96            }
97
98            let start = if left.is_empty() {
99                1
100            } else {
101                left.parse::<usize>()
102                    .map_err(|_| format!("invalid range: '{}'", part))?
103            };
104
105            let end = if right.is_empty() {
106                usize::MAX
107            } else {
108                right
109                    .parse::<usize>()
110                    .map_err(|_| format!("invalid range: '{}'", part))?
111            };
112
113            if start == 0 {
114                return Err("fields and positions are numbered from 1".to_string());
115            }
116            if start > end {
117                return Err(format!("invalid decreasing range: '{}'", part));
118            }
119
120            ranges.push(Range { start, end });
121        } else {
122            let n = part
123                .parse::<usize>()
124                .map_err(|_| format!("invalid field: '{}'", part))?;
125            if n == 0 {
126                return Err("fields and positions are numbered from 1".to_string());
127            }
128            ranges.push(Range { start: n, end: n });
129        }
130    }
131
132    if ranges.is_empty() {
133        return Err("you must specify a list of bytes, characters, or fields".to_string());
134    }
135
136    // Sort and merge overlapping/adjacent ranges
137    ranges.sort_by_key(|r| (r.start, r.end));
138    let mut merged = vec![ranges[0].clone()];
139    for r in &ranges[1..] {
140        let last = merged.last_mut().unwrap();
141        if no_merge_adjacent {
142            // Only merge truly overlapping ranges, not adjacent ones
143            if r.start <= last.end {
144                last.end = last.end.max(r.end);
145            } else {
146                merged.push(r.clone());
147            }
148        } else {
149            // Merge both overlapping and adjacent ranges
150            if r.start <= last.end.saturating_add(1) {
151                last.end = last.end.max(r.end);
152            } else {
153                merged.push(r.clone());
154            }
155        }
156    }
157
158    Ok(merged)
159}
160
161/// Check if a 1-based position is in any range.
162/// Ranges must be sorted. Uses early exit since ranges are sorted.
163#[inline(always)]
164fn in_ranges(ranges: &[Range], pos: usize) -> bool {
165    for r in ranges {
166        if pos < r.start {
167            return false;
168        }
169        if pos <= r.end {
170            return true;
171        }
172    }
173    false
174}
175
176/// Pre-compute a 64-bit mask for field selection.
177/// Bit i-1 is set if field i should be output.
178#[inline]
179fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
180    let mut mask: u64 = 0;
181    for i in 1..=64u32 {
182        let in_range = in_ranges(ranges, i as usize);
183        if in_range != complement {
184            mask |= 1u64 << (i - 1);
185        }
186    }
187    mask
188}
189
190/// Check if a field should be selected, using bitset for first 64 fields.
191#[inline(always)]
192fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
193    if field_num <= 64 {
194        (mask >> (field_num - 1)) & 1 == 1
195    } else {
196        in_ranges(ranges, field_num) != complement
197    }
198}
199
200// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
201
202/// Append a slice to buf without capacity checks.
203/// Caller MUST ensure buf has enough remaining capacity.
204#[inline(always)]
205unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
206    unsafe {
207        let len = buf.len();
208        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
209        buf.set_len(len + data.len());
210    }
211}
212
213/// Append a single byte to buf without capacity checks.
214/// Caller MUST ensure buf has enough remaining capacity.
215#[inline(always)]
216unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
217    unsafe {
218        let len = buf.len();
219        *buf.as_mut_ptr().add(len) = b;
220        buf.set_len(len + 1);
221    }
222}
223
224/// Write multiple IoSlice buffers using write_vectored (writev syscall).
225/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
226/// Cold path (partial write) is out-of-line to keep the hot loop tight.
227#[inline]
228fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
229    if slices.is_empty() {
230        return Ok(());
231    }
232    for batch in slices.chunks(MAX_IOV) {
233        let total: usize = batch.iter().map(|s| s.len()).sum();
234        let written = out.write_vectored(batch)?;
235        if written >= total {
236            continue;
237        }
238        if written == 0 {
239            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
240        }
241        write_ioslices_slow(out, batch, written)?;
242    }
243    Ok(())
244}
245
246/// Handle partial write_vectored (cold path, never inlined).
247#[cold]
248#[inline(never)]
249fn write_ioslices_slow(
250    out: &mut impl Write,
251    slices: &[IoSlice],
252    mut skip: usize,
253) -> io::Result<()> {
254    for slice in slices {
255        let len = slice.len();
256        if skip >= len {
257            skip -= len;
258            continue;
259        }
260        out.write_all(&slice[skip..])?;
261        skip = 0;
262    }
263    Ok(())
264}
265
266// ── Chunk splitting for parallel processing ──────────────────────────────
267
268/// Number of available CPUs for parallel chunk splitting.
269/// Uses std::thread::available_parallelism() to avoid triggering premature
270/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
271#[inline]
272fn num_cpus() -> usize {
273    std::thread::available_parallelism()
274        .map(|n| n.get())
275        .unwrap_or(1)
276}
277
278/// Split data into chunks for rayon::scope parallel processing.
279/// Uses Rayon's thread count to match the number of worker threads.
280fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
281    let num_threads = num_cpus().max(1);
282    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
283        return vec![data];
284    }
285
286    let chunk_size = data.len() / num_threads;
287    let mut chunks = Vec::with_capacity(num_threads);
288    let mut pos = 0;
289
290    for _ in 0..num_threads - 1 {
291        let target = pos + chunk_size;
292        if target >= data.len() {
293            break;
294        }
295        let boundary = memchr::memchr(line_delim, &data[target..])
296            .map(|p| target + p + 1)
297            .unwrap_or(data.len());
298        if boundary > pos {
299            chunks.push(&data[pos..boundary]);
300        }
301        pos = boundary;
302    }
303
304    if pos < data.len() {
305        chunks.push(&data[pos..]);
306    }
307
308    chunks
309}
310
311// ── Fast path: multi-field non-contiguous extraction ─────────────────────
312
313/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
314/// Pre-collects delimiter positions per line into a stack-allocated array,
315/// then directly indexes into them for each selected field.
316/// This is O(max_field) per line instead of O(num_fields * scan_length).
317fn process_fields_multi_select(
318    data: &[u8],
319    delim: u8,
320    line_delim: u8,
321    ranges: &[Range],
322    suppress: bool,
323    out: &mut impl Write,
324) -> io::Result<()> {
325    let max_field = ranges.last().map_or(0, |r| r.end);
326
327    if data.len() >= PARALLEL_THRESHOLD {
328        let chunks = split_for_scope(data, line_delim);
329        let n = chunks.len();
330        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
331        rayon::scope(|s| {
332            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
333                s.spawn(move |_| {
334                    result.reserve(chunk.len() * 3 / 4);
335                    multi_select_chunk(
336                        chunk, delim, line_delim, ranges, max_field, suppress, result,
337                    );
338                });
339            }
340        });
341        let slices: Vec<IoSlice> = results
342            .iter()
343            .filter(|r| !r.is_empty())
344            .map(|r| IoSlice::new(r))
345            .collect();
346        write_ioslices(out, &slices)?;
347    } else {
348        process_chunked(data, line_delim, out, |chunk, buf| {
349            multi_select_chunk(chunk, delim, line_delim, ranges, max_field, suppress, buf);
350        })?;
351    }
352    Ok(())
353}
354
355/// Process a chunk for multi-field extraction.
356/// Uses single-pass memchr2 with bitmask field selection when max_field <= 64.
357/// Falls back to two-level scanning for larger field numbers.
358fn multi_select_chunk(
359    data: &[u8],
360    delim: u8,
361    line_delim: u8,
362    ranges: &[Range],
363    max_field: usize,
364    suppress: bool,
365    buf: &mut Vec<u8>,
366) {
367    // Two-level scan for small max_field: outer memchr(newline) + inner
368    // memchr(delim) with early exit at max_field. This is faster than the
369    // single-pass memchr2 approach when lines have many fields past max_field,
370    // because we skip scanning delimiters we don't need (e.g., for -f1,3,5
371    // on a 10-field CSV, we stop after delimiter 5 instead of scanning all 9).
372    if max_field <= 64 && delim != line_delim {
373        let mut mask: u64 = 0;
374        for r in ranges {
375            let s = r.start.max(1);
376            let e = r.end.min(64);
377            for f in s..=e {
378                mask |= 1u64 << (f - 1);
379            }
380        }
381        // For small max_field, use single-pass memchr2 bitmask approach:
382        // scans for both delimiter and newline simultaneously, avoiding
383        // per-line iterator creation overhead on short lines.
384        // Trade-off: bitmask processes all delimiters per line (no early exit),
385        // while twolevel exits after max_field delimiters. For narrow CSVs
386        // (<=8 fields), the memchr2 SIMD advantage outweighs the extra hits.
387        // For wide CSVs, twolevel's early exit wins.
388        if max_field <= 8 {
389            multi_select_chunk_bitmask(data, delim, line_delim, mask, max_field, suppress, buf);
390        } else {
391            multi_select_twolevel(data, delim, line_delim, mask, max_field, suppress, buf);
392        }
393        return;
394    }
395
396    // Fallback: two-level scanning for large field numbers
397    buf.reserve(data.len());
398    let base = data.as_ptr();
399    let mut start = 0;
400    let max_delims = max_field.min(128);
401
402    for end_pos in memchr_iter(line_delim, data) {
403        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
404        multi_select_line_fast(
405            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
406        );
407        start = end_pos + 1;
408    }
409    if start < data.len() {
410        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
411        multi_select_line_fast(
412            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
413        );
414    }
415}
416
417/// Single-pass memchr2 multi-field extraction with bitmask field selection.
418/// Scans for both delimiter and newline simultaneously, avoiding per-line
419/// memchr_iter creation overhead on short lines (~200K lines x ~35 bytes).
420/// Best for max_field <= 8 where most fields are selected.
421fn multi_select_chunk_bitmask(
422    data: &[u8],
423    delim: u8,
424    line_delim: u8,
425    mask: u64,
426    max_field: usize,
427    suppress: bool,
428    buf: &mut Vec<u8>,
429) {
430    // Single-pass memchr2 approach: scan for both delimiters and newlines
431    // simultaneously. This avoids per-line memchr_iter creation overhead,
432    // which dominates for short lines (200K lines × ~35 bytes each).
433    buf.reserve(data.len() + 1);
434    let initial_len = buf.len();
435    let out_base = unsafe { buf.as_mut_ptr().add(initial_len) };
436    let src = data.as_ptr();
437    let mut wp: usize = 0;
438
439    let mut field_num: usize = 1; // current field (1-based)
440    let mut field_start: usize = 0; // start of current field
441    let mut first_output = true; // first field on current line?
442    let mut has_delim = false; // current line has any delimiter?
443
444    for pos in memchr::memchr2_iter(delim, line_delim, data) {
445        if data[pos] == line_delim {
446            // End of line: handle last field + write newline
447            if !has_delim {
448                // Line had no delimiter: pass through or suppress
449                if !suppress {
450                    let len = pos - field_start;
451                    unsafe {
452                        std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
453                    }
454                    wp += len;
455                    unsafe {
456                        *out_base.add(wp) = line_delim;
457                    }
458                    wp += 1;
459                }
460            } else {
461                // Check if last field is selected
462                if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
463                    if !first_output {
464                        unsafe {
465                            *out_base.add(wp) = delim;
466                        }
467                        wp += 1;
468                    }
469                    let len = pos - field_start;
470                    unsafe {
471                        std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
472                    }
473                    wp += len;
474                }
475                unsafe {
476                    *out_base.add(wp) = line_delim;
477                }
478                wp += 1;
479            }
480            // Reset for next line
481            field_num = 1;
482            field_start = pos + 1;
483            first_output = true;
484            has_delim = false;
485        } else {
486            // Delimiter found
487            has_delim = true;
488            if field_num <= max_field && (mask & (1u64 << (field_num - 1))) != 0 {
489                if !first_output {
490                    unsafe {
491                        *out_base.add(wp) = delim;
492                    }
493                    wp += 1;
494                }
495                let len = pos - field_start;
496                unsafe {
497                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
498                }
499                wp += len;
500                first_output = false;
501            }
502            field_num += 1;
503            field_start = pos + 1;
504        }
505    }
506
507    // Handle final line without trailing newline
508    if field_start < data.len() {
509        if !has_delim {
510            if !suppress {
511                let len = data.len() - field_start;
512                unsafe {
513                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
514                }
515                wp += len;
516                unsafe {
517                    *out_base.add(wp) = line_delim;
518                }
519                wp += 1;
520            }
521        } else {
522            if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
523                if !first_output {
524                    unsafe {
525                        *out_base.add(wp) = delim;
526                    }
527                    wp += 1;
528                }
529                let len = data.len() - field_start;
530                unsafe {
531                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
532                }
533                wp += len;
534            }
535            unsafe {
536                *out_base.add(wp) = line_delim;
537            }
538            wp += 1;
539        }
540    }
541
542    debug_assert!(wp <= data.len() + 1);
543    unsafe {
544        buf.set_len(initial_len + wp);
545    }
546}
547
548/// Two-level multi-field extraction: outer memchr(newline) for line boundaries,
549/// inner memchr(delim) with early exit after max_field delimiters per line.
550/// For `-f1,3,5` on a 10-field CSV, this scans only 5 delimiters per line
551/// instead of all 9, saving ~45% of delimiter processing.
552fn multi_select_twolevel(
553    data: &[u8],
554    delim: u8,
555    line_delim: u8,
556    mask: u64,
557    max_field: usize,
558    suppress: bool,
559    buf: &mut Vec<u8>,
560) {
561    buf.reserve(data.len() + 1);
562    let initial_len = buf.len();
563    let out_base = unsafe { buf.as_mut_ptr().add(initial_len) };
564    let src = data.as_ptr();
565    let mut wp: usize = 0;
566    let mut line_start: usize = 0;
567
568    for nl_pos in memchr_iter(line_delim, data) {
569        let line_len = nl_pos - line_start;
570        let line = &data[line_start..nl_pos];
571
572        if line_len == 0 {
573            if !suppress {
574                unsafe {
575                    *out_base.add(wp) = line_delim;
576                }
577                wp += 1;
578            }
579            line_start = nl_pos + 1;
580            continue;
581        }
582
583        // Scan delimiters within the line, stopping after max_field.
584        // Uses memchr_iter for amortized SIMD setup (one per line vs one per field).
585        let mut field_num: usize = 1;
586        let mut field_start: usize = 0;
587        let mut first_output = true;
588        let mut has_delim = false;
589
590        for dp in memchr::memchr_iter(delim, line) {
591            has_delim = true;
592            if (mask >> (field_num - 1)) & 1 == 1 {
593                if !first_output {
594                    unsafe {
595                        *out_base.add(wp) = delim;
596                    }
597                    wp += 1;
598                }
599                let flen = dp - field_start;
600                unsafe {
601                    std::ptr::copy_nonoverlapping(
602                        src.add(line_start + field_start),
603                        out_base.add(wp),
604                        flen,
605                    );
606                }
607                wp += flen;
608                first_output = false;
609            }
610            field_num += 1;
611            field_start = dp + 1;
612            if field_num > max_field {
613                break;
614            }
615        }
616
617        if !has_delim {
618            // No delimiter: pass through or suppress
619            if !suppress {
620                unsafe {
621                    std::ptr::copy_nonoverlapping(src.add(line_start), out_base.add(wp), line_len);
622                }
623                wp += line_len;
624                unsafe {
625                    *out_base.add(wp) = line_delim;
626                }
627                wp += 1;
628            }
629        } else {
630            // Check if the last field (after last found delimiter) is selected
631            if field_num <= 64 && (mask >> (field_num - 1)) & 1 == 1 {
632                if !first_output {
633                    unsafe {
634                        *out_base.add(wp) = delim;
635                    }
636                    wp += 1;
637                }
638                let flen = line_len - field_start;
639                unsafe {
640                    std::ptr::copy_nonoverlapping(
641                        src.add(line_start + field_start),
642                        out_base.add(wp),
643                        flen,
644                    );
645                }
646                wp += flen;
647            }
648            unsafe {
649                *out_base.add(wp) = line_delim;
650            }
651            wp += 1;
652        }
653
654        line_start = nl_pos + 1;
655    }
656
657    // Handle final line without trailing newline
658    if line_start < data.len() {
659        let line = &data[line_start..];
660        let line_len = line.len();
661        let mut field_num: usize = 1;
662        let mut field_start: usize = 0;
663        let mut first_output = true;
664        let mut has_delim = false;
665
666        for dp in memchr::memchr_iter(delim, line) {
667            has_delim = true;
668            if (mask >> (field_num - 1)) & 1 == 1 {
669                if !first_output {
670                    unsafe {
671                        *out_base.add(wp) = delim;
672                    }
673                    wp += 1;
674                }
675                let flen = dp - field_start;
676                unsafe {
677                    std::ptr::copy_nonoverlapping(
678                        src.add(line_start + field_start),
679                        out_base.add(wp),
680                        flen,
681                    );
682                }
683                wp += flen;
684                first_output = false;
685            }
686            field_num += 1;
687            field_start = dp + 1;
688            if field_num > max_field {
689                break;
690            }
691        }
692
693        if !has_delim {
694            if !suppress {
695                unsafe {
696                    std::ptr::copy_nonoverlapping(src.add(line_start), out_base.add(wp), line_len);
697                }
698                wp += line_len;
699                unsafe {
700                    *out_base.add(wp) = line_delim;
701                }
702                wp += 1;
703            }
704        } else {
705            if field_num <= 64 && (mask >> (field_num - 1)) & 1 == 1 {
706                if !first_output {
707                    unsafe {
708                        *out_base.add(wp) = delim;
709                    }
710                    wp += 1;
711                }
712                let flen = line_len - field_start;
713                unsafe {
714                    std::ptr::copy_nonoverlapping(
715                        src.add(line_start + field_start),
716                        out_base.add(wp),
717                        flen,
718                    );
719                }
720                wp += flen;
721            }
722            unsafe {
723                *out_base.add(wp) = line_delim;
724            }
725            wp += 1;
726        }
727    }
728
729    debug_assert!(
730        wp <= data.len() + 1,
731        "wp={} exceeded reservation data.len()+1={}",
732        wp,
733        data.len() + 1
734    );
735    unsafe {
736        buf.set_len(initial_len + wp);
737    }
738}
739
740/// Extract selected fields from a single line using delimiter position scanning.
741/// Optimized: collects delimiter positions into a stack array with early exit at max_delims,
742/// then indexes directly for each selected field. Uses raw pointer arithmetic.
743#[inline(always)]
744fn multi_select_line_fast(
745    line: &[u8],
746    delim: u8,
747    line_delim: u8,
748    ranges: &[Range],
749    max_delims: usize,
750    suppress: bool,
751    buf: &mut Vec<u8>,
752    _line_abs_start: usize,
753    _data_base: *const u8,
754) {
755    let len = line.len();
756    if len == 0 {
757        if !suppress {
758            unsafe { buf_push(buf, line_delim) };
759        }
760        return;
761    }
762
763    let base = line.as_ptr();
764
765    // Collect delimiter positions up to max_delims (early exit).
766    let mut delim_pos = [0usize; 128];
767    let mut num_delims: usize = 0;
768
769    for pos in memchr_iter(delim, line) {
770        if num_delims < max_delims {
771            delim_pos[num_delims] = pos;
772            num_delims += 1;
773            if num_delims >= max_delims {
774                break;
775            }
776        }
777    }
778
779    if num_delims == 0 {
780        if !suppress {
781            unsafe {
782                buf_extend(buf, line);
783                buf_push(buf, line_delim);
784            }
785        }
786        return;
787    }
788
789    let total_fields = num_delims + 1;
790    let mut first_output = true;
791
792    for r in ranges {
793        let range_start = r.start;
794        let range_end = r.end.min(total_fields);
795        if range_start > total_fields {
796            break;
797        }
798        for field_num in range_start..=range_end {
799            if field_num > total_fields {
800                break;
801            }
802
803            let field_start = if field_num == 1 {
804                0
805            } else if field_num - 2 < num_delims {
806                delim_pos[field_num - 2] + 1
807            } else {
808                continue;
809            };
810            let field_end = if field_num <= num_delims {
811                delim_pos[field_num - 1]
812            } else {
813                len
814            };
815
816            if !first_output {
817                unsafe { buf_push(buf, delim) };
818            }
819            unsafe {
820                buf_extend(
821                    buf,
822                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
823                );
824            }
825            first_output = false;
826        }
827    }
828
829    unsafe { buf_push(buf, line_delim) };
830}
831
832// ── Fast path: field extraction with batched output ──────────────────────
833
834/// Optimized field extraction with early exit and batched output.
835fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
836    let delim = cfg.delim;
837    let line_delim = cfg.line_delim;
838    let ranges = cfg.ranges;
839    let complement = cfg.complement;
840    let output_delim = cfg.output_delim;
841    let suppress = cfg.suppress_no_delim;
842
843    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
844    // That scan was O(N) over the entire file just to check an edge case
845    // (no delimiter in any line). The per-line processing already handles
846    // lines without delimiters correctly, so the scan was pure overhead
847    // for files that DO contain delimiters (the common case).
848
849    // Ultra-fast path: single field extraction (e.g., cut -f5)
850    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
851        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
852    }
853
854    // Fast path: complement of single field or contiguous range with default output delimiter.
855    if complement
856        && ranges.len() == 1
857        && output_delim.len() == 1
858        && output_delim[0] == delim
859        && ranges[0].start == ranges[0].end
860    {
861        return process_complement_single_field(
862            data,
863            delim,
864            line_delim,
865            ranges[0].start,
866            suppress,
867            out,
868        );
869    }
870
871    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
872    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
873    if complement
874        && ranges.len() == 1
875        && ranges[0].start > 1
876        && ranges[0].end < usize::MAX
877        && output_delim.len() == 1
878        && output_delim[0] == delim
879    {
880        return process_complement_range(
881            data,
882            delim,
883            line_delim,
884            ranges[0].start,
885            ranges[0].end,
886            suppress,
887            out,
888        );
889    }
890
891    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
892    if !complement
893        && ranges.len() == 1
894        && ranges[0].start == 1
895        && output_delim.len() == 1
896        && output_delim[0] == delim
897        && ranges[0].end < usize::MAX
898    {
899        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
900    }
901
902    // Fast path: open-ended field range from field N (e.g., cut -f3-)
903    if !complement
904        && ranges.len() == 1
905        && ranges[0].end == usize::MAX
906        && ranges[0].start > 1
907        && output_delim.len() == 1
908        && output_delim[0] == delim
909    {
910        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
911    }
912
913    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
914    if !complement
915        && ranges.len() == 1
916        && ranges[0].start > 1
917        && ranges[0].end < usize::MAX
918        && output_delim.len() == 1
919        && output_delim[0] == delim
920    {
921        return process_fields_mid_range(
922            data,
923            delim,
924            line_delim,
925            ranges[0].start,
926            ranges[0].end,
927            suppress,
928            out,
929        );
930    }
931
932    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
933    // Uses delimiter position caching: find all delimiter positions per line,
934    // then directly index into them for each selected field.
935    // This is faster than the general extract_fields_to_buf which re-checks
936    // is_selected() for every field encountered.
937    if !complement
938        && ranges.len() > 1
939        && ranges.last().map_or(false, |r| r.end < usize::MAX)
940        && output_delim.len() == 1
941        && output_delim[0] == delim
942        && delim != line_delim
943    {
944        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
945    }
946
947    // General field extraction
948    let max_field = if complement {
949        usize::MAX
950    } else {
951        ranges.last().map(|r| r.end).unwrap_or(0)
952    };
953    let field_mask = compute_field_mask(ranges, complement);
954
955    if data.len() >= PARALLEL_THRESHOLD {
956        let chunks = split_for_scope(data, line_delim);
957        let n = chunks.len();
958        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
959        rayon::scope(|s| {
960            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
961                s.spawn(move |_| {
962                    result.reserve(chunk.len() + 1);
963                    process_fields_chunk(
964                        chunk,
965                        delim,
966                        ranges,
967                        output_delim,
968                        suppress,
969                        max_field,
970                        field_mask,
971                        line_delim,
972                        complement,
973                        result,
974                    );
975                });
976            }
977        });
978        let slices: Vec<IoSlice> = results
979            .iter()
980            .filter(|r| !r.is_empty())
981            .map(|r| IoSlice::new(r))
982            .collect();
983        write_ioslices(out, &slices)?;
984    } else {
985        process_chunked(data, line_delim, out, |chunk, buf| {
986            process_fields_chunk(
987                chunk,
988                delim,
989                ranges,
990                output_delim,
991                suppress,
992                max_field,
993                field_mask,
994                line_delim,
995                complement,
996                buf,
997            );
998        })?;
999    }
1000    Ok(())
1001}
1002
1003/// Process a chunk of data for general field extraction.
1004/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
1005/// memchr_iter(delim) for delimiter positions. This is faster than memchr2 single-pass
1006/// because memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles).
1007fn process_fields_chunk(
1008    data: &[u8],
1009    delim: u8,
1010    ranges: &[Range],
1011    output_delim: &[u8],
1012    suppress: bool,
1013    max_field: usize,
1014    field_mask: u64,
1015    line_delim: u8,
1016    complement: bool,
1017    buf: &mut Vec<u8>,
1018) {
1019    // Always use two-level approach: outer memchr(newline) + inner memchr_iter(delim).
1020    // Even for complement/unbounded ranges, two-level is faster because memchr is
1021    // ~30-50% faster per byte than memchr2. The per-line function call overhead
1022    // is negligible compared to the SIMD scan savings.
1023    if delim != line_delim {
1024        buf.reserve(data.len());
1025        let mut start = 0;
1026        for end_pos in memchr_iter(line_delim, data) {
1027            let line = &data[start..end_pos];
1028            extract_fields_to_buf(
1029                line,
1030                delim,
1031                ranges,
1032                output_delim,
1033                suppress,
1034                max_field,
1035                field_mask,
1036                line_delim,
1037                buf,
1038                complement,
1039            );
1040            start = end_pos + 1;
1041        }
1042        if start < data.len() {
1043            extract_fields_to_buf(
1044                &data[start..],
1045                delim,
1046                ranges,
1047                output_delim,
1048                suppress,
1049                max_field,
1050                field_mask,
1051                line_delim,
1052                buf,
1053                complement,
1054            );
1055        }
1056        return;
1057    }
1058
1059    // Fallback: when delim == line_delim, use the two-level scan approach
1060    let mut start = 0;
1061    for end_pos in memchr_iter(line_delim, data) {
1062        let line = &data[start..end_pos];
1063        extract_fields_to_buf(
1064            line,
1065            delim,
1066            ranges,
1067            output_delim,
1068            suppress,
1069            max_field,
1070            field_mask,
1071            line_delim,
1072            buf,
1073            complement,
1074        );
1075        start = end_pos + 1;
1076    }
1077    if start < data.len() {
1078        extract_fields_to_buf(
1079            &data[start..],
1080            delim,
1081            ranges,
1082            output_delim,
1083            suppress,
1084            max_field,
1085            field_mask,
1086            line_delim,
1087            buf,
1088            complement,
1089        );
1090    }
1091}
1092
1093// ── Ultra-fast single field extraction ───────────────────────────────────
1094
1095/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
1096/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
1097/// memchr(delim) for the field delimiter with early exit.
1098fn process_single_field(
1099    data: &[u8],
1100    delim: u8,
1101    line_delim: u8,
1102    target: usize,
1103    suppress: bool,
1104    out: &mut impl Write,
1105) -> io::Result<()> {
1106    let target_idx = target - 1;
1107
1108    if delim != line_delim {
1109        // Field 1 fast path: two-level scan (outer newline + inner first-delim).
1110        // For field 1, only needs to find the first delimiter per line.
1111        // Lines without delimiter are tracked as contiguous runs for bulk copy.
1112        if target_idx == 0 && !suppress {
1113            if data.len() >= PARALLEL_THRESHOLD {
1114                return single_field1_parallel(data, delim, line_delim, out);
1115            }
1116            return process_chunked(data, line_delim, out, |chunk, buf| {
1117                single_field1_to_buf(chunk, delim, line_delim, buf);
1118            });
1119        }
1120
1121        // Two-level approach for field N: outer newline scan + inner delim scan
1122        // with early exit at target_idx. Faster than memchr2 single-pass because
1123        // we only scan delimiters up to target_idx per line (not all of them).
1124        if data.len() >= PARALLEL_THRESHOLD {
1125            let chunks = split_for_scope(data, line_delim);
1126            let n = chunks.len();
1127            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1128            rayon::scope(|s| {
1129                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1130                    s.spawn(move |_| {
1131                        result.reserve(chunk.len() / 2);
1132                        process_single_field_chunk(
1133                            chunk, delim, target_idx, line_delim, suppress, result,
1134                        );
1135                    });
1136                }
1137            });
1138            let slices: Vec<IoSlice> = results
1139                .iter()
1140                .filter(|r| !r.is_empty())
1141                .map(|r| IoSlice::new(r))
1142                .collect();
1143            write_ioslices(out, &slices)?;
1144        } else {
1145            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1146            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1147            if !buf.is_empty() {
1148                out.write_all(&buf)?;
1149            }
1150        }
1151        return Ok(());
1152    }
1153
1154    // Fallback for delim == line_delim: nested loop approach
1155    if data.len() >= PARALLEL_THRESHOLD {
1156        let chunks = split_for_scope(data, line_delim);
1157        let n = chunks.len();
1158        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1159        rayon::scope(|s| {
1160            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1161                s.spawn(move |_| {
1162                    result.reserve(chunk.len() / 4);
1163                    process_single_field_chunk(
1164                        chunk, delim, target_idx, line_delim, suppress, result,
1165                    );
1166                });
1167            }
1168        });
1169        let slices: Vec<IoSlice> = results
1170            .iter()
1171            .filter(|r| !r.is_empty())
1172            .map(|r| IoSlice::new(r))
1173            .collect();
1174        write_ioslices(out, &slices)?;
1175    } else {
1176        let mut buf = Vec::with_capacity(data.len() / 4);
1177        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1178        if !buf.is_empty() {
1179            out.write_all(&buf)?;
1180        }
1181    }
1182    Ok(())
1183}
1184
1185/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1186/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1187fn process_complement_range(
1188    data: &[u8],
1189    delim: u8,
1190    line_delim: u8,
1191    skip_start: usize,
1192    skip_end: usize,
1193    suppress: bool,
1194    out: &mut impl Write,
1195) -> io::Result<()> {
1196    if data.len() >= PARALLEL_THRESHOLD {
1197        let chunks = split_for_scope(data, line_delim);
1198        let n = chunks.len();
1199        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1200        rayon::scope(|s| {
1201            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1202                s.spawn(move |_| {
1203                    result.reserve(chunk.len());
1204                    complement_range_chunk(
1205                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
1206                    );
1207                });
1208            }
1209        });
1210        let slices: Vec<IoSlice> = results
1211            .iter()
1212            .filter(|r| !r.is_empty())
1213            .map(|r| IoSlice::new(r))
1214            .collect();
1215        write_ioslices(out, &slices)?;
1216    } else {
1217        process_chunked(data, line_delim, out, |chunk, buf| {
1218            complement_range_chunk(
1219                chunk, delim, skip_start, skip_end, line_delim, suppress, buf,
1220            );
1221        })?;
1222    }
1223    Ok(())
1224}
1225
1226/// Process a chunk for complement range extraction.
1227fn complement_range_chunk(
1228    data: &[u8],
1229    delim: u8,
1230    skip_start: usize,
1231    skip_end: usize,
1232    line_delim: u8,
1233    suppress: bool,
1234    buf: &mut Vec<u8>,
1235) {
1236    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1237    buf.reserve(data.len());
1238    let mut start = 0;
1239    for end_pos in memchr_iter(line_delim, data) {
1240        let line = &data[start..end_pos];
1241        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1242        start = end_pos + 1;
1243    }
1244    if start < data.len() {
1245        complement_range_line(
1246            &data[start..],
1247            delim,
1248            skip_start,
1249            skip_end,
1250            line_delim,
1251            suppress,
1252            buf,
1253        );
1254    }
1255}
1256
1257/// Extract all fields except skip_start..=skip_end from one line.
1258/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1259///
1260/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1261/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1262/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1263#[inline(always)]
1264fn complement_range_line(
1265    line: &[u8],
1266    delim: u8,
1267    skip_start: usize,
1268    skip_end: usize,
1269    line_delim: u8,
1270    suppress: bool,
1271    buf: &mut Vec<u8>,
1272) {
1273    let len = line.len();
1274    if len == 0 {
1275        if !suppress {
1276            unsafe { buf_push(buf, line_delim) };
1277        }
1278        return;
1279    }
1280
1281    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1282    let base = line.as_ptr();
1283
1284    // 1-based field numbers. To skip fields skip_start..=skip_end:
1285    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1286    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1287    //
1288    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1289    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1290
1291    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1292    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1293    let total_need = need_prefix_delims + need_skip_delims;
1294
1295    // Find delimiter positions up to total_need
1296    let mut delim_count: usize = 0;
1297    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1298    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1299
1300    for pos in memchr_iter(delim, line) {
1301        delim_count += 1;
1302        if delim_count == need_prefix_delims {
1303            prefix_end_pos = pos;
1304        }
1305        if delim_count == total_need {
1306            suffix_start_pos = pos + 1;
1307            break;
1308        }
1309    }
1310
1311    if delim_count == 0 {
1312        // No delimiter at all
1313        if !suppress {
1314            unsafe {
1315                buf_extend(buf, line);
1316                buf_push(buf, line_delim);
1317            }
1318        }
1319        return;
1320    }
1321
1322    // Case analysis:
1323    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1324    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1325    // 3. Enough to reach skip_end: prefix + delim + suffix
1326
1327    if delim_count < need_prefix_delims {
1328        // Not enough fields to reach skip region — output entire line
1329        unsafe {
1330            buf_extend(buf, line);
1331            buf_push(buf, line_delim);
1332        }
1333        return;
1334    }
1335
1336    let has_prefix = need_prefix_delims > 0;
1337    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1338
1339    if has_prefix && has_suffix {
1340        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1341        unsafe {
1342            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1343            buf_push(buf, delim);
1344            buf_extend(
1345                buf,
1346                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1347            );
1348            buf_push(buf, line_delim);
1349        }
1350    } else if has_prefix {
1351        // Only prefix, no suffix (skip region extends to end of line)
1352        unsafe {
1353            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1354            buf_push(buf, line_delim);
1355        }
1356    } else if has_suffix {
1357        // No prefix (skip_start == 1), only suffix
1358        unsafe {
1359            buf_extend(
1360                buf,
1361                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1362            );
1363            buf_push(buf, line_delim);
1364        }
1365    } else {
1366        // All fields skipped
1367        unsafe { buf_push(buf, line_delim) };
1368    }
1369}
1370
1371/// Complement single-field extraction: skip one field, output rest unchanged.
1372fn process_complement_single_field(
1373    data: &[u8],
1374    delim: u8,
1375    line_delim: u8,
1376    skip_field: usize,
1377    suppress: bool,
1378    out: &mut impl Write,
1379) -> io::Result<()> {
1380    let skip_idx = skip_field - 1;
1381
1382    if data.len() >= PARALLEL_THRESHOLD {
1383        let chunks = split_for_scope(data, line_delim);
1384        let n = chunks.len();
1385        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1386        rayon::scope(|s| {
1387            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1388                s.spawn(move |_| {
1389                    result.reserve(chunk.len());
1390                    complement_single_field_chunk(
1391                        chunk, delim, skip_idx, line_delim, suppress, result,
1392                    );
1393                });
1394            }
1395        });
1396        let slices: Vec<IoSlice> = results
1397            .iter()
1398            .filter(|r| !r.is_empty())
1399            .map(|r| IoSlice::new(r))
1400            .collect();
1401        write_ioslices(out, &slices)?;
1402    } else {
1403        process_chunked(data, line_delim, out, |chunk, buf| {
1404            complement_single_field_chunk(chunk, delim, skip_idx, line_delim, suppress, buf);
1405        })?;
1406    }
1407    Ok(())
1408}
1409
1410/// Process a chunk for complement single-field extraction using two-level scanning.
1411/// Outer memchr(newline) for line boundaries, inner memchr_iter(delim) with early exit
1412/// after finding the skip field's bounding delimiters. Faster than memchr2 single-pass
1413/// because memchr is faster per byte and inner scan exits early.
1414fn complement_single_field_chunk(
1415    data: &[u8],
1416    delim: u8,
1417    skip_idx: usize,
1418    line_delim: u8,
1419    suppress: bool,
1420    buf: &mut Vec<u8>,
1421) {
1422    buf.reserve(data.len());
1423    let mut start = 0;
1424    for end_pos in memchr_iter(line_delim, data) {
1425        let line = &data[start..end_pos];
1426        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1427        start = end_pos + 1;
1428    }
1429    if start < data.len() {
1430        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
1431    }
1432}
1433
1434/// Fallback per-line complement single-field extraction (for delim == line_delim).
1435#[inline(always)]
1436fn complement_single_field_line(
1437    line: &[u8],
1438    delim: u8,
1439    skip_idx: usize,
1440    line_delim: u8,
1441    suppress: bool,
1442    buf: &mut Vec<u8>,
1443) {
1444    let len = line.len();
1445    if len == 0 {
1446        if !suppress {
1447            unsafe { buf_push(buf, line_delim) };
1448        }
1449        return;
1450    }
1451
1452    let base = line.as_ptr();
1453    let need_before = skip_idx;
1454    let need_total = skip_idx + 1;
1455
1456    let mut delim_count: usize = 0;
1457    let mut skip_start_pos: usize = 0;
1458    let mut skip_end_pos: usize = len;
1459    let mut found_end = false;
1460
1461    for pos in memchr_iter(delim, line) {
1462        delim_count += 1;
1463        if delim_count == need_before {
1464            skip_start_pos = pos + 1;
1465        }
1466        if delim_count == need_total {
1467            skip_end_pos = pos;
1468            found_end = true;
1469            break;
1470        }
1471    }
1472
1473    if delim_count == 0 {
1474        if !suppress {
1475            unsafe {
1476                buf_extend(buf, line);
1477                buf_push(buf, line_delim);
1478            }
1479        }
1480        return;
1481    }
1482
1483    if delim_count < need_before {
1484        unsafe {
1485            buf_extend(buf, line);
1486            buf_push(buf, line_delim);
1487        }
1488        return;
1489    }
1490
1491    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1492    let has_suffix = found_end && skip_end_pos < len;
1493
1494    if has_prefix && has_suffix {
1495        unsafe {
1496            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1497            buf_push(buf, delim);
1498            buf_extend(
1499                buf,
1500                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1501            );
1502            buf_push(buf, line_delim);
1503        }
1504    } else if has_prefix {
1505        unsafe {
1506            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1507            buf_push(buf, line_delim);
1508        }
1509    } else if has_suffix {
1510        unsafe {
1511            buf_extend(
1512                buf,
1513                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1514            );
1515            buf_push(buf, line_delim);
1516        }
1517    } else {
1518        unsafe { buf_push(buf, line_delim) };
1519    }
1520}
1521
1522/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1523/// Zero-copy for the non-parallel path: identifies the truncation point per line
1524/// and writes contiguous runs directly from the source data.
1525fn process_fields_prefix(
1526    data: &[u8],
1527    delim: u8,
1528    line_delim: u8,
1529    last_field: usize,
1530    suppress: bool,
1531    out: &mut impl Write,
1532) -> io::Result<()> {
1533    if data.len() >= PARALLEL_THRESHOLD {
1534        let chunks = split_for_scope(data, line_delim);
1535        let n = chunks.len();
1536        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1537        rayon::scope(|s| {
1538            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1539                s.spawn(move |_| {
1540                    result.reserve(chunk.len());
1541                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1542                });
1543            }
1544        });
1545        let slices: Vec<IoSlice> = results
1546            .iter()
1547            .filter(|r| !r.is_empty())
1548            .map(|r| IoSlice::new(r))
1549            .collect();
1550        write_ioslices(out, &slices)?;
1551    } else if !suppress {
1552        // Zero-copy fast path: scan for truncation points, write runs from source.
1553        // When suppress is false, every line is output (with or without delimiter).
1554        // Most lines have enough fields, so the output is often identical to input.
1555        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1556    } else {
1557        process_chunked(data, line_delim, out, |chunk, buf| {
1558            fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, buf);
1559        })?;
1560    }
1561    Ok(())
1562}
1563
1564/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1565/// directly into the source data, flushing in MAX_IOV-sized batches.
1566/// For lines where the Nth delimiter exists, we truncate at that point.
1567/// For lines with fewer fields, we output them unchanged (contiguous run).
1568/// Lines without any delimiter are output unchanged (suppress=false assumed).
1569#[inline]
1570fn fields_prefix_zerocopy(
1571    data: &[u8],
1572    delim: u8,
1573    line_delim: u8,
1574    last_field: usize,
1575    out: &mut impl Write,
1576) -> io::Result<()> {
1577    let newline_buf: [u8; 1] = [line_delim];
1578    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1579    let mut start = 0;
1580    let mut run_start: usize = 0;
1581
1582    for end_pos in memchr_iter(line_delim, data) {
1583        let line = &data[start..end_pos];
1584        let mut field_count = 1;
1585        let mut truncate_at: Option<usize> = None;
1586        for dpos in memchr_iter(delim, line) {
1587            if field_count >= last_field {
1588                truncate_at = Some(start + dpos);
1589                break;
1590            }
1591            field_count += 1;
1592        }
1593
1594        if let Some(trunc_pos) = truncate_at {
1595            if run_start < start {
1596                iov.push(IoSlice::new(&data[run_start..start]));
1597            }
1598            iov.push(IoSlice::new(&data[start..trunc_pos]));
1599            iov.push(IoSlice::new(&newline_buf));
1600            run_start = end_pos + 1;
1601
1602            if iov.len() >= MAX_IOV - 2 {
1603                write_ioslices(out, &iov)?;
1604                iov.clear();
1605            }
1606        }
1607        start = end_pos + 1;
1608    }
1609    // Handle last line without terminator
1610    if start < data.len() {
1611        let line = &data[start..];
1612        let mut field_count = 1;
1613        let mut truncate_at: Option<usize> = None;
1614        for dpos in memchr_iter(delim, line) {
1615            if field_count >= last_field {
1616                truncate_at = Some(start + dpos);
1617                break;
1618            }
1619            field_count += 1;
1620        }
1621        if let Some(trunc_pos) = truncate_at {
1622            if run_start < start {
1623                iov.push(IoSlice::new(&data[run_start..start]));
1624            }
1625            iov.push(IoSlice::new(&data[start..trunc_pos]));
1626            iov.push(IoSlice::new(&newline_buf));
1627            if !iov.is_empty() {
1628                write_ioslices(out, &iov)?;
1629            }
1630            return Ok(());
1631        }
1632    }
1633    // Flush remaining contiguous run
1634    if run_start < data.len() {
1635        iov.push(IoSlice::new(&data[run_start..]));
1636        if !data.is_empty() && *data.last().unwrap() != line_delim {
1637            iov.push(IoSlice::new(&newline_buf));
1638        }
1639    }
1640    if !iov.is_empty() {
1641        write_ioslices(out, &iov)?;
1642    }
1643    Ok(())
1644}
1645
1646/// Process a chunk for contiguous from-start field range extraction.
1647fn fields_prefix_chunk(
1648    data: &[u8],
1649    delim: u8,
1650    line_delim: u8,
1651    last_field: usize,
1652    suppress: bool,
1653    buf: &mut Vec<u8>,
1654) {
1655    buf.reserve(data.len());
1656    let mut start = 0;
1657    for end_pos in memchr_iter(line_delim, data) {
1658        let line = &data[start..end_pos];
1659        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1660        start = end_pos + 1;
1661    }
1662    if start < data.len() {
1663        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1664    }
1665}
1666
1667/// Extract first N fields from one line (contiguous from-start range).
1668/// Uses memchr SIMD for delimiter scanning on all line sizes.
1669#[inline(always)]
1670fn fields_prefix_line(
1671    line: &[u8],
1672    delim: u8,
1673    line_delim: u8,
1674    last_field: usize,
1675    suppress: bool,
1676    buf: &mut Vec<u8>,
1677) {
1678    let len = line.len();
1679    if len == 0 {
1680        if !suppress {
1681            unsafe { buf_push(buf, line_delim) };
1682        }
1683        return;
1684    }
1685
1686    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1687    let base = line.as_ptr();
1688
1689    let mut field_count = 1usize;
1690    let mut has_delim = false;
1691
1692    for pos in memchr_iter(delim, line) {
1693        has_delim = true;
1694        if field_count >= last_field {
1695            unsafe {
1696                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1697                buf_push(buf, line_delim);
1698            }
1699            return;
1700        }
1701        field_count += 1;
1702    }
1703
1704    if !has_delim {
1705        if !suppress {
1706            unsafe {
1707                buf_extend(buf, line);
1708                buf_push(buf, line_delim);
1709            }
1710        }
1711        return;
1712    }
1713
1714    unsafe {
1715        buf_extend(buf, line);
1716        buf_push(buf, line_delim);
1717    }
1718}
1719
1720/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1721fn process_fields_suffix(
1722    data: &[u8],
1723    delim: u8,
1724    line_delim: u8,
1725    start_field: usize,
1726    suppress: bool,
1727    out: &mut impl Write,
1728) -> io::Result<()> {
1729    if data.len() >= PARALLEL_THRESHOLD {
1730        let chunks = split_for_scope(data, line_delim);
1731        let n = chunks.len();
1732        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1733        rayon::scope(|s| {
1734            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1735                s.spawn(move |_| {
1736                    result.reserve(chunk.len());
1737                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1738                });
1739            }
1740        });
1741        let slices: Vec<IoSlice> = results
1742            .iter()
1743            .filter(|r| !r.is_empty())
1744            .map(|r| IoSlice::new(r))
1745            .collect();
1746        write_ioslices(out, &slices)?;
1747    } else {
1748        process_chunked(data, line_delim, out, |chunk, buf| {
1749            fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, buf);
1750        })?;
1751    }
1752    Ok(())
1753}
1754
1755/// Process a chunk for open-ended field suffix extraction.
1756fn fields_suffix_chunk(
1757    data: &[u8],
1758    delim: u8,
1759    line_delim: u8,
1760    start_field: usize,
1761    suppress: bool,
1762    buf: &mut Vec<u8>,
1763) {
1764    buf.reserve(data.len());
1765    let mut start = 0;
1766    for end_pos in memchr_iter(line_delim, data) {
1767        let line = &data[start..end_pos];
1768        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1769        start = end_pos + 1;
1770    }
1771    if start < data.len() {
1772        fields_suffix_line(
1773            &data[start..],
1774            delim,
1775            line_delim,
1776            start_field,
1777            suppress,
1778            buf,
1779        );
1780    }
1781}
1782
1783/// Extract fields from start_field to end from one line.
1784/// Uses memchr SIMD for delimiter scanning on all line sizes.
1785#[inline(always)]
1786fn fields_suffix_line(
1787    line: &[u8],
1788    delim: u8,
1789    line_delim: u8,
1790    start_field: usize,
1791    suppress: bool,
1792    buf: &mut Vec<u8>,
1793) {
1794    let len = line.len();
1795    if len == 0 {
1796        if !suppress {
1797            unsafe { buf_push(buf, line_delim) };
1798        }
1799        return;
1800    }
1801
1802    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1803    let base = line.as_ptr();
1804
1805    let skip_delims = start_field - 1;
1806    let mut delim_count = 0usize;
1807    let mut has_delim = false;
1808
1809    for pos in memchr_iter(delim, line) {
1810        has_delim = true;
1811        delim_count += 1;
1812        if delim_count >= skip_delims {
1813            unsafe {
1814                buf_extend(
1815                    buf,
1816                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1817                );
1818                buf_push(buf, line_delim);
1819            }
1820            return;
1821        }
1822    }
1823
1824    if !has_delim {
1825        if !suppress {
1826            unsafe {
1827                buf_extend(buf, line);
1828                buf_push(buf, line_delim);
1829            }
1830        }
1831        return;
1832    }
1833
1834    // Fewer delimiters than needed
1835    unsafe { buf_push(buf, line_delim) };
1836}
1837
1838/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1839/// Optimized: skip to start_field using memchr, then output until end_field.
1840fn process_fields_mid_range(
1841    data: &[u8],
1842    delim: u8,
1843    line_delim: u8,
1844    start_field: usize,
1845    end_field: usize,
1846    suppress: bool,
1847    out: &mut impl Write,
1848) -> io::Result<()> {
1849    if data.len() >= PARALLEL_THRESHOLD {
1850        let chunks = split_for_scope(data, line_delim);
1851        let n = chunks.len();
1852        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1853        rayon::scope(|s| {
1854            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1855                s.spawn(move |_| {
1856                    result.reserve(chunk.len());
1857                    fields_mid_range_chunk(
1858                        chunk,
1859                        delim,
1860                        line_delim,
1861                        start_field,
1862                        end_field,
1863                        suppress,
1864                        result,
1865                    );
1866                });
1867            }
1868        });
1869        let slices: Vec<IoSlice> = results
1870            .iter()
1871            .filter(|r| !r.is_empty())
1872            .map(|r| IoSlice::new(r))
1873            .collect();
1874        write_ioslices(out, &slices)?;
1875    } else {
1876        process_chunked(data, line_delim, out, |chunk, buf| {
1877            fields_mid_range_chunk(
1878                chunk,
1879                delim,
1880                line_delim,
1881                start_field,
1882                end_field,
1883                suppress,
1884                buf,
1885            );
1886        })?;
1887    }
1888    Ok(())
1889}
1890
1891/// Process a chunk for contiguous mid-range field extraction.
1892/// Two-level scan: outer memchr(newline) for line boundaries, inner memchr_iter(delim)
1893/// with early exit at target_end_delim. Faster than memchr2 single-pass because
1894/// memchr is faster per byte and inner scan exits early.
1895fn fields_mid_range_chunk(
1896    data: &[u8],
1897    delim: u8,
1898    line_delim: u8,
1899    start_field: usize,
1900    end_field: usize,
1901    suppress: bool,
1902    buf: &mut Vec<u8>,
1903) {
1904    buf.reserve(data.len());
1905    let mut start = 0;
1906    for end_pos in memchr_iter(line_delim, data) {
1907        let line = &data[start..end_pos];
1908        fields_mid_range_line(
1909            line,
1910            delim,
1911            line_delim,
1912            start_field,
1913            end_field,
1914            suppress,
1915            buf,
1916        );
1917        start = end_pos + 1;
1918    }
1919    if start < data.len() {
1920        fields_mid_range_line(
1921            &data[start..],
1922            delim,
1923            line_delim,
1924            start_field,
1925            end_field,
1926            suppress,
1927            buf,
1928        );
1929    }
1930}
1931
1932/// Extract fields start_field..=end_field from one line.
1933/// Uses scalar byte scanning for short lines, memchr_iter for longer.
1934/// Raw pointer arithmetic to eliminate bounds checking.
1935#[inline(always)]
1936fn fields_mid_range_line(
1937    line: &[u8],
1938    delim: u8,
1939    line_delim: u8,
1940    start_field: usize,
1941    end_field: usize,
1942    suppress: bool,
1943    buf: &mut Vec<u8>,
1944) {
1945    let len = line.len();
1946    if len == 0 {
1947        if !suppress {
1948            unsafe { buf_push(buf, line_delim) };
1949        }
1950        return;
1951    }
1952
1953    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
1954    let base = line.as_ptr();
1955
1956    // Count delimiters to find start_field and end_field boundaries
1957    let skip_before = start_field - 1; // delimiters to skip before start_field
1958    let field_span = end_field - start_field; // additional delimiters within the range
1959    let target_end_delim = skip_before + field_span + 1;
1960    let mut delim_count = 0;
1961    let mut range_start = 0;
1962    let mut has_delim = false;
1963
1964    for pos in memchr_iter(delim, line) {
1965        has_delim = true;
1966        delim_count += 1;
1967        if delim_count == skip_before {
1968            range_start = pos + 1;
1969        }
1970        if delim_count == target_end_delim {
1971            if skip_before == 0 {
1972                range_start = 0;
1973            }
1974            unsafe {
1975                buf_extend(
1976                    buf,
1977                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1978                );
1979                buf_push(buf, line_delim);
1980            }
1981            return;
1982        }
1983    }
1984
1985    if !has_delim {
1986        if !suppress {
1987            unsafe {
1988                buf_extend(buf, line);
1989                buf_push(buf, line_delim);
1990            }
1991        }
1992        return;
1993    }
1994
1995    // Line has delimiters but fewer fields than end_field
1996    if delim_count >= skip_before {
1997        // We have at least start_field, output from range_start to end
1998        if skip_before == 0 {
1999            range_start = 0;
2000        }
2001        unsafe {
2002            buf_extend(
2003                buf,
2004                std::slice::from_raw_parts(base.add(range_start), len - range_start),
2005            );
2006            buf_push(buf, line_delim);
2007        }
2008    } else {
2009        // Not enough fields even for start_field — output empty line
2010        unsafe { buf_push(buf, line_delim) };
2011    }
2012}
2013
2014/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
2015/// directly into the source data, flushing in MAX_IOV-sized batches.
2016/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
2017///
2018/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
2019/// Parallel field-1 extraction for large data using memchr2 single-pass.
2020/// Splits data into per-thread chunks, each chunk extracts field 1 using
2021/// memchr2(delim, newline) which finds the first special byte in one scan.
2022/// For field 1: first special byte is either the delimiter (field end) or
2023/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
2024fn single_field1_parallel(
2025    data: &[u8],
2026    delim: u8,
2027    line_delim: u8,
2028    out: &mut impl Write,
2029) -> io::Result<()> {
2030    let chunks = split_for_scope(data, line_delim);
2031    let n = chunks.len();
2032    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2033    rayon::scope(|s| {
2034        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2035            s.spawn(move |_| {
2036                result.reserve(chunk.len() + 1);
2037                single_field1_to_buf(chunk, delim, line_delim, result);
2038            });
2039        }
2040    });
2041    let slices: Vec<IoSlice> = results
2042        .iter()
2043        .filter(|r| !r.is_empty())
2044        .map(|r| IoSlice::new(r))
2045        .collect();
2046    write_ioslices(out, &slices)
2047}
2048
2049/// Extract field 1 from a chunk using two-level scanning: outer memchr(newline)
2050/// for line boundaries, inner memchr(delim) for the first delimiter per line.
2051///
2052/// This is faster than memchr2_iter single-pass because:
2053/// 1. memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles)
2054/// 2. For field 1, the inner memchr exits after the FIRST delimiter, skipping
2055///    all subsequent delimiters on the line (huge win for multi-column CSV)
2056/// 3. Lines without delimiter produce contiguous runs that are bulk-copied
2057///
2058/// Uses a single output pointer to avoid per-line buf.len() load/store.
2059#[inline]
2060fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
2061    debug_assert_ne!(delim, line_delim, "delim and line_delim must differ");
2062    // Reserve data.len() + 1: output <= input for all lines except potentially
2063    // the last line without trailing newline, where we add a newline (GNU compat).
2064    buf.reserve(data.len() + 1);
2065
2066    let base = data.as_ptr();
2067    let initial_len = buf.len();
2068    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
2069    let mut start = 0;
2070    // Track the start of contiguous runs of no-delimiter lines for bulk copy.
2071    let mut run_start: usize = 0;
2072    let mut in_run = true; // we start in a run
2073
2074    for end_pos in memchr_iter(line_delim, data) {
2075        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2076        match memchr::memchr(delim, line) {
2077            Some(dp) => {
2078                // Line has delimiter — flush contiguous run, output field1 + newline
2079                if in_run && run_start < start {
2080                    // Bulk copy the contiguous run of unchanged lines
2081                    let run_len = start - run_start;
2082                    unsafe {
2083                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2084                        out_ptr = out_ptr.add(run_len);
2085                    }
2086                }
2087                // Output field (bytes before first delimiter) + newline
2088                unsafe {
2089                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
2090                    out_ptr = out_ptr.add(dp);
2091                    *out_ptr = line_delim;
2092                    out_ptr = out_ptr.add(1);
2093                }
2094                run_start = end_pos + 1;
2095                in_run = true;
2096            }
2097            None => {
2098                // No delimiter — this line stays in the contiguous run
2099                if !in_run {
2100                    run_start = start;
2101                    in_run = true;
2102                }
2103            }
2104        }
2105        start = end_pos + 1;
2106    }
2107
2108    // Flush any remaining contiguous run
2109    if in_run && run_start < start {
2110        let run_len = start - run_start;
2111        unsafe {
2112            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2113            out_ptr = out_ptr.add(run_len);
2114        }
2115    }
2116
2117    // Handle last line without trailing newline
2118    if start < data.len() {
2119        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2120        match memchr::memchr(delim, line) {
2121            Some(dp) => {
2122                // Field + trailing newline
2123                unsafe {
2124                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
2125                    out_ptr = out_ptr.add(dp);
2126                    *out_ptr = line_delim;
2127                    out_ptr = out_ptr.add(1);
2128                }
2129            }
2130            None => {
2131                // No delimiter — output remaining data + newline (GNU compat)
2132                let len = data.len() - start;
2133                unsafe {
2134                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, len);
2135                    out_ptr = out_ptr.add(len);
2136                    *out_ptr = line_delim;
2137                    out_ptr = out_ptr.add(1);
2138                }
2139            }
2140        }
2141    }
2142
2143    unsafe {
2144        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2145        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2146        buf.set_len(new_len);
2147    }
2148}
2149
2150/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
2151/// directly into the source data. Uses two-level scan: outer memchr(newline)
2152/// for the first delimiter. This is faster than memchr2 for SMALL data because
2153/// the inner scan exits after the FIRST delimiter, skipping all
2154/// subsequent delimiters on the line.
2155///
2156/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2157/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2158#[inline]
2159#[allow(dead_code)]
2160fn single_field1_zerocopy(
2161    data: &[u8],
2162    delim: u8,
2163    line_delim: u8,
2164    out: &mut impl Write,
2165) -> io::Result<()> {
2166    let newline_buf: [u8; 1] = [line_delim];
2167
2168    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2169    let mut run_start: usize = 0;
2170    let mut start = 0;
2171
2172    for end_pos in memchr_iter(line_delim, data) {
2173        let line = &data[start..end_pos];
2174        if let Some(dp) = memchr::memchr(delim, line) {
2175            // Line has delimiter — truncate at first delimiter.
2176            // Flush current contiguous run, then add truncated field + newline.
2177            if run_start < start {
2178                iov.push(IoSlice::new(&data[run_start..start]));
2179            }
2180            iov.push(IoSlice::new(&data[start..start + dp]));
2181            iov.push(IoSlice::new(&newline_buf));
2182            run_start = end_pos + 1;
2183
2184            if iov.len() >= MAX_IOV - 2 {
2185                write_ioslices(out, &iov)?;
2186                iov.clear();
2187            }
2188        }
2189        // else: no delimiter in line, output unchanged (stays in contiguous run)
2190        start = end_pos + 1;
2191    }
2192
2193    // Handle last line (no trailing newline)
2194    if start < data.len() {
2195        let line = &data[start..];
2196        if let Some(dp) = memchr::memchr(delim, line) {
2197            if run_start < start {
2198                iov.push(IoSlice::new(&data[run_start..start]));
2199            }
2200            iov.push(IoSlice::new(&data[start..start + dp]));
2201            iov.push(IoSlice::new(&newline_buf));
2202            if !iov.is_empty() {
2203                write_ioslices(out, &iov)?;
2204            }
2205            return Ok(());
2206        }
2207    }
2208
2209    // Flush remaining contiguous run
2210    if run_start < data.len() {
2211        iov.push(IoSlice::new(&data[run_start..]));
2212        if !data.is_empty() && *data.last().unwrap() != line_delim {
2213            iov.push(IoSlice::new(&newline_buf));
2214        }
2215    }
2216    if !iov.is_empty() {
2217        write_ioslices(out, &iov)?;
2218    }
2219    Ok(())
2220}
2221
2222/// Process a chunk of data for single-field extraction using write-pointer pattern.
2223/// Two-level scan: outer memchr(newline), inner memchr_iter(delim) with early exit.
2224/// Uses contiguous run tracking for lines that pass through unchanged.
2225fn process_single_field_chunk(
2226    data: &[u8],
2227    delim: u8,
2228    target_idx: usize,
2229    line_delim: u8,
2230    suppress: bool,
2231    buf: &mut Vec<u8>,
2232) {
2233    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2234    buf.reserve(data.len() + 1);
2235
2236    let base = data.as_ptr();
2237    let initial_len = buf.len();
2238    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
2239    let mut start = 0;
2240    // Track contiguous runs of lines that output unchanged
2241    let mut run_start: usize = 0;
2242    let mut in_run = !suppress; // if suppress, no line passes through without delimiter
2243
2244    for end_pos in memchr_iter(line_delim, data) {
2245        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2246        let line_len = end_pos - start;
2247
2248        if line_len == 0 {
2249            if !suppress {
2250                // Empty line passes through in the run
2251                if !in_run {
2252                    run_start = start;
2253                    in_run = true;
2254                }
2255            }
2256            start = end_pos + 1;
2257            continue;
2258        }
2259
2260        // Count delimiters up to target_idx to find the target field
2261        let mut field_start_offset = 0;
2262        let mut field_idx = 0;
2263        let mut found = false;
2264        let mut has_delim = false;
2265
2266        for pos in memchr_iter(delim, line) {
2267            has_delim = true;
2268            if field_idx == target_idx {
2269                // Found the target field: line[field_start_offset..pos]
2270                // Flush run, output field + newline
2271                if in_run && run_start < start {
2272                    let run_len = start - run_start;
2273                    unsafe {
2274                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2275                        out_ptr = out_ptr.add(run_len);
2276                    }
2277                }
2278                let field_len = pos - field_start_offset;
2279                unsafe {
2280                    std::ptr::copy_nonoverlapping(
2281                        base.add(start + field_start_offset),
2282                        out_ptr,
2283                        field_len,
2284                    );
2285                    out_ptr = out_ptr.add(field_len);
2286                    *out_ptr = line_delim;
2287                    out_ptr = out_ptr.add(1);
2288                }
2289                run_start = end_pos + 1;
2290                in_run = true;
2291                found = true;
2292                break;
2293            }
2294            field_idx += 1;
2295            field_start_offset = pos + 1;
2296        }
2297
2298        if !found {
2299            if !has_delim {
2300                // No delimiter in line
2301                if !suppress {
2302                    // Line passes through unchanged — stays in run
2303                    if !in_run {
2304                        run_start = start;
2305                        in_run = true;
2306                    }
2307                } else {
2308                    // Suppress: flush run, skip this line
2309                    if in_run && run_start < start {
2310                        let run_len = start - run_start;
2311                        unsafe {
2312                            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2313                            out_ptr = out_ptr.add(run_len);
2314                        }
2315                    }
2316                    in_run = false;
2317                    run_start = end_pos + 1;
2318                }
2319            } else if field_idx == target_idx {
2320                // Last field is the target: line[field_start_offset..]
2321                if in_run && run_start < start {
2322                    let run_len = start - run_start;
2323                    unsafe {
2324                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2325                        out_ptr = out_ptr.add(run_len);
2326                    }
2327                }
2328                let field_len = line_len - field_start_offset;
2329                unsafe {
2330                    std::ptr::copy_nonoverlapping(
2331                        base.add(start + field_start_offset),
2332                        out_ptr,
2333                        field_len,
2334                    );
2335                    out_ptr = out_ptr.add(field_len);
2336                    *out_ptr = line_delim;
2337                    out_ptr = out_ptr.add(1);
2338                }
2339                run_start = end_pos + 1;
2340                in_run = true;
2341            } else {
2342                // Not enough fields for target — output empty line
2343                if in_run && run_start < start {
2344                    let run_len = start - run_start;
2345                    unsafe {
2346                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2347                        out_ptr = out_ptr.add(run_len);
2348                    }
2349                }
2350                unsafe {
2351                    *out_ptr = line_delim;
2352                    out_ptr = out_ptr.add(1);
2353                }
2354                run_start = end_pos + 1;
2355                in_run = true;
2356            }
2357        }
2358
2359        start = end_pos + 1;
2360    }
2361
2362    // Flush remaining contiguous run
2363    if in_run && run_start < start {
2364        let run_len = start - run_start;
2365        unsafe {
2366            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2367            out_ptr = out_ptr.add(run_len);
2368        }
2369    }
2370
2371    // Handle last line without trailing newline
2372    if start < data.len() {
2373        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2374        let line_len = data.len() - start;
2375
2376        if line_len == 0 {
2377            if !suppress {
2378                unsafe {
2379                    *out_ptr = line_delim;
2380                    out_ptr = out_ptr.add(1);
2381                }
2382            }
2383        } else {
2384            let mut field_start_offset = 0;
2385            let mut field_idx = 0;
2386            let mut found = false;
2387            let mut has_delim = false;
2388
2389            for pos in memchr_iter(delim, line) {
2390                has_delim = true;
2391                if field_idx == target_idx {
2392                    let field_len = pos - field_start_offset;
2393                    unsafe {
2394                        std::ptr::copy_nonoverlapping(
2395                            base.add(start + field_start_offset),
2396                            out_ptr,
2397                            field_len,
2398                        );
2399                        out_ptr = out_ptr.add(field_len);
2400                        *out_ptr = line_delim;
2401                        out_ptr = out_ptr.add(1);
2402                    }
2403                    found = true;
2404                    break;
2405                }
2406                field_idx += 1;
2407                field_start_offset = pos + 1;
2408            }
2409
2410            if !found {
2411                if !has_delim {
2412                    if !suppress {
2413                        unsafe {
2414                            std::ptr::copy_nonoverlapping(base.add(start), out_ptr, line_len);
2415                            out_ptr = out_ptr.add(line_len);
2416                            *out_ptr = line_delim;
2417                            out_ptr = out_ptr.add(1);
2418                        }
2419                    }
2420                } else if field_idx == target_idx {
2421                    let field_len = line_len - field_start_offset;
2422                    unsafe {
2423                        std::ptr::copy_nonoverlapping(
2424                            base.add(start + field_start_offset),
2425                            out_ptr,
2426                            field_len,
2427                        );
2428                        out_ptr = out_ptr.add(field_len);
2429                        *out_ptr = line_delim;
2430                        out_ptr = out_ptr.add(1);
2431                    }
2432                } else {
2433                    unsafe {
2434                        *out_ptr = line_delim;
2435                        out_ptr = out_ptr.add(1);
2436                    }
2437                }
2438            }
2439        }
2440    }
2441
2442    unsafe {
2443        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2444        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2445        buf.set_len(new_len);
2446    }
2447}
2448
2449/// Extract fields from a single line into the output buffer.
2450/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2451/// Raw pointer arithmetic eliminates per-field bounds checking.
2452#[inline(always)]
2453fn extract_fields_to_buf(
2454    line: &[u8],
2455    delim: u8,
2456    ranges: &[Range],
2457    output_delim: &[u8],
2458    suppress: bool,
2459    max_field: usize,
2460    field_mask: u64,
2461    line_delim: u8,
2462    buf: &mut Vec<u8>,
2463    complement: bool,
2464) {
2465    let len = line.len();
2466
2467    if len == 0 {
2468        if !suppress {
2469            buf.push(line_delim);
2470        }
2471        return;
2472    }
2473
2474    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2475    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2476    let needed = len + output_delim.len() * 16 + 1;
2477    if buf.capacity() - buf.len() < needed {
2478        buf.reserve(needed);
2479    }
2480
2481    let base = line.as_ptr();
2482    let mut field_num: usize = 1;
2483    let mut field_start: usize = 0;
2484    let mut first_output = true;
2485    let mut has_delim = false;
2486
2487    // Use memchr SIMD for all line sizes
2488    for delim_pos in memchr_iter(delim, line) {
2489        has_delim = true;
2490
2491        if is_selected(field_num, field_mask, ranges, complement) {
2492            if !first_output {
2493                unsafe { buf_extend(buf, output_delim) };
2494            }
2495            unsafe {
2496                buf_extend(
2497                    buf,
2498                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2499                )
2500            };
2501            first_output = false;
2502        }
2503
2504        field_num += 1;
2505        field_start = delim_pos + 1;
2506
2507        if field_num > max_field {
2508            break;
2509        }
2510    }
2511
2512    // Last field
2513    if (field_num <= max_field || complement)
2514        && has_delim
2515        && is_selected(field_num, field_mask, ranges, complement)
2516    {
2517        if !first_output {
2518            unsafe { buf_extend(buf, output_delim) };
2519        }
2520        unsafe {
2521            buf_extend(
2522                buf,
2523                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2524            )
2525        };
2526        first_output = false;
2527    }
2528
2529    if !first_output {
2530        unsafe { buf_push(buf, line_delim) };
2531    } else if !has_delim {
2532        if !suppress {
2533            unsafe {
2534                buf_extend(buf, line);
2535                buf_push(buf, line_delim);
2536            }
2537        }
2538    } else {
2539        unsafe { buf_push(buf, line_delim) };
2540    }
2541}
2542
2543// ── Fast path: byte/char extraction with batched output ──────────────────
2544
2545/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2546/// Zero-copy: writes directly from the source data using output runs.
2547/// For lines shorter than max_bytes, the output is identical to the input,
2548/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2549fn process_bytes_from_start(
2550    data: &[u8],
2551    max_bytes: usize,
2552    line_delim: u8,
2553    out: &mut impl Write,
2554) -> io::Result<()> {
2555    // For data under 64MB: check if all lines fit for zero-copy passthrough.
2556    // When all lines fit, output = input (single write_all, no per-line processing).
2557    // The sequential scan (~1.7ms for 10MB at memchr speed) is cheaper than
2558    // per-line truncation + buffer assembly even with parallelism.
2559    // 64MB limit is independent of PARALLEL_THRESHOLD to preserve this fast path
2560    // even when parallel threshold is lowered.
2561    //
2562    // When all_fit=false, the scan breaks early at the first long line, so the
2563    // overhead is bounded by the position of that line (not the full file size).
2564    if data.len() < 64 * 1024 * 1024 && max_bytes > 0 && max_bytes < usize::MAX {
2565        let mut start = 0;
2566        let mut all_fit = true;
2567        for pos in memchr_iter(line_delim, data) {
2568            if pos - start > max_bytes {
2569                all_fit = false;
2570                break;
2571            }
2572            start = pos + 1;
2573        }
2574        // Check last line (no trailing delimiter)
2575        if all_fit && start < data.len() && data.len() - start > max_bytes {
2576            all_fit = false;
2577        }
2578        if all_fit {
2579            // All lines fit: output = input. Handle missing trailing delimiter.
2580            if !data.is_empty() && data[data.len() - 1] == line_delim {
2581                return out.write_all(data);
2582            } else if !data.is_empty() {
2583                out.write_all(data)?;
2584                return out.write_all(&[line_delim]);
2585            }
2586            return Ok(());
2587        }
2588    }
2589
2590    if data.len() >= PARALLEL_THRESHOLD {
2591        let chunks = split_for_scope(data, line_delim);
2592        let n = chunks.len();
2593        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2594        rayon::scope(|s| {
2595            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2596                s.spawn(move |_| {
2597                    // Output can be up to input size (when all lines fit).
2598                    // Reserve full chunk size to avoid reallocation.
2599                    result.reserve(chunk.len());
2600                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2601                });
2602            }
2603        });
2604        // Use write_vectored (writev) to batch N writes into fewer syscalls
2605        let slices: Vec<IoSlice> = results
2606            .iter()
2607            .filter(|r| !r.is_empty())
2608            .map(|r| IoSlice::new(r))
2609            .collect();
2610        write_ioslices(out, &slices)?;
2611    } else {
2612        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2613        // because every line gets truncated, creating 3 IoSlice entries per line.
2614        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2615        // managing millions of IoSlice entries through the kernel.
2616        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2617        if max_bytes <= 512 {
2618            // Estimate output size without scanning: output <= data.len(),
2619            // typically ~data.len()/4 for short max_bytes on longer lines.
2620            let est_out = (data.len() / 4).max(max_bytes + 2);
2621            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2622            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2623            if !buf.is_empty() {
2624                out.write_all(&buf)?;
2625            }
2626        } else {
2627            // Zero-copy path: track contiguous output runs and write directly from source.
2628            // For lines <= max_bytes, we include them as-is (no copy needed).
2629            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2630            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2631        }
2632    }
2633    Ok(())
2634}
2635
2636/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2637/// directly into the source data, flushing in MAX_IOV-sized batches.
2638/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2639/// produce two IoSlices (truncated data + newline).
2640#[inline]
2641fn bytes_from_start_zerocopy(
2642    data: &[u8],
2643    max_bytes: usize,
2644    line_delim: u8,
2645    out: &mut impl Write,
2646) -> io::Result<()> {
2647    let newline_buf: [u8; 1] = [line_delim];
2648    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2649    let mut start = 0;
2650    let mut run_start: usize = 0;
2651
2652    for pos in memchr_iter(line_delim, data) {
2653        let line_len = pos - start;
2654        if line_len > max_bytes {
2655            // This line needs truncation
2656            if run_start < start {
2657                iov.push(IoSlice::new(&data[run_start..start]));
2658            }
2659            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2660            iov.push(IoSlice::new(&newline_buf));
2661            run_start = pos + 1;
2662
2663            if iov.len() >= MAX_IOV - 2 {
2664                write_ioslices(out, &iov)?;
2665                iov.clear();
2666            }
2667        }
2668        start = pos + 1;
2669    }
2670    // Handle last line without terminator
2671    if start < data.len() {
2672        let line_len = data.len() - start;
2673        if line_len > max_bytes {
2674            if run_start < start {
2675                iov.push(IoSlice::new(&data[run_start..start]));
2676            }
2677            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2678            iov.push(IoSlice::new(&newline_buf));
2679            if !iov.is_empty() {
2680                write_ioslices(out, &iov)?;
2681            }
2682            return Ok(());
2683        }
2684    }
2685    // Flush remaining contiguous run
2686    if run_start < data.len() {
2687        iov.push(IoSlice::new(&data[run_start..]));
2688        if !data.is_empty() && *data.last().unwrap() != line_delim {
2689            iov.push(IoSlice::new(&newline_buf));
2690        }
2691    }
2692    if !iov.is_empty() {
2693        write_ioslices(out, &iov)?;
2694    }
2695    Ok(())
2696}
2697
2698/// Process a chunk for from-start byte range extraction (parallel path).
2699/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2700/// Pre-reserves data.len() (output never exceeds input), then uses a single
2701/// write pointer with deferred set_len — no per-line capacity checks.
2702#[inline]
2703fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2704    // Output is always <= input size (we only truncate, never expand).
2705    // Single reserve eliminates ALL per-line capacity checks.
2706    buf.reserve(data.len());
2707
2708    let src = data.as_ptr();
2709    let dst_base = buf.as_mut_ptr();
2710    let mut wp = buf.len();
2711    let mut start = 0;
2712
2713    for pos in memchr_iter(line_delim, data) {
2714        let line_len = pos - start;
2715        let take = line_len.min(max_bytes);
2716        unsafe {
2717            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2718            *dst_base.add(wp + take) = line_delim;
2719        }
2720        wp += take + 1;
2721        start = pos + 1;
2722    }
2723    // Handle last line without terminator
2724    if start < data.len() {
2725        let line_len = data.len() - start;
2726        let take = line_len.min(max_bytes);
2727        unsafe {
2728            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2729            *dst_base.add(wp + take) = line_delim;
2730        }
2731        wp += take + 1;
2732    }
2733    unsafe { buf.set_len(wp) };
2734}
2735
2736/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2737fn process_bytes_from_offset(
2738    data: &[u8],
2739    skip_bytes: usize,
2740    line_delim: u8,
2741    out: &mut impl Write,
2742) -> io::Result<()> {
2743    if data.len() >= PARALLEL_THRESHOLD {
2744        let chunks = split_for_scope(data, line_delim);
2745        let n = chunks.len();
2746        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2747        rayon::scope(|s| {
2748            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2749                s.spawn(move |_| {
2750                    result.reserve(chunk.len());
2751                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2752                });
2753            }
2754        });
2755        // Use write_vectored (writev) to batch N writes into fewer syscalls
2756        let slices: Vec<IoSlice> = results
2757            .iter()
2758            .filter(|r| !r.is_empty())
2759            .map(|r| IoSlice::new(r))
2760            .collect();
2761        write_ioslices(out, &slices)?;
2762    } else {
2763        // Zero-copy: write suffix of each line directly from source
2764        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2765    }
2766    Ok(())
2767}
2768
2769/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2770/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2771/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2772#[inline]
2773fn bytes_from_offset_zerocopy(
2774    data: &[u8],
2775    skip_bytes: usize,
2776    line_delim: u8,
2777    out: &mut impl Write,
2778) -> io::Result<()> {
2779    let delim_buf = [line_delim];
2780    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2781
2782    let mut start = 0;
2783    for pos in memchr_iter(line_delim, data) {
2784        let line_len = pos - start;
2785        if line_len > skip_bytes {
2786            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2787        }
2788        iov.push(IoSlice::new(&delim_buf));
2789        // Flush when approaching MAX_IOV to avoid oversized writev
2790        if iov.len() >= MAX_IOV - 1 {
2791            write_ioslices(out, &iov)?;
2792            iov.clear();
2793        }
2794        start = pos + 1;
2795    }
2796    if start < data.len() {
2797        let line_len = data.len() - start;
2798        if line_len > skip_bytes {
2799            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2800        }
2801        iov.push(IoSlice::new(&delim_buf));
2802    }
2803    if !iov.is_empty() {
2804        write_ioslices(out, &iov)?;
2805    }
2806    Ok(())
2807}
2808
2809/// Process a chunk for from-offset byte range extraction.
2810/// Single reserve + deferred set_len for zero per-line overhead.
2811#[inline]
2812fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2813    buf.reserve(data.len());
2814
2815    let src = data.as_ptr();
2816    let dst_base = buf.as_mut_ptr();
2817    let mut wp = buf.len();
2818    let mut start = 0;
2819
2820    for pos in memchr_iter(line_delim, data) {
2821        let line_len = pos - start;
2822        if line_len > skip_bytes {
2823            let take = line_len - skip_bytes;
2824            unsafe {
2825                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2826            }
2827            wp += take;
2828        }
2829        unsafe {
2830            *dst_base.add(wp) = line_delim;
2831        }
2832        wp += 1;
2833        start = pos + 1;
2834    }
2835    if start < data.len() {
2836        let line_len = data.len() - start;
2837        if line_len > skip_bytes {
2838            let take = line_len - skip_bytes;
2839            unsafe {
2840                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2841            }
2842            wp += take;
2843        }
2844        unsafe {
2845            *dst_base.add(wp) = line_delim;
2846        }
2847        wp += 1;
2848    }
2849    unsafe { buf.set_len(wp) };
2850}
2851
2852/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2853fn process_bytes_mid_range(
2854    data: &[u8],
2855    start_byte: usize,
2856    end_byte: usize,
2857    line_delim: u8,
2858    out: &mut impl Write,
2859) -> io::Result<()> {
2860    let skip = start_byte.saturating_sub(1);
2861
2862    if data.len() >= PARALLEL_THRESHOLD {
2863        let chunks = split_for_scope(data, line_delim);
2864        let n = chunks.len();
2865        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2866        rayon::scope(|s| {
2867            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2868                s.spawn(move |_| {
2869                    result.reserve(chunk.len());
2870                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2871                });
2872            }
2873        });
2874        let slices: Vec<IoSlice> = results
2875            .iter()
2876            .filter(|r| !r.is_empty())
2877            .map(|r| IoSlice::new(r))
2878            .collect();
2879        write_ioslices(out, &slices)?;
2880    } else {
2881        process_chunked(data, line_delim, out, |chunk, buf| {
2882            bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, buf);
2883        })?;
2884    }
2885    Ok(())
2886}
2887
2888/// Process a chunk for mid-range byte extraction.
2889/// For each line, output bytes skip..min(line_len, end_byte).
2890/// Single reserve + deferred set_len.
2891#[inline]
2892fn bytes_mid_range_chunk(
2893    data: &[u8],
2894    skip: usize,
2895    end_byte: usize,
2896    line_delim: u8,
2897    buf: &mut Vec<u8>,
2898) {
2899    buf.reserve(data.len());
2900
2901    let src = data.as_ptr();
2902    let dst_base = buf.as_mut_ptr();
2903    let mut wp = buf.len();
2904    let mut start = 0;
2905
2906    for pos in memchr_iter(line_delim, data) {
2907        let line_len = pos - start;
2908        if line_len > skip {
2909            let take_end = line_len.min(end_byte);
2910            let take = take_end - skip;
2911            unsafe {
2912                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2913            }
2914            wp += take;
2915        }
2916        unsafe {
2917            *dst_base.add(wp) = line_delim;
2918        }
2919        wp += 1;
2920        start = pos + 1;
2921    }
2922    if start < data.len() {
2923        let line_len = data.len() - start;
2924        if line_len > skip {
2925            let take_end = line_len.min(end_byte);
2926            let take = take_end - skip;
2927            unsafe {
2928                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2929            }
2930            wp += take;
2931        }
2932        unsafe {
2933            *dst_base.add(wp) = line_delim;
2934        }
2935        wp += 1;
2936    }
2937    unsafe { buf.set_len(wp) };
2938}
2939
2940/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2941fn process_bytes_complement_mid(
2942    data: &[u8],
2943    skip_start: usize,
2944    skip_end: usize,
2945    line_delim: u8,
2946    out: &mut impl Write,
2947) -> io::Result<()> {
2948    let prefix_bytes = skip_start - 1; // bytes before the skip region
2949    if data.len() >= PARALLEL_THRESHOLD {
2950        let chunks = split_for_scope(data, line_delim);
2951        let n = chunks.len();
2952        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2953        rayon::scope(|s| {
2954            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2955                s.spawn(move |_| {
2956                    result.reserve(chunk.len());
2957                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
2958                });
2959            }
2960        });
2961        let slices: Vec<IoSlice> = results
2962            .iter()
2963            .filter(|r| !r.is_empty())
2964            .map(|r| IoSlice::new(r))
2965            .collect();
2966        write_ioslices(out, &slices)?;
2967    } else {
2968        process_chunked(data, line_delim, out, |chunk, buf| {
2969            bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, buf);
2970        })?;
2971    }
2972    Ok(())
2973}
2974
2975/// Process a chunk for complement mid-range byte extraction.
2976/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2977#[inline]
2978fn bytes_complement_mid_chunk(
2979    data: &[u8],
2980    prefix_bytes: usize,
2981    skip_end: usize,
2982    line_delim: u8,
2983    buf: &mut Vec<u8>,
2984) {
2985    buf.reserve(data.len());
2986
2987    let src = data.as_ptr();
2988    let dst_base = buf.as_mut_ptr();
2989    let mut wp = buf.len();
2990    let mut start = 0;
2991
2992    for pos in memchr_iter(line_delim, data) {
2993        let line_len = pos - start;
2994        // Copy prefix (bytes before skip region)
2995        let take_prefix = prefix_bytes.min(line_len);
2996        if take_prefix > 0 {
2997            unsafe {
2998                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2999            }
3000            wp += take_prefix;
3001        }
3002        // Copy suffix (bytes after skip region)
3003        if line_len > skip_end {
3004            let suffix_len = line_len - skip_end;
3005            unsafe {
3006                std::ptr::copy_nonoverlapping(
3007                    src.add(start + skip_end),
3008                    dst_base.add(wp),
3009                    suffix_len,
3010                );
3011            }
3012            wp += suffix_len;
3013        }
3014        unsafe {
3015            *dst_base.add(wp) = line_delim;
3016        }
3017        wp += 1;
3018        start = pos + 1;
3019    }
3020    if start < data.len() {
3021        let line_len = data.len() - start;
3022        let take_prefix = prefix_bytes.min(line_len);
3023        if take_prefix > 0 {
3024            unsafe {
3025                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3026            }
3027            wp += take_prefix;
3028        }
3029        if line_len > skip_end {
3030            let suffix_len = line_len - skip_end;
3031            unsafe {
3032                std::ptr::copy_nonoverlapping(
3033                    src.add(start + skip_end),
3034                    dst_base.add(wp),
3035                    suffix_len,
3036                );
3037            }
3038            wp += suffix_len;
3039        }
3040        unsafe {
3041            *dst_base.add(wp) = line_delim;
3042        }
3043        wp += 1;
3044    }
3045    unsafe { buf.set_len(wp) };
3046}
3047
3048/// Optimized byte/char extraction with batched output and parallel processing.
3049fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3050    let line_delim = cfg.line_delim;
3051    let ranges = cfg.ranges;
3052    let complement = cfg.complement;
3053    let output_delim = cfg.output_delim;
3054
3055    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
3056    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
3057        let max_bytes = ranges[0].end;
3058        if max_bytes < usize::MAX {
3059            return process_bytes_from_start(data, max_bytes, line_delim, out);
3060        }
3061    }
3062
3063    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
3064    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
3065        let skip_bytes = ranges[0].start.saturating_sub(1);
3066        if skip_bytes > 0 {
3067            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
3068        }
3069    }
3070
3071    // Fast path: single mid-range (e.g., cut -b5-100)
3072    if !complement
3073        && ranges.len() == 1
3074        && ranges[0].start > 1
3075        && ranges[0].end < usize::MAX
3076        && output_delim.is_empty()
3077    {
3078        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
3079    }
3080
3081    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
3082    if complement
3083        && ranges.len() == 1
3084        && ranges[0].start == 1
3085        && ranges[0].end < usize::MAX
3086        && output_delim.is_empty()
3087    {
3088        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
3089    }
3090
3091    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
3092    if complement
3093        && ranges.len() == 1
3094        && ranges[0].end == usize::MAX
3095        && ranges[0].start > 1
3096        && output_delim.is_empty()
3097    {
3098        let max_bytes = ranges[0].start - 1;
3099        return process_bytes_from_start(data, max_bytes, line_delim, out);
3100    }
3101
3102    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
3103    if complement
3104        && ranges.len() == 1
3105        && ranges[0].start > 1
3106        && ranges[0].end < usize::MAX
3107        && output_delim.is_empty()
3108    {
3109        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
3110    }
3111
3112    if data.len() >= PARALLEL_THRESHOLD {
3113        let chunks = split_for_scope(data, line_delim);
3114        let n = chunks.len();
3115        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3116        rayon::scope(|s| {
3117            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3118                s.spawn(move |_| {
3119                    result.reserve(chunk.len() + 1);
3120                    process_bytes_chunk(
3121                        chunk,
3122                        ranges,
3123                        complement,
3124                        output_delim,
3125                        line_delim,
3126                        result,
3127                    );
3128                });
3129            }
3130        });
3131        let slices: Vec<IoSlice> = results
3132            .iter()
3133            .filter(|r| !r.is_empty())
3134            .map(|r| IoSlice::new(r))
3135            .collect();
3136        write_ioslices(out, &slices)?;
3137    } else {
3138        process_chunked(data, line_delim, out, |chunk, buf| {
3139            process_bytes_chunk(chunk, ranges, complement, output_delim, line_delim, buf);
3140        })?;
3141    }
3142    Ok(())
3143}
3144
3145/// Process a chunk of data for byte/char extraction.
3146/// Uses raw pointer arithmetic for the newline scan.
3147/// Complement single-range fast path: compute complement ranges once, then use
3148/// the non-complement multi-range path which is more cache-friendly.
3149fn process_bytes_chunk(
3150    data: &[u8],
3151    ranges: &[Range],
3152    complement: bool,
3153    output_delim: &[u8],
3154    line_delim: u8,
3155    buf: &mut Vec<u8>,
3156) {
3157    buf.reserve(data.len());
3158    let base = data.as_ptr();
3159    let mut start = 0;
3160    for end_pos in memchr_iter(line_delim, data) {
3161        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
3162        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3163        unsafe { buf_push(buf, line_delim) };
3164        start = end_pos + 1;
3165    }
3166    if start < data.len() {
3167        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
3168        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3169        unsafe { buf_push(buf, line_delim) };
3170    }
3171}
3172
3173/// Extract byte ranges from a line into the output buffer.
3174/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
3175/// Raw pointer arithmetic eliminates per-range bounds checking.
3176#[inline(always)]
3177fn cut_bytes_to_buf(
3178    line: &[u8],
3179    ranges: &[Range],
3180    complement: bool,
3181    output_delim: &[u8],
3182    buf: &mut Vec<u8>,
3183) {
3184    let len = line.len();
3185    let base = line.as_ptr();
3186    let mut first_range = true;
3187
3188    // Reserve worst case: full line + delimiters between ranges
3189    let needed = len + output_delim.len() * ranges.len() + 1;
3190    if buf.capacity() - buf.len() < needed {
3191        buf.reserve(needed);
3192    }
3193
3194    if complement {
3195        let mut pos: usize = 1;
3196        for r in ranges {
3197            let rs = r.start;
3198            let re = r.end.min(len);
3199            if pos < rs {
3200                if !first_range && !output_delim.is_empty() {
3201                    unsafe { buf_extend(buf, output_delim) };
3202                }
3203                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
3204                first_range = false;
3205            }
3206            pos = re + 1;
3207            if pos > len {
3208                break;
3209            }
3210        }
3211        if pos <= len {
3212            if !first_range && !output_delim.is_empty() {
3213                unsafe { buf_extend(buf, output_delim) };
3214            }
3215            unsafe {
3216                buf_extend(
3217                    buf,
3218                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3219                )
3220            };
3221        }
3222    } else if output_delim.is_empty() && ranges.len() == 1 {
3223        // Ultra-fast path: single range, no output delimiter
3224        let start = ranges[0].start.saturating_sub(1);
3225        let end = ranges[0].end.min(len);
3226        if start < len {
3227            unsafe {
3228                buf_extend(
3229                    buf,
3230                    std::slice::from_raw_parts(base.add(start), end - start),
3231                )
3232            };
3233        }
3234    } else {
3235        for r in ranges {
3236            let start = r.start.saturating_sub(1);
3237            let end = r.end.min(len);
3238            if start >= len {
3239                break;
3240            }
3241            if !first_range && !output_delim.is_empty() {
3242                unsafe { buf_extend(buf, output_delim) };
3243            }
3244            unsafe {
3245                buf_extend(
3246                    buf,
3247                    std::slice::from_raw_parts(base.add(start), end - start),
3248                )
3249            };
3250            first_range = false;
3251        }
3252    }
3253}
3254
3255// ── Public API ───────────────────────────────────────────────────────────
3256
3257/// Cut fields from a line using a delimiter. Writes to `out`.
3258#[inline]
3259pub fn cut_fields(
3260    line: &[u8],
3261    delim: u8,
3262    ranges: &[Range],
3263    complement: bool,
3264    output_delim: &[u8],
3265    suppress_no_delim: bool,
3266    out: &mut impl Write,
3267) -> io::Result<bool> {
3268    if memchr::memchr(delim, line).is_none() {
3269        if !suppress_no_delim {
3270            out.write_all(line)?;
3271            return Ok(true);
3272        }
3273        return Ok(false);
3274    }
3275
3276    let mut field_num: usize = 1;
3277    let mut field_start: usize = 0;
3278    let mut first_output = true;
3279
3280    for delim_pos in memchr_iter(delim, line) {
3281        let selected = in_ranges(ranges, field_num) != complement;
3282        if selected {
3283            if !first_output {
3284                out.write_all(output_delim)?;
3285            }
3286            out.write_all(&line[field_start..delim_pos])?;
3287            first_output = false;
3288        }
3289        field_start = delim_pos + 1;
3290        field_num += 1;
3291    }
3292
3293    let selected = in_ranges(ranges, field_num) != complement;
3294    if selected {
3295        if !first_output {
3296            out.write_all(output_delim)?;
3297        }
3298        out.write_all(&line[field_start..])?;
3299    }
3300
3301    Ok(true)
3302}
3303
3304/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3305#[inline]
3306pub fn cut_bytes(
3307    line: &[u8],
3308    ranges: &[Range],
3309    complement: bool,
3310    output_delim: &[u8],
3311    out: &mut impl Write,
3312) -> io::Result<bool> {
3313    let mut first_range = true;
3314
3315    if complement {
3316        let len = line.len();
3317        let mut comp_ranges = Vec::new();
3318        let mut pos: usize = 1;
3319        for r in ranges {
3320            let rs = r.start;
3321            let re = r.end.min(len);
3322            if pos < rs {
3323                comp_ranges.push((pos, rs - 1));
3324            }
3325            pos = re + 1;
3326            if pos > len {
3327                break;
3328            }
3329        }
3330        if pos <= len {
3331            comp_ranges.push((pos, len));
3332        }
3333        for &(s, e) in &comp_ranges {
3334            if !first_range && !output_delim.is_empty() {
3335                out.write_all(output_delim)?;
3336            }
3337            out.write_all(&line[s - 1..e])?;
3338            first_range = false;
3339        }
3340    } else {
3341        for r in ranges {
3342            let start = r.start.saturating_sub(1);
3343            let end = r.end.min(line.len());
3344            if start >= line.len() {
3345                break;
3346            }
3347            if !first_range && !output_delim.is_empty() {
3348                out.write_all(output_delim)?;
3349            }
3350            out.write_all(&line[start..end])?;
3351            first_range = false;
3352        }
3353    }
3354    Ok(true)
3355}
3356
3357/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3358/// Output is always <= input (we remove everything after first delimiter per line).
3359/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3360/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3361///
3362/// Lines without delimiter pass through unchanged (unless suppress=true).
3363/// Lines with delimiter: keep bytes before delimiter + newline.
3364pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3365    let len = data.len();
3366    let mut wp: usize = 0;
3367    let mut rp: usize = 0;
3368
3369    while rp < len {
3370        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3371            None => {
3372                // Rest is partial line, no delimiter
3373                if suppress {
3374                    // suppress: skip lines without delimiter
3375                    break;
3376                }
3377                let remaining = len - rp;
3378                if wp != rp {
3379                    data.copy_within(rp..len, wp);
3380                }
3381                wp += remaining;
3382                break;
3383            }
3384            Some(offset) => {
3385                let actual = rp + offset;
3386                if data[actual] == line_delim {
3387                    // No delimiter on this line
3388                    if suppress {
3389                        // Skip this line entirely
3390                        rp = actual + 1;
3391                    } else {
3392                        // Output entire line including newline
3393                        let chunk_len = actual + 1 - rp;
3394                        if wp != rp {
3395                            data.copy_within(rp..actual + 1, wp);
3396                        }
3397                        wp += chunk_len;
3398                        rp = actual + 1;
3399                    }
3400                } else {
3401                    // Delimiter found: output field 1 (up to delimiter) + newline
3402                    let field_len = actual - rp;
3403                    if wp != rp && field_len > 0 {
3404                        data.copy_within(rp..actual, wp);
3405                    }
3406                    wp += field_len;
3407                    data[wp] = line_delim;
3408                    wp += 1;
3409                    // Skip to next newline
3410                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3411                        None => {
3412                            rp = len;
3413                        }
3414                        Some(nl_off) => {
3415                            rp = actual + 1 + nl_off + 1;
3416                        }
3417                    }
3418                }
3419            }
3420        }
3421    }
3422    wp
3423}
3424
3425/// Process a full data buffer (from mmap or read) with cut operation.
3426pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3427    match cfg.mode {
3428        CutMode::Fields => process_fields_fast(data, cfg, out),
3429        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3430    }
3431}
3432
3433/// Process input from a reader (for stdin).
3434/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3435/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3436/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3437pub fn process_cut_reader<R: BufRead>(
3438    mut reader: R,
3439    cfg: &CutConfig,
3440    out: &mut impl Write,
3441) -> io::Result<()> {
3442    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3443    let line_delim = cfg.line_delim;
3444
3445    // Read large chunks and process in batch.
3446    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3447    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3448
3449    loop {
3450        // Read up to CHUNK_SIZE bytes
3451        buf.reserve(CHUNK_SIZE);
3452        let read_start = buf.len();
3453        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3454        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3455        buf.truncate(read_start + n);
3456
3457        if buf.is_empty() {
3458            break;
3459        }
3460
3461        if n == 0 {
3462            // EOF with leftover data (last line without terminator)
3463            process_cut_data(&buf, cfg, out)?;
3464            break;
3465        }
3466
3467        // Find the last line delimiter in the buffer so we process complete lines
3468        let process_end = match memchr::memrchr(line_delim, &buf) {
3469            Some(pos) => pos + 1,
3470            None => {
3471                // No line delimiter found — keep accumulating
3472                continue;
3473            }
3474        };
3475
3476        // Process the complete lines using the fast batch path
3477        process_cut_data(&buf[..process_end], cfg, out)?;
3478
3479        // Shift leftover to the front for next iteration
3480        let leftover_len = buf.len() - process_end;
3481        if leftover_len > 0 {
3482            buf.copy_within(process_end.., 0);
3483        }
3484        buf.truncate(leftover_len);
3485    }
3486
3487    Ok(())
3488}
3489
3490/// Read as many bytes as possible into buf, retrying on partial reads.
3491#[inline]
3492fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3493    let n = reader.read(buf)?;
3494    if n == buf.len() || n == 0 {
3495        return Ok(n);
3496    }
3497    // Slow path: partial read — retry to fill buffer
3498    let mut total = n;
3499    while total < buf.len() {
3500        match reader.read(&mut buf[total..]) {
3501            Ok(0) => break,
3502            Ok(n) => total += n,
3503            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3504            Err(e) => return Err(e),
3505        }
3506    }
3507    Ok(total)
3508}
3509
3510/// In-place cut processing for mutable data buffers.
3511/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3512/// for the given configuration (caller should fall back to regular processing).
3513///
3514/// In-place avoids allocating intermediate output buffers — the result is written
3515/// directly into the input buffer (output is always <= input for non-complement modes
3516/// with default output delimiter).
3517///
3518/// Note: if the input does not end with line_delim, we fall back to the regular
3519/// path because GNU cut always adds a trailing line delimiter, and the in-place
3520/// buffer cannot grow beyond the input size.
3521pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3522    if cfg.complement {
3523        return None;
3524    }
3525    // If input doesn't end with line_delim, the output may need an extra byte
3526    // (GNU cut always terminates the last line). In-place can't grow the buffer,
3527    // so fall back to the regular allocating path.
3528    if data.is_empty() || data[data.len() - 1] != cfg.line_delim {
3529        return None;
3530    }
3531
3532    match cfg.mode {
3533        CutMode::Fields => {
3534            // Only handle when output delimiter matches input (single-byte)
3535            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3536                return None;
3537            }
3538            if cfg.delim == cfg.line_delim {
3539                return None;
3540            }
3541            Some(cut_fields_inplace_general(
3542                data,
3543                cfg.delim,
3544                cfg.line_delim,
3545                cfg.ranges,
3546                cfg.suppress_no_delim,
3547            ))
3548        }
3549        CutMode::Bytes | CutMode::Characters => {
3550            if !cfg.output_delim.is_empty() {
3551                return None;
3552            }
3553            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3554        }
3555    }
3556}
3557
3558/// In-place generalized field extraction.
3559/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3560fn cut_fields_inplace_general(
3561    data: &mut [u8],
3562    delim: u8,
3563    line_delim: u8,
3564    ranges: &[Range],
3565    suppress: bool,
3566) -> usize {
3567    // Special case: field 1 only (existing optimized path)
3568    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3569        return cut_field1_inplace(data, delim, line_delim, suppress);
3570    }
3571
3572    let len = data.len();
3573    if len == 0 {
3574        return 0;
3575    }
3576
3577    let max_field = ranges.last().map_or(0, |r| r.end);
3578    let max_delims = max_field.min(128);
3579    let mut wp: usize = 0;
3580    let mut rp: usize = 0;
3581
3582    while rp < len {
3583        let line_end = memchr::memchr(line_delim, &data[rp..])
3584            .map(|p| rp + p)
3585            .unwrap_or(len);
3586        let line_len = line_end - rp;
3587
3588        // Collect delimiter positions (relative to line start)
3589        let mut delim_pos = [0usize; 128];
3590        let mut num_delims: usize = 0;
3591
3592        for pos in memchr_iter(delim, &data[rp..line_end]) {
3593            if num_delims < max_delims {
3594                delim_pos[num_delims] = pos;
3595                num_delims += 1;
3596                if num_delims >= max_delims {
3597                    break;
3598                }
3599            }
3600        }
3601
3602        if num_delims == 0 {
3603            // No delimiter in line
3604            if !suppress {
3605                if wp != rp {
3606                    data.copy_within(rp..line_end, wp);
3607                }
3608                wp += line_len;
3609                if line_end < len {
3610                    data[wp] = line_delim;
3611                    wp += 1;
3612                }
3613            }
3614        } else {
3615            let total_fields = num_delims + 1;
3616            let mut first_output = true;
3617
3618            for r in ranges {
3619                let range_start = r.start;
3620                let range_end = r.end.min(total_fields);
3621                if range_start > total_fields {
3622                    break;
3623                }
3624                for field_num in range_start..=range_end {
3625                    if field_num > total_fields {
3626                        break;
3627                    }
3628
3629                    let field_start = if field_num == 1 {
3630                        0
3631                    } else if field_num - 2 < num_delims {
3632                        delim_pos[field_num - 2] + 1
3633                    } else {
3634                        continue;
3635                    };
3636                    let field_end = if field_num <= num_delims {
3637                        delim_pos[field_num - 1]
3638                    } else {
3639                        line_len
3640                    };
3641
3642                    if !first_output {
3643                        data[wp] = delim;
3644                        wp += 1;
3645                    }
3646                    let flen = field_end - field_start;
3647                    if flen > 0 {
3648                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3649                        wp += flen;
3650                    }
3651                    first_output = false;
3652                }
3653            }
3654
3655            if !first_output && line_end < len {
3656                data[wp] = line_delim;
3657                wp += 1;
3658            } else if first_output && line_end < len {
3659                // No fields selected but line had delimiters — output empty line
3660                data[wp] = line_delim;
3661                wp += 1;
3662            }
3663        }
3664
3665        rp = if line_end < len { line_end + 1 } else { len };
3666    }
3667
3668    wp
3669}
3670
3671/// In-place byte/char range extraction.
3672fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3673    let len = data.len();
3674    if len == 0 {
3675        return 0;
3676    }
3677
3678    // Quick check: single range from byte 1 to end = no-op
3679    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3680        return len;
3681    }
3682
3683    // Single range from byte 1: fast truncation path
3684    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3685        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3686    }
3687
3688    let mut wp: usize = 0;
3689    let mut rp: usize = 0;
3690
3691    while rp < len {
3692        let line_end = memchr::memchr(line_delim, &data[rp..])
3693            .map(|p| rp + p)
3694            .unwrap_or(len);
3695        let line_len = line_end - rp;
3696
3697        for r in ranges {
3698            let start = r.start.saturating_sub(1);
3699            let end = r.end.min(line_len);
3700            if start >= line_len {
3701                break;
3702            }
3703            let flen = end - start;
3704            if flen > 0 {
3705                data.copy_within(rp + start..rp + start + flen, wp);
3706                wp += flen;
3707            }
3708        }
3709
3710        if line_end < len {
3711            data[wp] = line_delim;
3712            wp += 1;
3713        }
3714
3715        rp = if line_end < len { line_end + 1 } else { len };
3716    }
3717
3718    wp
3719}
3720
3721/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3722fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3723    let len = data.len();
3724
3725    // Quick check: see if all lines fit within max_bytes (common case)
3726    let mut all_fit = true;
3727    let mut start = 0;
3728    for pos in memchr_iter(line_delim, data) {
3729        if pos - start > max_bytes {
3730            all_fit = false;
3731            break;
3732        }
3733        start = pos + 1;
3734    }
3735    if all_fit && start < len && len - start > max_bytes {
3736        all_fit = false;
3737    }
3738    if all_fit {
3739        return len;
3740    }
3741
3742    // Some lines need truncation
3743    let mut wp: usize = 0;
3744    let mut rp: usize = 0;
3745
3746    while rp < len {
3747        let line_end = memchr::memchr(line_delim, &data[rp..])
3748            .map(|p| rp + p)
3749            .unwrap_or(len);
3750        let line_len = line_end - rp;
3751
3752        let take = line_len.min(max_bytes);
3753        if take > 0 && wp != rp {
3754            data.copy_within(rp..rp + take, wp);
3755        }
3756        wp += take;
3757
3758        if line_end < len {
3759            data[wp] = line_delim;
3760            wp += 1;
3761        }
3762
3763        rp = if line_end < len { line_end + 1 } else { len };
3764    }
3765
3766    wp
3767}
3768
3769/// Cut operation mode
3770#[derive(Debug, Clone, Copy, PartialEq)]
3771pub enum CutMode {
3772    Bytes,
3773    Characters,
3774    Fields,
3775}