Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (32MB).
5/// Files above this threshold use rayon parallel chunked processing.
6/// 32MB balances rayon init overhead + buffer allocation against parallel benefits.
7/// For 10MB files, sequential is faster due to thread coordination + memory overhead.
8const PARALLEL_THRESHOLD: usize = 32 * 1024 * 1024;
9
10/// Max iovec entries per writev call (Linux default).
11const MAX_IOV: usize = 1024;
12
13/// Input chunk size for sequential processing. 4MB reduces write_all syscalls
14/// (~3 calls for 10MB vs ~40 at 256KB). May exceed L2/L3 on smaller cores;
15/// the primary benefit is syscall reduction rather than cache residency.
16const SEQ_CHUNK: usize = 4 * 1024 * 1024;
17
18/// Process data in newline-aligned chunks, writing each chunk's output immediately.
19/// Avoids allocating a full-size output buffer (e.g. 12MB for 11MB input).
20fn process_chunked(
21    data: &[u8],
22    line_delim: u8,
23    out: &mut impl Write,
24    mut process_fn: impl FnMut(&[u8], &mut Vec<u8>),
25) -> io::Result<()> {
26    let mut buf = Vec::with_capacity(SEQ_CHUNK * 2);
27    let mut start = 0;
28    while start < data.len() {
29        let end = if start + SEQ_CHUNK >= data.len() {
30            data.len()
31        } else {
32            match memchr::memrchr(line_delim, &data[start..start + SEQ_CHUNK]) {
33                Some(pos) => start + pos + 1,
34                None => (start + SEQ_CHUNK).min(data.len()),
35            }
36        };
37        buf.clear();
38        process_fn(&data[start..end], &mut buf);
39        if !buf.is_empty() {
40            out.write_all(&buf)?;
41        }
42        start = end;
43    }
44    Ok(())
45}
46
47/// Configuration for cut operations.
48pub struct CutConfig<'a> {
49    pub mode: CutMode,
50    pub ranges: &'a [Range],
51    pub complement: bool,
52    pub delim: u8,
53    pub output_delim: &'a [u8],
54    pub suppress_no_delim: bool,
55    pub line_delim: u8,
56}
57
58/// A range specification like 1, 3-5, -3, 4-
59#[derive(Debug, Clone)]
60pub struct Range {
61    pub start: usize, // 1-based, 0 means "from beginning"
62    pub end: usize,   // 1-based, usize::MAX means "to end"
63}
64
65/// Parse a LIST specification like "1,3-5,7-" into ranges.
66/// Each range is 1-based. Returns sorted, merged ranges.
67/// When `no_merge_adjacent` is true, overlapping ranges are still merged but
68/// adjacent ranges (e.g., 1-2,3-4) are kept separate. This is needed when
69/// `--output-delimiter` is specified for byte/char mode so the delimiter is
70/// inserted between originally separate but adjacent ranges.
71pub fn parse_ranges(spec: &str, no_merge_adjacent: bool) -> Result<Vec<Range>, String> {
72    let mut ranges = Vec::new();
73
74    for part in spec.split(',') {
75        let part = part.trim();
76        if part.is_empty() {
77            continue;
78        }
79
80        if let Some(idx) = part.find('-') {
81            let left = &part[..idx];
82            let right = &part[idx + 1..];
83
84            // Reject bare "-" (both sides empty)
85            if left.is_empty() && right.is_empty() {
86                return Err("invalid range with no endpoint: -".to_string());
87            }
88
89            let start = if left.is_empty() {
90                1
91            } else {
92                left.parse::<usize>()
93                    .map_err(|_| format!("invalid range: '{}'", part))?
94            };
95
96            let end = if right.is_empty() {
97                usize::MAX
98            } else {
99                right
100                    .parse::<usize>()
101                    .map_err(|_| format!("invalid range: '{}'", part))?
102            };
103
104            if start == 0 {
105                return Err("fields and positions are numbered from 1".to_string());
106            }
107            if start > end {
108                return Err(format!("invalid decreasing range: '{}'", part));
109            }
110
111            ranges.push(Range { start, end });
112        } else {
113            let n = part
114                .parse::<usize>()
115                .map_err(|_| format!("invalid field: '{}'", part))?;
116            if n == 0 {
117                return Err("fields and positions are numbered from 1".to_string());
118            }
119            ranges.push(Range { start: n, end: n });
120        }
121    }
122
123    if ranges.is_empty() {
124        return Err("you must specify a list of bytes, characters, or fields".to_string());
125    }
126
127    // Sort and merge overlapping/adjacent ranges
128    ranges.sort_by_key(|r| (r.start, r.end));
129    let mut merged = vec![ranges[0].clone()];
130    for r in &ranges[1..] {
131        let last = merged.last_mut().unwrap();
132        if no_merge_adjacent {
133            // Only merge truly overlapping ranges, not adjacent ones
134            if r.start <= last.end {
135                last.end = last.end.max(r.end);
136            } else {
137                merged.push(r.clone());
138            }
139        } else {
140            // Merge both overlapping and adjacent ranges
141            if r.start <= last.end.saturating_add(1) {
142                last.end = last.end.max(r.end);
143            } else {
144                merged.push(r.clone());
145            }
146        }
147    }
148
149    Ok(merged)
150}
151
152/// Check if a 1-based position is in any range.
153/// Ranges must be sorted. Uses early exit since ranges are sorted.
154#[inline(always)]
155fn in_ranges(ranges: &[Range], pos: usize) -> bool {
156    for r in ranges {
157        if pos < r.start {
158            return false;
159        }
160        if pos <= r.end {
161            return true;
162        }
163    }
164    false
165}
166
167/// Pre-compute a 64-bit mask for field selection.
168/// Bit i-1 is set if field i should be output.
169#[inline]
170fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
171    let mut mask: u64 = 0;
172    for i in 1..=64u32 {
173        let in_range = in_ranges(ranges, i as usize);
174        if in_range != complement {
175            mask |= 1u64 << (i - 1);
176        }
177    }
178    mask
179}
180
181/// Check if a field should be selected, using bitset for first 64 fields.
182#[inline(always)]
183fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
184    if field_num <= 64 {
185        (mask >> (field_num - 1)) & 1 == 1
186    } else {
187        in_ranges(ranges, field_num) != complement
188    }
189}
190
191// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
192
193/// Append a slice to buf without capacity checks.
194/// Caller MUST ensure buf has enough remaining capacity.
195#[inline(always)]
196unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
197    unsafe {
198        let len = buf.len();
199        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
200        buf.set_len(len + data.len());
201    }
202}
203
204/// Append a single byte to buf without capacity checks.
205/// Caller MUST ensure buf has enough remaining capacity.
206#[inline(always)]
207unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
208    unsafe {
209        let len = buf.len();
210        *buf.as_mut_ptr().add(len) = b;
211        buf.set_len(len + 1);
212    }
213}
214
215/// Write multiple IoSlice buffers using write_vectored (writev syscall).
216/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
217/// Cold path (partial write) is out-of-line to keep the hot loop tight.
218#[inline]
219fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
220    if slices.is_empty() {
221        return Ok(());
222    }
223    for batch in slices.chunks(MAX_IOV) {
224        let total: usize = batch.iter().map(|s| s.len()).sum();
225        let written = out.write_vectored(batch)?;
226        if written >= total {
227            continue;
228        }
229        if written == 0 {
230            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
231        }
232        write_ioslices_slow(out, batch, written)?;
233    }
234    Ok(())
235}
236
237/// Handle partial write_vectored (cold path, never inlined).
238#[cold]
239#[inline(never)]
240fn write_ioslices_slow(
241    out: &mut impl Write,
242    slices: &[IoSlice],
243    mut skip: usize,
244) -> io::Result<()> {
245    for slice in slices {
246        let len = slice.len();
247        if skip >= len {
248            skip -= len;
249            continue;
250        }
251        out.write_all(&slice[skip..])?;
252        skip = 0;
253    }
254    Ok(())
255}
256
257// ── Chunk splitting for parallel processing ──────────────────────────────
258
259/// Number of available CPUs for parallel chunk splitting.
260/// Uses std::thread::available_parallelism() to avoid triggering premature
261/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
262#[inline]
263fn num_cpus() -> usize {
264    std::thread::available_parallelism()
265        .map(|n| n.get())
266        .unwrap_or(1)
267}
268
269/// Split data into chunks for rayon::scope parallel processing.
270/// Uses Rayon's thread count to match the number of worker threads.
271fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
272    let num_threads = num_cpus().max(1);
273    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
274        return vec![data];
275    }
276
277    let chunk_size = data.len() / num_threads;
278    let mut chunks = Vec::with_capacity(num_threads);
279    let mut pos = 0;
280
281    for _ in 0..num_threads - 1 {
282        let target = pos + chunk_size;
283        if target >= data.len() {
284            break;
285        }
286        let boundary = memchr::memchr(line_delim, &data[target..])
287            .map(|p| target + p + 1)
288            .unwrap_or(data.len());
289        if boundary > pos {
290            chunks.push(&data[pos..boundary]);
291        }
292        pos = boundary;
293    }
294
295    if pos < data.len() {
296        chunks.push(&data[pos..]);
297    }
298
299    chunks
300}
301
302// ── Fast path: multi-field non-contiguous extraction ─────────────────────
303
304/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
305/// Pre-collects delimiter positions per line into a stack-allocated array,
306/// then directly indexes into them for each selected field.
307/// This is O(max_field) per line instead of O(num_fields * scan_length).
308fn process_fields_multi_select(
309    data: &[u8],
310    delim: u8,
311    line_delim: u8,
312    ranges: &[Range],
313    suppress: bool,
314    out: &mut impl Write,
315) -> io::Result<()> {
316    let max_field = ranges.last().map_or(0, |r| r.end);
317
318    if data.len() >= PARALLEL_THRESHOLD {
319        let chunks = split_for_scope(data, line_delim);
320        let n = chunks.len();
321        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
322        rayon::scope(|s| {
323            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
324                s.spawn(move |_| {
325                    result.reserve(chunk.len() * 3 / 4);
326                    multi_select_chunk(
327                        chunk, delim, line_delim, ranges, max_field, suppress, result,
328                    );
329                });
330            }
331        });
332        let slices: Vec<IoSlice> = results
333            .iter()
334            .filter(|r| !r.is_empty())
335            .map(|r| IoSlice::new(r))
336            .collect();
337        write_ioslices(out, &slices)?;
338    } else {
339        process_chunked(data, line_delim, out, |chunk, buf| {
340            multi_select_chunk(chunk, delim, line_delim, ranges, max_field, suppress, buf);
341        })?;
342    }
343    Ok(())
344}
345
346/// Process a chunk for multi-field extraction.
347/// Uses single-pass memchr2 with bitmask field selection when max_field <= 64.
348/// Falls back to two-level scanning for larger field numbers.
349fn multi_select_chunk(
350    data: &[u8],
351    delim: u8,
352    line_delim: u8,
353    ranges: &[Range],
354    max_field: usize,
355    suppress: bool,
356    buf: &mut Vec<u8>,
357) {
358    // Two-level scan for small max_field: outer memchr(newline) + inner
359    // memchr(delim) with early exit at max_field. This is faster than the
360    // single-pass memchr2 approach when lines have many fields past max_field,
361    // because we skip scanning delimiters we don't need (e.g., for -f1,3,5
362    // on a 10-field CSV, we stop after delimiter 5 instead of scanning all 9).
363    if max_field <= 64 && delim != line_delim {
364        let mut mask: u64 = 0;
365        for r in ranges {
366            let s = r.start.max(1);
367            let e = r.end.min(64);
368            for f in s..=e {
369                mask |= 1u64 << (f - 1);
370            }
371        }
372        multi_select_twolevel(data, delim, line_delim, mask, max_field, suppress, buf);
373        return;
374    }
375
376    // Fallback: two-level scanning for large field numbers
377    buf.reserve(data.len());
378    let base = data.as_ptr();
379    let mut start = 0;
380    let max_delims = max_field.min(128);
381
382    for end_pos in memchr_iter(line_delim, data) {
383        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
384        multi_select_line_fast(
385            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
386        );
387        start = end_pos + 1;
388    }
389    if start < data.len() {
390        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
391        multi_select_line_fast(
392            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
393        );
394    }
395}
396
397/// Per-line multi-field extraction with early termination after max_field.
398/// For `-f1,3,5` on 20-field CSV, this scans only 5 delimiters per line
399/// instead of all 20, reducing per-hit overhead by ~75%.
400#[allow(dead_code)]
401fn multi_select_chunk_bitmask(
402    data: &[u8],
403    delim: u8,
404    line_delim: u8,
405    mask: u64,
406    max_field: usize,
407    suppress: bool,
408    buf: &mut Vec<u8>,
409) {
410    // Single-pass memchr2 approach: scan for both delimiters and newlines
411    // simultaneously. This avoids per-line memchr_iter creation overhead,
412    // which dominates for short lines (200K lines × ~35 bytes each).
413    buf.reserve(data.len() + 1);
414    let initial_len = buf.len();
415    let out_base = unsafe { buf.as_mut_ptr().add(initial_len) };
416    let src = data.as_ptr();
417    let mut wp: usize = 0;
418
419    let mut field_num: usize = 1; // current field (1-based)
420    let mut field_start: usize = 0; // start of current field
421    let mut first_output = true; // first field on current line?
422    let mut has_delim = false; // current line has any delimiter?
423
424    for pos in memchr::memchr2_iter(delim, line_delim, data) {
425        if data[pos] == line_delim {
426            // End of line: handle last field + write newline
427            if !has_delim {
428                // Line had no delimiter: pass through or suppress
429                if !suppress {
430                    let len = pos - field_start;
431                    unsafe {
432                        std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
433                    }
434                    wp += len;
435                    unsafe {
436                        *out_base.add(wp) = line_delim;
437                    }
438                    wp += 1;
439                }
440            } else {
441                // Check if last field is selected
442                if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
443                    if !first_output {
444                        unsafe {
445                            *out_base.add(wp) = delim;
446                        }
447                        wp += 1;
448                    }
449                    let len = pos - field_start;
450                    unsafe {
451                        std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
452                    }
453                    wp += len;
454                }
455                unsafe {
456                    *out_base.add(wp) = line_delim;
457                }
458                wp += 1;
459            }
460            // Reset for next line
461            field_num = 1;
462            field_start = pos + 1;
463            first_output = true;
464            has_delim = false;
465        } else {
466            // Delimiter found
467            has_delim = true;
468            if field_num <= max_field && (mask & (1u64 << (field_num - 1))) != 0 {
469                if !first_output {
470                    unsafe {
471                        *out_base.add(wp) = delim;
472                    }
473                    wp += 1;
474                }
475                let len = pos - field_start;
476                unsafe {
477                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
478                }
479                wp += len;
480                first_output = false;
481            }
482            field_num += 1;
483            field_start = pos + 1;
484        }
485    }
486
487    // Handle final line without trailing newline
488    if field_start < data.len() {
489        if !has_delim {
490            if !suppress {
491                let len = data.len() - field_start;
492                unsafe {
493                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
494                }
495                wp += len;
496                unsafe {
497                    *out_base.add(wp) = line_delim;
498                }
499                wp += 1;
500            }
501        } else {
502            if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
503                if !first_output {
504                    unsafe {
505                        *out_base.add(wp) = delim;
506                    }
507                    wp += 1;
508                }
509                let len = data.len() - field_start;
510                unsafe {
511                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
512                }
513                wp += len;
514            }
515            unsafe {
516                *out_base.add(wp) = line_delim;
517            }
518            wp += 1;
519        }
520    }
521
522    unsafe {
523        buf.set_len(initial_len + wp);
524    }
525}
526
527/// Two-level multi-field extraction: outer memchr(newline) for line boundaries,
528/// inner memchr(delim) with early exit after max_field delimiters per line.
529/// For `-f1,3,5` on a 10-field CSV, this scans only 5 delimiters per line
530/// instead of all 9, saving ~45% of delimiter processing.
531fn multi_select_twolevel(
532    data: &[u8],
533    delim: u8,
534    line_delim: u8,
535    mask: u64,
536    max_field: usize,
537    suppress: bool,
538    buf: &mut Vec<u8>,
539) {
540    buf.reserve(data.len() + 1);
541    let initial_len = buf.len();
542    let out_base = unsafe { buf.as_mut_ptr().add(initial_len) };
543    let src = data.as_ptr();
544    let mut wp: usize = 0;
545    let mut line_start: usize = 0;
546
547    for nl_pos in memchr_iter(line_delim, data) {
548        let line_len = nl_pos - line_start;
549        let line = &data[line_start..nl_pos];
550
551        if line_len == 0 {
552            if !suppress {
553                unsafe {
554                    *out_base.add(wp) = line_delim;
555                }
556                wp += 1;
557            }
558            line_start = nl_pos + 1;
559            continue;
560        }
561
562        // Scan delimiters within the line, stopping after max_field.
563        // Uses memchr_iter for amortized SIMD setup (one per line vs one per field).
564        let mut field_num: usize = 1;
565        let mut field_start: usize = 0;
566        let mut first_output = true;
567        let mut has_delim = false;
568
569        for dp in memchr::memchr_iter(delim, line) {
570            has_delim = true;
571            if (mask >> (field_num - 1)) & 1 == 1 {
572                if !first_output {
573                    unsafe {
574                        *out_base.add(wp) = delim;
575                    }
576                    wp += 1;
577                }
578                let flen = dp - field_start;
579                unsafe {
580                    std::ptr::copy_nonoverlapping(
581                        src.add(line_start + field_start),
582                        out_base.add(wp),
583                        flen,
584                    );
585                }
586                wp += flen;
587                first_output = false;
588            }
589            field_num += 1;
590            field_start = dp + 1;
591            if field_num > max_field {
592                break;
593            }
594        }
595
596        if !has_delim {
597            // No delimiter: pass through or suppress
598            if !suppress {
599                unsafe {
600                    std::ptr::copy_nonoverlapping(src.add(line_start), out_base.add(wp), line_len);
601                }
602                wp += line_len;
603                unsafe {
604                    *out_base.add(wp) = line_delim;
605                }
606                wp += 1;
607            }
608        } else {
609            // Check if the last field (after last found delimiter) is selected
610            if field_num <= 64 && (mask >> (field_num - 1)) & 1 == 1 {
611                if !first_output {
612                    unsafe {
613                        *out_base.add(wp) = delim;
614                    }
615                    wp += 1;
616                }
617                let flen = line_len - field_start;
618                unsafe {
619                    std::ptr::copy_nonoverlapping(
620                        src.add(line_start + field_start),
621                        out_base.add(wp),
622                        flen,
623                    );
624                }
625                wp += flen;
626            }
627            unsafe {
628                *out_base.add(wp) = line_delim;
629            }
630            wp += 1;
631        }
632
633        line_start = nl_pos + 1;
634    }
635
636    // Handle final line without trailing newline
637    if line_start < data.len() {
638        let line = &data[line_start..];
639        let line_len = line.len();
640        let mut field_num: usize = 1;
641        let mut field_start: usize = 0;
642        let mut first_output = true;
643        let mut has_delim = false;
644
645        for dp in memchr::memchr_iter(delim, line) {
646            has_delim = true;
647            if (mask >> (field_num - 1)) & 1 == 1 {
648                if !first_output {
649                    unsafe {
650                        *out_base.add(wp) = delim;
651                    }
652                    wp += 1;
653                }
654                let flen = dp - field_start;
655                unsafe {
656                    std::ptr::copy_nonoverlapping(
657                        src.add(line_start + field_start),
658                        out_base.add(wp),
659                        flen,
660                    );
661                }
662                wp += flen;
663                first_output = false;
664            }
665            field_num += 1;
666            field_start = dp + 1;
667            if field_num > max_field {
668                break;
669            }
670        }
671
672        if !has_delim {
673            if !suppress {
674                unsafe {
675                    std::ptr::copy_nonoverlapping(src.add(line_start), out_base.add(wp), line_len);
676                }
677                wp += line_len;
678                unsafe {
679                    *out_base.add(wp) = line_delim;
680                }
681                wp += 1;
682            }
683        } else {
684            if field_num <= 64 && (mask >> (field_num - 1)) & 1 == 1 {
685                if !first_output {
686                    unsafe {
687                        *out_base.add(wp) = delim;
688                    }
689                    wp += 1;
690                }
691                let flen = line_len - field_start;
692                unsafe {
693                    std::ptr::copy_nonoverlapping(
694                        src.add(line_start + field_start),
695                        out_base.add(wp),
696                        flen,
697                    );
698                }
699                wp += flen;
700            }
701            unsafe {
702                *out_base.add(wp) = line_delim;
703            }
704            wp += 1;
705        }
706    }
707
708    debug_assert!(
709        wp <= data.len() + 1,
710        "wp={} exceeded reservation data.len()+1={}",
711        wp,
712        data.len() + 1
713    );
714    unsafe {
715        buf.set_len(initial_len + wp);
716    }
717}
718
719/// Extract selected fields from a single line using delimiter position scanning.
720/// Optimized: collects delimiter positions into a stack array with early exit at max_delims,
721/// then indexes directly for each selected field. Uses raw pointer arithmetic.
722#[inline(always)]
723fn multi_select_line_fast(
724    line: &[u8],
725    delim: u8,
726    line_delim: u8,
727    ranges: &[Range],
728    max_delims: usize,
729    suppress: bool,
730    buf: &mut Vec<u8>,
731    _line_abs_start: usize,
732    _data_base: *const u8,
733) {
734    let len = line.len();
735    if len == 0 {
736        if !suppress {
737            unsafe { buf_push(buf, line_delim) };
738        }
739        return;
740    }
741
742    let base = line.as_ptr();
743
744    // Collect delimiter positions up to max_delims (early exit).
745    let mut delim_pos = [0usize; 128];
746    let mut num_delims: usize = 0;
747
748    for pos in memchr_iter(delim, line) {
749        if num_delims < max_delims {
750            delim_pos[num_delims] = pos;
751            num_delims += 1;
752            if num_delims >= max_delims {
753                break;
754            }
755        }
756    }
757
758    if num_delims == 0 {
759        if !suppress {
760            unsafe {
761                buf_extend(buf, line);
762                buf_push(buf, line_delim);
763            }
764        }
765        return;
766    }
767
768    let total_fields = num_delims + 1;
769    let mut first_output = true;
770
771    for r in ranges {
772        let range_start = r.start;
773        let range_end = r.end.min(total_fields);
774        if range_start > total_fields {
775            break;
776        }
777        for field_num in range_start..=range_end {
778            if field_num > total_fields {
779                break;
780            }
781
782            let field_start = if field_num == 1 {
783                0
784            } else if field_num - 2 < num_delims {
785                delim_pos[field_num - 2] + 1
786            } else {
787                continue;
788            };
789            let field_end = if field_num <= num_delims {
790                delim_pos[field_num - 1]
791            } else {
792                len
793            };
794
795            if !first_output {
796                unsafe { buf_push(buf, delim) };
797            }
798            unsafe {
799                buf_extend(
800                    buf,
801                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
802                );
803            }
804            first_output = false;
805        }
806    }
807
808    unsafe { buf_push(buf, line_delim) };
809}
810
811// ── Fast path: field extraction with batched output ──────────────────────
812
813/// Optimized field extraction with early exit and batched output.
814fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
815    let delim = cfg.delim;
816    let line_delim = cfg.line_delim;
817    let ranges = cfg.ranges;
818    let complement = cfg.complement;
819    let output_delim = cfg.output_delim;
820    let suppress = cfg.suppress_no_delim;
821
822    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
823    // That scan was O(N) over the entire file just to check an edge case
824    // (no delimiter in any line). The per-line processing already handles
825    // lines without delimiters correctly, so the scan was pure overhead
826    // for files that DO contain delimiters (the common case).
827
828    // Ultra-fast path: single field extraction (e.g., cut -f5)
829    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
830        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
831    }
832
833    // Fast path: complement of single field or contiguous range with default output delimiter.
834    if complement
835        && ranges.len() == 1
836        && output_delim.len() == 1
837        && output_delim[0] == delim
838        && ranges[0].start == ranges[0].end
839    {
840        return process_complement_single_field(
841            data,
842            delim,
843            line_delim,
844            ranges[0].start,
845            suppress,
846            out,
847        );
848    }
849
850    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
851    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
852    if complement
853        && ranges.len() == 1
854        && ranges[0].start > 1
855        && ranges[0].end < usize::MAX
856        && output_delim.len() == 1
857        && output_delim[0] == delim
858    {
859        return process_complement_range(
860            data,
861            delim,
862            line_delim,
863            ranges[0].start,
864            ranges[0].end,
865            suppress,
866            out,
867        );
868    }
869
870    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
871    if !complement
872        && ranges.len() == 1
873        && ranges[0].start == 1
874        && output_delim.len() == 1
875        && output_delim[0] == delim
876        && ranges[0].end < usize::MAX
877    {
878        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
879    }
880
881    // Fast path: open-ended field range from field N (e.g., cut -f3-)
882    if !complement
883        && ranges.len() == 1
884        && ranges[0].end == usize::MAX
885        && ranges[0].start > 1
886        && output_delim.len() == 1
887        && output_delim[0] == delim
888    {
889        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
890    }
891
892    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
893    if !complement
894        && ranges.len() == 1
895        && ranges[0].start > 1
896        && ranges[0].end < usize::MAX
897        && output_delim.len() == 1
898        && output_delim[0] == delim
899    {
900        return process_fields_mid_range(
901            data,
902            delim,
903            line_delim,
904            ranges[0].start,
905            ranges[0].end,
906            suppress,
907            out,
908        );
909    }
910
911    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
912    // Uses delimiter position caching: find all delimiter positions per line,
913    // then directly index into them for each selected field.
914    // This is faster than the general extract_fields_to_buf which re-checks
915    // is_selected() for every field encountered.
916    if !complement
917        && ranges.len() > 1
918        && ranges.last().map_or(false, |r| r.end < usize::MAX)
919        && output_delim.len() == 1
920        && output_delim[0] == delim
921        && delim != line_delim
922    {
923        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
924    }
925
926    // General field extraction
927    let max_field = if complement {
928        usize::MAX
929    } else {
930        ranges.last().map(|r| r.end).unwrap_or(0)
931    };
932    let field_mask = compute_field_mask(ranges, complement);
933
934    if data.len() >= PARALLEL_THRESHOLD {
935        let chunks = split_for_scope(data, line_delim);
936        let n = chunks.len();
937        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
938        rayon::scope(|s| {
939            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
940                s.spawn(move |_| {
941                    result.reserve(chunk.len() + 1);
942                    process_fields_chunk(
943                        chunk,
944                        delim,
945                        ranges,
946                        output_delim,
947                        suppress,
948                        max_field,
949                        field_mask,
950                        line_delim,
951                        complement,
952                        result,
953                    );
954                });
955            }
956        });
957        let slices: Vec<IoSlice> = results
958            .iter()
959            .filter(|r| !r.is_empty())
960            .map(|r| IoSlice::new(r))
961            .collect();
962        write_ioslices(out, &slices)?;
963    } else {
964        process_chunked(data, line_delim, out, |chunk, buf| {
965            process_fields_chunk(
966                chunk,
967                delim,
968                ranges,
969                output_delim,
970                suppress,
971                max_field,
972                field_mask,
973                line_delim,
974                complement,
975                buf,
976            );
977        })?;
978    }
979    Ok(())
980}
981
982/// Process a chunk of data for general field extraction.
983/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
984/// memchr_iter(delim) for delimiter positions. This is faster than memchr2 single-pass
985/// because memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles).
986fn process_fields_chunk(
987    data: &[u8],
988    delim: u8,
989    ranges: &[Range],
990    output_delim: &[u8],
991    suppress: bool,
992    max_field: usize,
993    field_mask: u64,
994    line_delim: u8,
995    complement: bool,
996    buf: &mut Vec<u8>,
997) {
998    // Always use two-level approach: outer memchr(newline) + inner memchr_iter(delim).
999    // Even for complement/unbounded ranges, two-level is faster because memchr is
1000    // ~30-50% faster per byte than memchr2. The per-line function call overhead
1001    // is negligible compared to the SIMD scan savings.
1002    if delim != line_delim {
1003        buf.reserve(data.len());
1004        let mut start = 0;
1005        for end_pos in memchr_iter(line_delim, data) {
1006            let line = &data[start..end_pos];
1007            extract_fields_to_buf(
1008                line,
1009                delim,
1010                ranges,
1011                output_delim,
1012                suppress,
1013                max_field,
1014                field_mask,
1015                line_delim,
1016                buf,
1017                complement,
1018            );
1019            start = end_pos + 1;
1020        }
1021        if start < data.len() {
1022            extract_fields_to_buf(
1023                &data[start..],
1024                delim,
1025                ranges,
1026                output_delim,
1027                suppress,
1028                max_field,
1029                field_mask,
1030                line_delim,
1031                buf,
1032                complement,
1033            );
1034        }
1035        return;
1036    }
1037
1038    // Fallback: when delim == line_delim, use the two-level scan approach
1039    let mut start = 0;
1040    for end_pos in memchr_iter(line_delim, data) {
1041        let line = &data[start..end_pos];
1042        extract_fields_to_buf(
1043            line,
1044            delim,
1045            ranges,
1046            output_delim,
1047            suppress,
1048            max_field,
1049            field_mask,
1050            line_delim,
1051            buf,
1052            complement,
1053        );
1054        start = end_pos + 1;
1055    }
1056    if start < data.len() {
1057        extract_fields_to_buf(
1058            &data[start..],
1059            delim,
1060            ranges,
1061            output_delim,
1062            suppress,
1063            max_field,
1064            field_mask,
1065            line_delim,
1066            buf,
1067            complement,
1068        );
1069    }
1070}
1071
1072// ── Ultra-fast single field extraction ───────────────────────────────────
1073
1074/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
1075/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
1076/// memchr(delim) for the field delimiter with early exit.
1077fn process_single_field(
1078    data: &[u8],
1079    delim: u8,
1080    line_delim: u8,
1081    target: usize,
1082    suppress: bool,
1083    out: &mut impl Write,
1084) -> io::Result<()> {
1085    let target_idx = target - 1;
1086
1087    // For single-field extraction, parallelize at 16MB+ to match PARALLEL_THRESHOLD.
1088    const FIELD_PARALLEL_MIN: usize = 16 * 1024 * 1024;
1089
1090    if delim != line_delim {
1091        // Field 1 fast path: two-level scan (outer newline + inner first-delim).
1092        // For field 1, only needs to find the first delimiter per line.
1093        // Lines without delimiter are tracked as contiguous runs for bulk copy.
1094        if target_idx == 0 && !suppress {
1095            if data.len() >= FIELD_PARALLEL_MIN {
1096                return single_field1_parallel(data, delim, line_delim, out);
1097            }
1098            return process_chunked(data, line_delim, out, |chunk, buf| {
1099                single_field1_to_buf(chunk, delim, line_delim, buf);
1100            });
1101        }
1102
1103        // Two-level approach for field N: outer newline scan + inner delim scan
1104        // with early exit at target_idx. Faster than memchr2 single-pass because
1105        // we only scan delimiters up to target_idx per line (not all of them).
1106        if data.len() >= FIELD_PARALLEL_MIN {
1107            let chunks = split_for_scope(data, line_delim);
1108            let n = chunks.len();
1109            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1110            rayon::scope(|s| {
1111                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1112                    s.spawn(move |_| {
1113                        result.reserve(chunk.len() / 2);
1114                        process_single_field_chunk(
1115                            chunk, delim, target_idx, line_delim, suppress, result,
1116                        );
1117                    });
1118                }
1119            });
1120            let slices: Vec<IoSlice> = results
1121                .iter()
1122                .filter(|r| !r.is_empty())
1123                .map(|r| IoSlice::new(r))
1124                .collect();
1125            write_ioslices(out, &slices)?;
1126        } else {
1127            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1128            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1129            if !buf.is_empty() {
1130                out.write_all(&buf)?;
1131            }
1132        }
1133        return Ok(());
1134    }
1135
1136    // Fallback for delim == line_delim: nested loop approach
1137    if data.len() >= FIELD_PARALLEL_MIN {
1138        let chunks = split_for_scope(data, line_delim);
1139        let n = chunks.len();
1140        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1141        rayon::scope(|s| {
1142            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1143                s.spawn(move |_| {
1144                    result.reserve(chunk.len() / 4);
1145                    process_single_field_chunk(
1146                        chunk, delim, target_idx, line_delim, suppress, result,
1147                    );
1148                });
1149            }
1150        });
1151        let slices: Vec<IoSlice> = results
1152            .iter()
1153            .filter(|r| !r.is_empty())
1154            .map(|r| IoSlice::new(r))
1155            .collect();
1156        write_ioslices(out, &slices)?;
1157    } else {
1158        let mut buf = Vec::with_capacity(data.len() / 4);
1159        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1160        if !buf.is_empty() {
1161            out.write_all(&buf)?;
1162        }
1163    }
1164    Ok(())
1165}
1166
1167/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1168/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1169fn process_complement_range(
1170    data: &[u8],
1171    delim: u8,
1172    line_delim: u8,
1173    skip_start: usize,
1174    skip_end: usize,
1175    suppress: bool,
1176    out: &mut impl Write,
1177) -> io::Result<()> {
1178    if data.len() >= PARALLEL_THRESHOLD {
1179        let chunks = split_for_scope(data, line_delim);
1180        let n = chunks.len();
1181        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1182        rayon::scope(|s| {
1183            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1184                s.spawn(move |_| {
1185                    result.reserve(chunk.len());
1186                    complement_range_chunk(
1187                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
1188                    );
1189                });
1190            }
1191        });
1192        let slices: Vec<IoSlice> = results
1193            .iter()
1194            .filter(|r| !r.is_empty())
1195            .map(|r| IoSlice::new(r))
1196            .collect();
1197        write_ioslices(out, &slices)?;
1198    } else {
1199        process_chunked(data, line_delim, out, |chunk, buf| {
1200            complement_range_chunk(
1201                chunk, delim, skip_start, skip_end, line_delim, suppress, buf,
1202            );
1203        })?;
1204    }
1205    Ok(())
1206}
1207
1208/// Process a chunk for complement range extraction.
1209fn complement_range_chunk(
1210    data: &[u8],
1211    delim: u8,
1212    skip_start: usize,
1213    skip_end: usize,
1214    line_delim: u8,
1215    suppress: bool,
1216    buf: &mut Vec<u8>,
1217) {
1218    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1219    buf.reserve(data.len());
1220    let mut start = 0;
1221    for end_pos in memchr_iter(line_delim, data) {
1222        let line = &data[start..end_pos];
1223        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1224        start = end_pos + 1;
1225    }
1226    if start < data.len() {
1227        complement_range_line(
1228            &data[start..],
1229            delim,
1230            skip_start,
1231            skip_end,
1232            line_delim,
1233            suppress,
1234            buf,
1235        );
1236    }
1237}
1238
1239/// Extract all fields except skip_start..=skip_end from one line.
1240/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1241///
1242/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1243/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1244/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1245#[inline(always)]
1246fn complement_range_line(
1247    line: &[u8],
1248    delim: u8,
1249    skip_start: usize,
1250    skip_end: usize,
1251    line_delim: u8,
1252    suppress: bool,
1253    buf: &mut Vec<u8>,
1254) {
1255    let len = line.len();
1256    if len == 0 {
1257        if !suppress {
1258            unsafe { buf_push(buf, line_delim) };
1259        }
1260        return;
1261    }
1262
1263    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1264    let base = line.as_ptr();
1265
1266    // 1-based field numbers. To skip fields skip_start..=skip_end:
1267    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1268    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1269    //
1270    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1271    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1272
1273    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1274    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1275    let total_need = need_prefix_delims + need_skip_delims;
1276
1277    // Find delimiter positions up to total_need
1278    let mut delim_count: usize = 0;
1279    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1280    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1281
1282    for pos in memchr_iter(delim, line) {
1283        delim_count += 1;
1284        if delim_count == need_prefix_delims {
1285            prefix_end_pos = pos;
1286        }
1287        if delim_count == total_need {
1288            suffix_start_pos = pos + 1;
1289            break;
1290        }
1291    }
1292
1293    if delim_count == 0 {
1294        // No delimiter at all
1295        if !suppress {
1296            unsafe {
1297                buf_extend(buf, line);
1298                buf_push(buf, line_delim);
1299            }
1300        }
1301        return;
1302    }
1303
1304    // Case analysis:
1305    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1306    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1307    // 3. Enough to reach skip_end: prefix + delim + suffix
1308
1309    if delim_count < need_prefix_delims {
1310        // Not enough fields to reach skip region — output entire line
1311        unsafe {
1312            buf_extend(buf, line);
1313            buf_push(buf, line_delim);
1314        }
1315        return;
1316    }
1317
1318    let has_prefix = need_prefix_delims > 0;
1319    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1320
1321    if has_prefix && has_suffix {
1322        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1323        unsafe {
1324            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1325            buf_push(buf, delim);
1326            buf_extend(
1327                buf,
1328                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1329            );
1330            buf_push(buf, line_delim);
1331        }
1332    } else if has_prefix {
1333        // Only prefix, no suffix (skip region extends to end of line)
1334        unsafe {
1335            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1336            buf_push(buf, line_delim);
1337        }
1338    } else if has_suffix {
1339        // No prefix (skip_start == 1), only suffix
1340        unsafe {
1341            buf_extend(
1342                buf,
1343                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1344            );
1345            buf_push(buf, line_delim);
1346        }
1347    } else {
1348        // All fields skipped
1349        unsafe { buf_push(buf, line_delim) };
1350    }
1351}
1352
1353/// Complement single-field extraction: skip one field, output rest unchanged.
1354fn process_complement_single_field(
1355    data: &[u8],
1356    delim: u8,
1357    line_delim: u8,
1358    skip_field: usize,
1359    suppress: bool,
1360    out: &mut impl Write,
1361) -> io::Result<()> {
1362    let skip_idx = skip_field - 1;
1363
1364    if data.len() >= PARALLEL_THRESHOLD {
1365        let chunks = split_for_scope(data, line_delim);
1366        let n = chunks.len();
1367        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1368        rayon::scope(|s| {
1369            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1370                s.spawn(move |_| {
1371                    result.reserve(chunk.len());
1372                    complement_single_field_chunk(
1373                        chunk, delim, skip_idx, line_delim, suppress, result,
1374                    );
1375                });
1376            }
1377        });
1378        let slices: Vec<IoSlice> = results
1379            .iter()
1380            .filter(|r| !r.is_empty())
1381            .map(|r| IoSlice::new(r))
1382            .collect();
1383        write_ioslices(out, &slices)?;
1384    } else {
1385        process_chunked(data, line_delim, out, |chunk, buf| {
1386            complement_single_field_chunk(chunk, delim, skip_idx, line_delim, suppress, buf);
1387        })?;
1388    }
1389    Ok(())
1390}
1391
1392/// Process a chunk for complement single-field extraction using two-level scanning.
1393/// Outer memchr(newline) for line boundaries, inner memchr_iter(delim) with early exit
1394/// after finding the skip field's bounding delimiters. Faster than memchr2 single-pass
1395/// because memchr is faster per byte and inner scan exits early.
1396fn complement_single_field_chunk(
1397    data: &[u8],
1398    delim: u8,
1399    skip_idx: usize,
1400    line_delim: u8,
1401    suppress: bool,
1402    buf: &mut Vec<u8>,
1403) {
1404    buf.reserve(data.len());
1405    let mut start = 0;
1406    for end_pos in memchr_iter(line_delim, data) {
1407        let line = &data[start..end_pos];
1408        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1409        start = end_pos + 1;
1410    }
1411    if start < data.len() {
1412        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
1413    }
1414}
1415
1416/// Fallback per-line complement single-field extraction (for delim == line_delim).
1417#[inline(always)]
1418fn complement_single_field_line(
1419    line: &[u8],
1420    delim: u8,
1421    skip_idx: usize,
1422    line_delim: u8,
1423    suppress: bool,
1424    buf: &mut Vec<u8>,
1425) {
1426    let len = line.len();
1427    if len == 0 {
1428        if !suppress {
1429            unsafe { buf_push(buf, line_delim) };
1430        }
1431        return;
1432    }
1433
1434    let base = line.as_ptr();
1435    let need_before = skip_idx;
1436    let need_total = skip_idx + 1;
1437
1438    let mut delim_count: usize = 0;
1439    let mut skip_start_pos: usize = 0;
1440    let mut skip_end_pos: usize = len;
1441    let mut found_end = false;
1442
1443    for pos in memchr_iter(delim, line) {
1444        delim_count += 1;
1445        if delim_count == need_before {
1446            skip_start_pos = pos + 1;
1447        }
1448        if delim_count == need_total {
1449            skip_end_pos = pos;
1450            found_end = true;
1451            break;
1452        }
1453    }
1454
1455    if delim_count == 0 {
1456        if !suppress {
1457            unsafe {
1458                buf_extend(buf, line);
1459                buf_push(buf, line_delim);
1460            }
1461        }
1462        return;
1463    }
1464
1465    if delim_count < need_before {
1466        unsafe {
1467            buf_extend(buf, line);
1468            buf_push(buf, line_delim);
1469        }
1470        return;
1471    }
1472
1473    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1474    let has_suffix = found_end && skip_end_pos < len;
1475
1476    if has_prefix && has_suffix {
1477        unsafe {
1478            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1479            buf_push(buf, delim);
1480            buf_extend(
1481                buf,
1482                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1483            );
1484            buf_push(buf, line_delim);
1485        }
1486    } else if has_prefix {
1487        unsafe {
1488            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1489            buf_push(buf, line_delim);
1490        }
1491    } else if has_suffix {
1492        unsafe {
1493            buf_extend(
1494                buf,
1495                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1496            );
1497            buf_push(buf, line_delim);
1498        }
1499    } else {
1500        unsafe { buf_push(buf, line_delim) };
1501    }
1502}
1503
1504/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1505/// Zero-copy for the non-parallel path: identifies the truncation point per line
1506/// and writes contiguous runs directly from the source data.
1507fn process_fields_prefix(
1508    data: &[u8],
1509    delim: u8,
1510    line_delim: u8,
1511    last_field: usize,
1512    suppress: bool,
1513    out: &mut impl Write,
1514) -> io::Result<()> {
1515    if data.len() >= PARALLEL_THRESHOLD {
1516        let chunks = split_for_scope(data, line_delim);
1517        let n = chunks.len();
1518        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1519        rayon::scope(|s| {
1520            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1521                s.spawn(move |_| {
1522                    result.reserve(chunk.len());
1523                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1524                });
1525            }
1526        });
1527        let slices: Vec<IoSlice> = results
1528            .iter()
1529            .filter(|r| !r.is_empty())
1530            .map(|r| IoSlice::new(r))
1531            .collect();
1532        write_ioslices(out, &slices)?;
1533    } else if !suppress {
1534        // Zero-copy fast path: scan for truncation points, write runs from source.
1535        // When suppress is false, every line is output (with or without delimiter).
1536        // Most lines have enough fields, so the output is often identical to input.
1537        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1538    } else {
1539        process_chunked(data, line_delim, out, |chunk, buf| {
1540            fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, buf);
1541        })?;
1542    }
1543    Ok(())
1544}
1545
1546/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1547/// directly into the source data, flushing in MAX_IOV-sized batches.
1548/// For lines where the Nth delimiter exists, we truncate at that point.
1549/// For lines with fewer fields, we output them unchanged (contiguous run).
1550/// Lines without any delimiter are output unchanged (suppress=false assumed).
1551#[inline]
1552fn fields_prefix_zerocopy(
1553    data: &[u8],
1554    delim: u8,
1555    line_delim: u8,
1556    last_field: usize,
1557    out: &mut impl Write,
1558) -> io::Result<()> {
1559    let newline_buf: [u8; 1] = [line_delim];
1560    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1561    let mut start = 0;
1562    let mut run_start: usize = 0;
1563
1564    for end_pos in memchr_iter(line_delim, data) {
1565        let line = &data[start..end_pos];
1566        let mut field_count = 1;
1567        let mut truncate_at: Option<usize> = None;
1568        for dpos in memchr_iter(delim, line) {
1569            if field_count >= last_field {
1570                truncate_at = Some(start + dpos);
1571                break;
1572            }
1573            field_count += 1;
1574        }
1575
1576        if let Some(trunc_pos) = truncate_at {
1577            if run_start < start {
1578                iov.push(IoSlice::new(&data[run_start..start]));
1579            }
1580            iov.push(IoSlice::new(&data[start..trunc_pos]));
1581            iov.push(IoSlice::new(&newline_buf));
1582            run_start = end_pos + 1;
1583
1584            if iov.len() >= MAX_IOV - 2 {
1585                write_ioslices(out, &iov)?;
1586                iov.clear();
1587            }
1588        }
1589        start = end_pos + 1;
1590    }
1591    // Handle last line without terminator
1592    if start < data.len() {
1593        let line = &data[start..];
1594        let mut field_count = 1;
1595        let mut truncate_at: Option<usize> = None;
1596        for dpos in memchr_iter(delim, line) {
1597            if field_count >= last_field {
1598                truncate_at = Some(start + dpos);
1599                break;
1600            }
1601            field_count += 1;
1602        }
1603        if let Some(trunc_pos) = truncate_at {
1604            if run_start < start {
1605                iov.push(IoSlice::new(&data[run_start..start]));
1606            }
1607            iov.push(IoSlice::new(&data[start..trunc_pos]));
1608            iov.push(IoSlice::new(&newline_buf));
1609            if !iov.is_empty() {
1610                write_ioslices(out, &iov)?;
1611            }
1612            return Ok(());
1613        }
1614    }
1615    // Flush remaining contiguous run
1616    if run_start < data.len() {
1617        iov.push(IoSlice::new(&data[run_start..]));
1618        if !data.is_empty() && *data.last().unwrap() != line_delim {
1619            iov.push(IoSlice::new(&newline_buf));
1620        }
1621    }
1622    if !iov.is_empty() {
1623        write_ioslices(out, &iov)?;
1624    }
1625    Ok(())
1626}
1627
1628/// Process a chunk for contiguous from-start field range extraction.
1629fn fields_prefix_chunk(
1630    data: &[u8],
1631    delim: u8,
1632    line_delim: u8,
1633    last_field: usize,
1634    suppress: bool,
1635    buf: &mut Vec<u8>,
1636) {
1637    buf.reserve(data.len());
1638    let mut start = 0;
1639    for end_pos in memchr_iter(line_delim, data) {
1640        let line = &data[start..end_pos];
1641        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1642        start = end_pos + 1;
1643    }
1644    if start < data.len() {
1645        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1646    }
1647}
1648
1649/// Extract first N fields from one line (contiguous from-start range).
1650/// Uses memchr SIMD for delimiter scanning on all line sizes.
1651#[inline(always)]
1652fn fields_prefix_line(
1653    line: &[u8],
1654    delim: u8,
1655    line_delim: u8,
1656    last_field: usize,
1657    suppress: bool,
1658    buf: &mut Vec<u8>,
1659) {
1660    let len = line.len();
1661    if len == 0 {
1662        if !suppress {
1663            unsafe { buf_push(buf, line_delim) };
1664        }
1665        return;
1666    }
1667
1668    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1669    let base = line.as_ptr();
1670
1671    let mut field_count = 1usize;
1672    let mut has_delim = false;
1673
1674    for pos in memchr_iter(delim, line) {
1675        has_delim = true;
1676        if field_count >= last_field {
1677            unsafe {
1678                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1679                buf_push(buf, line_delim);
1680            }
1681            return;
1682        }
1683        field_count += 1;
1684    }
1685
1686    if !has_delim {
1687        if !suppress {
1688            unsafe {
1689                buf_extend(buf, line);
1690                buf_push(buf, line_delim);
1691            }
1692        }
1693        return;
1694    }
1695
1696    unsafe {
1697        buf_extend(buf, line);
1698        buf_push(buf, line_delim);
1699    }
1700}
1701
1702/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1703fn process_fields_suffix(
1704    data: &[u8],
1705    delim: u8,
1706    line_delim: u8,
1707    start_field: usize,
1708    suppress: bool,
1709    out: &mut impl Write,
1710) -> io::Result<()> {
1711    if data.len() >= PARALLEL_THRESHOLD {
1712        let chunks = split_for_scope(data, line_delim);
1713        let n = chunks.len();
1714        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1715        rayon::scope(|s| {
1716            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1717                s.spawn(move |_| {
1718                    result.reserve(chunk.len());
1719                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1720                });
1721            }
1722        });
1723        let slices: Vec<IoSlice> = results
1724            .iter()
1725            .filter(|r| !r.is_empty())
1726            .map(|r| IoSlice::new(r))
1727            .collect();
1728        write_ioslices(out, &slices)?;
1729    } else {
1730        process_chunked(data, line_delim, out, |chunk, buf| {
1731            fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, buf);
1732        })?;
1733    }
1734    Ok(())
1735}
1736
1737/// Process a chunk for open-ended field suffix extraction.
1738fn fields_suffix_chunk(
1739    data: &[u8],
1740    delim: u8,
1741    line_delim: u8,
1742    start_field: usize,
1743    suppress: bool,
1744    buf: &mut Vec<u8>,
1745) {
1746    buf.reserve(data.len());
1747    let mut start = 0;
1748    for end_pos in memchr_iter(line_delim, data) {
1749        let line = &data[start..end_pos];
1750        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1751        start = end_pos + 1;
1752    }
1753    if start < data.len() {
1754        fields_suffix_line(
1755            &data[start..],
1756            delim,
1757            line_delim,
1758            start_field,
1759            suppress,
1760            buf,
1761        );
1762    }
1763}
1764
1765/// Extract fields from start_field to end from one line.
1766/// Uses memchr SIMD for delimiter scanning on all line sizes.
1767#[inline(always)]
1768fn fields_suffix_line(
1769    line: &[u8],
1770    delim: u8,
1771    line_delim: u8,
1772    start_field: usize,
1773    suppress: bool,
1774    buf: &mut Vec<u8>,
1775) {
1776    let len = line.len();
1777    if len == 0 {
1778        if !suppress {
1779            unsafe { buf_push(buf, line_delim) };
1780        }
1781        return;
1782    }
1783
1784    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1785    let base = line.as_ptr();
1786
1787    let skip_delims = start_field - 1;
1788    let mut delim_count = 0usize;
1789    let mut has_delim = false;
1790
1791    for pos in memchr_iter(delim, line) {
1792        has_delim = true;
1793        delim_count += 1;
1794        if delim_count >= skip_delims {
1795            unsafe {
1796                buf_extend(
1797                    buf,
1798                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1799                );
1800                buf_push(buf, line_delim);
1801            }
1802            return;
1803        }
1804    }
1805
1806    if !has_delim {
1807        if !suppress {
1808            unsafe {
1809                buf_extend(buf, line);
1810                buf_push(buf, line_delim);
1811            }
1812        }
1813        return;
1814    }
1815
1816    // Fewer delimiters than needed
1817    unsafe { buf_push(buf, line_delim) };
1818}
1819
1820/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1821/// Optimized: skip to start_field using memchr, then output until end_field.
1822fn process_fields_mid_range(
1823    data: &[u8],
1824    delim: u8,
1825    line_delim: u8,
1826    start_field: usize,
1827    end_field: usize,
1828    suppress: bool,
1829    out: &mut impl Write,
1830) -> io::Result<()> {
1831    if data.len() >= PARALLEL_THRESHOLD {
1832        let chunks = split_for_scope(data, line_delim);
1833        let n = chunks.len();
1834        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1835        rayon::scope(|s| {
1836            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1837                s.spawn(move |_| {
1838                    result.reserve(chunk.len());
1839                    fields_mid_range_chunk(
1840                        chunk,
1841                        delim,
1842                        line_delim,
1843                        start_field,
1844                        end_field,
1845                        suppress,
1846                        result,
1847                    );
1848                });
1849            }
1850        });
1851        let slices: Vec<IoSlice> = results
1852            .iter()
1853            .filter(|r| !r.is_empty())
1854            .map(|r| IoSlice::new(r))
1855            .collect();
1856        write_ioslices(out, &slices)?;
1857    } else {
1858        process_chunked(data, line_delim, out, |chunk, buf| {
1859            fields_mid_range_chunk(
1860                chunk,
1861                delim,
1862                line_delim,
1863                start_field,
1864                end_field,
1865                suppress,
1866                buf,
1867            );
1868        })?;
1869    }
1870    Ok(())
1871}
1872
1873/// Process a chunk for contiguous mid-range field extraction.
1874/// Two-level scan: outer memchr(newline) for line boundaries, inner memchr_iter(delim)
1875/// with early exit at target_end_delim. Faster than memchr2 single-pass because
1876/// memchr is faster per byte and inner scan exits early.
1877fn fields_mid_range_chunk(
1878    data: &[u8],
1879    delim: u8,
1880    line_delim: u8,
1881    start_field: usize,
1882    end_field: usize,
1883    suppress: bool,
1884    buf: &mut Vec<u8>,
1885) {
1886    buf.reserve(data.len());
1887    let mut start = 0;
1888    for end_pos in memchr_iter(line_delim, data) {
1889        let line = &data[start..end_pos];
1890        fields_mid_range_line(
1891            line,
1892            delim,
1893            line_delim,
1894            start_field,
1895            end_field,
1896            suppress,
1897            buf,
1898        );
1899        start = end_pos + 1;
1900    }
1901    if start < data.len() {
1902        fields_mid_range_line(
1903            &data[start..],
1904            delim,
1905            line_delim,
1906            start_field,
1907            end_field,
1908            suppress,
1909            buf,
1910        );
1911    }
1912}
1913
1914/// Extract fields start_field..=end_field from one line.
1915/// Uses scalar byte scanning for short lines, memchr_iter for longer.
1916/// Raw pointer arithmetic to eliminate bounds checking.
1917#[inline(always)]
1918fn fields_mid_range_line(
1919    line: &[u8],
1920    delim: u8,
1921    line_delim: u8,
1922    start_field: usize,
1923    end_field: usize,
1924    suppress: bool,
1925    buf: &mut Vec<u8>,
1926) {
1927    let len = line.len();
1928    if len == 0 {
1929        if !suppress {
1930            unsafe { buf_push(buf, line_delim) };
1931        }
1932        return;
1933    }
1934
1935    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
1936    let base = line.as_ptr();
1937
1938    // Count delimiters to find start_field and end_field boundaries
1939    let skip_before = start_field - 1; // delimiters to skip before start_field
1940    let field_span = end_field - start_field; // additional delimiters within the range
1941    let target_end_delim = skip_before + field_span + 1;
1942    let mut delim_count = 0;
1943    let mut range_start = 0;
1944    let mut has_delim = false;
1945
1946    for pos in memchr_iter(delim, line) {
1947        has_delim = true;
1948        delim_count += 1;
1949        if delim_count == skip_before {
1950            range_start = pos + 1;
1951        }
1952        if delim_count == target_end_delim {
1953            if skip_before == 0 {
1954                range_start = 0;
1955            }
1956            unsafe {
1957                buf_extend(
1958                    buf,
1959                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1960                );
1961                buf_push(buf, line_delim);
1962            }
1963            return;
1964        }
1965    }
1966
1967    if !has_delim {
1968        if !suppress {
1969            unsafe {
1970                buf_extend(buf, line);
1971                buf_push(buf, line_delim);
1972            }
1973        }
1974        return;
1975    }
1976
1977    // Line has delimiters but fewer fields than end_field
1978    if delim_count >= skip_before {
1979        // We have at least start_field, output from range_start to end
1980        if skip_before == 0 {
1981            range_start = 0;
1982        }
1983        unsafe {
1984            buf_extend(
1985                buf,
1986                std::slice::from_raw_parts(base.add(range_start), len - range_start),
1987            );
1988            buf_push(buf, line_delim);
1989        }
1990    } else {
1991        // Not enough fields even for start_field — output empty line
1992        unsafe { buf_push(buf, line_delim) };
1993    }
1994}
1995
1996/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
1997/// directly into the source data, flushing in MAX_IOV-sized batches.
1998/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
1999///
2000/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
2001/// Parallel field-1 extraction for large data using memchr2 single-pass.
2002/// Splits data into per-thread chunks, each chunk extracts field 1 using
2003/// memchr2(delim, newline) which finds the first special byte in one scan.
2004/// For field 1: first special byte is either the delimiter (field end) or
2005/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
2006fn single_field1_parallel(
2007    data: &[u8],
2008    delim: u8,
2009    line_delim: u8,
2010    out: &mut impl Write,
2011) -> io::Result<()> {
2012    let chunks = split_for_scope(data, line_delim);
2013    let n = chunks.len();
2014    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2015    rayon::scope(|s| {
2016        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2017            s.spawn(move |_| {
2018                result.reserve(chunk.len() + 1);
2019                single_field1_to_buf(chunk, delim, line_delim, result);
2020            });
2021        }
2022    });
2023    let slices: Vec<IoSlice> = results
2024        .iter()
2025        .filter(|r| !r.is_empty())
2026        .map(|r| IoSlice::new(r))
2027        .collect();
2028    write_ioslices(out, &slices)
2029}
2030
2031/// Extract field 1 from a chunk using two-level scanning: outer memchr(newline)
2032/// for line boundaries, inner memchr(delim) for the first delimiter per line.
2033///
2034/// This is faster than memchr2_iter single-pass because:
2035/// 1. memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles)
2036/// 2. For field 1, the inner memchr exits after the FIRST delimiter, skipping
2037///    all subsequent delimiters on the line (huge win for multi-column CSV)
2038/// 3. Lines without delimiter produce contiguous runs that are bulk-copied
2039///
2040/// Uses a single output pointer to avoid per-line buf.len() load/store.
2041#[inline]
2042fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
2043    debug_assert_ne!(delim, line_delim, "delim and line_delim must differ");
2044    // Reserve data.len() + 1: output <= input for all lines except potentially
2045    // the last line without trailing newline, where we add a newline (GNU compat).
2046    buf.reserve(data.len() + 1);
2047
2048    let base = data.as_ptr();
2049    let initial_len = buf.len();
2050    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
2051    let mut start = 0;
2052    // Track the start of contiguous runs of no-delimiter lines for bulk copy.
2053    let mut run_start: usize = 0;
2054    let mut in_run = true; // we start in a run
2055
2056    for end_pos in memchr_iter(line_delim, data) {
2057        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2058        match memchr::memchr(delim, line) {
2059            Some(dp) => {
2060                // Line has delimiter — flush contiguous run, output field1 + newline
2061                if in_run && run_start < start {
2062                    // Bulk copy the contiguous run of unchanged lines
2063                    let run_len = start - run_start;
2064                    unsafe {
2065                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2066                        out_ptr = out_ptr.add(run_len);
2067                    }
2068                }
2069                // Output field (bytes before first delimiter) + newline
2070                unsafe {
2071                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
2072                    out_ptr = out_ptr.add(dp);
2073                    *out_ptr = line_delim;
2074                    out_ptr = out_ptr.add(1);
2075                }
2076                run_start = end_pos + 1;
2077                in_run = true;
2078            }
2079            None => {
2080                // No delimiter — this line stays in the contiguous run
2081                if !in_run {
2082                    run_start = start;
2083                    in_run = true;
2084                }
2085            }
2086        }
2087        start = end_pos + 1;
2088    }
2089
2090    // Flush any remaining contiguous run
2091    if in_run && run_start < start {
2092        let run_len = start - run_start;
2093        unsafe {
2094            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2095            out_ptr = out_ptr.add(run_len);
2096        }
2097    }
2098
2099    // Handle last line without trailing newline
2100    if start < data.len() {
2101        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2102        match memchr::memchr(delim, line) {
2103            Some(dp) => {
2104                // Field + trailing newline
2105                unsafe {
2106                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
2107                    out_ptr = out_ptr.add(dp);
2108                    *out_ptr = line_delim;
2109                    out_ptr = out_ptr.add(1);
2110                }
2111            }
2112            None => {
2113                // No delimiter — output remaining data + newline (GNU compat)
2114                let len = data.len() - start;
2115                unsafe {
2116                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, len);
2117                    out_ptr = out_ptr.add(len);
2118                    *out_ptr = line_delim;
2119                    out_ptr = out_ptr.add(1);
2120                }
2121            }
2122        }
2123    }
2124
2125    unsafe {
2126        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2127        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2128        buf.set_len(new_len);
2129    }
2130}
2131
2132/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
2133/// directly into the source data. Uses two-level scan: outer memchr(newline)
2134/// for the first delimiter. This is faster than memchr2 for SMALL data because
2135/// the inner scan exits after the FIRST delimiter, skipping all
2136/// subsequent delimiters on the line.
2137///
2138/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2139/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2140#[inline]
2141#[allow(dead_code)]
2142fn single_field1_zerocopy(
2143    data: &[u8],
2144    delim: u8,
2145    line_delim: u8,
2146    out: &mut impl Write,
2147) -> io::Result<()> {
2148    let newline_buf: [u8; 1] = [line_delim];
2149
2150    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2151    let mut run_start: usize = 0;
2152    let mut start = 0;
2153
2154    for end_pos in memchr_iter(line_delim, data) {
2155        let line = &data[start..end_pos];
2156        if let Some(dp) = memchr::memchr(delim, line) {
2157            // Line has delimiter — truncate at first delimiter.
2158            // Flush current contiguous run, then add truncated field + newline.
2159            if run_start < start {
2160                iov.push(IoSlice::new(&data[run_start..start]));
2161            }
2162            iov.push(IoSlice::new(&data[start..start + dp]));
2163            iov.push(IoSlice::new(&newline_buf));
2164            run_start = end_pos + 1;
2165
2166            if iov.len() >= MAX_IOV - 2 {
2167                write_ioslices(out, &iov)?;
2168                iov.clear();
2169            }
2170        }
2171        // else: no delimiter in line, output unchanged (stays in contiguous run)
2172        start = end_pos + 1;
2173    }
2174
2175    // Handle last line (no trailing newline)
2176    if start < data.len() {
2177        let line = &data[start..];
2178        if let Some(dp) = memchr::memchr(delim, line) {
2179            if run_start < start {
2180                iov.push(IoSlice::new(&data[run_start..start]));
2181            }
2182            iov.push(IoSlice::new(&data[start..start + dp]));
2183            iov.push(IoSlice::new(&newline_buf));
2184            if !iov.is_empty() {
2185                write_ioslices(out, &iov)?;
2186            }
2187            return Ok(());
2188        }
2189    }
2190
2191    // Flush remaining contiguous run
2192    if run_start < data.len() {
2193        iov.push(IoSlice::new(&data[run_start..]));
2194        if !data.is_empty() && *data.last().unwrap() != line_delim {
2195            iov.push(IoSlice::new(&newline_buf));
2196        }
2197    }
2198    if !iov.is_empty() {
2199        write_ioslices(out, &iov)?;
2200    }
2201    Ok(())
2202}
2203
2204/// Process a chunk of data for single-field extraction using write-pointer pattern.
2205/// Two-level scan: outer memchr(newline), inner memchr_iter(delim) with early exit.
2206/// Uses contiguous run tracking for lines that pass through unchanged.
2207fn process_single_field_chunk(
2208    data: &[u8],
2209    delim: u8,
2210    target_idx: usize,
2211    line_delim: u8,
2212    suppress: bool,
2213    buf: &mut Vec<u8>,
2214) {
2215    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2216    buf.reserve(data.len() + 1);
2217
2218    let base = data.as_ptr();
2219    let initial_len = buf.len();
2220    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
2221    let mut start = 0;
2222    // Track contiguous runs of lines that output unchanged
2223    let mut run_start: usize = 0;
2224    let mut in_run = !suppress; // if suppress, no line passes through without delimiter
2225
2226    for end_pos in memchr_iter(line_delim, data) {
2227        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2228        let line_len = end_pos - start;
2229
2230        if line_len == 0 {
2231            if !suppress {
2232                // Empty line passes through in the run
2233                if !in_run {
2234                    run_start = start;
2235                    in_run = true;
2236                }
2237            }
2238            start = end_pos + 1;
2239            continue;
2240        }
2241
2242        // Count delimiters up to target_idx to find the target field
2243        let mut field_start_offset = 0;
2244        let mut field_idx = 0;
2245        let mut found = false;
2246        let mut has_delim = false;
2247
2248        for pos in memchr_iter(delim, line) {
2249            has_delim = true;
2250            if field_idx == target_idx {
2251                // Found the target field: line[field_start_offset..pos]
2252                // Flush run, output field + newline
2253                if in_run && run_start < start {
2254                    let run_len = start - run_start;
2255                    unsafe {
2256                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2257                        out_ptr = out_ptr.add(run_len);
2258                    }
2259                }
2260                let field_len = pos - field_start_offset;
2261                unsafe {
2262                    std::ptr::copy_nonoverlapping(
2263                        base.add(start + field_start_offset),
2264                        out_ptr,
2265                        field_len,
2266                    );
2267                    out_ptr = out_ptr.add(field_len);
2268                    *out_ptr = line_delim;
2269                    out_ptr = out_ptr.add(1);
2270                }
2271                run_start = end_pos + 1;
2272                in_run = true;
2273                found = true;
2274                break;
2275            }
2276            field_idx += 1;
2277            field_start_offset = pos + 1;
2278        }
2279
2280        if !found {
2281            if !has_delim {
2282                // No delimiter in line
2283                if !suppress {
2284                    // Line passes through unchanged — stays in run
2285                    if !in_run {
2286                        run_start = start;
2287                        in_run = true;
2288                    }
2289                } else {
2290                    // Suppress: flush run, skip this line
2291                    if in_run && run_start < start {
2292                        let run_len = start - run_start;
2293                        unsafe {
2294                            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2295                            out_ptr = out_ptr.add(run_len);
2296                        }
2297                    }
2298                    in_run = false;
2299                    run_start = end_pos + 1;
2300                }
2301            } else if field_idx == target_idx {
2302                // Last field is the target: line[field_start_offset..]
2303                if in_run && run_start < start {
2304                    let run_len = start - run_start;
2305                    unsafe {
2306                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2307                        out_ptr = out_ptr.add(run_len);
2308                    }
2309                }
2310                let field_len = line_len - field_start_offset;
2311                unsafe {
2312                    std::ptr::copy_nonoverlapping(
2313                        base.add(start + field_start_offset),
2314                        out_ptr,
2315                        field_len,
2316                    );
2317                    out_ptr = out_ptr.add(field_len);
2318                    *out_ptr = line_delim;
2319                    out_ptr = out_ptr.add(1);
2320                }
2321                run_start = end_pos + 1;
2322                in_run = true;
2323            } else {
2324                // Not enough fields for target — output empty line
2325                if in_run && run_start < start {
2326                    let run_len = start - run_start;
2327                    unsafe {
2328                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2329                        out_ptr = out_ptr.add(run_len);
2330                    }
2331                }
2332                unsafe {
2333                    *out_ptr = line_delim;
2334                    out_ptr = out_ptr.add(1);
2335                }
2336                run_start = end_pos + 1;
2337                in_run = true;
2338            }
2339        }
2340
2341        start = end_pos + 1;
2342    }
2343
2344    // Flush remaining contiguous run
2345    if in_run && run_start < start {
2346        let run_len = start - run_start;
2347        unsafe {
2348            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2349            out_ptr = out_ptr.add(run_len);
2350        }
2351    }
2352
2353    // Handle last line without trailing newline
2354    if start < data.len() {
2355        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2356        let line_len = data.len() - start;
2357
2358        if line_len == 0 {
2359            if !suppress {
2360                unsafe {
2361                    *out_ptr = line_delim;
2362                    out_ptr = out_ptr.add(1);
2363                }
2364            }
2365        } else {
2366            let mut field_start_offset = 0;
2367            let mut field_idx = 0;
2368            let mut found = false;
2369            let mut has_delim = false;
2370
2371            for pos in memchr_iter(delim, line) {
2372                has_delim = true;
2373                if field_idx == target_idx {
2374                    let field_len = pos - field_start_offset;
2375                    unsafe {
2376                        std::ptr::copy_nonoverlapping(
2377                            base.add(start + field_start_offset),
2378                            out_ptr,
2379                            field_len,
2380                        );
2381                        out_ptr = out_ptr.add(field_len);
2382                        *out_ptr = line_delim;
2383                        out_ptr = out_ptr.add(1);
2384                    }
2385                    found = true;
2386                    break;
2387                }
2388                field_idx += 1;
2389                field_start_offset = pos + 1;
2390            }
2391
2392            if !found {
2393                if !has_delim {
2394                    if !suppress {
2395                        unsafe {
2396                            std::ptr::copy_nonoverlapping(base.add(start), out_ptr, line_len);
2397                            out_ptr = out_ptr.add(line_len);
2398                            *out_ptr = line_delim;
2399                            out_ptr = out_ptr.add(1);
2400                        }
2401                    }
2402                } else if field_idx == target_idx {
2403                    let field_len = line_len - field_start_offset;
2404                    unsafe {
2405                        std::ptr::copy_nonoverlapping(
2406                            base.add(start + field_start_offset),
2407                            out_ptr,
2408                            field_len,
2409                        );
2410                        out_ptr = out_ptr.add(field_len);
2411                        *out_ptr = line_delim;
2412                        out_ptr = out_ptr.add(1);
2413                    }
2414                } else {
2415                    unsafe {
2416                        *out_ptr = line_delim;
2417                        out_ptr = out_ptr.add(1);
2418                    }
2419                }
2420            }
2421        }
2422    }
2423
2424    unsafe {
2425        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2426        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2427        buf.set_len(new_len);
2428    }
2429}
2430
2431/// Extract fields from a single line into the output buffer.
2432/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2433/// Raw pointer arithmetic eliminates per-field bounds checking.
2434#[inline(always)]
2435fn extract_fields_to_buf(
2436    line: &[u8],
2437    delim: u8,
2438    ranges: &[Range],
2439    output_delim: &[u8],
2440    suppress: bool,
2441    max_field: usize,
2442    field_mask: u64,
2443    line_delim: u8,
2444    buf: &mut Vec<u8>,
2445    complement: bool,
2446) {
2447    let len = line.len();
2448
2449    if len == 0 {
2450        if !suppress {
2451            buf.push(line_delim);
2452        }
2453        return;
2454    }
2455
2456    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2457    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2458    let needed = len + output_delim.len() * 16 + 1;
2459    if buf.capacity() - buf.len() < needed {
2460        buf.reserve(needed);
2461    }
2462
2463    let base = line.as_ptr();
2464    let mut field_num: usize = 1;
2465    let mut field_start: usize = 0;
2466    let mut first_output = true;
2467    let mut has_delim = false;
2468
2469    // Use memchr SIMD for all line sizes
2470    for delim_pos in memchr_iter(delim, line) {
2471        has_delim = true;
2472
2473        if is_selected(field_num, field_mask, ranges, complement) {
2474            if !first_output {
2475                unsafe { buf_extend(buf, output_delim) };
2476            }
2477            unsafe {
2478                buf_extend(
2479                    buf,
2480                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2481                )
2482            };
2483            first_output = false;
2484        }
2485
2486        field_num += 1;
2487        field_start = delim_pos + 1;
2488
2489        if field_num > max_field {
2490            break;
2491        }
2492    }
2493
2494    // Last field
2495    if (field_num <= max_field || complement)
2496        && has_delim
2497        && is_selected(field_num, field_mask, ranges, complement)
2498    {
2499        if !first_output {
2500            unsafe { buf_extend(buf, output_delim) };
2501        }
2502        unsafe {
2503            buf_extend(
2504                buf,
2505                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2506            )
2507        };
2508        first_output = false;
2509    }
2510
2511    if !first_output {
2512        unsafe { buf_push(buf, line_delim) };
2513    } else if !has_delim {
2514        if !suppress {
2515            unsafe {
2516                buf_extend(buf, line);
2517                buf_push(buf, line_delim);
2518            }
2519        }
2520    } else {
2521        unsafe { buf_push(buf, line_delim) };
2522    }
2523}
2524
2525// ── Fast path: byte/char extraction with batched output ──────────────────
2526
2527/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2528/// Zero-copy: writes directly from the source data using output runs.
2529/// For lines shorter than max_bytes, the output is identical to the input,
2530/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2531fn process_bytes_from_start(
2532    data: &[u8],
2533    max_bytes: usize,
2534    line_delim: u8,
2535    out: &mut impl Write,
2536) -> io::Result<()> {
2537    // For small data (< PARALLEL_THRESHOLD): check if all lines fit for zero-copy passthrough.
2538    // The sequential scan + write_all is competitive with per-line processing for small data.
2539    //
2540    // For large data (>= PARALLEL_THRESHOLD): skip the all_fit scan entirely.
2541    // The scan is sequential (~1.7ms for 10MB at memchr speed) while parallel per-line
2542    // processing is much faster (~0.5ms for 10MB with 4 threads). Even when all lines fit,
2543    // the parallel copy + write is faster than sequential scan + zero-copy write.
2544    if data.len() < PARALLEL_THRESHOLD && max_bytes > 0 && max_bytes < usize::MAX {
2545        let mut start = 0;
2546        let mut all_fit = true;
2547        for pos in memchr_iter(line_delim, data) {
2548            if pos - start > max_bytes {
2549                all_fit = false;
2550                break;
2551            }
2552            start = pos + 1;
2553        }
2554        // Check last line (no trailing delimiter)
2555        if all_fit && start < data.len() && data.len() - start > max_bytes {
2556            all_fit = false;
2557        }
2558        if all_fit {
2559            // All lines fit: output = input. Handle missing trailing delimiter.
2560            if !data.is_empty() && data[data.len() - 1] == line_delim {
2561                return out.write_all(data);
2562            } else if !data.is_empty() {
2563                out.write_all(data)?;
2564                return out.write_all(&[line_delim]);
2565            }
2566            return Ok(());
2567        }
2568    }
2569
2570    if data.len() >= PARALLEL_THRESHOLD {
2571        let chunks = split_for_scope(data, line_delim);
2572        let n = chunks.len();
2573        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2574        rayon::scope(|s| {
2575            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2576                s.spawn(move |_| {
2577                    // Output can be up to input size (when all lines fit).
2578                    // Reserve full chunk size to avoid reallocation.
2579                    result.reserve(chunk.len());
2580                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2581                });
2582            }
2583        });
2584        // Use write_vectored (writev) to batch N writes into fewer syscalls
2585        let slices: Vec<IoSlice> = results
2586            .iter()
2587            .filter(|r| !r.is_empty())
2588            .map(|r| IoSlice::new(r))
2589            .collect();
2590        write_ioslices(out, &slices)?;
2591    } else {
2592        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2593        // because every line gets truncated, creating 3 IoSlice entries per line.
2594        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2595        // managing millions of IoSlice entries through the kernel.
2596        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2597        if max_bytes <= 512 {
2598            // Estimate output size without scanning: output <= data.len(),
2599            // typically ~data.len()/4 for short max_bytes on longer lines.
2600            let est_out = (data.len() / 4).max(max_bytes + 2);
2601            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2602            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2603            if !buf.is_empty() {
2604                out.write_all(&buf)?;
2605            }
2606        } else {
2607            // Zero-copy path: track contiguous output runs and write directly from source.
2608            // For lines <= max_bytes, we include them as-is (no copy needed).
2609            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2610            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2611        }
2612    }
2613    Ok(())
2614}
2615
2616/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2617/// directly into the source data, flushing in MAX_IOV-sized batches.
2618/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2619/// produce two IoSlices (truncated data + newline).
2620#[inline]
2621fn bytes_from_start_zerocopy(
2622    data: &[u8],
2623    max_bytes: usize,
2624    line_delim: u8,
2625    out: &mut impl Write,
2626) -> io::Result<()> {
2627    let newline_buf: [u8; 1] = [line_delim];
2628    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2629    let mut start = 0;
2630    let mut run_start: usize = 0;
2631
2632    for pos in memchr_iter(line_delim, data) {
2633        let line_len = pos - start;
2634        if line_len > max_bytes {
2635            // This line needs truncation
2636            if run_start < start {
2637                iov.push(IoSlice::new(&data[run_start..start]));
2638            }
2639            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2640            iov.push(IoSlice::new(&newline_buf));
2641            run_start = pos + 1;
2642
2643            if iov.len() >= MAX_IOV - 2 {
2644                write_ioslices(out, &iov)?;
2645                iov.clear();
2646            }
2647        }
2648        start = pos + 1;
2649    }
2650    // Handle last line without terminator
2651    if start < data.len() {
2652        let line_len = data.len() - start;
2653        if line_len > max_bytes {
2654            if run_start < start {
2655                iov.push(IoSlice::new(&data[run_start..start]));
2656            }
2657            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2658            iov.push(IoSlice::new(&newline_buf));
2659            if !iov.is_empty() {
2660                write_ioslices(out, &iov)?;
2661            }
2662            return Ok(());
2663        }
2664    }
2665    // Flush remaining contiguous run
2666    if run_start < data.len() {
2667        iov.push(IoSlice::new(&data[run_start..]));
2668        if !data.is_empty() && *data.last().unwrap() != line_delim {
2669            iov.push(IoSlice::new(&newline_buf));
2670        }
2671    }
2672    if !iov.is_empty() {
2673        write_ioslices(out, &iov)?;
2674    }
2675    Ok(())
2676}
2677
2678/// Process a chunk for from-start byte range extraction (parallel path).
2679/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2680/// Pre-reserves data.len() (output never exceeds input), then uses a single
2681/// write pointer with deferred set_len — no per-line capacity checks.
2682#[inline]
2683fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2684    // Output is always <= input size (we only truncate, never expand).
2685    // Single reserve eliminates ALL per-line capacity checks.
2686    buf.reserve(data.len());
2687
2688    let src = data.as_ptr();
2689    let dst_base = buf.as_mut_ptr();
2690    let mut wp = buf.len();
2691    let mut start = 0;
2692
2693    for pos in memchr_iter(line_delim, data) {
2694        let line_len = pos - start;
2695        let take = line_len.min(max_bytes);
2696        unsafe {
2697            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2698            *dst_base.add(wp + take) = line_delim;
2699        }
2700        wp += take + 1;
2701        start = pos + 1;
2702    }
2703    // Handle last line without terminator
2704    if start < data.len() {
2705        let line_len = data.len() - start;
2706        let take = line_len.min(max_bytes);
2707        unsafe {
2708            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2709            *dst_base.add(wp + take) = line_delim;
2710        }
2711        wp += take + 1;
2712    }
2713    unsafe { buf.set_len(wp) };
2714}
2715
2716/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2717fn process_bytes_from_offset(
2718    data: &[u8],
2719    skip_bytes: usize,
2720    line_delim: u8,
2721    out: &mut impl Write,
2722) -> io::Result<()> {
2723    if data.len() >= PARALLEL_THRESHOLD {
2724        let chunks = split_for_scope(data, line_delim);
2725        let n = chunks.len();
2726        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2727        rayon::scope(|s| {
2728            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2729                s.spawn(move |_| {
2730                    result.reserve(chunk.len());
2731                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2732                });
2733            }
2734        });
2735        // Use write_vectored (writev) to batch N writes into fewer syscalls
2736        let slices: Vec<IoSlice> = results
2737            .iter()
2738            .filter(|r| !r.is_empty())
2739            .map(|r| IoSlice::new(r))
2740            .collect();
2741        write_ioslices(out, &slices)?;
2742    } else {
2743        // Zero-copy: write suffix of each line directly from source
2744        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2745    }
2746    Ok(())
2747}
2748
2749/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2750/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2751/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2752#[inline]
2753fn bytes_from_offset_zerocopy(
2754    data: &[u8],
2755    skip_bytes: usize,
2756    line_delim: u8,
2757    out: &mut impl Write,
2758) -> io::Result<()> {
2759    let delim_buf = [line_delim];
2760    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2761
2762    let mut start = 0;
2763    for pos in memchr_iter(line_delim, data) {
2764        let line_len = pos - start;
2765        if line_len > skip_bytes {
2766            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2767        }
2768        iov.push(IoSlice::new(&delim_buf));
2769        // Flush when approaching MAX_IOV to avoid oversized writev
2770        if iov.len() >= MAX_IOV - 1 {
2771            write_ioslices(out, &iov)?;
2772            iov.clear();
2773        }
2774        start = pos + 1;
2775    }
2776    if start < data.len() {
2777        let line_len = data.len() - start;
2778        if line_len > skip_bytes {
2779            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2780        }
2781        iov.push(IoSlice::new(&delim_buf));
2782    }
2783    if !iov.is_empty() {
2784        write_ioslices(out, &iov)?;
2785    }
2786    Ok(())
2787}
2788
2789/// Process a chunk for from-offset byte range extraction.
2790/// Single reserve + deferred set_len for zero per-line overhead.
2791#[inline]
2792fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2793    buf.reserve(data.len());
2794
2795    let src = data.as_ptr();
2796    let dst_base = buf.as_mut_ptr();
2797    let mut wp = buf.len();
2798    let mut start = 0;
2799
2800    for pos in memchr_iter(line_delim, data) {
2801        let line_len = pos - start;
2802        if line_len > skip_bytes {
2803            let take = line_len - skip_bytes;
2804            unsafe {
2805                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2806            }
2807            wp += take;
2808        }
2809        unsafe {
2810            *dst_base.add(wp) = line_delim;
2811        }
2812        wp += 1;
2813        start = pos + 1;
2814    }
2815    if start < data.len() {
2816        let line_len = data.len() - start;
2817        if line_len > skip_bytes {
2818            let take = line_len - skip_bytes;
2819            unsafe {
2820                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2821            }
2822            wp += take;
2823        }
2824        unsafe {
2825            *dst_base.add(wp) = line_delim;
2826        }
2827        wp += 1;
2828    }
2829    unsafe { buf.set_len(wp) };
2830}
2831
2832/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2833fn process_bytes_mid_range(
2834    data: &[u8],
2835    start_byte: usize,
2836    end_byte: usize,
2837    line_delim: u8,
2838    out: &mut impl Write,
2839) -> io::Result<()> {
2840    let skip = start_byte.saturating_sub(1);
2841
2842    if data.len() >= PARALLEL_THRESHOLD {
2843        let chunks = split_for_scope(data, line_delim);
2844        let n = chunks.len();
2845        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2846        rayon::scope(|s| {
2847            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2848                s.spawn(move |_| {
2849                    result.reserve(chunk.len());
2850                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2851                });
2852            }
2853        });
2854        let slices: Vec<IoSlice> = results
2855            .iter()
2856            .filter(|r| !r.is_empty())
2857            .map(|r| IoSlice::new(r))
2858            .collect();
2859        write_ioslices(out, &slices)?;
2860    } else {
2861        process_chunked(data, line_delim, out, |chunk, buf| {
2862            bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, buf);
2863        })?;
2864    }
2865    Ok(())
2866}
2867
2868/// Process a chunk for mid-range byte extraction.
2869/// For each line, output bytes skip..min(line_len, end_byte).
2870/// Single reserve + deferred set_len.
2871#[inline]
2872fn bytes_mid_range_chunk(
2873    data: &[u8],
2874    skip: usize,
2875    end_byte: usize,
2876    line_delim: u8,
2877    buf: &mut Vec<u8>,
2878) {
2879    buf.reserve(data.len());
2880
2881    let src = data.as_ptr();
2882    let dst_base = buf.as_mut_ptr();
2883    let mut wp = buf.len();
2884    let mut start = 0;
2885
2886    for pos in memchr_iter(line_delim, data) {
2887        let line_len = pos - start;
2888        if line_len > skip {
2889            let take_end = line_len.min(end_byte);
2890            let take = take_end - skip;
2891            unsafe {
2892                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2893            }
2894            wp += take;
2895        }
2896        unsafe {
2897            *dst_base.add(wp) = line_delim;
2898        }
2899        wp += 1;
2900        start = pos + 1;
2901    }
2902    if start < data.len() {
2903        let line_len = data.len() - start;
2904        if line_len > skip {
2905            let take_end = line_len.min(end_byte);
2906            let take = take_end - skip;
2907            unsafe {
2908                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2909            }
2910            wp += take;
2911        }
2912        unsafe {
2913            *dst_base.add(wp) = line_delim;
2914        }
2915        wp += 1;
2916    }
2917    unsafe { buf.set_len(wp) };
2918}
2919
2920/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2921fn process_bytes_complement_mid(
2922    data: &[u8],
2923    skip_start: usize,
2924    skip_end: usize,
2925    line_delim: u8,
2926    out: &mut impl Write,
2927) -> io::Result<()> {
2928    let prefix_bytes = skip_start - 1; // bytes before the skip region
2929    if data.len() >= PARALLEL_THRESHOLD {
2930        let chunks = split_for_scope(data, line_delim);
2931        let n = chunks.len();
2932        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2933        rayon::scope(|s| {
2934            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2935                s.spawn(move |_| {
2936                    result.reserve(chunk.len());
2937                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
2938                });
2939            }
2940        });
2941        let slices: Vec<IoSlice> = results
2942            .iter()
2943            .filter(|r| !r.is_empty())
2944            .map(|r| IoSlice::new(r))
2945            .collect();
2946        write_ioslices(out, &slices)?;
2947    } else {
2948        process_chunked(data, line_delim, out, |chunk, buf| {
2949            bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, buf);
2950        })?;
2951    }
2952    Ok(())
2953}
2954
2955/// Process a chunk for complement mid-range byte extraction.
2956/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2957#[inline]
2958fn bytes_complement_mid_chunk(
2959    data: &[u8],
2960    prefix_bytes: usize,
2961    skip_end: usize,
2962    line_delim: u8,
2963    buf: &mut Vec<u8>,
2964) {
2965    buf.reserve(data.len());
2966
2967    let src = data.as_ptr();
2968    let dst_base = buf.as_mut_ptr();
2969    let mut wp = buf.len();
2970    let mut start = 0;
2971
2972    for pos in memchr_iter(line_delim, data) {
2973        let line_len = pos - start;
2974        // Copy prefix (bytes before skip region)
2975        let take_prefix = prefix_bytes.min(line_len);
2976        if take_prefix > 0 {
2977            unsafe {
2978                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2979            }
2980            wp += take_prefix;
2981        }
2982        // Copy suffix (bytes after skip region)
2983        if line_len > skip_end {
2984            let suffix_len = line_len - skip_end;
2985            unsafe {
2986                std::ptr::copy_nonoverlapping(
2987                    src.add(start + skip_end),
2988                    dst_base.add(wp),
2989                    suffix_len,
2990                );
2991            }
2992            wp += suffix_len;
2993        }
2994        unsafe {
2995            *dst_base.add(wp) = line_delim;
2996        }
2997        wp += 1;
2998        start = pos + 1;
2999    }
3000    if start < data.len() {
3001        let line_len = data.len() - start;
3002        let take_prefix = prefix_bytes.min(line_len);
3003        if take_prefix > 0 {
3004            unsafe {
3005                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3006            }
3007            wp += take_prefix;
3008        }
3009        if line_len > skip_end {
3010            let suffix_len = line_len - skip_end;
3011            unsafe {
3012                std::ptr::copy_nonoverlapping(
3013                    src.add(start + skip_end),
3014                    dst_base.add(wp),
3015                    suffix_len,
3016                );
3017            }
3018            wp += suffix_len;
3019        }
3020        unsafe {
3021            *dst_base.add(wp) = line_delim;
3022        }
3023        wp += 1;
3024    }
3025    unsafe { buf.set_len(wp) };
3026}
3027
3028/// Optimized byte/char extraction with batched output and parallel processing.
3029fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3030    let line_delim = cfg.line_delim;
3031    let ranges = cfg.ranges;
3032    let complement = cfg.complement;
3033    let output_delim = cfg.output_delim;
3034
3035    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
3036    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
3037        let max_bytes = ranges[0].end;
3038        if max_bytes < usize::MAX {
3039            return process_bytes_from_start(data, max_bytes, line_delim, out);
3040        }
3041    }
3042
3043    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
3044    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
3045        let skip_bytes = ranges[0].start.saturating_sub(1);
3046        if skip_bytes > 0 {
3047            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
3048        }
3049    }
3050
3051    // Fast path: single mid-range (e.g., cut -b5-100)
3052    if !complement
3053        && ranges.len() == 1
3054        && ranges[0].start > 1
3055        && ranges[0].end < usize::MAX
3056        && output_delim.is_empty()
3057    {
3058        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
3059    }
3060
3061    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
3062    if complement
3063        && ranges.len() == 1
3064        && ranges[0].start == 1
3065        && ranges[0].end < usize::MAX
3066        && output_delim.is_empty()
3067    {
3068        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
3069    }
3070
3071    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
3072    if complement
3073        && ranges.len() == 1
3074        && ranges[0].end == usize::MAX
3075        && ranges[0].start > 1
3076        && output_delim.is_empty()
3077    {
3078        let max_bytes = ranges[0].start - 1;
3079        return process_bytes_from_start(data, max_bytes, line_delim, out);
3080    }
3081
3082    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
3083    if complement
3084        && ranges.len() == 1
3085        && ranges[0].start > 1
3086        && ranges[0].end < usize::MAX
3087        && output_delim.is_empty()
3088    {
3089        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
3090    }
3091
3092    if data.len() >= PARALLEL_THRESHOLD {
3093        let chunks = split_for_scope(data, line_delim);
3094        let n = chunks.len();
3095        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3096        rayon::scope(|s| {
3097            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3098                s.spawn(move |_| {
3099                    result.reserve(chunk.len() + 1);
3100                    process_bytes_chunk(
3101                        chunk,
3102                        ranges,
3103                        complement,
3104                        output_delim,
3105                        line_delim,
3106                        result,
3107                    );
3108                });
3109            }
3110        });
3111        let slices: Vec<IoSlice> = results
3112            .iter()
3113            .filter(|r| !r.is_empty())
3114            .map(|r| IoSlice::new(r))
3115            .collect();
3116        write_ioslices(out, &slices)?;
3117    } else {
3118        process_chunked(data, line_delim, out, |chunk, buf| {
3119            process_bytes_chunk(chunk, ranges, complement, output_delim, line_delim, buf);
3120        })?;
3121    }
3122    Ok(())
3123}
3124
3125/// Process a chunk of data for byte/char extraction.
3126/// Uses raw pointer arithmetic for the newline scan.
3127/// Complement single-range fast path: compute complement ranges once, then use
3128/// the non-complement multi-range path which is more cache-friendly.
3129fn process_bytes_chunk(
3130    data: &[u8],
3131    ranges: &[Range],
3132    complement: bool,
3133    output_delim: &[u8],
3134    line_delim: u8,
3135    buf: &mut Vec<u8>,
3136) {
3137    buf.reserve(data.len());
3138    let base = data.as_ptr();
3139    let mut start = 0;
3140    for end_pos in memchr_iter(line_delim, data) {
3141        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
3142        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3143        unsafe { buf_push(buf, line_delim) };
3144        start = end_pos + 1;
3145    }
3146    if start < data.len() {
3147        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
3148        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3149        unsafe { buf_push(buf, line_delim) };
3150    }
3151}
3152
3153/// Extract byte ranges from a line into the output buffer.
3154/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
3155/// Raw pointer arithmetic eliminates per-range bounds checking.
3156#[inline(always)]
3157fn cut_bytes_to_buf(
3158    line: &[u8],
3159    ranges: &[Range],
3160    complement: bool,
3161    output_delim: &[u8],
3162    buf: &mut Vec<u8>,
3163) {
3164    let len = line.len();
3165    let base = line.as_ptr();
3166    let mut first_range = true;
3167
3168    // Reserve worst case: full line + delimiters between ranges
3169    let needed = len + output_delim.len() * ranges.len() + 1;
3170    if buf.capacity() - buf.len() < needed {
3171        buf.reserve(needed);
3172    }
3173
3174    if complement {
3175        let mut pos: usize = 1;
3176        for r in ranges {
3177            let rs = r.start;
3178            let re = r.end.min(len);
3179            if pos < rs {
3180                if !first_range && !output_delim.is_empty() {
3181                    unsafe { buf_extend(buf, output_delim) };
3182                }
3183                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
3184                first_range = false;
3185            }
3186            pos = re + 1;
3187            if pos > len {
3188                break;
3189            }
3190        }
3191        if pos <= len {
3192            if !first_range && !output_delim.is_empty() {
3193                unsafe { buf_extend(buf, output_delim) };
3194            }
3195            unsafe {
3196                buf_extend(
3197                    buf,
3198                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3199                )
3200            };
3201        }
3202    } else if output_delim.is_empty() && ranges.len() == 1 {
3203        // Ultra-fast path: single range, no output delimiter
3204        let start = ranges[0].start.saturating_sub(1);
3205        let end = ranges[0].end.min(len);
3206        if start < len {
3207            unsafe {
3208                buf_extend(
3209                    buf,
3210                    std::slice::from_raw_parts(base.add(start), end - start),
3211                )
3212            };
3213        }
3214    } else {
3215        for r in ranges {
3216            let start = r.start.saturating_sub(1);
3217            let end = r.end.min(len);
3218            if start >= len {
3219                break;
3220            }
3221            if !first_range && !output_delim.is_empty() {
3222                unsafe { buf_extend(buf, output_delim) };
3223            }
3224            unsafe {
3225                buf_extend(
3226                    buf,
3227                    std::slice::from_raw_parts(base.add(start), end - start),
3228                )
3229            };
3230            first_range = false;
3231        }
3232    }
3233}
3234
3235// ── Public API ───────────────────────────────────────────────────────────
3236
3237/// Cut fields from a line using a delimiter. Writes to `out`.
3238#[inline]
3239pub fn cut_fields(
3240    line: &[u8],
3241    delim: u8,
3242    ranges: &[Range],
3243    complement: bool,
3244    output_delim: &[u8],
3245    suppress_no_delim: bool,
3246    out: &mut impl Write,
3247) -> io::Result<bool> {
3248    if memchr::memchr(delim, line).is_none() {
3249        if !suppress_no_delim {
3250            out.write_all(line)?;
3251            return Ok(true);
3252        }
3253        return Ok(false);
3254    }
3255
3256    let mut field_num: usize = 1;
3257    let mut field_start: usize = 0;
3258    let mut first_output = true;
3259
3260    for delim_pos in memchr_iter(delim, line) {
3261        let selected = in_ranges(ranges, field_num) != complement;
3262        if selected {
3263            if !first_output {
3264                out.write_all(output_delim)?;
3265            }
3266            out.write_all(&line[field_start..delim_pos])?;
3267            first_output = false;
3268        }
3269        field_start = delim_pos + 1;
3270        field_num += 1;
3271    }
3272
3273    let selected = in_ranges(ranges, field_num) != complement;
3274    if selected {
3275        if !first_output {
3276            out.write_all(output_delim)?;
3277        }
3278        out.write_all(&line[field_start..])?;
3279    }
3280
3281    Ok(true)
3282}
3283
3284/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3285#[inline]
3286pub fn cut_bytes(
3287    line: &[u8],
3288    ranges: &[Range],
3289    complement: bool,
3290    output_delim: &[u8],
3291    out: &mut impl Write,
3292) -> io::Result<bool> {
3293    let mut first_range = true;
3294
3295    if complement {
3296        let len = line.len();
3297        let mut comp_ranges = Vec::new();
3298        let mut pos: usize = 1;
3299        for r in ranges {
3300            let rs = r.start;
3301            let re = r.end.min(len);
3302            if pos < rs {
3303                comp_ranges.push((pos, rs - 1));
3304            }
3305            pos = re + 1;
3306            if pos > len {
3307                break;
3308            }
3309        }
3310        if pos <= len {
3311            comp_ranges.push((pos, len));
3312        }
3313        for &(s, e) in &comp_ranges {
3314            if !first_range && !output_delim.is_empty() {
3315                out.write_all(output_delim)?;
3316            }
3317            out.write_all(&line[s - 1..e])?;
3318            first_range = false;
3319        }
3320    } else {
3321        for r in ranges {
3322            let start = r.start.saturating_sub(1);
3323            let end = r.end.min(line.len());
3324            if start >= line.len() {
3325                break;
3326            }
3327            if !first_range && !output_delim.is_empty() {
3328                out.write_all(output_delim)?;
3329            }
3330            out.write_all(&line[start..end])?;
3331            first_range = false;
3332        }
3333    }
3334    Ok(true)
3335}
3336
3337/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3338/// Output is always <= input (we remove everything after first delimiter per line).
3339/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3340/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3341///
3342/// Lines without delimiter pass through unchanged (unless suppress=true).
3343/// Lines with delimiter: keep bytes before delimiter + newline.
3344pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3345    let len = data.len();
3346    let mut wp: usize = 0;
3347    let mut rp: usize = 0;
3348
3349    while rp < len {
3350        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3351            None => {
3352                // Rest is partial line, no delimiter
3353                if suppress {
3354                    // suppress: skip lines without delimiter
3355                    break;
3356                }
3357                let remaining = len - rp;
3358                if wp != rp {
3359                    data.copy_within(rp..len, wp);
3360                }
3361                wp += remaining;
3362                break;
3363            }
3364            Some(offset) => {
3365                let actual = rp + offset;
3366                if data[actual] == line_delim {
3367                    // No delimiter on this line
3368                    if suppress {
3369                        // Skip this line entirely
3370                        rp = actual + 1;
3371                    } else {
3372                        // Output entire line including newline
3373                        let chunk_len = actual + 1 - rp;
3374                        if wp != rp {
3375                            data.copy_within(rp..actual + 1, wp);
3376                        }
3377                        wp += chunk_len;
3378                        rp = actual + 1;
3379                    }
3380                } else {
3381                    // Delimiter found: output field 1 (up to delimiter) + newline
3382                    let field_len = actual - rp;
3383                    if wp != rp && field_len > 0 {
3384                        data.copy_within(rp..actual, wp);
3385                    }
3386                    wp += field_len;
3387                    data[wp] = line_delim;
3388                    wp += 1;
3389                    // Skip to next newline
3390                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3391                        None => {
3392                            rp = len;
3393                        }
3394                        Some(nl_off) => {
3395                            rp = actual + 1 + nl_off + 1;
3396                        }
3397                    }
3398                }
3399            }
3400        }
3401    }
3402    wp
3403}
3404
3405/// Process a full data buffer (from mmap or read) with cut operation.
3406pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3407    match cfg.mode {
3408        CutMode::Fields => process_fields_fast(data, cfg, out),
3409        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3410    }
3411}
3412
3413/// Process input from a reader (for stdin).
3414/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3415/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3416/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3417pub fn process_cut_reader<R: BufRead>(
3418    mut reader: R,
3419    cfg: &CutConfig,
3420    out: &mut impl Write,
3421) -> io::Result<()> {
3422    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3423    let line_delim = cfg.line_delim;
3424
3425    // Read large chunks and process in batch.
3426    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3427    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3428
3429    loop {
3430        // Read up to CHUNK_SIZE bytes
3431        buf.reserve(CHUNK_SIZE);
3432        let read_start = buf.len();
3433        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3434        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3435        buf.truncate(read_start + n);
3436
3437        if buf.is_empty() {
3438            break;
3439        }
3440
3441        if n == 0 {
3442            // EOF with leftover data (last line without terminator)
3443            process_cut_data(&buf, cfg, out)?;
3444            break;
3445        }
3446
3447        // Find the last line delimiter in the buffer so we process complete lines
3448        let process_end = match memchr::memrchr(line_delim, &buf) {
3449            Some(pos) => pos + 1,
3450            None => {
3451                // No line delimiter found — keep accumulating
3452                continue;
3453            }
3454        };
3455
3456        // Process the complete lines using the fast batch path
3457        process_cut_data(&buf[..process_end], cfg, out)?;
3458
3459        // Shift leftover to the front for next iteration
3460        let leftover_len = buf.len() - process_end;
3461        if leftover_len > 0 {
3462            buf.copy_within(process_end.., 0);
3463        }
3464        buf.truncate(leftover_len);
3465    }
3466
3467    Ok(())
3468}
3469
3470/// Read as many bytes as possible into buf, retrying on partial reads.
3471#[inline]
3472fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3473    let n = reader.read(buf)?;
3474    if n == buf.len() || n == 0 {
3475        return Ok(n);
3476    }
3477    // Slow path: partial read — retry to fill buffer
3478    let mut total = n;
3479    while total < buf.len() {
3480        match reader.read(&mut buf[total..]) {
3481            Ok(0) => break,
3482            Ok(n) => total += n,
3483            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3484            Err(e) => return Err(e),
3485        }
3486    }
3487    Ok(total)
3488}
3489
3490/// In-place cut processing for mutable data buffers.
3491/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3492/// for the given configuration (caller should fall back to regular processing).
3493///
3494/// In-place avoids allocating intermediate output buffers — the result is written
3495/// directly into the input buffer (output is always <= input for non-complement modes
3496/// with default output delimiter).
3497///
3498/// Note: if the input does not end with line_delim, we fall back to the regular
3499/// path because GNU cut always adds a trailing line delimiter, and the in-place
3500/// buffer cannot grow beyond the input size.
3501pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3502    if cfg.complement {
3503        return None;
3504    }
3505    // If input doesn't end with line_delim, the output may need an extra byte
3506    // (GNU cut always terminates the last line). In-place can't grow the buffer,
3507    // so fall back to the regular allocating path.
3508    if data.is_empty() || data[data.len() - 1] != cfg.line_delim {
3509        return None;
3510    }
3511
3512    match cfg.mode {
3513        CutMode::Fields => {
3514            // Only handle when output delimiter matches input (single-byte)
3515            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3516                return None;
3517            }
3518            if cfg.delim == cfg.line_delim {
3519                return None;
3520            }
3521            Some(cut_fields_inplace_general(
3522                data,
3523                cfg.delim,
3524                cfg.line_delim,
3525                cfg.ranges,
3526                cfg.suppress_no_delim,
3527            ))
3528        }
3529        CutMode::Bytes | CutMode::Characters => {
3530            if !cfg.output_delim.is_empty() {
3531                return None;
3532            }
3533            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3534        }
3535    }
3536}
3537
3538/// In-place generalized field extraction.
3539/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3540fn cut_fields_inplace_general(
3541    data: &mut [u8],
3542    delim: u8,
3543    line_delim: u8,
3544    ranges: &[Range],
3545    suppress: bool,
3546) -> usize {
3547    // Special case: field 1 only (existing optimized path)
3548    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3549        return cut_field1_inplace(data, delim, line_delim, suppress);
3550    }
3551
3552    let len = data.len();
3553    if len == 0 {
3554        return 0;
3555    }
3556
3557    let max_field = ranges.last().map_or(0, |r| r.end);
3558    let max_delims = max_field.min(128);
3559    let mut wp: usize = 0;
3560    let mut rp: usize = 0;
3561
3562    while rp < len {
3563        let line_end = memchr::memchr(line_delim, &data[rp..])
3564            .map(|p| rp + p)
3565            .unwrap_or(len);
3566        let line_len = line_end - rp;
3567
3568        // Collect delimiter positions (relative to line start)
3569        let mut delim_pos = [0usize; 128];
3570        let mut num_delims: usize = 0;
3571
3572        for pos in memchr_iter(delim, &data[rp..line_end]) {
3573            if num_delims < max_delims {
3574                delim_pos[num_delims] = pos;
3575                num_delims += 1;
3576                if num_delims >= max_delims {
3577                    break;
3578                }
3579            }
3580        }
3581
3582        if num_delims == 0 {
3583            // No delimiter in line
3584            if !suppress {
3585                if wp != rp {
3586                    data.copy_within(rp..line_end, wp);
3587                }
3588                wp += line_len;
3589                if line_end < len {
3590                    data[wp] = line_delim;
3591                    wp += 1;
3592                }
3593            }
3594        } else {
3595            let total_fields = num_delims + 1;
3596            let mut first_output = true;
3597
3598            for r in ranges {
3599                let range_start = r.start;
3600                let range_end = r.end.min(total_fields);
3601                if range_start > total_fields {
3602                    break;
3603                }
3604                for field_num in range_start..=range_end {
3605                    if field_num > total_fields {
3606                        break;
3607                    }
3608
3609                    let field_start = if field_num == 1 {
3610                        0
3611                    } else if field_num - 2 < num_delims {
3612                        delim_pos[field_num - 2] + 1
3613                    } else {
3614                        continue;
3615                    };
3616                    let field_end = if field_num <= num_delims {
3617                        delim_pos[field_num - 1]
3618                    } else {
3619                        line_len
3620                    };
3621
3622                    if !first_output {
3623                        data[wp] = delim;
3624                        wp += 1;
3625                    }
3626                    let flen = field_end - field_start;
3627                    if flen > 0 {
3628                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3629                        wp += flen;
3630                    }
3631                    first_output = false;
3632                }
3633            }
3634
3635            if !first_output && line_end < len {
3636                data[wp] = line_delim;
3637                wp += 1;
3638            } else if first_output && line_end < len {
3639                // No fields selected but line had delimiters — output empty line
3640                data[wp] = line_delim;
3641                wp += 1;
3642            }
3643        }
3644
3645        rp = if line_end < len { line_end + 1 } else { len };
3646    }
3647
3648    wp
3649}
3650
3651/// In-place byte/char range extraction.
3652fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3653    let len = data.len();
3654    if len == 0 {
3655        return 0;
3656    }
3657
3658    // Quick check: single range from byte 1 to end = no-op
3659    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3660        return len;
3661    }
3662
3663    // Single range from byte 1: fast truncation path
3664    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3665        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3666    }
3667
3668    let mut wp: usize = 0;
3669    let mut rp: usize = 0;
3670
3671    while rp < len {
3672        let line_end = memchr::memchr(line_delim, &data[rp..])
3673            .map(|p| rp + p)
3674            .unwrap_or(len);
3675        let line_len = line_end - rp;
3676
3677        for r in ranges {
3678            let start = r.start.saturating_sub(1);
3679            let end = r.end.min(line_len);
3680            if start >= line_len {
3681                break;
3682            }
3683            let flen = end - start;
3684            if flen > 0 {
3685                data.copy_within(rp + start..rp + start + flen, wp);
3686                wp += flen;
3687            }
3688        }
3689
3690        if line_end < len {
3691            data[wp] = line_delim;
3692            wp += 1;
3693        }
3694
3695        rp = if line_end < len { line_end + 1 } else { len };
3696    }
3697
3698    wp
3699}
3700
3701/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3702fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3703    let len = data.len();
3704
3705    // Quick check: see if all lines fit within max_bytes (common case)
3706    let mut all_fit = true;
3707    let mut start = 0;
3708    for pos in memchr_iter(line_delim, data) {
3709        if pos - start > max_bytes {
3710            all_fit = false;
3711            break;
3712        }
3713        start = pos + 1;
3714    }
3715    if all_fit && start < len && len - start > max_bytes {
3716        all_fit = false;
3717    }
3718    if all_fit {
3719        return len;
3720    }
3721
3722    // Some lines need truncation
3723    let mut wp: usize = 0;
3724    let mut rp: usize = 0;
3725
3726    while rp < len {
3727        let line_end = memchr::memchr(line_delim, &data[rp..])
3728            .map(|p| rp + p)
3729            .unwrap_or(len);
3730        let line_len = line_end - rp;
3731
3732        let take = line_len.min(max_bytes);
3733        if take > 0 && wp != rp {
3734            data.copy_within(rp..rp + take, wp);
3735        }
3736        wp += take;
3737
3738        if line_end < len {
3739            data[wp] = line_delim;
3740            wp += 1;
3741        }
3742
3743        rp = if line_end < len { line_end + 1 } else { len };
3744    }
3745
3746    wp
3747}
3748
3749/// Cut operation mode
3750#[derive(Debug, Clone, Copy, PartialEq)]
3751pub enum CutMode {
3752    Bytes,
3753    Characters,
3754    Fields,
3755}