Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (16MB).
5/// Files above this threshold use rayon parallel chunked processing.
6/// 16MB balances the split_for_scope scan overhead against parallel benefits.
7const PARALLEL_THRESHOLD: usize = 16 * 1024 * 1024;
8
9/// Max iovec entries per writev call (Linux default).
10const MAX_IOV: usize = 1024;
11
12/// Configuration for cut operations.
13pub struct CutConfig<'a> {
14    pub mode: CutMode,
15    pub ranges: &'a [Range],
16    pub complement: bool,
17    pub delim: u8,
18    pub output_delim: &'a [u8],
19    pub suppress_no_delim: bool,
20    pub line_delim: u8,
21}
22
23/// A range specification like 1, 3-5, -3, 4-
24#[derive(Debug, Clone)]
25pub struct Range {
26    pub start: usize, // 1-based, 0 means "from beginning"
27    pub end: usize,   // 1-based, usize::MAX means "to end"
28}
29
30/// Parse a LIST specification like "1,3-5,7-" into ranges.
31/// Each range is 1-based. Returns sorted, merged ranges.
32pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
33    let mut ranges = Vec::new();
34
35    for part in spec.split(',') {
36        let part = part.trim();
37        if part.is_empty() {
38            continue;
39        }
40
41        if let Some(idx) = part.find('-') {
42            let left = &part[..idx];
43            let right = &part[idx + 1..];
44
45            let start = if left.is_empty() {
46                1
47            } else {
48                left.parse::<usize>()
49                    .map_err(|_| format!("invalid range: '{}'", part))?
50            };
51
52            let end = if right.is_empty() {
53                usize::MAX
54            } else {
55                right
56                    .parse::<usize>()
57                    .map_err(|_| format!("invalid range: '{}'", part))?
58            };
59
60            if start == 0 {
61                return Err("fields and positions are numbered from 1".to_string());
62            }
63            if start > end {
64                return Err(format!("invalid decreasing range: '{}'", part));
65            }
66
67            ranges.push(Range { start, end });
68        } else {
69            let n = part
70                .parse::<usize>()
71                .map_err(|_| format!("invalid field: '{}'", part))?;
72            if n == 0 {
73                return Err("fields and positions are numbered from 1".to_string());
74            }
75            ranges.push(Range { start: n, end: n });
76        }
77    }
78
79    if ranges.is_empty() {
80        return Err("you must specify a list of bytes, characters, or fields".to_string());
81    }
82
83    // Sort and merge overlapping ranges
84    ranges.sort_by_key(|r| (r.start, r.end));
85    let mut merged = vec![ranges[0].clone()];
86    for r in &ranges[1..] {
87        let last = merged.last_mut().unwrap();
88        if r.start <= last.end.saturating_add(1) {
89            last.end = last.end.max(r.end);
90        } else {
91            merged.push(r.clone());
92        }
93    }
94
95    Ok(merged)
96}
97
98/// Check if a 1-based position is in any range.
99/// Ranges must be sorted. Uses early exit since ranges are sorted.
100#[inline(always)]
101fn in_ranges(ranges: &[Range], pos: usize) -> bool {
102    for r in ranges {
103        if pos < r.start {
104            return false;
105        }
106        if pos <= r.end {
107            return true;
108        }
109    }
110    false
111}
112
113/// Pre-compute a 64-bit mask for field selection.
114/// Bit i-1 is set if field i should be output.
115#[inline]
116fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
117    let mut mask: u64 = 0;
118    for i in 1..=64u32 {
119        let in_range = in_ranges(ranges, i as usize);
120        if in_range != complement {
121            mask |= 1u64 << (i - 1);
122        }
123    }
124    mask
125}
126
127/// Check if a field should be selected, using bitset for first 64 fields.
128#[inline(always)]
129fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
130    if field_num <= 64 {
131        (mask >> (field_num - 1)) & 1 == 1
132    } else {
133        in_ranges(ranges, field_num) != complement
134    }
135}
136
137// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
138
139/// Append a slice to buf without capacity checks.
140/// Caller MUST ensure buf has enough remaining capacity.
141#[inline(always)]
142unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
143    unsafe {
144        let len = buf.len();
145        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
146        buf.set_len(len + data.len());
147    }
148}
149
150/// Append a single byte to buf without capacity checks.
151/// Caller MUST ensure buf has enough remaining capacity.
152#[inline(always)]
153unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
154    unsafe {
155        let len = buf.len();
156        *buf.as_mut_ptr().add(len) = b;
157        buf.set_len(len + 1);
158    }
159}
160
161/// Append a slice + a single trailing byte to buf without capacity checks.
162/// Fused operation saves one len load/store vs separate buf_extend + buf_push.
163/// Hot path for field extraction: copies field content + newline in one call.
164/// Caller MUST ensure buf has enough remaining capacity.
165#[inline(always)]
166unsafe fn buf_extend_byte(buf: &mut Vec<u8>, data: &[u8], b: u8) {
167    unsafe {
168        let len = buf.len();
169        let ptr = buf.as_mut_ptr().add(len);
170        std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, data.len());
171        *ptr.add(data.len()) = b;
172        buf.set_len(len + data.len() + 1);
173    }
174}
175
176/// Write multiple IoSlice buffers using write_vectored (writev syscall).
177/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
178/// Cold path (partial write) is out-of-line to keep the hot loop tight.
179#[inline]
180fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
181    if slices.is_empty() {
182        return Ok(());
183    }
184    for batch in slices.chunks(MAX_IOV) {
185        let total: usize = batch.iter().map(|s| s.len()).sum();
186        let written = out.write_vectored(batch)?;
187        if written >= total {
188            continue;
189        }
190        if written == 0 {
191            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
192        }
193        write_ioslices_slow(out, batch, written)?;
194    }
195    Ok(())
196}
197
198/// Handle partial write_vectored (cold path, never inlined).
199#[cold]
200#[inline(never)]
201fn write_ioslices_slow(
202    out: &mut impl Write,
203    slices: &[IoSlice],
204    mut skip: usize,
205) -> io::Result<()> {
206    for slice in slices {
207        let len = slice.len();
208        if skip >= len {
209            skip -= len;
210            continue;
211        }
212        out.write_all(&slice[skip..])?;
213        skip = 0;
214    }
215    Ok(())
216}
217
218// ── Chunk splitting for parallel processing ──────────────────────────────
219
220/// Number of available CPUs for parallel chunk splitting.
221/// Uses std::thread::available_parallelism() to avoid triggering premature
222/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
223#[inline]
224fn num_cpus() -> usize {
225    std::thread::available_parallelism()
226        .map(|n| n.get())
227        .unwrap_or(1)
228}
229
230/// Split data into chunks for rayon::scope parallel processing.
231/// Uses Rayon's thread count to match the number of worker threads.
232fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
233    let num_threads = num_cpus().max(1);
234    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
235        return vec![data];
236    }
237
238    let chunk_size = data.len() / num_threads;
239    let mut chunks = Vec::with_capacity(num_threads);
240    let mut pos = 0;
241
242    for _ in 0..num_threads - 1 {
243        let target = pos + chunk_size;
244        if target >= data.len() {
245            break;
246        }
247        let boundary = memchr::memchr(line_delim, &data[target..])
248            .map(|p| target + p + 1)
249            .unwrap_or(data.len());
250        if boundary > pos {
251            chunks.push(&data[pos..boundary]);
252        }
253        pos = boundary;
254    }
255
256    if pos < data.len() {
257        chunks.push(&data[pos..]);
258    }
259
260    chunks
261}
262
263// ── Fast path: multi-field non-contiguous extraction ─────────────────────
264
265/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
266/// Pre-collects delimiter positions per line into a stack-allocated array,
267/// then directly indexes into them for each selected field.
268/// This is O(max_field) per line instead of O(num_fields * scan_length).
269fn process_fields_multi_select(
270    data: &[u8],
271    delim: u8,
272    line_delim: u8,
273    ranges: &[Range],
274    suppress: bool,
275    out: &mut impl Write,
276) -> io::Result<()> {
277    let max_field = ranges.last().map_or(0, |r| r.end);
278
279    if data.len() >= PARALLEL_THRESHOLD {
280        let chunks = split_for_scope(data, line_delim);
281        let n = chunks.len();
282        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
283        rayon::scope(|s| {
284            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
285                s.spawn(move |_| {
286                    result.reserve(chunk.len() * 3 / 4);
287                    multi_select_chunk(
288                        chunk, delim, line_delim, ranges, max_field, suppress, result,
289                    );
290                });
291            }
292        });
293        let slices: Vec<IoSlice> = results
294            .iter()
295            .filter(|r| !r.is_empty())
296            .map(|r| IoSlice::new(r))
297            .collect();
298        write_ioslices(out, &slices)?;
299    } else {
300        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
301        multi_select_chunk(
302            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
303        );
304        if !buf.is_empty() {
305            out.write_all(&buf)?;
306        }
307    }
308    Ok(())
309}
310
311/// Process a chunk for multi-field extraction using a single-pass memchr2 scan.
312/// Scans for both delimiter and line_delim in one SIMD pass over the entire chunk,
313/// eliminating per-line memchr_iter setup overhead (significant for short lines).
314/// Delimiter positions are collected in a stack array per line.
315/// When max_field is reached on a line, remaining delimiters are ignored.
316fn multi_select_chunk(
317    data: &[u8],
318    delim: u8,
319    line_delim: u8,
320    ranges: &[Range],
321    max_field: usize,
322    suppress: bool,
323    buf: &mut Vec<u8>,
324) {
325    // When delim == line_delim, fall back to two-level approach
326    if delim == line_delim {
327        buf.reserve(data.len());
328        let base = data.as_ptr();
329        let mut start = 0;
330        for end_pos in memchr_iter(line_delim, data) {
331            let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
332            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
333            start = end_pos + 1;
334        }
335        if start < data.len() {
336            let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
337            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
338        }
339        return;
340    }
341
342    buf.reserve(data.len());
343    let base = data.as_ptr();
344    let data_len = data.len();
345
346    // Per-line state
347    let mut line_start: usize = 0;
348    let mut delim_pos = [0usize; 64];
349    let mut num_delims: usize = 0;
350    let max_delims = max_field.min(64);
351    let mut at_max = false;
352
353    // Single-pass scan using memchr2 for both delimiter and newline
354    for pos in memchr::memchr2_iter(delim, line_delim, data) {
355        let byte = unsafe { *base.add(pos) };
356
357        if byte == line_delim {
358            // End of line: extract fields from collected positions
359            let line_len = pos - line_start;
360            if num_delims == 0 {
361                // No delimiter in line
362                if !suppress {
363                    unsafe {
364                        buf_extend(
365                            buf,
366                            std::slice::from_raw_parts(base.add(line_start), line_len),
367                        );
368                        buf_push(buf, line_delim);
369                    }
370                }
371            } else {
372                // Extract fields using collected delimiter positions
373                let total_fields = num_delims + 1;
374                let mut first_output = true;
375
376                for r in ranges {
377                    let range_start = r.start;
378                    let range_end = r.end.min(total_fields);
379                    if range_start > total_fields {
380                        break;
381                    }
382                    for field_num in range_start..=range_end {
383                        if field_num > total_fields {
384                            break;
385                        }
386
387                        let field_start = if field_num == 1 {
388                            line_start
389                        } else if field_num - 2 < num_delims {
390                            delim_pos[field_num - 2] + 1
391                        } else {
392                            continue;
393                        };
394                        let field_end = if field_num <= num_delims {
395                            delim_pos[field_num - 1]
396                        } else {
397                            pos
398                        };
399
400                        if !first_output {
401                            unsafe { buf_push(buf, delim) };
402                        }
403                        unsafe {
404                            buf_extend(
405                                buf,
406                                std::slice::from_raw_parts(
407                                    base.add(field_start),
408                                    field_end - field_start,
409                                ),
410                            );
411                        }
412                        first_output = false;
413                    }
414                }
415
416                unsafe { buf_push(buf, line_delim) };
417            }
418
419            // Reset for next line
420            line_start = pos + 1;
421            num_delims = 0;
422            at_max = false;
423        } else {
424            // Delimiter found: collect position (up to max_field)
425            if !at_max && num_delims < max_delims {
426                delim_pos[num_delims] = pos;
427                num_delims += 1;
428                if num_delims >= max_delims {
429                    at_max = true;
430                }
431            }
432        }
433    }
434
435    // Handle last line without trailing line_delim
436    if line_start < data_len {
437        if num_delims == 0 {
438            if !suppress {
439                unsafe {
440                    buf_extend(
441                        buf,
442                        std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
443                    );
444                    buf_push(buf, line_delim);
445                }
446            }
447        } else {
448            let total_fields = num_delims + 1;
449            let mut first_output = true;
450
451            for r in ranges {
452                let range_start = r.start;
453                let range_end = r.end.min(total_fields);
454                if range_start > total_fields {
455                    break;
456                }
457                for field_num in range_start..=range_end {
458                    if field_num > total_fields {
459                        break;
460                    }
461
462                    let field_start = if field_num == 1 {
463                        line_start
464                    } else if field_num - 2 < num_delims {
465                        delim_pos[field_num - 2] + 1
466                    } else {
467                        continue;
468                    };
469                    let field_end = if field_num <= num_delims {
470                        delim_pos[field_num - 1]
471                    } else {
472                        data_len
473                    };
474
475                    if !first_output {
476                        unsafe { buf_push(buf, delim) };
477                    }
478                    unsafe {
479                        buf_extend(
480                            buf,
481                            std::slice::from_raw_parts(
482                                base.add(field_start),
483                                field_end - field_start,
484                            ),
485                        );
486                    }
487                    first_output = false;
488                }
489            }
490
491            unsafe { buf_push(buf, line_delim) };
492        }
493    }
494}
495
496/// Extract selected fields from a single line using delimiter position scanning.
497/// Scans delimiters only up to max_field (early exit), then extracts selected fields
498/// by indexing directly into the collected positions. Since ranges are pre-sorted and
499/// non-overlapping, every field within a range is selected — no is_selected check needed.
500#[inline(always)]
501fn multi_select_line(
502    line: &[u8],
503    delim: u8,
504    line_delim: u8,
505    ranges: &[Range],
506    max_field: usize,
507    suppress: bool,
508    buf: &mut Vec<u8>,
509) {
510    let len = line.len();
511    if len == 0 {
512        if !suppress {
513            unsafe { buf_push(buf, line_delim) };
514        }
515        return;
516    }
517
518    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
519    let base = line.as_ptr();
520
521    // Collect delimiter positions up to max_field (early exit).
522    // Stack array for up to 64 delimiter positions.
523    let mut delim_pos = [0usize; 64];
524    let mut num_delims: usize = 0;
525    let max_delims = max_field.min(64);
526
527    for pos in memchr_iter(delim, line) {
528        if num_delims < max_delims {
529            delim_pos[num_delims] = pos;
530            num_delims += 1;
531            if num_delims >= max_delims {
532                break;
533            }
534        }
535    }
536
537    if num_delims == 0 {
538        if !suppress {
539            unsafe {
540                buf_extend(buf, line);
541                buf_push(buf, line_delim);
542            }
543        }
544        return;
545    }
546
547    // Extract selected fields using delimiter positions.
548    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
549    // is selected — skip the is_selected check entirely (saves 1 function call per field).
550    let total_fields = num_delims + 1;
551    let mut first_output = true;
552
553    for r in ranges {
554        let range_start = r.start;
555        let range_end = r.end.min(total_fields);
556        if range_start > total_fields {
557            break;
558        }
559        for field_num in range_start..=range_end {
560            if field_num > total_fields {
561                break;
562            }
563
564            let field_start = if field_num == 1 {
565                0
566            } else if field_num - 2 < num_delims {
567                delim_pos[field_num - 2] + 1
568            } else {
569                continue;
570            };
571            let field_end = if field_num <= num_delims {
572                delim_pos[field_num - 1]
573            } else {
574                len
575            };
576
577            if !first_output {
578                unsafe { buf_push(buf, delim) };
579            }
580            unsafe {
581                buf_extend(
582                    buf,
583                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
584                );
585            }
586            first_output = false;
587        }
588    }
589
590    unsafe { buf_push(buf, line_delim) };
591}
592
593// ── Fast path: field extraction with batched output ──────────────────────
594
595/// Optimized field extraction with early exit and batched output.
596fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
597    let delim = cfg.delim;
598    let line_delim = cfg.line_delim;
599    let ranges = cfg.ranges;
600    let complement = cfg.complement;
601    let output_delim = cfg.output_delim;
602    let suppress = cfg.suppress_no_delim;
603
604    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
605    // That scan was O(N) over the entire file just to check an edge case
606    // (no delimiter in any line). The per-line processing already handles
607    // lines without delimiters correctly, so the scan was pure overhead
608    // for files that DO contain delimiters (the common case).
609
610    // Ultra-fast path: single field extraction (e.g., cut -f5)
611    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
612        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
613    }
614
615    // Fast path: complement of single field or contiguous range with default output delimiter.
616    if complement
617        && ranges.len() == 1
618        && output_delim.len() == 1
619        && output_delim[0] == delim
620        && ranges[0].start == ranges[0].end
621    {
622        return process_complement_single_field(
623            data,
624            delim,
625            line_delim,
626            ranges[0].start,
627            suppress,
628            out,
629        );
630    }
631
632    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
633    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
634    if complement
635        && ranges.len() == 1
636        && ranges[0].start > 1
637        && ranges[0].end < usize::MAX
638        && output_delim.len() == 1
639        && output_delim[0] == delim
640    {
641        return process_complement_range(
642            data,
643            delim,
644            line_delim,
645            ranges[0].start,
646            ranges[0].end,
647            suppress,
648            out,
649        );
650    }
651
652    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
653    if !complement
654        && ranges.len() == 1
655        && ranges[0].start == 1
656        && output_delim.len() == 1
657        && output_delim[0] == delim
658        && ranges[0].end < usize::MAX
659    {
660        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
661    }
662
663    // Fast path: open-ended field range from field N (e.g., cut -f3-)
664    if !complement
665        && ranges.len() == 1
666        && ranges[0].end == usize::MAX
667        && ranges[0].start > 1
668        && output_delim.len() == 1
669        && output_delim[0] == delim
670    {
671        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
672    }
673
674    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
675    if !complement
676        && ranges.len() == 1
677        && ranges[0].start > 1
678        && ranges[0].end < usize::MAX
679        && output_delim.len() == 1
680        && output_delim[0] == delim
681    {
682        return process_fields_mid_range(
683            data,
684            delim,
685            line_delim,
686            ranges[0].start,
687            ranges[0].end,
688            suppress,
689            out,
690        );
691    }
692
693    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
694    // Uses delimiter position caching: find all delimiter positions per line,
695    // then directly index into them for each selected field.
696    // This is faster than the general extract_fields_to_buf which re-checks
697    // is_selected() for every field encountered.
698    if !complement
699        && ranges.len() > 1
700        && ranges.last().map_or(false, |r| r.end < usize::MAX)
701        && output_delim.len() == 1
702        && output_delim[0] == delim
703        && delim != line_delim
704    {
705        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
706    }
707
708    // General field extraction
709    let max_field = if complement {
710        usize::MAX
711    } else {
712        ranges.last().map(|r| r.end).unwrap_or(0)
713    };
714    let field_mask = compute_field_mask(ranges, complement);
715
716    if data.len() >= PARALLEL_THRESHOLD {
717        let chunks = split_for_scope(data, line_delim);
718        let n = chunks.len();
719        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
720        rayon::scope(|s| {
721            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
722                s.spawn(move |_| {
723                    result.reserve(chunk.len());
724                    process_fields_chunk(
725                        chunk,
726                        delim,
727                        ranges,
728                        output_delim,
729                        suppress,
730                        max_field,
731                        field_mask,
732                        line_delim,
733                        complement,
734                        result,
735                    );
736                });
737            }
738        });
739        let slices: Vec<IoSlice> = results
740            .iter()
741            .filter(|r| !r.is_empty())
742            .map(|r| IoSlice::new(r))
743            .collect();
744        write_ioslices(out, &slices)?;
745    } else {
746        let mut buf = Vec::with_capacity(data.len());
747        process_fields_chunk(
748            data,
749            delim,
750            ranges,
751            output_delim,
752            suppress,
753            max_field,
754            field_mask,
755            line_delim,
756            complement,
757            &mut buf,
758        );
759        if !buf.is_empty() {
760            out.write_all(&buf)?;
761        }
762    }
763    Ok(())
764}
765
766/// Process a chunk of data for general field extraction.
767/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
768/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
769/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
770fn process_fields_chunk(
771    data: &[u8],
772    delim: u8,
773    ranges: &[Range],
774    output_delim: &[u8],
775    suppress: bool,
776    max_field: usize,
777    field_mask: u64,
778    line_delim: u8,
779    complement: bool,
780    buf: &mut Vec<u8>,
781) {
782    // When delim != line_delim and max_field is bounded, use two-level approach:
783    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
784    // This avoids scanning past max_field on each line (significant for lines with
785    // many columns but small field selection like -f1,3,5 on 20-column CSV).
786    // For complement or unbounded ranges, use single-pass memchr2_iter which
787    // needs to process all delimiters anyway.
788    if delim != line_delim && max_field < usize::MAX && !complement {
789        buf.reserve(data.len());
790        let mut start = 0;
791        for end_pos in memchr_iter(line_delim, data) {
792            let line = &data[start..end_pos];
793            extract_fields_to_buf(
794                line,
795                delim,
796                ranges,
797                output_delim,
798                suppress,
799                max_field,
800                field_mask,
801                line_delim,
802                buf,
803                complement,
804            );
805            start = end_pos + 1;
806        }
807        if start < data.len() {
808            extract_fields_to_buf(
809                &data[start..],
810                delim,
811                ranges,
812                output_delim,
813                suppress,
814                max_field,
815                field_mask,
816                line_delim,
817                buf,
818                complement,
819            );
820        }
821        return;
822    }
823
824    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
825    // delimiter and line_delim in one SIMD scan.
826    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
827    if delim != line_delim {
828        buf.reserve(data.len());
829
830        let data_len = data.len();
831        let base = data.as_ptr();
832        let mut line_start: usize = 0;
833        let mut field_start: usize = 0;
834        let mut field_num: usize = 1;
835        let mut first_output = true;
836        let mut has_delim = false;
837
838        for pos in memchr::memchr2_iter(delim, line_delim, data) {
839            let byte = unsafe { *base.add(pos) };
840
841            if byte == line_delim {
842                // End of line: flush final field and emit line delimiter
843                if (field_num <= max_field || complement)
844                    && has_delim
845                    && is_selected(field_num, field_mask, ranges, complement)
846                {
847                    if !first_output {
848                        unsafe { buf_extend(buf, output_delim) };
849                    }
850                    unsafe {
851                        buf_extend(
852                            buf,
853                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
854                        )
855                    };
856                    first_output = false;
857                }
858
859                if !first_output {
860                    unsafe { buf_push(buf, line_delim) };
861                } else if !has_delim {
862                    if !suppress {
863                        unsafe {
864                            buf_extend(
865                                buf,
866                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
867                            );
868                            buf_push(buf, line_delim);
869                        }
870                    }
871                } else {
872                    unsafe { buf_push(buf, line_delim) };
873                }
874
875                // Reset state for next line
876                line_start = pos + 1;
877                field_start = pos + 1;
878                field_num = 1;
879                first_output = true;
880                has_delim = false;
881            } else {
882                // Field delimiter hit
883                has_delim = true;
884
885                if is_selected(field_num, field_mask, ranges, complement) {
886                    if !first_output {
887                        unsafe { buf_extend(buf, output_delim) };
888                    }
889                    unsafe {
890                        buf_extend(
891                            buf,
892                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
893                        )
894                    };
895                    first_output = false;
896                }
897
898                field_num += 1;
899                field_start = pos + 1;
900            }
901        }
902
903        // Handle last line without trailing line_delim
904        if line_start < data_len {
905            if line_start < data_len {
906                if (field_num <= max_field || complement)
907                    && has_delim
908                    && is_selected(field_num, field_mask, ranges, complement)
909                {
910                    if !first_output {
911                        unsafe { buf_extend(buf, output_delim) };
912                    }
913                    unsafe {
914                        buf_extend(
915                            buf,
916                            std::slice::from_raw_parts(
917                                base.add(field_start),
918                                data_len - field_start,
919                            ),
920                        )
921                    };
922                    first_output = false;
923                }
924
925                if !first_output {
926                    unsafe { buf_push(buf, line_delim) };
927                } else if !has_delim {
928                    if !suppress {
929                        unsafe {
930                            buf_extend(
931                                buf,
932                                std::slice::from_raw_parts(
933                                    base.add(line_start),
934                                    data_len - line_start,
935                                ),
936                            );
937                            buf_push(buf, line_delim);
938                        }
939                    }
940                } else {
941                    unsafe { buf_push(buf, line_delim) };
942                }
943            }
944        }
945
946        return;
947    }
948
949    // Fallback: when delim == line_delim, use the two-level scan approach
950    let mut start = 0;
951    for end_pos in memchr_iter(line_delim, data) {
952        let line = &data[start..end_pos];
953        extract_fields_to_buf(
954            line,
955            delim,
956            ranges,
957            output_delim,
958            suppress,
959            max_field,
960            field_mask,
961            line_delim,
962            buf,
963            complement,
964        );
965        start = end_pos + 1;
966    }
967    if start < data.len() {
968        extract_fields_to_buf(
969            &data[start..],
970            delim,
971            ranges,
972            output_delim,
973            suppress,
974            max_field,
975            field_mask,
976            line_delim,
977            buf,
978            complement,
979        );
980    }
981}
982
983// ── Ultra-fast single field extraction ───────────────────────────────────
984
985/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
986/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
987/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
988fn process_single_field(
989    data: &[u8],
990    delim: u8,
991    line_delim: u8,
992    target: usize,
993    suppress: bool,
994    out: &mut impl Write,
995) -> io::Result<()> {
996    let target_idx = target - 1;
997
998    // For single-field extraction, parallelize at 16MB+ to match PARALLEL_THRESHOLD.
999    const FIELD_PARALLEL_MIN: usize = 16 * 1024 * 1024;
1000
1001    if delim != line_delim {
1002        // Field 1 fast path: memchr2 single-pass scan.
1003        // For field 1, the first delimiter IS the field boundary. Lines without
1004        // delimiter are passed through unchanged.
1005        if target_idx == 0 && !suppress {
1006            if data.len() >= FIELD_PARALLEL_MIN {
1007                return single_field1_parallel(data, delim, line_delim, out);
1008            }
1009            // Sequential: scan with memchr2 into buffer, single write_all.
1010            // Faster than writev/IoSlice for moderate data because it produces
1011            // one contiguous buffer → one write syscall, and avoids IoSlice
1012            // allocation overhead for high-delimiter-density data.
1013            let mut buf = Vec::with_capacity(data.len() + 1);
1014            single_field1_to_buf(data, delim, line_delim, &mut buf);
1015            if !buf.is_empty() {
1016                out.write_all(&buf)?;
1017            }
1018            return Ok(());
1019        }
1020
1021        // Two-level approach for field N: outer newline scan + inner delim scan
1022        // with early exit at target_idx. Faster than memchr2 single-pass because
1023        // we only scan delimiters up to target_idx per line (not all of them).
1024        if data.len() >= FIELD_PARALLEL_MIN {
1025            let chunks = split_for_scope(data, line_delim);
1026            let n = chunks.len();
1027            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1028            rayon::scope(|s| {
1029                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1030                    s.spawn(move |_| {
1031                        result.reserve(chunk.len() / 2);
1032                        process_single_field_chunk(
1033                            chunk, delim, target_idx, line_delim, suppress, result,
1034                        );
1035                    });
1036                }
1037            });
1038            let slices: Vec<IoSlice> = results
1039                .iter()
1040                .filter(|r| !r.is_empty())
1041                .map(|r| IoSlice::new(r))
1042                .collect();
1043            write_ioslices(out, &slices)?;
1044        } else {
1045            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1046            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1047            if !buf.is_empty() {
1048                out.write_all(&buf)?;
1049            }
1050        }
1051        return Ok(());
1052    }
1053
1054    // Fallback for delim == line_delim: nested loop approach
1055    if data.len() >= FIELD_PARALLEL_MIN {
1056        let chunks = split_for_scope(data, line_delim);
1057        let n = chunks.len();
1058        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1059        rayon::scope(|s| {
1060            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1061                s.spawn(move |_| {
1062                    result.reserve(chunk.len() / 4);
1063                    process_single_field_chunk(
1064                        chunk, delim, target_idx, line_delim, suppress, result,
1065                    );
1066                });
1067            }
1068        });
1069        let slices: Vec<IoSlice> = results
1070            .iter()
1071            .filter(|r| !r.is_empty())
1072            .map(|r| IoSlice::new(r))
1073            .collect();
1074        write_ioslices(out, &slices)?;
1075    } else {
1076        let mut buf = Vec::with_capacity(data.len() / 4);
1077        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1078        if !buf.is_empty() {
1079            out.write_all(&buf)?;
1080        }
1081    }
1082    Ok(())
1083}
1084
1085/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1086/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1087fn process_complement_range(
1088    data: &[u8],
1089    delim: u8,
1090    line_delim: u8,
1091    skip_start: usize,
1092    skip_end: usize,
1093    suppress: bool,
1094    out: &mut impl Write,
1095) -> io::Result<()> {
1096    if data.len() >= PARALLEL_THRESHOLD {
1097        let chunks = split_for_scope(data, line_delim);
1098        let n = chunks.len();
1099        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1100        rayon::scope(|s| {
1101            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1102                s.spawn(move |_| {
1103                    result.reserve(chunk.len());
1104                    complement_range_chunk(
1105                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
1106                    );
1107                });
1108            }
1109        });
1110        let slices: Vec<IoSlice> = results
1111            .iter()
1112            .filter(|r| !r.is_empty())
1113            .map(|r| IoSlice::new(r))
1114            .collect();
1115        write_ioslices(out, &slices)?;
1116    } else {
1117        let mut buf = Vec::with_capacity(data.len());
1118        complement_range_chunk(
1119            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1120        );
1121        if !buf.is_empty() {
1122            out.write_all(&buf)?;
1123        }
1124    }
1125    Ok(())
1126}
1127
1128/// Process a chunk for complement range extraction.
1129fn complement_range_chunk(
1130    data: &[u8],
1131    delim: u8,
1132    skip_start: usize,
1133    skip_end: usize,
1134    line_delim: u8,
1135    suppress: bool,
1136    buf: &mut Vec<u8>,
1137) {
1138    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1139    buf.reserve(data.len());
1140    let mut start = 0;
1141    for end_pos in memchr_iter(line_delim, data) {
1142        let line = &data[start..end_pos];
1143        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1144        start = end_pos + 1;
1145    }
1146    if start < data.len() {
1147        complement_range_line(
1148            &data[start..],
1149            delim,
1150            skip_start,
1151            skip_end,
1152            line_delim,
1153            suppress,
1154            buf,
1155        );
1156    }
1157}
1158
1159/// Extract all fields except skip_start..=skip_end from one line.
1160/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1161///
1162/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1163/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1164/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1165#[inline(always)]
1166fn complement_range_line(
1167    line: &[u8],
1168    delim: u8,
1169    skip_start: usize,
1170    skip_end: usize,
1171    line_delim: u8,
1172    suppress: bool,
1173    buf: &mut Vec<u8>,
1174) {
1175    let len = line.len();
1176    if len == 0 {
1177        if !suppress {
1178            unsafe { buf_push(buf, line_delim) };
1179        }
1180        return;
1181    }
1182
1183    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1184    let base = line.as_ptr();
1185
1186    // 1-based field numbers. To skip fields skip_start..=skip_end:
1187    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1188    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1189    //
1190    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1191    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1192
1193    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1194    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1195    let total_need = need_prefix_delims + need_skip_delims;
1196
1197    // Find delimiter positions up to total_need
1198    let mut delim_count: usize = 0;
1199    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1200    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1201
1202    for pos in memchr_iter(delim, line) {
1203        delim_count += 1;
1204        if delim_count == need_prefix_delims {
1205            prefix_end_pos = pos;
1206        }
1207        if delim_count == total_need {
1208            suffix_start_pos = pos + 1;
1209            break;
1210        }
1211    }
1212
1213    if delim_count == 0 {
1214        // No delimiter at all
1215        if !suppress {
1216            unsafe {
1217                buf_extend(buf, line);
1218                buf_push(buf, line_delim);
1219            }
1220        }
1221        return;
1222    }
1223
1224    // Case analysis:
1225    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1226    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1227    // 3. Enough to reach skip_end: prefix + delim + suffix
1228
1229    if delim_count < need_prefix_delims {
1230        // Not enough fields to reach skip region — output entire line
1231        unsafe {
1232            buf_extend(buf, line);
1233            buf_push(buf, line_delim);
1234        }
1235        return;
1236    }
1237
1238    let has_prefix = need_prefix_delims > 0;
1239    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1240
1241    if has_prefix && has_suffix {
1242        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1243        unsafe {
1244            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1245            buf_push(buf, delim);
1246            buf_extend(
1247                buf,
1248                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1249            );
1250            buf_push(buf, line_delim);
1251        }
1252    } else if has_prefix {
1253        // Only prefix, no suffix (skip region extends to end of line)
1254        unsafe {
1255            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1256            buf_push(buf, line_delim);
1257        }
1258    } else if has_suffix {
1259        // No prefix (skip_start == 1), only suffix
1260        unsafe {
1261            buf_extend(
1262                buf,
1263                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1264            );
1265            buf_push(buf, line_delim);
1266        }
1267    } else {
1268        // All fields skipped
1269        unsafe { buf_push(buf, line_delim) };
1270    }
1271}
1272
1273/// Complement single-field extraction: skip one field, output rest unchanged.
1274fn process_complement_single_field(
1275    data: &[u8],
1276    delim: u8,
1277    line_delim: u8,
1278    skip_field: usize,
1279    suppress: bool,
1280    out: &mut impl Write,
1281) -> io::Result<()> {
1282    let skip_idx = skip_field - 1;
1283
1284    if data.len() >= PARALLEL_THRESHOLD {
1285        let chunks = split_for_scope(data, line_delim);
1286        let n = chunks.len();
1287        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1288        rayon::scope(|s| {
1289            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1290                s.spawn(move |_| {
1291                    result.reserve(chunk.len());
1292                    complement_single_field_chunk(
1293                        chunk, delim, skip_idx, line_delim, suppress, result,
1294                    );
1295                });
1296            }
1297        });
1298        let slices: Vec<IoSlice> = results
1299            .iter()
1300            .filter(|r| !r.is_empty())
1301            .map(|r| IoSlice::new(r))
1302            .collect();
1303        write_ioslices(out, &slices)?;
1304    } else {
1305        let mut buf = Vec::with_capacity(data.len());
1306        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1307        if !buf.is_empty() {
1308            out.write_all(&buf)?;
1309        }
1310    }
1311    Ok(())
1312}
1313
1314/// Process a chunk for complement single-field extraction using memchr2 single-pass.
1315/// Scans for both delimiter and line_delim in one SIMD pass, tracking delimiter count
1316/// per line. When the skip field's bounding delimiters are found, copies prefix + suffix.
1317/// This eliminates the per-line memchr_iter setup overhead and reduces from two SIMD
1318/// passes (outer newline scan + inner delimiter scan) to one.
1319fn complement_single_field_chunk(
1320    data: &[u8],
1321    delim: u8,
1322    skip_idx: usize,
1323    line_delim: u8,
1324    suppress: bool,
1325    buf: &mut Vec<u8>,
1326) {
1327    // When delim == line_delim, fall back to per-line approach
1328    if delim == line_delim {
1329        buf.reserve(data.len());
1330        let mut start = 0;
1331        for end_pos in memchr_iter(line_delim, data) {
1332            let line = &data[start..end_pos];
1333            complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1334            start = end_pos + 1;
1335        }
1336        if start < data.len() {
1337            complement_single_field_line(
1338                &data[start..],
1339                delim,
1340                skip_idx,
1341                line_delim,
1342                suppress,
1343                buf,
1344            );
1345        }
1346        return;
1347    }
1348
1349    buf.reserve(data.len());
1350    let base = data.as_ptr();
1351    let data_len = data.len();
1352    let need_before = skip_idx; // delimiters before skip field
1353    let need_total = skip_idx + 1; // delimiters to find end of skip field
1354
1355    // Per-line state
1356    let mut line_start: usize = 0;
1357    let mut delim_count: usize = 0;
1358    let mut skip_start_pos: usize = 0;
1359    let mut skip_end_pos: usize = 0;
1360    let mut found_start = need_before == 0; // skip_idx==0 means skip starts at line start
1361    let mut found_end = false;
1362
1363    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1364        let byte = unsafe { *base.add(pos) };
1365
1366        if byte == line_delim {
1367            // End of line: emit based on what we found
1368            if delim_count == 0 {
1369                // No delimiter in line
1370                if !suppress {
1371                    unsafe {
1372                        buf_extend(
1373                            buf,
1374                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1375                        );
1376                        buf_push(buf, line_delim);
1377                    }
1378                }
1379            } else if !found_start || delim_count < need_before {
1380                // Not enough delimiters to reach skip field — output entire line
1381                unsafe {
1382                    buf_extend(
1383                        buf,
1384                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1385                    );
1386                    buf_push(buf, line_delim);
1387                }
1388            } else {
1389                let has_prefix = skip_idx > 0;
1390                let has_suffix = found_end && skip_end_pos < pos;
1391
1392                if has_prefix && has_suffix {
1393                    unsafe {
1394                        buf_extend(
1395                            buf,
1396                            std::slice::from_raw_parts(
1397                                base.add(line_start),
1398                                skip_start_pos - 1 - line_start,
1399                            ),
1400                        );
1401                        buf_push(buf, delim);
1402                        buf_extend(
1403                            buf,
1404                            std::slice::from_raw_parts(
1405                                base.add(skip_end_pos + 1),
1406                                pos - skip_end_pos - 1,
1407                            ),
1408                        );
1409                        buf_push(buf, line_delim);
1410                    }
1411                } else if has_prefix {
1412                    unsafe {
1413                        buf_extend(
1414                            buf,
1415                            std::slice::from_raw_parts(
1416                                base.add(line_start),
1417                                skip_start_pos - 1 - line_start,
1418                            ),
1419                        );
1420                        buf_push(buf, line_delim);
1421                    }
1422                } else if has_suffix {
1423                    unsafe {
1424                        buf_extend(
1425                            buf,
1426                            std::slice::from_raw_parts(
1427                                base.add(skip_end_pos + 1),
1428                                pos - skip_end_pos - 1,
1429                            ),
1430                        );
1431                        buf_push(buf, line_delim);
1432                    }
1433                } else {
1434                    unsafe { buf_push(buf, line_delim) };
1435                }
1436            }
1437
1438            // Reset for next line
1439            line_start = pos + 1;
1440            delim_count = 0;
1441            skip_start_pos = 0;
1442            skip_end_pos = 0;
1443            found_start = need_before == 0;
1444            found_end = false;
1445        } else {
1446            // Delimiter found
1447            delim_count += 1;
1448            if delim_count == need_before {
1449                skip_start_pos = pos + 1;
1450                found_start = true;
1451            }
1452            if delim_count == need_total {
1453                skip_end_pos = pos;
1454                found_end = true;
1455            }
1456        }
1457    }
1458
1459    // Handle last line without trailing line_delim
1460    if line_start < data_len {
1461        let pos = data_len;
1462        if delim_count == 0 {
1463            if !suppress {
1464                unsafe {
1465                    buf_extend(
1466                        buf,
1467                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1468                    );
1469                    buf_push(buf, line_delim);
1470                }
1471            }
1472        } else if !found_start || delim_count < need_before {
1473            unsafe {
1474                buf_extend(
1475                    buf,
1476                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1477                );
1478                buf_push(buf, line_delim);
1479            }
1480        } else {
1481            let has_prefix = skip_idx > 0;
1482            let has_suffix = found_end && skip_end_pos < pos;
1483
1484            if has_prefix && has_suffix {
1485                unsafe {
1486                    buf_extend(
1487                        buf,
1488                        std::slice::from_raw_parts(
1489                            base.add(line_start),
1490                            skip_start_pos - 1 - line_start,
1491                        ),
1492                    );
1493                    buf_push(buf, delim);
1494                    buf_extend(
1495                        buf,
1496                        std::slice::from_raw_parts(
1497                            base.add(skip_end_pos + 1),
1498                            pos - skip_end_pos - 1,
1499                        ),
1500                    );
1501                    buf_push(buf, line_delim);
1502                }
1503            } else if has_prefix {
1504                unsafe {
1505                    buf_extend(
1506                        buf,
1507                        std::slice::from_raw_parts(
1508                            base.add(line_start),
1509                            skip_start_pos - 1 - line_start,
1510                        ),
1511                    );
1512                    buf_push(buf, line_delim);
1513                }
1514            } else if has_suffix {
1515                unsafe {
1516                    buf_extend(
1517                        buf,
1518                        std::slice::from_raw_parts(
1519                            base.add(skip_end_pos + 1),
1520                            pos - skip_end_pos - 1,
1521                        ),
1522                    );
1523                    buf_push(buf, line_delim);
1524                }
1525            } else {
1526                unsafe { buf_push(buf, line_delim) };
1527            }
1528        }
1529    }
1530}
1531
1532/// Fallback per-line complement single-field extraction (for delim == line_delim).
1533#[inline(always)]
1534fn complement_single_field_line(
1535    line: &[u8],
1536    delim: u8,
1537    skip_idx: usize,
1538    line_delim: u8,
1539    suppress: bool,
1540    buf: &mut Vec<u8>,
1541) {
1542    let len = line.len();
1543    if len == 0 {
1544        if !suppress {
1545            unsafe { buf_push(buf, line_delim) };
1546        }
1547        return;
1548    }
1549
1550    let base = line.as_ptr();
1551    let need_before = skip_idx;
1552    let need_total = skip_idx + 1;
1553
1554    let mut delim_count: usize = 0;
1555    let mut skip_start_pos: usize = 0;
1556    let mut skip_end_pos: usize = len;
1557    let mut found_end = false;
1558
1559    for pos in memchr_iter(delim, line) {
1560        delim_count += 1;
1561        if delim_count == need_before {
1562            skip_start_pos = pos + 1;
1563        }
1564        if delim_count == need_total {
1565            skip_end_pos = pos;
1566            found_end = true;
1567            break;
1568        }
1569    }
1570
1571    if delim_count == 0 {
1572        if !suppress {
1573            unsafe {
1574                buf_extend(buf, line);
1575                buf_push(buf, line_delim);
1576            }
1577        }
1578        return;
1579    }
1580
1581    if delim_count < need_before {
1582        unsafe {
1583            buf_extend(buf, line);
1584            buf_push(buf, line_delim);
1585        }
1586        return;
1587    }
1588
1589    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1590    let has_suffix = found_end && skip_end_pos < len;
1591
1592    if has_prefix && has_suffix {
1593        unsafe {
1594            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1595            buf_push(buf, delim);
1596            buf_extend(
1597                buf,
1598                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1599            );
1600            buf_push(buf, line_delim);
1601        }
1602    } else if has_prefix {
1603        unsafe {
1604            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1605            buf_push(buf, line_delim);
1606        }
1607    } else if has_suffix {
1608        unsafe {
1609            buf_extend(
1610                buf,
1611                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1612            );
1613            buf_push(buf, line_delim);
1614        }
1615    } else {
1616        unsafe { buf_push(buf, line_delim) };
1617    }
1618}
1619
1620/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1621/// Zero-copy for the non-parallel path: identifies the truncation point per line
1622/// and writes contiguous runs directly from the source data.
1623fn process_fields_prefix(
1624    data: &[u8],
1625    delim: u8,
1626    line_delim: u8,
1627    last_field: usize,
1628    suppress: bool,
1629    out: &mut impl Write,
1630) -> io::Result<()> {
1631    if data.len() >= PARALLEL_THRESHOLD {
1632        let chunks = split_for_scope(data, line_delim);
1633        let n = chunks.len();
1634        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1635        rayon::scope(|s| {
1636            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1637                s.spawn(move |_| {
1638                    result.reserve(chunk.len());
1639                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1640                });
1641            }
1642        });
1643        let slices: Vec<IoSlice> = results
1644            .iter()
1645            .filter(|r| !r.is_empty())
1646            .map(|r| IoSlice::new(r))
1647            .collect();
1648        write_ioslices(out, &slices)?;
1649    } else if !suppress {
1650        // Zero-copy fast path: scan for truncation points, write runs from source.
1651        // When suppress is false, every line is output (with or without delimiter).
1652        // Most lines have enough fields, so the output is often identical to input.
1653        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1654    } else {
1655        let mut buf = Vec::with_capacity(data.len());
1656        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1657        if !buf.is_empty() {
1658            out.write_all(&buf)?;
1659        }
1660    }
1661    Ok(())
1662}
1663
1664/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1665/// directly into the source data, flushing in MAX_IOV-sized batches.
1666/// For lines where the Nth delimiter exists, we truncate at that point.
1667/// For lines with fewer fields, we output them unchanged (contiguous run).
1668/// Lines without any delimiter are output unchanged (suppress=false assumed).
1669#[inline]
1670fn fields_prefix_zerocopy(
1671    data: &[u8],
1672    delim: u8,
1673    line_delim: u8,
1674    last_field: usize,
1675    out: &mut impl Write,
1676) -> io::Result<()> {
1677    let newline_buf: [u8; 1] = [line_delim];
1678    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1679    let mut start = 0;
1680    let mut run_start: usize = 0;
1681
1682    for end_pos in memchr_iter(line_delim, data) {
1683        let line = &data[start..end_pos];
1684        let mut field_count = 1;
1685        let mut truncate_at: Option<usize> = None;
1686        for dpos in memchr_iter(delim, line) {
1687            if field_count >= last_field {
1688                truncate_at = Some(start + dpos);
1689                break;
1690            }
1691            field_count += 1;
1692        }
1693
1694        if let Some(trunc_pos) = truncate_at {
1695            if run_start < start {
1696                iov.push(IoSlice::new(&data[run_start..start]));
1697            }
1698            iov.push(IoSlice::new(&data[start..trunc_pos]));
1699            iov.push(IoSlice::new(&newline_buf));
1700            run_start = end_pos + 1;
1701
1702            if iov.len() >= MAX_IOV - 2 {
1703                write_ioslices(out, &iov)?;
1704                iov.clear();
1705            }
1706        }
1707        start = end_pos + 1;
1708    }
1709    // Handle last line without terminator
1710    if start < data.len() {
1711        let line = &data[start..];
1712        let mut field_count = 1;
1713        let mut truncate_at: Option<usize> = None;
1714        for dpos in memchr_iter(delim, line) {
1715            if field_count >= last_field {
1716                truncate_at = Some(start + dpos);
1717                break;
1718            }
1719            field_count += 1;
1720        }
1721        if let Some(trunc_pos) = truncate_at {
1722            if run_start < start {
1723                iov.push(IoSlice::new(&data[run_start..start]));
1724            }
1725            iov.push(IoSlice::new(&data[start..trunc_pos]));
1726            iov.push(IoSlice::new(&newline_buf));
1727            if !iov.is_empty() {
1728                write_ioslices(out, &iov)?;
1729            }
1730            return Ok(());
1731        }
1732    }
1733    // Flush remaining contiguous run
1734    if run_start < data.len() {
1735        iov.push(IoSlice::new(&data[run_start..]));
1736        if !data.is_empty() && *data.last().unwrap() != line_delim {
1737            iov.push(IoSlice::new(&newline_buf));
1738        }
1739    }
1740    if !iov.is_empty() {
1741        write_ioslices(out, &iov)?;
1742    }
1743    Ok(())
1744}
1745
1746/// Process a chunk for contiguous from-start field range extraction.
1747fn fields_prefix_chunk(
1748    data: &[u8],
1749    delim: u8,
1750    line_delim: u8,
1751    last_field: usize,
1752    suppress: bool,
1753    buf: &mut Vec<u8>,
1754) {
1755    buf.reserve(data.len());
1756    let mut start = 0;
1757    for end_pos in memchr_iter(line_delim, data) {
1758        let line = &data[start..end_pos];
1759        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1760        start = end_pos + 1;
1761    }
1762    if start < data.len() {
1763        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1764    }
1765}
1766
1767/// Extract first N fields from one line (contiguous from-start range).
1768/// Uses memchr SIMD for delimiter scanning on all line sizes.
1769#[inline(always)]
1770fn fields_prefix_line(
1771    line: &[u8],
1772    delim: u8,
1773    line_delim: u8,
1774    last_field: usize,
1775    suppress: bool,
1776    buf: &mut Vec<u8>,
1777) {
1778    let len = line.len();
1779    if len == 0 {
1780        if !suppress {
1781            unsafe { buf_push(buf, line_delim) };
1782        }
1783        return;
1784    }
1785
1786    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1787    let base = line.as_ptr();
1788
1789    let mut field_count = 1usize;
1790    let mut has_delim = false;
1791
1792    for pos in memchr_iter(delim, line) {
1793        has_delim = true;
1794        if field_count >= last_field {
1795            unsafe {
1796                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1797                buf_push(buf, line_delim);
1798            }
1799            return;
1800        }
1801        field_count += 1;
1802    }
1803
1804    if !has_delim {
1805        if !suppress {
1806            unsafe {
1807                buf_extend(buf, line);
1808                buf_push(buf, line_delim);
1809            }
1810        }
1811        return;
1812    }
1813
1814    unsafe {
1815        buf_extend(buf, line);
1816        buf_push(buf, line_delim);
1817    }
1818}
1819
1820/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1821fn process_fields_suffix(
1822    data: &[u8],
1823    delim: u8,
1824    line_delim: u8,
1825    start_field: usize,
1826    suppress: bool,
1827    out: &mut impl Write,
1828) -> io::Result<()> {
1829    if data.len() >= PARALLEL_THRESHOLD {
1830        let chunks = split_for_scope(data, line_delim);
1831        let n = chunks.len();
1832        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1833        rayon::scope(|s| {
1834            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1835                s.spawn(move |_| {
1836                    result.reserve(chunk.len());
1837                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1838                });
1839            }
1840        });
1841        let slices: Vec<IoSlice> = results
1842            .iter()
1843            .filter(|r| !r.is_empty())
1844            .map(|r| IoSlice::new(r))
1845            .collect();
1846        write_ioslices(out, &slices)?;
1847    } else {
1848        let mut buf = Vec::with_capacity(data.len());
1849        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1850        if !buf.is_empty() {
1851            out.write_all(&buf)?;
1852        }
1853    }
1854    Ok(())
1855}
1856
1857/// Process a chunk for open-ended field suffix extraction.
1858fn fields_suffix_chunk(
1859    data: &[u8],
1860    delim: u8,
1861    line_delim: u8,
1862    start_field: usize,
1863    suppress: bool,
1864    buf: &mut Vec<u8>,
1865) {
1866    buf.reserve(data.len());
1867    let mut start = 0;
1868    for end_pos in memchr_iter(line_delim, data) {
1869        let line = &data[start..end_pos];
1870        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1871        start = end_pos + 1;
1872    }
1873    if start < data.len() {
1874        fields_suffix_line(
1875            &data[start..],
1876            delim,
1877            line_delim,
1878            start_field,
1879            suppress,
1880            buf,
1881        );
1882    }
1883}
1884
1885/// Extract fields from start_field to end from one line.
1886/// Uses memchr SIMD for delimiter scanning on all line sizes.
1887#[inline(always)]
1888fn fields_suffix_line(
1889    line: &[u8],
1890    delim: u8,
1891    line_delim: u8,
1892    start_field: usize,
1893    suppress: bool,
1894    buf: &mut Vec<u8>,
1895) {
1896    let len = line.len();
1897    if len == 0 {
1898        if !suppress {
1899            unsafe { buf_push(buf, line_delim) };
1900        }
1901        return;
1902    }
1903
1904    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1905    let base = line.as_ptr();
1906
1907    let skip_delims = start_field - 1;
1908    let mut delim_count = 0usize;
1909    let mut has_delim = false;
1910
1911    for pos in memchr_iter(delim, line) {
1912        has_delim = true;
1913        delim_count += 1;
1914        if delim_count >= skip_delims {
1915            unsafe {
1916                buf_extend(
1917                    buf,
1918                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1919                );
1920                buf_push(buf, line_delim);
1921            }
1922            return;
1923        }
1924    }
1925
1926    if !has_delim {
1927        if !suppress {
1928            unsafe {
1929                buf_extend(buf, line);
1930                buf_push(buf, line_delim);
1931            }
1932        }
1933        return;
1934    }
1935
1936    // Fewer delimiters than needed
1937    unsafe { buf_push(buf, line_delim) };
1938}
1939
1940/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1941/// Optimized: skip to start_field using memchr, then output until end_field.
1942fn process_fields_mid_range(
1943    data: &[u8],
1944    delim: u8,
1945    line_delim: u8,
1946    start_field: usize,
1947    end_field: usize,
1948    suppress: bool,
1949    out: &mut impl Write,
1950) -> io::Result<()> {
1951    if data.len() >= PARALLEL_THRESHOLD {
1952        let chunks = split_for_scope(data, line_delim);
1953        let n = chunks.len();
1954        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1955        rayon::scope(|s| {
1956            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1957                s.spawn(move |_| {
1958                    result.reserve(chunk.len());
1959                    fields_mid_range_chunk(
1960                        chunk,
1961                        delim,
1962                        line_delim,
1963                        start_field,
1964                        end_field,
1965                        suppress,
1966                        result,
1967                    );
1968                });
1969            }
1970        });
1971        let slices: Vec<IoSlice> = results
1972            .iter()
1973            .filter(|r| !r.is_empty())
1974            .map(|r| IoSlice::new(r))
1975            .collect();
1976        write_ioslices(out, &slices)?;
1977    } else {
1978        let mut buf = Vec::with_capacity(data.len());
1979        fields_mid_range_chunk(
1980            data,
1981            delim,
1982            line_delim,
1983            start_field,
1984            end_field,
1985            suppress,
1986            &mut buf,
1987        );
1988        if !buf.is_empty() {
1989            out.write_all(&buf)?;
1990        }
1991    }
1992    Ok(())
1993}
1994
1995/// Process a chunk for contiguous mid-range field extraction.
1996/// Single-pass memchr2 scan over the entire chunk, tracking delimiter count
1997/// per line. Avoids the double-scan (outer newline + inner delimiter).
1998fn fields_mid_range_chunk(
1999    data: &[u8],
2000    delim: u8,
2001    line_delim: u8,
2002    start_field: usize,
2003    end_field: usize,
2004    suppress: bool,
2005    buf: &mut Vec<u8>,
2006) {
2007    // When delim == line_delim, fall back to per-line approach
2008    if delim == line_delim {
2009        buf.reserve(data.len());
2010        let mut start = 0;
2011        for end_pos in memchr_iter(line_delim, data) {
2012            let line = &data[start..end_pos];
2013            fields_mid_range_line(
2014                line,
2015                delim,
2016                line_delim,
2017                start_field,
2018                end_field,
2019                suppress,
2020                buf,
2021            );
2022            start = end_pos + 1;
2023        }
2024        if start < data.len() {
2025            fields_mid_range_line(
2026                &data[start..],
2027                delim,
2028                line_delim,
2029                start_field,
2030                end_field,
2031                suppress,
2032                buf,
2033            );
2034        }
2035        return;
2036    }
2037
2038    buf.reserve(data.len());
2039    let base = data.as_ptr();
2040    let skip_before = start_field - 1; // delimiters to skip before range
2041    let target_end_delim = skip_before + (end_field - start_field) + 1;
2042
2043    let mut line_start: usize = 0;
2044    let mut delim_count: usize = 0;
2045    let mut range_start: usize = 0;
2046    let mut has_delim = false;
2047    let mut found_end = false; // true when we found all target fields, skip to newline
2048
2049    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2050        let byte = unsafe { *base.add(pos) };
2051        if byte == line_delim {
2052            // End of line
2053            if found_end {
2054                // Already output this line's range
2055            } else if !has_delim {
2056                // No delimiter on this line
2057                if !suppress {
2058                    unsafe {
2059                        buf_extend(
2060                            buf,
2061                            std::slice::from_raw_parts(base.add(line_start), pos + 1 - line_start),
2062                        );
2063                    }
2064                }
2065            } else if delim_count >= skip_before {
2066                // Have enough fields for start_field; output from range_start to EOL
2067                if skip_before == 0 {
2068                    range_start = line_start;
2069                }
2070                unsafe {
2071                    buf_extend(
2072                        buf,
2073                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2074                    );
2075                    buf_push(buf, line_delim);
2076                }
2077            } else {
2078                // Not enough fields for start_field — output empty line
2079                unsafe { buf_push(buf, line_delim) };
2080            }
2081            line_start = pos + 1;
2082            delim_count = 0;
2083            has_delim = false;
2084            found_end = false;
2085        } else if !found_end {
2086            // Delimiter
2087            has_delim = true;
2088            delim_count += 1;
2089            if delim_count == skip_before {
2090                range_start = pos + 1;
2091            }
2092            if delim_count == target_end_delim {
2093                if skip_before == 0 {
2094                    range_start = line_start;
2095                }
2096                unsafe {
2097                    buf_extend(
2098                        buf,
2099                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2100                    );
2101                    buf_push(buf, line_delim);
2102                }
2103                found_end = true;
2104            }
2105        }
2106    }
2107    // Handle trailing data without final newline
2108    if line_start < data.len() && !found_end {
2109        if !has_delim {
2110            if !suppress {
2111                unsafe {
2112                    buf_extend(
2113                        buf,
2114                        std::slice::from_raw_parts(base.add(line_start), data.len() - line_start),
2115                    );
2116                }
2117            }
2118        } else if delim_count >= skip_before {
2119            if skip_before == 0 {
2120                range_start = line_start;
2121            }
2122            unsafe {
2123                buf_extend(
2124                    buf,
2125                    std::slice::from_raw_parts(base.add(range_start), data.len() - range_start),
2126                );
2127            }
2128        }
2129    }
2130}
2131
2132/// Extract fields start_field..=end_field from one line.
2133/// Uses scalar byte scanning for short lines, memchr_iter for longer.
2134/// Raw pointer arithmetic to eliminate bounds checking.
2135#[inline(always)]
2136fn fields_mid_range_line(
2137    line: &[u8],
2138    delim: u8,
2139    line_delim: u8,
2140    start_field: usize,
2141    end_field: usize,
2142    suppress: bool,
2143    buf: &mut Vec<u8>,
2144) {
2145    let len = line.len();
2146    if len == 0 {
2147        if !suppress {
2148            unsafe { buf_push(buf, line_delim) };
2149        }
2150        return;
2151    }
2152
2153    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
2154    let base = line.as_ptr();
2155
2156    // Count delimiters to find start_field and end_field boundaries
2157    let skip_before = start_field - 1; // delimiters to skip before start_field
2158    let field_span = end_field - start_field; // additional delimiters within the range
2159    let target_end_delim = skip_before + field_span + 1;
2160    let mut delim_count = 0;
2161    let mut range_start = 0;
2162    let mut has_delim = false;
2163
2164    for pos in memchr_iter(delim, line) {
2165        has_delim = true;
2166        delim_count += 1;
2167        if delim_count == skip_before {
2168            range_start = pos + 1;
2169        }
2170        if delim_count == target_end_delim {
2171            if skip_before == 0 {
2172                range_start = 0;
2173            }
2174            unsafe {
2175                buf_extend(
2176                    buf,
2177                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2178                );
2179                buf_push(buf, line_delim);
2180            }
2181            return;
2182        }
2183    }
2184
2185    if !has_delim {
2186        if !suppress {
2187            unsafe {
2188                buf_extend(buf, line);
2189                buf_push(buf, line_delim);
2190            }
2191        }
2192        return;
2193    }
2194
2195    // Line has delimiters but fewer fields than end_field
2196    if delim_count >= skip_before {
2197        // We have at least start_field, output from range_start to end
2198        if skip_before == 0 {
2199            range_start = 0;
2200        }
2201        unsafe {
2202            buf_extend(
2203                buf,
2204                std::slice::from_raw_parts(base.add(range_start), len - range_start),
2205            );
2206            buf_push(buf, line_delim);
2207        }
2208    } else {
2209        // Not enough fields even for start_field — output empty line
2210        unsafe { buf_push(buf, line_delim) };
2211    }
2212}
2213
2214/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
2215/// directly into the source data, flushing in MAX_IOV-sized batches.
2216/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
2217///
2218/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
2219/// Parallel field-1 extraction for large data using memchr2 single-pass.
2220/// Splits data into per-thread chunks, each chunk extracts field 1 using
2221/// memchr2(delim, newline) which finds the first special byte in one scan.
2222/// For field 1: first special byte is either the delimiter (field end) or
2223/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
2224fn single_field1_parallel(
2225    data: &[u8],
2226    delim: u8,
2227    line_delim: u8,
2228    out: &mut impl Write,
2229) -> io::Result<()> {
2230    let chunks = split_for_scope(data, line_delim);
2231    let n = chunks.len();
2232    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2233    rayon::scope(|s| {
2234        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2235            s.spawn(move |_| {
2236                result.reserve(chunk.len() + 1);
2237                single_field1_to_buf(chunk, delim, line_delim, result);
2238            });
2239        }
2240    });
2241    let slices: Vec<IoSlice> = results
2242        .iter()
2243        .filter(|r| !r.is_empty())
2244        .map(|r| IoSlice::new(r))
2245        .collect();
2246    write_ioslices(out, &slices)
2247}
2248
2249/// Extract field 1 from a chunk using memchr2_iter single-pass SIMD scanning.
2250/// Uses a single memchr2_iter pass over the entire chunk to find both delimiters
2251/// and newlines. This eliminates the per-line memchr function call overhead
2252/// (~5-10ns per call × 2 calls per line) that dominates for short-field data.
2253///
2254/// Optimizations:
2255/// - Deferred field copy: delays copying from delimiter position to newline,
2256///   enabling fused field+newline output in a single copy sequence.
2257/// - Single output pointer: avoids per-line buf.len() load/store (saves ~488K
2258///   ops for 244K lines). One set_len at the end.
2259#[inline]
2260fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
2261    debug_assert_ne!(delim, line_delim, "delim and line_delim must differ");
2262    // Reserve data.len() + 1: output ≤ input for all lines except potentially
2263    // the last line without trailing newline, where we add a newline (GNU compat).
2264    buf.reserve(data.len() + 1);
2265
2266    // Use a single output pointer — avoids per-line buf.len() load/store.
2267    // Only one set_len at the end instead of 2 per line (saves ~488K ops for 244K lines).
2268    let base = data.as_ptr();
2269    let initial_len = buf.len();
2270    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
2271    let mut line_start: usize = 0;
2272    let mut found_delim = false;
2273    let mut delim_pos: usize = 0; // only valid when found_delim == true
2274
2275    // SAFETY (capacity): Total output <= data.len() + 1 because:
2276    // - Lines without delimiter: output exactly the input bytes (subrange of data)
2277    // - Lines with delimiter: output field bytes (< input line), uses base reservation
2278    // - Unterminated last line: adds 1 newline, which is why we reserve +1
2279    // The +1 is only consumed by the unterminated-last-line case; all other cases
2280    // stay within data.len(). reserve(data.len() + 1) guarantees sufficient capacity.
2281    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2282        let byte = unsafe { *base.add(pos) };
2283        if byte == line_delim {
2284            if !found_delim {
2285                // No delimiter on this line — output entire line including newline
2286                let len = pos + 1 - line_start;
2287                unsafe {
2288                    std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, len);
2289                    out_ptr = out_ptr.add(len);
2290                }
2291            } else {
2292                // Delimiter was found — output field + newline in one fused copy.
2293                // field_len may be 0 (line starts with delimiter, e.g. "\trest"):
2294                // copy_nonoverlapping with count=0 is a no-op, which is correct.
2295                let field_len = delim_pos - line_start;
2296                unsafe {
2297                    std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, field_len);
2298                    out_ptr = out_ptr.add(field_len);
2299                    *out_ptr = line_delim;
2300                    out_ptr = out_ptr.add(1);
2301                }
2302            }
2303            line_start = pos + 1;
2304            found_delim = false;
2305        } else if !found_delim {
2306            // First delimiter on this line — record position, defer copy to newline
2307            found_delim = true;
2308            delim_pos = pos;
2309        }
2310        // Subsequent delimiters: ignore
2311    }
2312
2313    // Handle last line without trailing newline — GNU cut always adds newline
2314    if line_start < data.len() {
2315        if !found_delim {
2316            // No delimiter — output remaining data + newline (GNU compat)
2317            let len = data.len() - line_start;
2318            unsafe {
2319                std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, len);
2320                out_ptr = out_ptr.add(len);
2321                *out_ptr = line_delim;
2322                out_ptr = out_ptr.add(1);
2323            }
2324        } else {
2325            // Field + trailing newline (GNU compat)
2326            let field_len = delim_pos - line_start;
2327            unsafe {
2328                std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, field_len);
2329                out_ptr = out_ptr.add(field_len);
2330                *out_ptr = line_delim;
2331                out_ptr = out_ptr.add(1);
2332            }
2333        }
2334    }
2335
2336    // SAFETY: out_ptr was derived from buf.as_mut_ptr().add(initial_len) after
2337    // the reserve() call, and no Vec reallocation occurred between capture and
2338    // here (no safe buf.* calls in the loop body). Using pointer subtraction
2339    // instead of offset_from avoids the isize intermediate — both pointers are
2340    // in the same allocation so the subtraction is always non-negative.
2341    unsafe {
2342        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2343        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2344        buf.set_len(new_len);
2345    }
2346}
2347
2348/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
2349/// directly into the source data. Uses two-level scan: outer memchr(newline)
2350/// for the first delimiter. This is faster than memchr2 for SMALL data because
2351/// the inner scan exits after the FIRST delimiter, skipping all
2352/// subsequent delimiters on the line.
2353///
2354/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2355/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2356#[inline]
2357#[allow(dead_code)]
2358fn single_field1_zerocopy(
2359    data: &[u8],
2360    delim: u8,
2361    line_delim: u8,
2362    out: &mut impl Write,
2363) -> io::Result<()> {
2364    let newline_buf: [u8; 1] = [line_delim];
2365
2366    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2367    let mut run_start: usize = 0;
2368    let mut start = 0;
2369
2370    for end_pos in memchr_iter(line_delim, data) {
2371        let line = &data[start..end_pos];
2372        if let Some(dp) = memchr::memchr(delim, line) {
2373            // Line has delimiter — truncate at first delimiter.
2374            // Flush current contiguous run, then add truncated field + newline.
2375            if run_start < start {
2376                iov.push(IoSlice::new(&data[run_start..start]));
2377            }
2378            iov.push(IoSlice::new(&data[start..start + dp]));
2379            iov.push(IoSlice::new(&newline_buf));
2380            run_start = end_pos + 1;
2381
2382            if iov.len() >= MAX_IOV - 2 {
2383                write_ioslices(out, &iov)?;
2384                iov.clear();
2385            }
2386        }
2387        // else: no delimiter in line, output unchanged (stays in contiguous run)
2388        start = end_pos + 1;
2389    }
2390
2391    // Handle last line (no trailing newline)
2392    if start < data.len() {
2393        let line = &data[start..];
2394        if let Some(dp) = memchr::memchr(delim, line) {
2395            if run_start < start {
2396                iov.push(IoSlice::new(&data[run_start..start]));
2397            }
2398            iov.push(IoSlice::new(&data[start..start + dp]));
2399            iov.push(IoSlice::new(&newline_buf));
2400            if !iov.is_empty() {
2401                write_ioslices(out, &iov)?;
2402            }
2403            return Ok(());
2404        }
2405    }
2406
2407    // Flush remaining contiguous run
2408    if run_start < data.len() {
2409        iov.push(IoSlice::new(&data[run_start..]));
2410        if !data.is_empty() && *data.last().unwrap() != line_delim {
2411            iov.push(IoSlice::new(&newline_buf));
2412        }
2413    }
2414    if !iov.is_empty() {
2415        write_ioslices(out, &iov)?;
2416    }
2417    Ok(())
2418}
2419
2420/// Process a chunk of data for single-field extraction.
2421fn process_single_field_chunk(
2422    data: &[u8],
2423    delim: u8,
2424    target_idx: usize,
2425    line_delim: u8,
2426    suppress: bool,
2427    buf: &mut Vec<u8>,
2428) {
2429    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2430    buf.reserve(data.len());
2431    let mut start = 0;
2432    for end_pos in memchr_iter(line_delim, data) {
2433        let line = &data[start..end_pos];
2434        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2435        start = end_pos + 1;
2436    }
2437    if start < data.len() {
2438        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2439    }
2440}
2441
2442/// Extract a single field from one line.
2443/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2444/// For longer lines, uses memchr for SIMD-accelerated scanning.
2445/// Raw pointer arithmetic eliminates per-field bounds checking.
2446#[inline(always)]
2447fn extract_single_field_line(
2448    line: &[u8],
2449    delim: u8,
2450    target_idx: usize,
2451    line_delim: u8,
2452    suppress: bool,
2453    buf: &mut Vec<u8>,
2454) {
2455    let len = line.len();
2456    if len == 0 {
2457        if !suppress {
2458            unsafe { buf_push(buf, line_delim) };
2459        }
2460        return;
2461    }
2462
2463    // Note: no per-line buf.reserve — process_single_field_chunk already reserves data.len()
2464    let base = line.as_ptr();
2465
2466    // Ultra-fast path for first field: single memchr
2467    if target_idx == 0 {
2468        match memchr::memchr(delim, line) {
2469            Some(pos) => unsafe {
2470                buf_extend_byte(buf, std::slice::from_raw_parts(base, pos), line_delim);
2471            },
2472            None => {
2473                if !suppress {
2474                    unsafe {
2475                        buf_extend_byte(buf, line, line_delim);
2476                    }
2477                }
2478            }
2479        }
2480        return;
2481    }
2482
2483    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2484    let mut field_start = 0;
2485    let mut field_idx = 0;
2486    let mut has_delim = false;
2487
2488    for pos in memchr_iter(delim, line) {
2489        has_delim = true;
2490        if field_idx == target_idx {
2491            unsafe {
2492                buf_extend_byte(
2493                    buf,
2494                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2495                    line_delim,
2496                );
2497            }
2498            return;
2499        }
2500        field_idx += 1;
2501        field_start = pos + 1;
2502    }
2503
2504    if !has_delim {
2505        if !suppress {
2506            unsafe {
2507                buf_extend_byte(buf, line, line_delim);
2508            }
2509        }
2510        return;
2511    }
2512
2513    if field_idx == target_idx {
2514        unsafe {
2515            buf_extend_byte(
2516                buf,
2517                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2518                line_delim,
2519            );
2520        }
2521    } else {
2522        unsafe { buf_push(buf, line_delim) };
2523    }
2524}
2525
2526/// Extract fields from a single line into the output buffer.
2527/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2528/// Raw pointer arithmetic eliminates per-field bounds checking.
2529#[inline(always)]
2530fn extract_fields_to_buf(
2531    line: &[u8],
2532    delim: u8,
2533    ranges: &[Range],
2534    output_delim: &[u8],
2535    suppress: bool,
2536    max_field: usize,
2537    field_mask: u64,
2538    line_delim: u8,
2539    buf: &mut Vec<u8>,
2540    complement: bool,
2541) {
2542    let len = line.len();
2543
2544    if len == 0 {
2545        if !suppress {
2546            buf.push(line_delim);
2547        }
2548        return;
2549    }
2550
2551    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2552    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2553    let needed = len + output_delim.len() * 16 + 1;
2554    if buf.capacity() - buf.len() < needed {
2555        buf.reserve(needed);
2556    }
2557
2558    let base = line.as_ptr();
2559    let mut field_num: usize = 1;
2560    let mut field_start: usize = 0;
2561    let mut first_output = true;
2562    let mut has_delim = false;
2563
2564    // Use memchr SIMD for all line sizes
2565    for delim_pos in memchr_iter(delim, line) {
2566        has_delim = true;
2567
2568        if is_selected(field_num, field_mask, ranges, complement) {
2569            if !first_output {
2570                unsafe { buf_extend(buf, output_delim) };
2571            }
2572            unsafe {
2573                buf_extend(
2574                    buf,
2575                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2576                )
2577            };
2578            first_output = false;
2579        }
2580
2581        field_num += 1;
2582        field_start = delim_pos + 1;
2583
2584        if field_num > max_field {
2585            break;
2586        }
2587    }
2588
2589    // Last field
2590    if (field_num <= max_field || complement)
2591        && has_delim
2592        && is_selected(field_num, field_mask, ranges, complement)
2593    {
2594        if !first_output {
2595            unsafe { buf_extend(buf, output_delim) };
2596        }
2597        unsafe {
2598            buf_extend(
2599                buf,
2600                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2601            )
2602        };
2603        first_output = false;
2604    }
2605
2606    if !first_output {
2607        unsafe { buf_push(buf, line_delim) };
2608    } else if !has_delim {
2609        if !suppress {
2610            unsafe {
2611                buf_extend(buf, line);
2612                buf_push(buf, line_delim);
2613            }
2614        }
2615    } else {
2616        unsafe { buf_push(buf, line_delim) };
2617    }
2618}
2619
2620// ── Fast path: byte/char extraction with batched output ──────────────────
2621
2622/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2623/// Zero-copy: writes directly from the source data using output runs.
2624/// For lines shorter than max_bytes, the output is identical to the input,
2625/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2626fn process_bytes_from_start(
2627    data: &[u8],
2628    max_bytes: usize,
2629    line_delim: u8,
2630    out: &mut impl Write,
2631) -> io::Result<()> {
2632    // For small data (< PARALLEL_THRESHOLD): check if all lines fit for zero-copy passthrough.
2633    // The sequential scan + write_all is competitive with per-line processing for small data.
2634    //
2635    // For large data (>= PARALLEL_THRESHOLD): skip the all_fit scan entirely.
2636    // The scan is sequential (~1.7ms for 10MB at memchr speed) while parallel per-line
2637    // processing is much faster (~0.5ms for 10MB with 4 threads). Even when all lines fit,
2638    // the parallel copy + write is faster than sequential scan + zero-copy write.
2639    if data.len() < PARALLEL_THRESHOLD && max_bytes > 0 && max_bytes < usize::MAX {
2640        let mut start = 0;
2641        let mut all_fit = true;
2642        for pos in memchr_iter(line_delim, data) {
2643            if pos - start > max_bytes {
2644                all_fit = false;
2645                break;
2646            }
2647            start = pos + 1;
2648        }
2649        // Check last line (no trailing delimiter)
2650        if all_fit && start < data.len() && data.len() - start > max_bytes {
2651            all_fit = false;
2652        }
2653        if all_fit {
2654            // All lines fit: output = input. Handle missing trailing delimiter.
2655            if !data.is_empty() && data[data.len() - 1] == line_delim {
2656                return out.write_all(data);
2657            } else if !data.is_empty() {
2658                out.write_all(data)?;
2659                return out.write_all(&[line_delim]);
2660            }
2661            return Ok(());
2662        }
2663    }
2664
2665    if data.len() >= PARALLEL_THRESHOLD {
2666        let chunks = split_for_scope(data, line_delim);
2667        let n = chunks.len();
2668        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2669        rayon::scope(|s| {
2670            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2671                s.spawn(move |_| {
2672                    // Output can be up to input size (when all lines fit).
2673                    // Reserve full chunk size to avoid reallocation.
2674                    result.reserve(chunk.len());
2675                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2676                });
2677            }
2678        });
2679        // Use write_vectored (writev) to batch N writes into fewer syscalls
2680        let slices: Vec<IoSlice> = results
2681            .iter()
2682            .filter(|r| !r.is_empty())
2683            .map(|r| IoSlice::new(r))
2684            .collect();
2685        write_ioslices(out, &slices)?;
2686    } else {
2687        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2688        // because every line gets truncated, creating 3 IoSlice entries per line.
2689        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2690        // managing millions of IoSlice entries through the kernel.
2691        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2692        if max_bytes <= 512 {
2693            // Estimate output size without scanning: output <= data.len(),
2694            // typically ~data.len()/4 for short max_bytes on longer lines.
2695            let est_out = (data.len() / 4).max(max_bytes + 2);
2696            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2697            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2698            if !buf.is_empty() {
2699                out.write_all(&buf)?;
2700            }
2701        } else {
2702            // Zero-copy path: track contiguous output runs and write directly from source.
2703            // For lines <= max_bytes, we include them as-is (no copy needed).
2704            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2705            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2706        }
2707    }
2708    Ok(())
2709}
2710
2711/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2712/// directly into the source data, flushing in MAX_IOV-sized batches.
2713/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2714/// produce two IoSlices (truncated data + newline).
2715#[inline]
2716fn bytes_from_start_zerocopy(
2717    data: &[u8],
2718    max_bytes: usize,
2719    line_delim: u8,
2720    out: &mut impl Write,
2721) -> io::Result<()> {
2722    let newline_buf: [u8; 1] = [line_delim];
2723    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2724    let mut start = 0;
2725    let mut run_start: usize = 0;
2726
2727    for pos in memchr_iter(line_delim, data) {
2728        let line_len = pos - start;
2729        if line_len > max_bytes {
2730            // This line needs truncation
2731            if run_start < start {
2732                iov.push(IoSlice::new(&data[run_start..start]));
2733            }
2734            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2735            iov.push(IoSlice::new(&newline_buf));
2736            run_start = pos + 1;
2737
2738            if iov.len() >= MAX_IOV - 2 {
2739                write_ioslices(out, &iov)?;
2740                iov.clear();
2741            }
2742        }
2743        start = pos + 1;
2744    }
2745    // Handle last line without terminator
2746    if start < data.len() {
2747        let line_len = data.len() - start;
2748        if line_len > max_bytes {
2749            if run_start < start {
2750                iov.push(IoSlice::new(&data[run_start..start]));
2751            }
2752            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2753            iov.push(IoSlice::new(&newline_buf));
2754            if !iov.is_empty() {
2755                write_ioslices(out, &iov)?;
2756            }
2757            return Ok(());
2758        }
2759    }
2760    // Flush remaining contiguous run
2761    if run_start < data.len() {
2762        iov.push(IoSlice::new(&data[run_start..]));
2763        if !data.is_empty() && *data.last().unwrap() != line_delim {
2764            iov.push(IoSlice::new(&newline_buf));
2765        }
2766    }
2767    if !iov.is_empty() {
2768        write_ioslices(out, &iov)?;
2769    }
2770    Ok(())
2771}
2772
2773/// Process a chunk for from-start byte range extraction (parallel path).
2774/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2775/// Pre-reserves data.len() (output never exceeds input), then uses a single
2776/// write pointer with deferred set_len — no per-line capacity checks.
2777#[inline]
2778fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2779    // Output is always <= input size (we only truncate, never expand).
2780    // Single reserve eliminates ALL per-line capacity checks.
2781    buf.reserve(data.len());
2782
2783    let src = data.as_ptr();
2784    let dst_base = buf.as_mut_ptr();
2785    let mut wp = buf.len();
2786    let mut start = 0;
2787
2788    for pos in memchr_iter(line_delim, data) {
2789        let line_len = pos - start;
2790        let take = line_len.min(max_bytes);
2791        unsafe {
2792            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2793            *dst_base.add(wp + take) = line_delim;
2794        }
2795        wp += take + 1;
2796        start = pos + 1;
2797    }
2798    // Handle last line without terminator
2799    if start < data.len() {
2800        let line_len = data.len() - start;
2801        let take = line_len.min(max_bytes);
2802        unsafe {
2803            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2804            *dst_base.add(wp + take) = line_delim;
2805        }
2806        wp += take + 1;
2807    }
2808    unsafe { buf.set_len(wp) };
2809}
2810
2811/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2812fn process_bytes_from_offset(
2813    data: &[u8],
2814    skip_bytes: usize,
2815    line_delim: u8,
2816    out: &mut impl Write,
2817) -> io::Result<()> {
2818    if data.len() >= PARALLEL_THRESHOLD {
2819        let chunks = split_for_scope(data, line_delim);
2820        let n = chunks.len();
2821        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2822        rayon::scope(|s| {
2823            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2824                s.spawn(move |_| {
2825                    result.reserve(chunk.len());
2826                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2827                });
2828            }
2829        });
2830        // Use write_vectored (writev) to batch N writes into fewer syscalls
2831        let slices: Vec<IoSlice> = results
2832            .iter()
2833            .filter(|r| !r.is_empty())
2834            .map(|r| IoSlice::new(r))
2835            .collect();
2836        write_ioslices(out, &slices)?;
2837    } else {
2838        // Zero-copy: write suffix of each line directly from source
2839        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2840    }
2841    Ok(())
2842}
2843
2844/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2845/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2846/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2847#[inline]
2848fn bytes_from_offset_zerocopy(
2849    data: &[u8],
2850    skip_bytes: usize,
2851    line_delim: u8,
2852    out: &mut impl Write,
2853) -> io::Result<()> {
2854    let delim_buf = [line_delim];
2855    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2856
2857    let mut start = 0;
2858    for pos in memchr_iter(line_delim, data) {
2859        let line_len = pos - start;
2860        if line_len > skip_bytes {
2861            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2862        }
2863        iov.push(IoSlice::new(&delim_buf));
2864        // Flush when approaching MAX_IOV to avoid oversized writev
2865        if iov.len() >= MAX_IOV - 1 {
2866            write_ioslices(out, &iov)?;
2867            iov.clear();
2868        }
2869        start = pos + 1;
2870    }
2871    if start < data.len() {
2872        let line_len = data.len() - start;
2873        if line_len > skip_bytes {
2874            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2875        }
2876        iov.push(IoSlice::new(&delim_buf));
2877    }
2878    if !iov.is_empty() {
2879        write_ioslices(out, &iov)?;
2880    }
2881    Ok(())
2882}
2883
2884/// Process a chunk for from-offset byte range extraction.
2885/// Single reserve + deferred set_len for zero per-line overhead.
2886#[inline]
2887fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2888    buf.reserve(data.len());
2889
2890    let src = data.as_ptr();
2891    let dst_base = buf.as_mut_ptr();
2892    let mut wp = buf.len();
2893    let mut start = 0;
2894
2895    for pos in memchr_iter(line_delim, data) {
2896        let line_len = pos - start;
2897        if line_len > skip_bytes {
2898            let take = line_len - skip_bytes;
2899            unsafe {
2900                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2901            }
2902            wp += take;
2903        }
2904        unsafe {
2905            *dst_base.add(wp) = line_delim;
2906        }
2907        wp += 1;
2908        start = pos + 1;
2909    }
2910    if start < data.len() {
2911        let line_len = data.len() - start;
2912        if line_len > skip_bytes {
2913            let take = line_len - skip_bytes;
2914            unsafe {
2915                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2916            }
2917            wp += take;
2918        }
2919        unsafe {
2920            *dst_base.add(wp) = line_delim;
2921        }
2922        wp += 1;
2923    }
2924    unsafe { buf.set_len(wp) };
2925}
2926
2927/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2928fn process_bytes_mid_range(
2929    data: &[u8],
2930    start_byte: usize,
2931    end_byte: usize,
2932    line_delim: u8,
2933    out: &mut impl Write,
2934) -> io::Result<()> {
2935    let skip = start_byte.saturating_sub(1);
2936
2937    if data.len() >= PARALLEL_THRESHOLD {
2938        let chunks = split_for_scope(data, line_delim);
2939        let n = chunks.len();
2940        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2941        rayon::scope(|s| {
2942            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2943                s.spawn(move |_| {
2944                    result.reserve(chunk.len());
2945                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2946                });
2947            }
2948        });
2949        let slices: Vec<IoSlice> = results
2950            .iter()
2951            .filter(|r| !r.is_empty())
2952            .map(|r| IoSlice::new(r))
2953            .collect();
2954        write_ioslices(out, &slices)?;
2955    } else {
2956        let mut buf = Vec::with_capacity(data.len());
2957        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2958        if !buf.is_empty() {
2959            out.write_all(&buf)?;
2960        }
2961    }
2962    Ok(())
2963}
2964
2965/// Process a chunk for mid-range byte extraction.
2966/// For each line, output bytes skip..min(line_len, end_byte).
2967/// Single reserve + deferred set_len.
2968#[inline]
2969fn bytes_mid_range_chunk(
2970    data: &[u8],
2971    skip: usize,
2972    end_byte: usize,
2973    line_delim: u8,
2974    buf: &mut Vec<u8>,
2975) {
2976    buf.reserve(data.len());
2977
2978    let src = data.as_ptr();
2979    let dst_base = buf.as_mut_ptr();
2980    let mut wp = buf.len();
2981    let mut start = 0;
2982
2983    for pos in memchr_iter(line_delim, data) {
2984        let line_len = pos - start;
2985        if line_len > skip {
2986            let take_end = line_len.min(end_byte);
2987            let take = take_end - skip;
2988            unsafe {
2989                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2990            }
2991            wp += take;
2992        }
2993        unsafe {
2994            *dst_base.add(wp) = line_delim;
2995        }
2996        wp += 1;
2997        start = pos + 1;
2998    }
2999    if start < data.len() {
3000        let line_len = data.len() - start;
3001        if line_len > skip {
3002            let take_end = line_len.min(end_byte);
3003            let take = take_end - skip;
3004            unsafe {
3005                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
3006            }
3007            wp += take;
3008        }
3009        unsafe {
3010            *dst_base.add(wp) = line_delim;
3011        }
3012        wp += 1;
3013    }
3014    unsafe { buf.set_len(wp) };
3015}
3016
3017/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
3018fn process_bytes_complement_mid(
3019    data: &[u8],
3020    skip_start: usize,
3021    skip_end: usize,
3022    line_delim: u8,
3023    out: &mut impl Write,
3024) -> io::Result<()> {
3025    let prefix_bytes = skip_start - 1; // bytes before the skip region
3026    if data.len() >= PARALLEL_THRESHOLD {
3027        let chunks = split_for_scope(data, line_delim);
3028        let n = chunks.len();
3029        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3030        rayon::scope(|s| {
3031            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3032                s.spawn(move |_| {
3033                    result.reserve(chunk.len());
3034                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
3035                });
3036            }
3037        });
3038        let slices: Vec<IoSlice> = results
3039            .iter()
3040            .filter(|r| !r.is_empty())
3041            .map(|r| IoSlice::new(r))
3042            .collect();
3043        write_ioslices(out, &slices)?;
3044    } else {
3045        let mut buf = Vec::with_capacity(data.len());
3046        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
3047        if !buf.is_empty() {
3048            out.write_all(&buf)?;
3049        }
3050    }
3051    Ok(())
3052}
3053
3054/// Process a chunk for complement mid-range byte extraction.
3055/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
3056#[inline]
3057fn bytes_complement_mid_chunk(
3058    data: &[u8],
3059    prefix_bytes: usize,
3060    skip_end: usize,
3061    line_delim: u8,
3062    buf: &mut Vec<u8>,
3063) {
3064    buf.reserve(data.len());
3065
3066    let src = data.as_ptr();
3067    let dst_base = buf.as_mut_ptr();
3068    let mut wp = buf.len();
3069    let mut start = 0;
3070
3071    for pos in memchr_iter(line_delim, data) {
3072        let line_len = pos - start;
3073        // Copy prefix (bytes before skip region)
3074        let take_prefix = prefix_bytes.min(line_len);
3075        if take_prefix > 0 {
3076            unsafe {
3077                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3078            }
3079            wp += take_prefix;
3080        }
3081        // Copy suffix (bytes after skip region)
3082        if line_len > skip_end {
3083            let suffix_len = line_len - skip_end;
3084            unsafe {
3085                std::ptr::copy_nonoverlapping(
3086                    src.add(start + skip_end),
3087                    dst_base.add(wp),
3088                    suffix_len,
3089                );
3090            }
3091            wp += suffix_len;
3092        }
3093        unsafe {
3094            *dst_base.add(wp) = line_delim;
3095        }
3096        wp += 1;
3097        start = pos + 1;
3098    }
3099    if start < data.len() {
3100        let line_len = data.len() - start;
3101        let take_prefix = prefix_bytes.min(line_len);
3102        if take_prefix > 0 {
3103            unsafe {
3104                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3105            }
3106            wp += take_prefix;
3107        }
3108        if line_len > skip_end {
3109            let suffix_len = line_len - skip_end;
3110            unsafe {
3111                std::ptr::copy_nonoverlapping(
3112                    src.add(start + skip_end),
3113                    dst_base.add(wp),
3114                    suffix_len,
3115                );
3116            }
3117            wp += suffix_len;
3118        }
3119        unsafe {
3120            *dst_base.add(wp) = line_delim;
3121        }
3122        wp += 1;
3123    }
3124    unsafe { buf.set_len(wp) };
3125}
3126
3127/// Optimized byte/char extraction with batched output and parallel processing.
3128fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3129    let line_delim = cfg.line_delim;
3130    let ranges = cfg.ranges;
3131    let complement = cfg.complement;
3132    let output_delim = cfg.output_delim;
3133
3134    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
3135    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
3136        let max_bytes = ranges[0].end;
3137        if max_bytes < usize::MAX {
3138            return process_bytes_from_start(data, max_bytes, line_delim, out);
3139        }
3140    }
3141
3142    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
3143    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
3144        let skip_bytes = ranges[0].start.saturating_sub(1);
3145        if skip_bytes > 0 {
3146            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
3147        }
3148    }
3149
3150    // Fast path: single mid-range (e.g., cut -b5-100)
3151    if !complement
3152        && ranges.len() == 1
3153        && ranges[0].start > 1
3154        && ranges[0].end < usize::MAX
3155        && output_delim.is_empty()
3156    {
3157        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
3158    }
3159
3160    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
3161    if complement
3162        && ranges.len() == 1
3163        && ranges[0].start == 1
3164        && ranges[0].end < usize::MAX
3165        && output_delim.is_empty()
3166    {
3167        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
3168    }
3169
3170    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
3171    if complement
3172        && ranges.len() == 1
3173        && ranges[0].end == usize::MAX
3174        && ranges[0].start > 1
3175        && output_delim.is_empty()
3176    {
3177        let max_bytes = ranges[0].start - 1;
3178        return process_bytes_from_start(data, max_bytes, line_delim, out);
3179    }
3180
3181    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
3182    if complement
3183        && ranges.len() == 1
3184        && ranges[0].start > 1
3185        && ranges[0].end < usize::MAX
3186        && output_delim.is_empty()
3187    {
3188        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
3189    }
3190
3191    if data.len() >= PARALLEL_THRESHOLD {
3192        let chunks = split_for_scope(data, line_delim);
3193        let n = chunks.len();
3194        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3195        rayon::scope(|s| {
3196            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3197                s.spawn(move |_| {
3198                    result.reserve(chunk.len());
3199                    process_bytes_chunk(
3200                        chunk,
3201                        ranges,
3202                        complement,
3203                        output_delim,
3204                        line_delim,
3205                        result,
3206                    );
3207                });
3208            }
3209        });
3210        let slices: Vec<IoSlice> = results
3211            .iter()
3212            .filter(|r| !r.is_empty())
3213            .map(|r| IoSlice::new(r))
3214            .collect();
3215        write_ioslices(out, &slices)?;
3216    } else {
3217        let mut buf = Vec::with_capacity(data.len());
3218        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
3219        if !buf.is_empty() {
3220            out.write_all(&buf)?;
3221        }
3222    }
3223    Ok(())
3224}
3225
3226/// Process a chunk of data for byte/char extraction.
3227/// Uses raw pointer arithmetic for the newline scan.
3228/// Complement single-range fast path: compute complement ranges once, then use
3229/// the non-complement multi-range path which is more cache-friendly.
3230fn process_bytes_chunk(
3231    data: &[u8],
3232    ranges: &[Range],
3233    complement: bool,
3234    output_delim: &[u8],
3235    line_delim: u8,
3236    buf: &mut Vec<u8>,
3237) {
3238    buf.reserve(data.len());
3239    let base = data.as_ptr();
3240    let mut start = 0;
3241    for end_pos in memchr_iter(line_delim, data) {
3242        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
3243        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3244        unsafe { buf_push(buf, line_delim) };
3245        start = end_pos + 1;
3246    }
3247    if start < data.len() {
3248        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
3249        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3250        unsafe { buf_push(buf, line_delim) };
3251    }
3252}
3253
3254/// Extract byte ranges from a line into the output buffer.
3255/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
3256/// Raw pointer arithmetic eliminates per-range bounds checking.
3257#[inline(always)]
3258fn cut_bytes_to_buf(
3259    line: &[u8],
3260    ranges: &[Range],
3261    complement: bool,
3262    output_delim: &[u8],
3263    buf: &mut Vec<u8>,
3264) {
3265    let len = line.len();
3266    let base = line.as_ptr();
3267    let mut first_range = true;
3268
3269    // Reserve worst case: full line + delimiters between ranges
3270    let needed = len + output_delim.len() * ranges.len() + 1;
3271    if buf.capacity() - buf.len() < needed {
3272        buf.reserve(needed);
3273    }
3274
3275    if complement {
3276        let mut pos: usize = 1;
3277        for r in ranges {
3278            let rs = r.start;
3279            let re = r.end.min(len);
3280            if pos < rs {
3281                if !first_range && !output_delim.is_empty() {
3282                    unsafe { buf_extend(buf, output_delim) };
3283                }
3284                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
3285                first_range = false;
3286            }
3287            pos = re + 1;
3288            if pos > len {
3289                break;
3290            }
3291        }
3292        if pos <= len {
3293            if !first_range && !output_delim.is_empty() {
3294                unsafe { buf_extend(buf, output_delim) };
3295            }
3296            unsafe {
3297                buf_extend(
3298                    buf,
3299                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3300                )
3301            };
3302        }
3303    } else if output_delim.is_empty() && ranges.len() == 1 {
3304        // Ultra-fast path: single range, no output delimiter
3305        let start = ranges[0].start.saturating_sub(1);
3306        let end = ranges[0].end.min(len);
3307        if start < len {
3308            unsafe {
3309                buf_extend(
3310                    buf,
3311                    std::slice::from_raw_parts(base.add(start), end - start),
3312                )
3313            };
3314        }
3315    } else {
3316        for r in ranges {
3317            let start = r.start.saturating_sub(1);
3318            let end = r.end.min(len);
3319            if start >= len {
3320                break;
3321            }
3322            if !first_range && !output_delim.is_empty() {
3323                unsafe { buf_extend(buf, output_delim) };
3324            }
3325            unsafe {
3326                buf_extend(
3327                    buf,
3328                    std::slice::from_raw_parts(base.add(start), end - start),
3329                )
3330            };
3331            first_range = false;
3332        }
3333    }
3334}
3335
3336// ── Public API ───────────────────────────────────────────────────────────
3337
3338/// Cut fields from a line using a delimiter. Writes to `out`.
3339#[inline]
3340pub fn cut_fields(
3341    line: &[u8],
3342    delim: u8,
3343    ranges: &[Range],
3344    complement: bool,
3345    output_delim: &[u8],
3346    suppress_no_delim: bool,
3347    out: &mut impl Write,
3348) -> io::Result<bool> {
3349    if memchr::memchr(delim, line).is_none() {
3350        if !suppress_no_delim {
3351            out.write_all(line)?;
3352            return Ok(true);
3353        }
3354        return Ok(false);
3355    }
3356
3357    let mut field_num: usize = 1;
3358    let mut field_start: usize = 0;
3359    let mut first_output = true;
3360
3361    for delim_pos in memchr_iter(delim, line) {
3362        let selected = in_ranges(ranges, field_num) != complement;
3363        if selected {
3364            if !first_output {
3365                out.write_all(output_delim)?;
3366            }
3367            out.write_all(&line[field_start..delim_pos])?;
3368            first_output = false;
3369        }
3370        field_start = delim_pos + 1;
3371        field_num += 1;
3372    }
3373
3374    let selected = in_ranges(ranges, field_num) != complement;
3375    if selected {
3376        if !first_output {
3377            out.write_all(output_delim)?;
3378        }
3379        out.write_all(&line[field_start..])?;
3380    }
3381
3382    Ok(true)
3383}
3384
3385/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3386#[inline]
3387pub fn cut_bytes(
3388    line: &[u8],
3389    ranges: &[Range],
3390    complement: bool,
3391    output_delim: &[u8],
3392    out: &mut impl Write,
3393) -> io::Result<bool> {
3394    let mut first_range = true;
3395
3396    if complement {
3397        let len = line.len();
3398        let mut comp_ranges = Vec::new();
3399        let mut pos: usize = 1;
3400        for r in ranges {
3401            let rs = r.start;
3402            let re = r.end.min(len);
3403            if pos < rs {
3404                comp_ranges.push((pos, rs - 1));
3405            }
3406            pos = re + 1;
3407            if pos > len {
3408                break;
3409            }
3410        }
3411        if pos <= len {
3412            comp_ranges.push((pos, len));
3413        }
3414        for &(s, e) in &comp_ranges {
3415            if !first_range && !output_delim.is_empty() {
3416                out.write_all(output_delim)?;
3417            }
3418            out.write_all(&line[s - 1..e])?;
3419            first_range = false;
3420        }
3421    } else {
3422        for r in ranges {
3423            let start = r.start.saturating_sub(1);
3424            let end = r.end.min(line.len());
3425            if start >= line.len() {
3426                break;
3427            }
3428            if !first_range && !output_delim.is_empty() {
3429                out.write_all(output_delim)?;
3430            }
3431            out.write_all(&line[start..end])?;
3432            first_range = false;
3433        }
3434    }
3435    Ok(true)
3436}
3437
3438/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3439/// Output is always <= input (we remove everything after first delimiter per line).
3440/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3441/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3442///
3443/// Lines without delimiter pass through unchanged (unless suppress=true).
3444/// Lines with delimiter: keep bytes before delimiter + newline.
3445pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3446    let len = data.len();
3447    let mut wp: usize = 0;
3448    let mut rp: usize = 0;
3449
3450    while rp < len {
3451        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3452            None => {
3453                // Rest is partial line, no delimiter
3454                if suppress {
3455                    // suppress: skip lines without delimiter
3456                    break;
3457                }
3458                let remaining = len - rp;
3459                if wp != rp {
3460                    data.copy_within(rp..len, wp);
3461                }
3462                wp += remaining;
3463                break;
3464            }
3465            Some(offset) => {
3466                let actual = rp + offset;
3467                if data[actual] == line_delim {
3468                    // No delimiter on this line
3469                    if suppress {
3470                        // Skip this line entirely
3471                        rp = actual + 1;
3472                    } else {
3473                        // Output entire line including newline
3474                        let chunk_len = actual + 1 - rp;
3475                        if wp != rp {
3476                            data.copy_within(rp..actual + 1, wp);
3477                        }
3478                        wp += chunk_len;
3479                        rp = actual + 1;
3480                    }
3481                } else {
3482                    // Delimiter found: output field 1 (up to delimiter) + newline
3483                    let field_len = actual - rp;
3484                    if wp != rp && field_len > 0 {
3485                        data.copy_within(rp..actual, wp);
3486                    }
3487                    wp += field_len;
3488                    data[wp] = line_delim;
3489                    wp += 1;
3490                    // Skip to next newline
3491                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3492                        None => {
3493                            rp = len;
3494                        }
3495                        Some(nl_off) => {
3496                            rp = actual + 1 + nl_off + 1;
3497                        }
3498                    }
3499                }
3500            }
3501        }
3502    }
3503    wp
3504}
3505
3506/// Process a full data buffer (from mmap or read) with cut operation.
3507pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3508    match cfg.mode {
3509        CutMode::Fields => process_fields_fast(data, cfg, out),
3510        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3511    }
3512}
3513
3514/// Process input from a reader (for stdin).
3515/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3516/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3517/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3518pub fn process_cut_reader<R: BufRead>(
3519    mut reader: R,
3520    cfg: &CutConfig,
3521    out: &mut impl Write,
3522) -> io::Result<()> {
3523    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3524    let line_delim = cfg.line_delim;
3525
3526    // Read large chunks and process in batch.
3527    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3528    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3529
3530    loop {
3531        // Read up to CHUNK_SIZE bytes
3532        buf.reserve(CHUNK_SIZE);
3533        let read_start = buf.len();
3534        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3535        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3536        buf.truncate(read_start + n);
3537
3538        if buf.is_empty() {
3539            break;
3540        }
3541
3542        if n == 0 {
3543            // EOF with leftover data (last line without terminator)
3544            process_cut_data(&buf, cfg, out)?;
3545            break;
3546        }
3547
3548        // Find the last line delimiter in the buffer so we process complete lines
3549        let process_end = match memchr::memrchr(line_delim, &buf) {
3550            Some(pos) => pos + 1,
3551            None => {
3552                // No line delimiter found — keep accumulating
3553                continue;
3554            }
3555        };
3556
3557        // Process the complete lines using the fast batch path
3558        process_cut_data(&buf[..process_end], cfg, out)?;
3559
3560        // Shift leftover to the front for next iteration
3561        let leftover_len = buf.len() - process_end;
3562        if leftover_len > 0 {
3563            buf.copy_within(process_end.., 0);
3564        }
3565        buf.truncate(leftover_len);
3566    }
3567
3568    Ok(())
3569}
3570
3571/// Read as many bytes as possible into buf, retrying on partial reads.
3572#[inline]
3573fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3574    let n = reader.read(buf)?;
3575    if n == buf.len() || n == 0 {
3576        return Ok(n);
3577    }
3578    // Slow path: partial read — retry to fill buffer
3579    let mut total = n;
3580    while total < buf.len() {
3581        match reader.read(&mut buf[total..]) {
3582            Ok(0) => break,
3583            Ok(n) => total += n,
3584            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3585            Err(e) => return Err(e),
3586        }
3587    }
3588    Ok(total)
3589}
3590
3591/// In-place cut processing for mutable data buffers.
3592/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3593/// for the given configuration (caller should fall back to regular processing).
3594///
3595/// In-place avoids allocating intermediate output buffers — the result is written
3596/// directly into the input buffer (output is always <= input for non-complement modes
3597/// with default output delimiter).
3598pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3599    if cfg.complement {
3600        return None;
3601    }
3602
3603    match cfg.mode {
3604        CutMode::Fields => {
3605            // Only handle when output delimiter matches input (single-byte)
3606            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3607                return None;
3608            }
3609            if cfg.delim == cfg.line_delim {
3610                return None;
3611            }
3612            Some(cut_fields_inplace_general(
3613                data,
3614                cfg.delim,
3615                cfg.line_delim,
3616                cfg.ranges,
3617                cfg.suppress_no_delim,
3618            ))
3619        }
3620        CutMode::Bytes | CutMode::Characters => {
3621            if !cfg.output_delim.is_empty() {
3622                return None;
3623            }
3624            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3625        }
3626    }
3627}
3628
3629/// In-place generalized field extraction.
3630/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3631fn cut_fields_inplace_general(
3632    data: &mut [u8],
3633    delim: u8,
3634    line_delim: u8,
3635    ranges: &[Range],
3636    suppress: bool,
3637) -> usize {
3638    // Special case: field 1 only (existing optimized path)
3639    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3640        return cut_field1_inplace(data, delim, line_delim, suppress);
3641    }
3642
3643    let len = data.len();
3644    if len == 0 {
3645        return 0;
3646    }
3647
3648    let max_field = ranges.last().map_or(0, |r| r.end);
3649    let max_delims = max_field.min(64);
3650    let mut wp: usize = 0;
3651    let mut rp: usize = 0;
3652
3653    while rp < len {
3654        let line_end = memchr::memchr(line_delim, &data[rp..])
3655            .map(|p| rp + p)
3656            .unwrap_or(len);
3657        let line_len = line_end - rp;
3658
3659        // Collect delimiter positions (relative to line start)
3660        let mut delim_pos = [0usize; 64];
3661        let mut num_delims: usize = 0;
3662
3663        for pos in memchr_iter(delim, &data[rp..line_end]) {
3664            if num_delims < max_delims {
3665                delim_pos[num_delims] = pos;
3666                num_delims += 1;
3667                if num_delims >= max_delims {
3668                    break;
3669                }
3670            }
3671        }
3672
3673        if num_delims == 0 {
3674            // No delimiter in line
3675            if !suppress {
3676                if wp != rp {
3677                    data.copy_within(rp..line_end, wp);
3678                }
3679                wp += line_len;
3680                if line_end < len {
3681                    data[wp] = line_delim;
3682                    wp += 1;
3683                }
3684            }
3685        } else {
3686            let total_fields = num_delims + 1;
3687            let mut first_output = true;
3688
3689            for r in ranges {
3690                let range_start = r.start;
3691                let range_end = r.end.min(total_fields);
3692                if range_start > total_fields {
3693                    break;
3694                }
3695                for field_num in range_start..=range_end {
3696                    if field_num > total_fields {
3697                        break;
3698                    }
3699
3700                    let field_start = if field_num == 1 {
3701                        0
3702                    } else if field_num - 2 < num_delims {
3703                        delim_pos[field_num - 2] + 1
3704                    } else {
3705                        continue;
3706                    };
3707                    let field_end = if field_num <= num_delims {
3708                        delim_pos[field_num - 1]
3709                    } else {
3710                        line_len
3711                    };
3712
3713                    if !first_output {
3714                        data[wp] = delim;
3715                        wp += 1;
3716                    }
3717                    let flen = field_end - field_start;
3718                    if flen > 0 {
3719                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3720                        wp += flen;
3721                    }
3722                    first_output = false;
3723                }
3724            }
3725
3726            if !first_output && line_end < len {
3727                data[wp] = line_delim;
3728                wp += 1;
3729            } else if first_output && line_end < len {
3730                // No fields selected but line had delimiters — output empty line
3731                data[wp] = line_delim;
3732                wp += 1;
3733            }
3734        }
3735
3736        rp = if line_end < len { line_end + 1 } else { len };
3737    }
3738
3739    wp
3740}
3741
3742/// In-place byte/char range extraction.
3743fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3744    let len = data.len();
3745    if len == 0 {
3746        return 0;
3747    }
3748
3749    // Quick check: single range from byte 1 to end = no-op
3750    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3751        return len;
3752    }
3753
3754    // Single range from byte 1: fast truncation path
3755    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3756        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3757    }
3758
3759    let mut wp: usize = 0;
3760    let mut rp: usize = 0;
3761
3762    while rp < len {
3763        let line_end = memchr::memchr(line_delim, &data[rp..])
3764            .map(|p| rp + p)
3765            .unwrap_or(len);
3766        let line_len = line_end - rp;
3767
3768        for r in ranges {
3769            let start = r.start.saturating_sub(1);
3770            let end = r.end.min(line_len);
3771            if start >= line_len {
3772                break;
3773            }
3774            let flen = end - start;
3775            if flen > 0 {
3776                data.copy_within(rp + start..rp + start + flen, wp);
3777                wp += flen;
3778            }
3779        }
3780
3781        if line_end < len {
3782            data[wp] = line_delim;
3783            wp += 1;
3784        }
3785
3786        rp = if line_end < len { line_end + 1 } else { len };
3787    }
3788
3789    wp
3790}
3791
3792/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3793fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3794    let len = data.len();
3795
3796    // Quick check: see if all lines fit within max_bytes (common case)
3797    let mut all_fit = true;
3798    let mut start = 0;
3799    for pos in memchr_iter(line_delim, data) {
3800        if pos - start > max_bytes {
3801            all_fit = false;
3802            break;
3803        }
3804        start = pos + 1;
3805    }
3806    if all_fit && start < len && len - start > max_bytes {
3807        all_fit = false;
3808    }
3809    if all_fit {
3810        return len;
3811    }
3812
3813    // Some lines need truncation
3814    let mut wp: usize = 0;
3815    let mut rp: usize = 0;
3816
3817    while rp < len {
3818        let line_end = memchr::memchr(line_delim, &data[rp..])
3819            .map(|p| rp + p)
3820            .unwrap_or(len);
3821        let line_len = line_end - rp;
3822
3823        let take = line_len.min(max_bytes);
3824        if take > 0 && wp != rp {
3825            data.copy_within(rp..rp + take, wp);
3826        }
3827        wp += take;
3828
3829        if line_end < len {
3830            data[wp] = line_delim;
3831            wp += 1;
3832        }
3833
3834        rp = if line_end < len { line_end + 1 } else { len };
3835    }
3836
3837    wp
3838}
3839
3840/// Cut operation mode
3841#[derive(Debug, Clone, Copy, PartialEq)]
3842pub enum CutMode {
3843    Bytes,
3844    Characters,
3845    Fields,
3846}