Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (2MB).
5/// Uses Rayon's persistent thread pool for low-overhead dispatch (~10µs).
6/// For data >= 2MB with 4 cores, the parallel savings (~3ms) far exceed overhead.
7const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
8
9/// Max iovec entries per writev call (Linux default).
10const MAX_IOV: usize = 1024;
11
12/// Configuration for cut operations.
13pub struct CutConfig<'a> {
14    pub mode: CutMode,
15    pub ranges: &'a [Range],
16    pub complement: bool,
17    pub delim: u8,
18    pub output_delim: &'a [u8],
19    pub suppress_no_delim: bool,
20    pub line_delim: u8,
21}
22
23/// A range specification like 1, 3-5, -3, 4-
24#[derive(Debug, Clone)]
25pub struct Range {
26    pub start: usize, // 1-based, 0 means "from beginning"
27    pub end: usize,   // 1-based, usize::MAX means "to end"
28}
29
30/// Parse a LIST specification like "1,3-5,7-" into ranges.
31/// Each range is 1-based. Returns sorted, merged ranges.
32pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
33    let mut ranges = Vec::new();
34
35    for part in spec.split(',') {
36        let part = part.trim();
37        if part.is_empty() {
38            continue;
39        }
40
41        if let Some(idx) = part.find('-') {
42            let left = &part[..idx];
43            let right = &part[idx + 1..];
44
45            let start = if left.is_empty() {
46                1
47            } else {
48                left.parse::<usize>()
49                    .map_err(|_| format!("invalid range: '{}'", part))?
50            };
51
52            let end = if right.is_empty() {
53                usize::MAX
54            } else {
55                right
56                    .parse::<usize>()
57                    .map_err(|_| format!("invalid range: '{}'", part))?
58            };
59
60            if start == 0 {
61                return Err("fields and positions are numbered from 1".to_string());
62            }
63            if start > end {
64                return Err(format!("invalid decreasing range: '{}'", part));
65            }
66
67            ranges.push(Range { start, end });
68        } else {
69            let n = part
70                .parse::<usize>()
71                .map_err(|_| format!("invalid field: '{}'", part))?;
72            if n == 0 {
73                return Err("fields and positions are numbered from 1".to_string());
74            }
75            ranges.push(Range { start: n, end: n });
76        }
77    }
78
79    if ranges.is_empty() {
80        return Err("you must specify a list of bytes, characters, or fields".to_string());
81    }
82
83    // Sort and merge overlapping ranges
84    ranges.sort_by_key(|r| (r.start, r.end));
85    let mut merged = vec![ranges[0].clone()];
86    for r in &ranges[1..] {
87        let last = merged.last_mut().unwrap();
88        if r.start <= last.end.saturating_add(1) {
89            last.end = last.end.max(r.end);
90        } else {
91            merged.push(r.clone());
92        }
93    }
94
95    Ok(merged)
96}
97
98/// Check if a 1-based position is in any range.
99/// Ranges must be sorted. Uses early exit since ranges are sorted.
100#[inline(always)]
101fn in_ranges(ranges: &[Range], pos: usize) -> bool {
102    for r in ranges {
103        if pos < r.start {
104            return false;
105        }
106        if pos <= r.end {
107            return true;
108        }
109    }
110    false
111}
112
113/// Pre-compute a 64-bit mask for field selection.
114/// Bit i-1 is set if field i should be output.
115#[inline]
116fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
117    let mut mask: u64 = 0;
118    for i in 1..=64u32 {
119        let in_range = in_ranges(ranges, i as usize);
120        if in_range != complement {
121            mask |= 1u64 << (i - 1);
122        }
123    }
124    mask
125}
126
127/// Check if a field should be selected, using bitset for first 64 fields.
128#[inline(always)]
129fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
130    if field_num <= 64 {
131        (mask >> (field_num - 1)) & 1 == 1
132    } else {
133        in_ranges(ranges, field_num) != complement
134    }
135}
136
137// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
138
139/// Append a slice to buf without capacity checks.
140/// Caller MUST ensure buf has enough remaining capacity.
141#[inline(always)]
142unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
143    unsafe {
144        let len = buf.len();
145        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
146        buf.set_len(len + data.len());
147    }
148}
149
150/// Append a single byte to buf without capacity checks.
151/// Caller MUST ensure buf has enough remaining capacity.
152#[inline(always)]
153unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
154    unsafe {
155        let len = buf.len();
156        *buf.as_mut_ptr().add(len) = b;
157        buf.set_len(len + 1);
158    }
159}
160
161/// Write multiple IoSlice buffers using write_vectored (writev syscall).
162/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
163/// Cold path (partial write) is out-of-line to keep the hot loop tight.
164#[inline]
165fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
166    if slices.is_empty() {
167        return Ok(());
168    }
169    for batch in slices.chunks(MAX_IOV) {
170        let total: usize = batch.iter().map(|s| s.len()).sum();
171        let written = out.write_vectored(batch)?;
172        if written >= total {
173            continue;
174        }
175        if written == 0 {
176            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
177        }
178        write_ioslices_slow(out, batch, written)?;
179    }
180    Ok(())
181}
182
183/// Handle partial write_vectored (cold path, never inlined).
184#[cold]
185#[inline(never)]
186fn write_ioslices_slow(
187    out: &mut impl Write,
188    slices: &[IoSlice],
189    mut skip: usize,
190) -> io::Result<()> {
191    for slice in slices {
192        let len = slice.len();
193        if skip >= len {
194            skip -= len;
195            continue;
196        }
197        out.write_all(&slice[skip..])?;
198        skip = 0;
199    }
200    Ok(())
201}
202
203// ── Chunk splitting for parallel processing ──────────────────────────────
204
205/// Number of available CPUs for parallel chunk splitting.
206/// Uses std::thread::available_parallelism() to avoid triggering premature
207/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
208#[inline]
209fn num_cpus() -> usize {
210    std::thread::available_parallelism()
211        .map(|n| n.get())
212        .unwrap_or(1)
213}
214
215/// Split data into chunks for rayon::scope parallel processing.
216/// Uses Rayon's thread count to match the number of worker threads.
217fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
218    let num_threads = num_cpus().max(1);
219    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
220        return vec![data];
221    }
222
223    let chunk_size = data.len() / num_threads;
224    let mut chunks = Vec::with_capacity(num_threads);
225    let mut pos = 0;
226
227    for _ in 0..num_threads - 1 {
228        let target = pos + chunk_size;
229        if target >= data.len() {
230            break;
231        }
232        let boundary = memchr::memchr(line_delim, &data[target..])
233            .map(|p| target + p + 1)
234            .unwrap_or(data.len());
235        if boundary > pos {
236            chunks.push(&data[pos..boundary]);
237        }
238        pos = boundary;
239    }
240
241    if pos < data.len() {
242        chunks.push(&data[pos..]);
243    }
244
245    chunks
246}
247
248// ── Fast path: multi-field non-contiguous extraction ─────────────────────
249
250/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
251/// Pre-collects delimiter positions per line into a stack-allocated array,
252/// then directly indexes into them for each selected field.
253/// This is O(max_field) per line instead of O(num_fields * scan_length).
254fn process_fields_multi_select(
255    data: &[u8],
256    delim: u8,
257    line_delim: u8,
258    ranges: &[Range],
259    suppress: bool,
260    out: &mut impl Write,
261) -> io::Result<()> {
262    let max_field = ranges.last().map_or(0, |r| r.end);
263
264    if data.len() >= PARALLEL_THRESHOLD {
265        let chunks = split_for_scope(data, line_delim);
266        let n = chunks.len();
267        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
268        rayon::scope(|s| {
269            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
270                s.spawn(move |_| {
271                    result.reserve(chunk.len() * 3 / 4);
272                    multi_select_chunk(
273                        chunk, delim, line_delim, ranges, max_field, suppress, result,
274                    );
275                });
276            }
277        });
278        let slices: Vec<IoSlice> = results
279            .iter()
280            .filter(|r| !r.is_empty())
281            .map(|r| IoSlice::new(r))
282            .collect();
283        write_ioslices(out, &slices)?;
284    } else {
285        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
286        multi_select_chunk(
287            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
288        );
289        if !buf.is_empty() {
290            out.write_all(&buf)?;
291        }
292    }
293    Ok(())
294}
295
296/// Process a chunk for multi-field extraction using a single-pass memchr2 scan.
297/// Scans for both delimiter and line_delim in one SIMD pass over the entire chunk,
298/// eliminating per-line memchr_iter setup overhead (significant for short lines).
299/// Delimiter positions are collected in a stack array per line.
300/// When max_field is reached on a line, remaining delimiters are ignored.
301fn multi_select_chunk(
302    data: &[u8],
303    delim: u8,
304    line_delim: u8,
305    ranges: &[Range],
306    max_field: usize,
307    suppress: bool,
308    buf: &mut Vec<u8>,
309) {
310    // When delim == line_delim, fall back to two-level approach
311    if delim == line_delim {
312        buf.reserve(data.len());
313        let base = data.as_ptr();
314        let mut start = 0;
315        for end_pos in memchr_iter(line_delim, data) {
316            let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
317            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
318            start = end_pos + 1;
319        }
320        if start < data.len() {
321            let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
322            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
323        }
324        return;
325    }
326
327    buf.reserve(data.len());
328    let base = data.as_ptr();
329    let data_len = data.len();
330
331    // Per-line state
332    let mut line_start: usize = 0;
333    let mut delim_pos = [0usize; 64];
334    let mut num_delims: usize = 0;
335    let max_delims = max_field.min(64);
336    let mut at_max = false;
337
338    // Single-pass scan using memchr2 for both delimiter and newline
339    for pos in memchr::memchr2_iter(delim, line_delim, data) {
340        let byte = unsafe { *base.add(pos) };
341
342        if byte == line_delim {
343            // End of line: extract fields from collected positions
344            let line_len = pos - line_start;
345            if num_delims == 0 {
346                // No delimiter in line
347                if !suppress {
348                    unsafe {
349                        buf_extend(
350                            buf,
351                            std::slice::from_raw_parts(base.add(line_start), line_len),
352                        );
353                        buf_push(buf, line_delim);
354                    }
355                }
356            } else {
357                // Extract fields using collected delimiter positions
358                let total_fields = num_delims + 1;
359                let mut first_output = true;
360
361                for r in ranges {
362                    let range_start = r.start;
363                    let range_end = r.end.min(total_fields);
364                    if range_start > total_fields {
365                        break;
366                    }
367                    for field_num in range_start..=range_end {
368                        if field_num > total_fields {
369                            break;
370                        }
371
372                        let field_start = if field_num == 1 {
373                            line_start
374                        } else if field_num - 2 < num_delims {
375                            delim_pos[field_num - 2] + 1
376                        } else {
377                            continue;
378                        };
379                        let field_end = if field_num <= num_delims {
380                            delim_pos[field_num - 1]
381                        } else {
382                            pos
383                        };
384
385                        if !first_output {
386                            unsafe { buf_push(buf, delim) };
387                        }
388                        unsafe {
389                            buf_extend(
390                                buf,
391                                std::slice::from_raw_parts(
392                                    base.add(field_start),
393                                    field_end - field_start,
394                                ),
395                            );
396                        }
397                        first_output = false;
398                    }
399                }
400
401                unsafe { buf_push(buf, line_delim) };
402            }
403
404            // Reset for next line
405            line_start = pos + 1;
406            num_delims = 0;
407            at_max = false;
408        } else {
409            // Delimiter found: collect position (up to max_field)
410            if !at_max && num_delims < max_delims {
411                delim_pos[num_delims] = pos;
412                num_delims += 1;
413                if num_delims >= max_delims {
414                    at_max = true;
415                }
416            }
417        }
418    }
419
420    // Handle last line without trailing line_delim
421    if line_start < data_len {
422        if num_delims == 0 {
423            if !suppress {
424                unsafe {
425                    buf_extend(
426                        buf,
427                        std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
428                    );
429                    buf_push(buf, line_delim);
430                }
431            }
432        } else {
433            let total_fields = num_delims + 1;
434            let mut first_output = true;
435
436            for r in ranges {
437                let range_start = r.start;
438                let range_end = r.end.min(total_fields);
439                if range_start > total_fields {
440                    break;
441                }
442                for field_num in range_start..=range_end {
443                    if field_num > total_fields {
444                        break;
445                    }
446
447                    let field_start = if field_num == 1 {
448                        line_start
449                    } else if field_num - 2 < num_delims {
450                        delim_pos[field_num - 2] + 1
451                    } else {
452                        continue;
453                    };
454                    let field_end = if field_num <= num_delims {
455                        delim_pos[field_num - 1]
456                    } else {
457                        data_len
458                    };
459
460                    if !first_output {
461                        unsafe { buf_push(buf, delim) };
462                    }
463                    unsafe {
464                        buf_extend(
465                            buf,
466                            std::slice::from_raw_parts(
467                                base.add(field_start),
468                                field_end - field_start,
469                            ),
470                        );
471                    }
472                    first_output = false;
473                }
474            }
475
476            unsafe { buf_push(buf, line_delim) };
477        }
478    }
479}
480
481/// Extract selected fields from a single line using delimiter position scanning.
482/// Scans delimiters only up to max_field (early exit), then extracts selected fields
483/// by indexing directly into the collected positions. Since ranges are pre-sorted and
484/// non-overlapping, every field within a range is selected — no is_selected check needed.
485#[inline(always)]
486fn multi_select_line(
487    line: &[u8],
488    delim: u8,
489    line_delim: u8,
490    ranges: &[Range],
491    max_field: usize,
492    suppress: bool,
493    buf: &mut Vec<u8>,
494) {
495    let len = line.len();
496    if len == 0 {
497        if !suppress {
498            unsafe { buf_push(buf, line_delim) };
499        }
500        return;
501    }
502
503    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
504    let base = line.as_ptr();
505
506    // Collect delimiter positions up to max_field (early exit).
507    // Stack array for up to 64 delimiter positions.
508    let mut delim_pos = [0usize; 64];
509    let mut num_delims: usize = 0;
510    let max_delims = max_field.min(64);
511
512    for pos in memchr_iter(delim, line) {
513        if num_delims < max_delims {
514            delim_pos[num_delims] = pos;
515            num_delims += 1;
516            if num_delims >= max_delims {
517                break;
518            }
519        }
520    }
521
522    if num_delims == 0 {
523        if !suppress {
524            unsafe {
525                buf_extend(buf, line);
526                buf_push(buf, line_delim);
527            }
528        }
529        return;
530    }
531
532    // Extract selected fields using delimiter positions.
533    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
534    // is selected — skip the is_selected check entirely (saves 1 function call per field).
535    let total_fields = num_delims + 1;
536    let mut first_output = true;
537
538    for r in ranges {
539        let range_start = r.start;
540        let range_end = r.end.min(total_fields);
541        if range_start > total_fields {
542            break;
543        }
544        for field_num in range_start..=range_end {
545            if field_num > total_fields {
546                break;
547            }
548
549            let field_start = if field_num == 1 {
550                0
551            } else if field_num - 2 < num_delims {
552                delim_pos[field_num - 2] + 1
553            } else {
554                continue;
555            };
556            let field_end = if field_num <= num_delims {
557                delim_pos[field_num - 1]
558            } else {
559                len
560            };
561
562            if !first_output {
563                unsafe { buf_push(buf, delim) };
564            }
565            unsafe {
566                buf_extend(
567                    buf,
568                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
569                );
570            }
571            first_output = false;
572        }
573    }
574
575    unsafe { buf_push(buf, line_delim) };
576}
577
578// ── Fast path: field extraction with batched output ──────────────────────
579
580/// Optimized field extraction with early exit and batched output.
581fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
582    let delim = cfg.delim;
583    let line_delim = cfg.line_delim;
584    let ranges = cfg.ranges;
585    let complement = cfg.complement;
586    let output_delim = cfg.output_delim;
587    let suppress = cfg.suppress_no_delim;
588
589    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
590    // That scan was O(N) over the entire file just to check an edge case
591    // (no delimiter in any line). The per-line processing already handles
592    // lines without delimiters correctly, so the scan was pure overhead
593    // for files that DO contain delimiters (the common case).
594
595    // Ultra-fast path: single field extraction (e.g., cut -f5)
596    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
597        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
598    }
599
600    // Fast path: complement of single field or contiguous range with default output delimiter.
601    if complement
602        && ranges.len() == 1
603        && output_delim.len() == 1
604        && output_delim[0] == delim
605        && ranges[0].start == ranges[0].end
606    {
607        return process_complement_single_field(
608            data,
609            delim,
610            line_delim,
611            ranges[0].start,
612            suppress,
613            out,
614        );
615    }
616
617    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
618    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
619    if complement
620        && ranges.len() == 1
621        && ranges[0].start > 1
622        && ranges[0].end < usize::MAX
623        && output_delim.len() == 1
624        && output_delim[0] == delim
625    {
626        return process_complement_range(
627            data,
628            delim,
629            line_delim,
630            ranges[0].start,
631            ranges[0].end,
632            suppress,
633            out,
634        );
635    }
636
637    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
638    if !complement
639        && ranges.len() == 1
640        && ranges[0].start == 1
641        && output_delim.len() == 1
642        && output_delim[0] == delim
643        && ranges[0].end < usize::MAX
644    {
645        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
646    }
647
648    // Fast path: open-ended field range from field N (e.g., cut -f3-)
649    if !complement
650        && ranges.len() == 1
651        && ranges[0].end == usize::MAX
652        && ranges[0].start > 1
653        && output_delim.len() == 1
654        && output_delim[0] == delim
655    {
656        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
657    }
658
659    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
660    if !complement
661        && ranges.len() == 1
662        && ranges[0].start > 1
663        && ranges[0].end < usize::MAX
664        && output_delim.len() == 1
665        && output_delim[0] == delim
666    {
667        return process_fields_mid_range(
668            data,
669            delim,
670            line_delim,
671            ranges[0].start,
672            ranges[0].end,
673            suppress,
674            out,
675        );
676    }
677
678    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
679    // Uses delimiter position caching: find all delimiter positions per line,
680    // then directly index into them for each selected field.
681    // This is faster than the general extract_fields_to_buf which re-checks
682    // is_selected() for every field encountered.
683    if !complement
684        && ranges.len() > 1
685        && ranges.last().map_or(false, |r| r.end < usize::MAX)
686        && output_delim.len() == 1
687        && output_delim[0] == delim
688        && delim != line_delim
689    {
690        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
691    }
692
693    // General field extraction
694    let max_field = if complement {
695        usize::MAX
696    } else {
697        ranges.last().map(|r| r.end).unwrap_or(0)
698    };
699    let field_mask = compute_field_mask(ranges, complement);
700
701    if data.len() >= PARALLEL_THRESHOLD {
702        let chunks = split_for_scope(data, line_delim);
703        let n = chunks.len();
704        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
705        rayon::scope(|s| {
706            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
707                s.spawn(move |_| {
708                    result.reserve(chunk.len());
709                    process_fields_chunk(
710                        chunk,
711                        delim,
712                        ranges,
713                        output_delim,
714                        suppress,
715                        max_field,
716                        field_mask,
717                        line_delim,
718                        complement,
719                        result,
720                    );
721                });
722            }
723        });
724        let slices: Vec<IoSlice> = results
725            .iter()
726            .filter(|r| !r.is_empty())
727            .map(|r| IoSlice::new(r))
728            .collect();
729        write_ioslices(out, &slices)?;
730    } else {
731        let mut buf = Vec::with_capacity(data.len());
732        process_fields_chunk(
733            data,
734            delim,
735            ranges,
736            output_delim,
737            suppress,
738            max_field,
739            field_mask,
740            line_delim,
741            complement,
742            &mut buf,
743        );
744        if !buf.is_empty() {
745            out.write_all(&buf)?;
746        }
747    }
748    Ok(())
749}
750
751/// Process a chunk of data for general field extraction.
752/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
753/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
754/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
755fn process_fields_chunk(
756    data: &[u8],
757    delim: u8,
758    ranges: &[Range],
759    output_delim: &[u8],
760    suppress: bool,
761    max_field: usize,
762    field_mask: u64,
763    line_delim: u8,
764    complement: bool,
765    buf: &mut Vec<u8>,
766) {
767    // When delim != line_delim and max_field is bounded, use two-level approach:
768    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
769    // This avoids scanning past max_field on each line (significant for lines with
770    // many columns but small field selection like -f1,3,5 on 20-column CSV).
771    // For complement or unbounded ranges, use single-pass memchr2_iter which
772    // needs to process all delimiters anyway.
773    if delim != line_delim && max_field < usize::MAX && !complement {
774        buf.reserve(data.len());
775        let mut start = 0;
776        for end_pos in memchr_iter(line_delim, data) {
777            let line = &data[start..end_pos];
778            extract_fields_to_buf(
779                line,
780                delim,
781                ranges,
782                output_delim,
783                suppress,
784                max_field,
785                field_mask,
786                line_delim,
787                buf,
788                complement,
789            );
790            start = end_pos + 1;
791        }
792        if start < data.len() {
793            extract_fields_to_buf(
794                &data[start..],
795                delim,
796                ranges,
797                output_delim,
798                suppress,
799                max_field,
800                field_mask,
801                line_delim,
802                buf,
803                complement,
804            );
805        }
806        return;
807    }
808
809    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
810    // delimiter and line_delim in one SIMD scan.
811    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
812    if delim != line_delim {
813        buf.reserve(data.len());
814
815        let data_len = data.len();
816        let base = data.as_ptr();
817        let mut line_start: usize = 0;
818        let mut field_start: usize = 0;
819        let mut field_num: usize = 1;
820        let mut first_output = true;
821        let mut has_delim = false;
822
823        for pos in memchr::memchr2_iter(delim, line_delim, data) {
824            let byte = unsafe { *base.add(pos) };
825
826            if byte == line_delim {
827                // End of line: flush final field and emit line delimiter
828                if (field_num <= max_field || complement)
829                    && has_delim
830                    && is_selected(field_num, field_mask, ranges, complement)
831                {
832                    if !first_output {
833                        unsafe { buf_extend(buf, output_delim) };
834                    }
835                    unsafe {
836                        buf_extend(
837                            buf,
838                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
839                        )
840                    };
841                    first_output = false;
842                }
843
844                if !first_output {
845                    unsafe { buf_push(buf, line_delim) };
846                } else if !has_delim {
847                    if !suppress {
848                        unsafe {
849                            buf_extend(
850                                buf,
851                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
852                            );
853                            buf_push(buf, line_delim);
854                        }
855                    }
856                } else {
857                    unsafe { buf_push(buf, line_delim) };
858                }
859
860                // Reset state for next line
861                line_start = pos + 1;
862                field_start = pos + 1;
863                field_num = 1;
864                first_output = true;
865                has_delim = false;
866            } else {
867                // Field delimiter hit
868                has_delim = true;
869
870                if is_selected(field_num, field_mask, ranges, complement) {
871                    if !first_output {
872                        unsafe { buf_extend(buf, output_delim) };
873                    }
874                    unsafe {
875                        buf_extend(
876                            buf,
877                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
878                        )
879                    };
880                    first_output = false;
881                }
882
883                field_num += 1;
884                field_start = pos + 1;
885            }
886        }
887
888        // Handle last line without trailing line_delim
889        if line_start < data_len {
890            if line_start < data_len {
891                if (field_num <= max_field || complement)
892                    && has_delim
893                    && is_selected(field_num, field_mask, ranges, complement)
894                {
895                    if !first_output {
896                        unsafe { buf_extend(buf, output_delim) };
897                    }
898                    unsafe {
899                        buf_extend(
900                            buf,
901                            std::slice::from_raw_parts(
902                                base.add(field_start),
903                                data_len - field_start,
904                            ),
905                        )
906                    };
907                    first_output = false;
908                }
909
910                if !first_output {
911                    unsafe { buf_push(buf, line_delim) };
912                } else if !has_delim {
913                    if !suppress {
914                        unsafe {
915                            buf_extend(
916                                buf,
917                                std::slice::from_raw_parts(
918                                    base.add(line_start),
919                                    data_len - line_start,
920                                ),
921                            );
922                            buf_push(buf, line_delim);
923                        }
924                    }
925                } else {
926                    unsafe { buf_push(buf, line_delim) };
927                }
928            }
929        }
930
931        return;
932    }
933
934    // Fallback: when delim == line_delim, use the two-level scan approach
935    let mut start = 0;
936    for end_pos in memchr_iter(line_delim, data) {
937        let line = &data[start..end_pos];
938        extract_fields_to_buf(
939            line,
940            delim,
941            ranges,
942            output_delim,
943            suppress,
944            max_field,
945            field_mask,
946            line_delim,
947            buf,
948            complement,
949        );
950        start = end_pos + 1;
951    }
952    if start < data.len() {
953        extract_fields_to_buf(
954            &data[start..],
955            delim,
956            ranges,
957            output_delim,
958            suppress,
959            max_field,
960            field_mask,
961            line_delim,
962            buf,
963            complement,
964        );
965    }
966}
967
968// ── Ultra-fast single field extraction ───────────────────────────────────
969
970/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
971/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
972/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
973fn process_single_field(
974    data: &[u8],
975    delim: u8,
976    line_delim: u8,
977    target: usize,
978    suppress: bool,
979    out: &mut impl Write,
980) -> io::Result<()> {
981    let target_idx = target - 1;
982
983    // For single-field extraction, parallelize at 2MB+ to match PARALLEL_THRESHOLD.
984    // The 10MB benchmark regressed from ~7x to ~5.3x when this was set to 32MB.
985    const FIELD_PARALLEL_MIN: usize = 2 * 1024 * 1024;
986
987    if delim != line_delim {
988        // Field 1 fast path: memchr2 single-pass scan.
989        // For field 1, the first delimiter IS the field boundary. Lines without
990        // delimiter are passed through unchanged.
991        if target_idx == 0 && !suppress {
992            if data.len() >= FIELD_PARALLEL_MIN {
993                return single_field1_parallel(data, delim, line_delim, out);
994            }
995            // Sequential: scan with memchr2 into buffer, single write_all.
996            // Faster than writev/IoSlice for moderate data because it produces
997            // one contiguous buffer → one write syscall, and avoids IoSlice
998            // allocation overhead for high-delimiter-density data.
999            let mut buf = Vec::with_capacity(data.len());
1000            single_field1_to_buf(data, delim, line_delim, &mut buf);
1001            if !buf.is_empty() {
1002                out.write_all(&buf)?;
1003            }
1004            return Ok(());
1005        }
1006
1007        // Two-level approach for field N: outer newline scan + inner delim scan
1008        // with early exit at target_idx. Faster than memchr2 single-pass because
1009        // we only scan delimiters up to target_idx per line (not all of them).
1010        if data.len() >= FIELD_PARALLEL_MIN {
1011            let chunks = split_for_scope(data, line_delim);
1012            let n = chunks.len();
1013            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1014            rayon::scope(|s| {
1015                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1016                    s.spawn(move |_| {
1017                        result.reserve(chunk.len() / 2);
1018                        process_single_field_chunk(
1019                            chunk, delim, target_idx, line_delim, suppress, result,
1020                        );
1021                    });
1022                }
1023            });
1024            let slices: Vec<IoSlice> = results
1025                .iter()
1026                .filter(|r| !r.is_empty())
1027                .map(|r| IoSlice::new(r))
1028                .collect();
1029            write_ioslices(out, &slices)?;
1030        } else {
1031            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1032            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1033            if !buf.is_empty() {
1034                out.write_all(&buf)?;
1035            }
1036        }
1037        return Ok(());
1038    }
1039
1040    // Fallback for delim == line_delim: nested loop approach
1041    if data.len() >= FIELD_PARALLEL_MIN {
1042        let chunks = split_for_scope(data, line_delim);
1043        let n = chunks.len();
1044        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1045        rayon::scope(|s| {
1046            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1047                s.spawn(move |_| {
1048                    result.reserve(chunk.len() / 4);
1049                    process_single_field_chunk(
1050                        chunk, delim, target_idx, line_delim, suppress, result,
1051                    );
1052                });
1053            }
1054        });
1055        let slices: Vec<IoSlice> = results
1056            .iter()
1057            .filter(|r| !r.is_empty())
1058            .map(|r| IoSlice::new(r))
1059            .collect();
1060        write_ioslices(out, &slices)?;
1061    } else {
1062        let mut buf = Vec::with_capacity(data.len() / 4);
1063        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1064        if !buf.is_empty() {
1065            out.write_all(&buf)?;
1066        }
1067    }
1068    Ok(())
1069}
1070
1071/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1072/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1073fn process_complement_range(
1074    data: &[u8],
1075    delim: u8,
1076    line_delim: u8,
1077    skip_start: usize,
1078    skip_end: usize,
1079    suppress: bool,
1080    out: &mut impl Write,
1081) -> io::Result<()> {
1082    if data.len() >= PARALLEL_THRESHOLD {
1083        let chunks = split_for_scope(data, line_delim);
1084        let n = chunks.len();
1085        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1086        rayon::scope(|s| {
1087            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1088                s.spawn(move |_| {
1089                    result.reserve(chunk.len());
1090                    complement_range_chunk(
1091                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
1092                    );
1093                });
1094            }
1095        });
1096        let slices: Vec<IoSlice> = results
1097            .iter()
1098            .filter(|r| !r.is_empty())
1099            .map(|r| IoSlice::new(r))
1100            .collect();
1101        write_ioslices(out, &slices)?;
1102    } else {
1103        let mut buf = Vec::with_capacity(data.len());
1104        complement_range_chunk(
1105            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1106        );
1107        if !buf.is_empty() {
1108            out.write_all(&buf)?;
1109        }
1110    }
1111    Ok(())
1112}
1113
1114/// Process a chunk for complement range extraction.
1115fn complement_range_chunk(
1116    data: &[u8],
1117    delim: u8,
1118    skip_start: usize,
1119    skip_end: usize,
1120    line_delim: u8,
1121    suppress: bool,
1122    buf: &mut Vec<u8>,
1123) {
1124    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1125    buf.reserve(data.len());
1126    let mut start = 0;
1127    for end_pos in memchr_iter(line_delim, data) {
1128        let line = &data[start..end_pos];
1129        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1130        start = end_pos + 1;
1131    }
1132    if start < data.len() {
1133        complement_range_line(
1134            &data[start..],
1135            delim,
1136            skip_start,
1137            skip_end,
1138            line_delim,
1139            suppress,
1140            buf,
1141        );
1142    }
1143}
1144
1145/// Extract all fields except skip_start..=skip_end from one line.
1146/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1147///
1148/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1149/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1150/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1151#[inline(always)]
1152fn complement_range_line(
1153    line: &[u8],
1154    delim: u8,
1155    skip_start: usize,
1156    skip_end: usize,
1157    line_delim: u8,
1158    suppress: bool,
1159    buf: &mut Vec<u8>,
1160) {
1161    let len = line.len();
1162    if len == 0 {
1163        if !suppress {
1164            unsafe { buf_push(buf, line_delim) };
1165        }
1166        return;
1167    }
1168
1169    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1170    let base = line.as_ptr();
1171
1172    // 1-based field numbers. To skip fields skip_start..=skip_end:
1173    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1174    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1175    //
1176    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1177    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1178
1179    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1180    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1181    let total_need = need_prefix_delims + need_skip_delims;
1182
1183    // Find delimiter positions up to total_need
1184    let mut delim_count: usize = 0;
1185    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1186    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1187
1188    for pos in memchr_iter(delim, line) {
1189        delim_count += 1;
1190        if delim_count == need_prefix_delims {
1191            prefix_end_pos = pos;
1192        }
1193        if delim_count == total_need {
1194            suffix_start_pos = pos + 1;
1195            break;
1196        }
1197    }
1198
1199    if delim_count == 0 {
1200        // No delimiter at all
1201        if !suppress {
1202            unsafe {
1203                buf_extend(buf, line);
1204                buf_push(buf, line_delim);
1205            }
1206        }
1207        return;
1208    }
1209
1210    // Case analysis:
1211    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1212    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1213    // 3. Enough to reach skip_end: prefix + delim + suffix
1214
1215    if delim_count < need_prefix_delims {
1216        // Not enough fields to reach skip region — output entire line
1217        unsafe {
1218            buf_extend(buf, line);
1219            buf_push(buf, line_delim);
1220        }
1221        return;
1222    }
1223
1224    let has_prefix = need_prefix_delims > 0;
1225    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1226
1227    if has_prefix && has_suffix {
1228        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1229        unsafe {
1230            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1231            buf_push(buf, delim);
1232            buf_extend(
1233                buf,
1234                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1235            );
1236            buf_push(buf, line_delim);
1237        }
1238    } else if has_prefix {
1239        // Only prefix, no suffix (skip region extends to end of line)
1240        unsafe {
1241            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1242            buf_push(buf, line_delim);
1243        }
1244    } else if has_suffix {
1245        // No prefix (skip_start == 1), only suffix
1246        unsafe {
1247            buf_extend(
1248                buf,
1249                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1250            );
1251            buf_push(buf, line_delim);
1252        }
1253    } else {
1254        // All fields skipped
1255        unsafe { buf_push(buf, line_delim) };
1256    }
1257}
1258
1259/// Complement single-field extraction: skip one field, output rest unchanged.
1260fn process_complement_single_field(
1261    data: &[u8],
1262    delim: u8,
1263    line_delim: u8,
1264    skip_field: usize,
1265    suppress: bool,
1266    out: &mut impl Write,
1267) -> io::Result<()> {
1268    let skip_idx = skip_field - 1;
1269
1270    if data.len() >= PARALLEL_THRESHOLD {
1271        let chunks = split_for_scope(data, line_delim);
1272        let n = chunks.len();
1273        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1274        rayon::scope(|s| {
1275            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1276                s.spawn(move |_| {
1277                    result.reserve(chunk.len());
1278                    complement_single_field_chunk(
1279                        chunk, delim, skip_idx, line_delim, suppress, result,
1280                    );
1281                });
1282            }
1283        });
1284        let slices: Vec<IoSlice> = results
1285            .iter()
1286            .filter(|r| !r.is_empty())
1287            .map(|r| IoSlice::new(r))
1288            .collect();
1289        write_ioslices(out, &slices)?;
1290    } else {
1291        let mut buf = Vec::with_capacity(data.len());
1292        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1293        if !buf.is_empty() {
1294            out.write_all(&buf)?;
1295        }
1296    }
1297    Ok(())
1298}
1299
1300/// Process a chunk for complement single-field extraction using memchr2 single-pass.
1301/// Scans for both delimiter and line_delim in one SIMD pass, tracking delimiter count
1302/// per line. When the skip field's bounding delimiters are found, copies prefix + suffix.
1303/// This eliminates the per-line memchr_iter setup overhead and reduces from two SIMD
1304/// passes (outer newline scan + inner delimiter scan) to one.
1305fn complement_single_field_chunk(
1306    data: &[u8],
1307    delim: u8,
1308    skip_idx: usize,
1309    line_delim: u8,
1310    suppress: bool,
1311    buf: &mut Vec<u8>,
1312) {
1313    // When delim == line_delim, fall back to per-line approach
1314    if delim == line_delim {
1315        buf.reserve(data.len());
1316        let mut start = 0;
1317        for end_pos in memchr_iter(line_delim, data) {
1318            let line = &data[start..end_pos];
1319            complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1320            start = end_pos + 1;
1321        }
1322        if start < data.len() {
1323            complement_single_field_line(
1324                &data[start..],
1325                delim,
1326                skip_idx,
1327                line_delim,
1328                suppress,
1329                buf,
1330            );
1331        }
1332        return;
1333    }
1334
1335    buf.reserve(data.len());
1336    let base = data.as_ptr();
1337    let data_len = data.len();
1338    let need_before = skip_idx; // delimiters before skip field
1339    let need_total = skip_idx + 1; // delimiters to find end of skip field
1340
1341    // Per-line state
1342    let mut line_start: usize = 0;
1343    let mut delim_count: usize = 0;
1344    let mut skip_start_pos: usize = 0;
1345    let mut skip_end_pos: usize = 0;
1346    let mut found_start = need_before == 0; // skip_idx==0 means skip starts at line start
1347    let mut found_end = false;
1348
1349    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1350        let byte = unsafe { *base.add(pos) };
1351
1352        if byte == line_delim {
1353            // End of line: emit based on what we found
1354            if delim_count == 0 {
1355                // No delimiter in line
1356                if !suppress {
1357                    unsafe {
1358                        buf_extend(
1359                            buf,
1360                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1361                        );
1362                        buf_push(buf, line_delim);
1363                    }
1364                }
1365            } else if !found_start || delim_count < need_before {
1366                // Not enough delimiters to reach skip field — output entire line
1367                unsafe {
1368                    buf_extend(
1369                        buf,
1370                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1371                    );
1372                    buf_push(buf, line_delim);
1373                }
1374            } else {
1375                let has_prefix = skip_idx > 0;
1376                let has_suffix = found_end && skip_end_pos < pos;
1377
1378                if has_prefix && has_suffix {
1379                    unsafe {
1380                        buf_extend(
1381                            buf,
1382                            std::slice::from_raw_parts(
1383                                base.add(line_start),
1384                                skip_start_pos - 1 - line_start,
1385                            ),
1386                        );
1387                        buf_push(buf, delim);
1388                        buf_extend(
1389                            buf,
1390                            std::slice::from_raw_parts(
1391                                base.add(skip_end_pos + 1),
1392                                pos - skip_end_pos - 1,
1393                            ),
1394                        );
1395                        buf_push(buf, line_delim);
1396                    }
1397                } else if has_prefix {
1398                    unsafe {
1399                        buf_extend(
1400                            buf,
1401                            std::slice::from_raw_parts(
1402                                base.add(line_start),
1403                                skip_start_pos - 1 - line_start,
1404                            ),
1405                        );
1406                        buf_push(buf, line_delim);
1407                    }
1408                } else if has_suffix {
1409                    unsafe {
1410                        buf_extend(
1411                            buf,
1412                            std::slice::from_raw_parts(
1413                                base.add(skip_end_pos + 1),
1414                                pos - skip_end_pos - 1,
1415                            ),
1416                        );
1417                        buf_push(buf, line_delim);
1418                    }
1419                } else {
1420                    unsafe { buf_push(buf, line_delim) };
1421                }
1422            }
1423
1424            // Reset for next line
1425            line_start = pos + 1;
1426            delim_count = 0;
1427            skip_start_pos = 0;
1428            skip_end_pos = 0;
1429            found_start = need_before == 0;
1430            found_end = false;
1431        } else {
1432            // Delimiter found
1433            delim_count += 1;
1434            if delim_count == need_before {
1435                skip_start_pos = pos + 1;
1436                found_start = true;
1437            }
1438            if delim_count == need_total {
1439                skip_end_pos = pos;
1440                found_end = true;
1441            }
1442        }
1443    }
1444
1445    // Handle last line without trailing line_delim
1446    if line_start < data_len {
1447        let pos = data_len;
1448        if delim_count == 0 {
1449            if !suppress {
1450                unsafe {
1451                    buf_extend(
1452                        buf,
1453                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1454                    );
1455                    buf_push(buf, line_delim);
1456                }
1457            }
1458        } else if !found_start || delim_count < need_before {
1459            unsafe {
1460                buf_extend(
1461                    buf,
1462                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1463                );
1464                buf_push(buf, line_delim);
1465            }
1466        } else {
1467            let has_prefix = skip_idx > 0;
1468            let has_suffix = found_end && skip_end_pos < pos;
1469
1470            if has_prefix && has_suffix {
1471                unsafe {
1472                    buf_extend(
1473                        buf,
1474                        std::slice::from_raw_parts(
1475                            base.add(line_start),
1476                            skip_start_pos - 1 - line_start,
1477                        ),
1478                    );
1479                    buf_push(buf, delim);
1480                    buf_extend(
1481                        buf,
1482                        std::slice::from_raw_parts(
1483                            base.add(skip_end_pos + 1),
1484                            pos - skip_end_pos - 1,
1485                        ),
1486                    );
1487                    buf_push(buf, line_delim);
1488                }
1489            } else if has_prefix {
1490                unsafe {
1491                    buf_extend(
1492                        buf,
1493                        std::slice::from_raw_parts(
1494                            base.add(line_start),
1495                            skip_start_pos - 1 - line_start,
1496                        ),
1497                    );
1498                    buf_push(buf, line_delim);
1499                }
1500            } else if has_suffix {
1501                unsafe {
1502                    buf_extend(
1503                        buf,
1504                        std::slice::from_raw_parts(
1505                            base.add(skip_end_pos + 1),
1506                            pos - skip_end_pos - 1,
1507                        ),
1508                    );
1509                    buf_push(buf, line_delim);
1510                }
1511            } else {
1512                unsafe { buf_push(buf, line_delim) };
1513            }
1514        }
1515    }
1516}
1517
1518/// Fallback per-line complement single-field extraction (for delim == line_delim).
1519#[inline(always)]
1520fn complement_single_field_line(
1521    line: &[u8],
1522    delim: u8,
1523    skip_idx: usize,
1524    line_delim: u8,
1525    suppress: bool,
1526    buf: &mut Vec<u8>,
1527) {
1528    let len = line.len();
1529    if len == 0 {
1530        if !suppress {
1531            unsafe { buf_push(buf, line_delim) };
1532        }
1533        return;
1534    }
1535
1536    let base = line.as_ptr();
1537    let need_before = skip_idx;
1538    let need_total = skip_idx + 1;
1539
1540    let mut delim_count: usize = 0;
1541    let mut skip_start_pos: usize = 0;
1542    let mut skip_end_pos: usize = len;
1543    let mut found_end = false;
1544
1545    for pos in memchr_iter(delim, line) {
1546        delim_count += 1;
1547        if delim_count == need_before {
1548            skip_start_pos = pos + 1;
1549        }
1550        if delim_count == need_total {
1551            skip_end_pos = pos;
1552            found_end = true;
1553            break;
1554        }
1555    }
1556
1557    if delim_count == 0 {
1558        if !suppress {
1559            unsafe {
1560                buf_extend(buf, line);
1561                buf_push(buf, line_delim);
1562            }
1563        }
1564        return;
1565    }
1566
1567    if delim_count < need_before {
1568        unsafe {
1569            buf_extend(buf, line);
1570            buf_push(buf, line_delim);
1571        }
1572        return;
1573    }
1574
1575    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1576    let has_suffix = found_end && skip_end_pos < len;
1577
1578    if has_prefix && has_suffix {
1579        unsafe {
1580            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1581            buf_push(buf, delim);
1582            buf_extend(
1583                buf,
1584                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1585            );
1586            buf_push(buf, line_delim);
1587        }
1588    } else if has_prefix {
1589        unsafe {
1590            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1591            buf_push(buf, line_delim);
1592        }
1593    } else if has_suffix {
1594        unsafe {
1595            buf_extend(
1596                buf,
1597                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1598            );
1599            buf_push(buf, line_delim);
1600        }
1601    } else {
1602        unsafe { buf_push(buf, line_delim) };
1603    }
1604}
1605
1606/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1607/// Zero-copy for the non-parallel path: identifies the truncation point per line
1608/// and writes contiguous runs directly from the source data.
1609fn process_fields_prefix(
1610    data: &[u8],
1611    delim: u8,
1612    line_delim: u8,
1613    last_field: usize,
1614    suppress: bool,
1615    out: &mut impl Write,
1616) -> io::Result<()> {
1617    if data.len() >= PARALLEL_THRESHOLD {
1618        let chunks = split_for_scope(data, line_delim);
1619        let n = chunks.len();
1620        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1621        rayon::scope(|s| {
1622            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1623                s.spawn(move |_| {
1624                    result.reserve(chunk.len());
1625                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1626                });
1627            }
1628        });
1629        let slices: Vec<IoSlice> = results
1630            .iter()
1631            .filter(|r| !r.is_empty())
1632            .map(|r| IoSlice::new(r))
1633            .collect();
1634        write_ioslices(out, &slices)?;
1635    } else if !suppress {
1636        // Zero-copy fast path: scan for truncation points, write runs from source.
1637        // When suppress is false, every line is output (with or without delimiter).
1638        // Most lines have enough fields, so the output is often identical to input.
1639        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1640    } else {
1641        let mut buf = Vec::with_capacity(data.len());
1642        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1643        if !buf.is_empty() {
1644            out.write_all(&buf)?;
1645        }
1646    }
1647    Ok(())
1648}
1649
1650/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1651/// directly into the source data, flushing in MAX_IOV-sized batches.
1652/// For lines where the Nth delimiter exists, we truncate at that point.
1653/// For lines with fewer fields, we output them unchanged (contiguous run).
1654/// Lines without any delimiter are output unchanged (suppress=false assumed).
1655#[inline]
1656fn fields_prefix_zerocopy(
1657    data: &[u8],
1658    delim: u8,
1659    line_delim: u8,
1660    last_field: usize,
1661    out: &mut impl Write,
1662) -> io::Result<()> {
1663    let newline_buf: [u8; 1] = [line_delim];
1664    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1665    let mut start = 0;
1666    let mut run_start: usize = 0;
1667
1668    for end_pos in memchr_iter(line_delim, data) {
1669        let line = &data[start..end_pos];
1670        let mut field_count = 1;
1671        let mut truncate_at: Option<usize> = None;
1672        for dpos in memchr_iter(delim, line) {
1673            if field_count >= last_field {
1674                truncate_at = Some(start + dpos);
1675                break;
1676            }
1677            field_count += 1;
1678        }
1679
1680        if let Some(trunc_pos) = truncate_at {
1681            if run_start < start {
1682                iov.push(IoSlice::new(&data[run_start..start]));
1683            }
1684            iov.push(IoSlice::new(&data[start..trunc_pos]));
1685            iov.push(IoSlice::new(&newline_buf));
1686            run_start = end_pos + 1;
1687
1688            if iov.len() >= MAX_IOV - 2 {
1689                write_ioslices(out, &iov)?;
1690                iov.clear();
1691            }
1692        }
1693        start = end_pos + 1;
1694    }
1695    // Handle last line without terminator
1696    if start < data.len() {
1697        let line = &data[start..];
1698        let mut field_count = 1;
1699        let mut truncate_at: Option<usize> = None;
1700        for dpos in memchr_iter(delim, line) {
1701            if field_count >= last_field {
1702                truncate_at = Some(start + dpos);
1703                break;
1704            }
1705            field_count += 1;
1706        }
1707        if let Some(trunc_pos) = truncate_at {
1708            if run_start < start {
1709                iov.push(IoSlice::new(&data[run_start..start]));
1710            }
1711            iov.push(IoSlice::new(&data[start..trunc_pos]));
1712            iov.push(IoSlice::new(&newline_buf));
1713            if !iov.is_empty() {
1714                write_ioslices(out, &iov)?;
1715            }
1716            return Ok(());
1717        }
1718    }
1719    // Flush remaining contiguous run
1720    if run_start < data.len() {
1721        iov.push(IoSlice::new(&data[run_start..]));
1722        if !data.is_empty() && *data.last().unwrap() != line_delim {
1723            iov.push(IoSlice::new(&newline_buf));
1724        }
1725    }
1726    if !iov.is_empty() {
1727        write_ioslices(out, &iov)?;
1728    }
1729    Ok(())
1730}
1731
1732/// Process a chunk for contiguous from-start field range extraction.
1733fn fields_prefix_chunk(
1734    data: &[u8],
1735    delim: u8,
1736    line_delim: u8,
1737    last_field: usize,
1738    suppress: bool,
1739    buf: &mut Vec<u8>,
1740) {
1741    buf.reserve(data.len());
1742    let mut start = 0;
1743    for end_pos in memchr_iter(line_delim, data) {
1744        let line = &data[start..end_pos];
1745        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1746        start = end_pos + 1;
1747    }
1748    if start < data.len() {
1749        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1750    }
1751}
1752
1753/// Extract first N fields from one line (contiguous from-start range).
1754/// Uses memchr SIMD for delimiter scanning on all line sizes.
1755#[inline(always)]
1756fn fields_prefix_line(
1757    line: &[u8],
1758    delim: u8,
1759    line_delim: u8,
1760    last_field: usize,
1761    suppress: bool,
1762    buf: &mut Vec<u8>,
1763) {
1764    let len = line.len();
1765    if len == 0 {
1766        if !suppress {
1767            unsafe { buf_push(buf, line_delim) };
1768        }
1769        return;
1770    }
1771
1772    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1773    let base = line.as_ptr();
1774
1775    let mut field_count = 1usize;
1776    let mut has_delim = false;
1777
1778    for pos in memchr_iter(delim, line) {
1779        has_delim = true;
1780        if field_count >= last_field {
1781            unsafe {
1782                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1783                buf_push(buf, line_delim);
1784            }
1785            return;
1786        }
1787        field_count += 1;
1788    }
1789
1790    if !has_delim {
1791        if !suppress {
1792            unsafe {
1793                buf_extend(buf, line);
1794                buf_push(buf, line_delim);
1795            }
1796        }
1797        return;
1798    }
1799
1800    unsafe {
1801        buf_extend(buf, line);
1802        buf_push(buf, line_delim);
1803    }
1804}
1805
1806/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1807fn process_fields_suffix(
1808    data: &[u8],
1809    delim: u8,
1810    line_delim: u8,
1811    start_field: usize,
1812    suppress: bool,
1813    out: &mut impl Write,
1814) -> io::Result<()> {
1815    if data.len() >= PARALLEL_THRESHOLD {
1816        let chunks = split_for_scope(data, line_delim);
1817        let n = chunks.len();
1818        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1819        rayon::scope(|s| {
1820            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1821                s.spawn(move |_| {
1822                    result.reserve(chunk.len());
1823                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1824                });
1825            }
1826        });
1827        let slices: Vec<IoSlice> = results
1828            .iter()
1829            .filter(|r| !r.is_empty())
1830            .map(|r| IoSlice::new(r))
1831            .collect();
1832        write_ioslices(out, &slices)?;
1833    } else {
1834        let mut buf = Vec::with_capacity(data.len());
1835        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1836        if !buf.is_empty() {
1837            out.write_all(&buf)?;
1838        }
1839    }
1840    Ok(())
1841}
1842
1843/// Process a chunk for open-ended field suffix extraction.
1844fn fields_suffix_chunk(
1845    data: &[u8],
1846    delim: u8,
1847    line_delim: u8,
1848    start_field: usize,
1849    suppress: bool,
1850    buf: &mut Vec<u8>,
1851) {
1852    buf.reserve(data.len());
1853    let mut start = 0;
1854    for end_pos in memchr_iter(line_delim, data) {
1855        let line = &data[start..end_pos];
1856        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1857        start = end_pos + 1;
1858    }
1859    if start < data.len() {
1860        fields_suffix_line(
1861            &data[start..],
1862            delim,
1863            line_delim,
1864            start_field,
1865            suppress,
1866            buf,
1867        );
1868    }
1869}
1870
1871/// Extract fields from start_field to end from one line.
1872/// Uses memchr SIMD for delimiter scanning on all line sizes.
1873#[inline(always)]
1874fn fields_suffix_line(
1875    line: &[u8],
1876    delim: u8,
1877    line_delim: u8,
1878    start_field: usize,
1879    suppress: bool,
1880    buf: &mut Vec<u8>,
1881) {
1882    let len = line.len();
1883    if len == 0 {
1884        if !suppress {
1885            unsafe { buf_push(buf, line_delim) };
1886        }
1887        return;
1888    }
1889
1890    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1891    let base = line.as_ptr();
1892
1893    let skip_delims = start_field - 1;
1894    let mut delim_count = 0usize;
1895    let mut has_delim = false;
1896
1897    for pos in memchr_iter(delim, line) {
1898        has_delim = true;
1899        delim_count += 1;
1900        if delim_count >= skip_delims {
1901            unsafe {
1902                buf_extend(
1903                    buf,
1904                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1905                );
1906                buf_push(buf, line_delim);
1907            }
1908            return;
1909        }
1910    }
1911
1912    if !has_delim {
1913        if !suppress {
1914            unsafe {
1915                buf_extend(buf, line);
1916                buf_push(buf, line_delim);
1917            }
1918        }
1919        return;
1920    }
1921
1922    // Fewer delimiters than needed
1923    unsafe { buf_push(buf, line_delim) };
1924}
1925
1926/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1927/// Optimized: skip to start_field using memchr, then output until end_field.
1928fn process_fields_mid_range(
1929    data: &[u8],
1930    delim: u8,
1931    line_delim: u8,
1932    start_field: usize,
1933    end_field: usize,
1934    suppress: bool,
1935    out: &mut impl Write,
1936) -> io::Result<()> {
1937    if data.len() >= PARALLEL_THRESHOLD {
1938        let chunks = split_for_scope(data, line_delim);
1939        let n = chunks.len();
1940        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1941        rayon::scope(|s| {
1942            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1943                s.spawn(move |_| {
1944                    result.reserve(chunk.len());
1945                    fields_mid_range_chunk(
1946                        chunk,
1947                        delim,
1948                        line_delim,
1949                        start_field,
1950                        end_field,
1951                        suppress,
1952                        result,
1953                    );
1954                });
1955            }
1956        });
1957        let slices: Vec<IoSlice> = results
1958            .iter()
1959            .filter(|r| !r.is_empty())
1960            .map(|r| IoSlice::new(r))
1961            .collect();
1962        write_ioslices(out, &slices)?;
1963    } else {
1964        let mut buf = Vec::with_capacity(data.len());
1965        fields_mid_range_chunk(
1966            data,
1967            delim,
1968            line_delim,
1969            start_field,
1970            end_field,
1971            suppress,
1972            &mut buf,
1973        );
1974        if !buf.is_empty() {
1975            out.write_all(&buf)?;
1976        }
1977    }
1978    Ok(())
1979}
1980
1981/// Process a chunk for contiguous mid-range field extraction.
1982/// Single-pass memchr2 scan over the entire chunk, tracking delimiter count
1983/// per line. Avoids the double-scan (outer newline + inner delimiter).
1984fn fields_mid_range_chunk(
1985    data: &[u8],
1986    delim: u8,
1987    line_delim: u8,
1988    start_field: usize,
1989    end_field: usize,
1990    suppress: bool,
1991    buf: &mut Vec<u8>,
1992) {
1993    // When delim == line_delim, fall back to per-line approach
1994    if delim == line_delim {
1995        buf.reserve(data.len());
1996        let mut start = 0;
1997        for end_pos in memchr_iter(line_delim, data) {
1998            let line = &data[start..end_pos];
1999            fields_mid_range_line(
2000                line,
2001                delim,
2002                line_delim,
2003                start_field,
2004                end_field,
2005                suppress,
2006                buf,
2007            );
2008            start = end_pos + 1;
2009        }
2010        if start < data.len() {
2011            fields_mid_range_line(
2012                &data[start..],
2013                delim,
2014                line_delim,
2015                start_field,
2016                end_field,
2017                suppress,
2018                buf,
2019            );
2020        }
2021        return;
2022    }
2023
2024    buf.reserve(data.len());
2025    let base = data.as_ptr();
2026    let skip_before = start_field - 1; // delimiters to skip before range
2027    let target_end_delim = skip_before + (end_field - start_field) + 1;
2028
2029    let mut line_start: usize = 0;
2030    let mut delim_count: usize = 0;
2031    let mut range_start: usize = 0;
2032    let mut has_delim = false;
2033    let mut found_end = false; // true when we found all target fields, skip to newline
2034
2035    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2036        let byte = unsafe { *base.add(pos) };
2037        if byte == line_delim {
2038            // End of line
2039            if found_end {
2040                // Already output this line's range
2041            } else if !has_delim {
2042                // No delimiter on this line
2043                if !suppress {
2044                    unsafe {
2045                        buf_extend(
2046                            buf,
2047                            std::slice::from_raw_parts(base.add(line_start), pos + 1 - line_start),
2048                        );
2049                    }
2050                }
2051            } else if delim_count >= skip_before {
2052                // Have enough fields for start_field; output from range_start to EOL
2053                if skip_before == 0 {
2054                    range_start = line_start;
2055                }
2056                unsafe {
2057                    buf_extend(
2058                        buf,
2059                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2060                    );
2061                    buf_push(buf, line_delim);
2062                }
2063            } else {
2064                // Not enough fields for start_field — output empty line
2065                unsafe { buf_push(buf, line_delim) };
2066            }
2067            line_start = pos + 1;
2068            delim_count = 0;
2069            has_delim = false;
2070            found_end = false;
2071        } else if !found_end {
2072            // Delimiter
2073            has_delim = true;
2074            delim_count += 1;
2075            if delim_count == skip_before {
2076                range_start = pos + 1;
2077            }
2078            if delim_count == target_end_delim {
2079                if skip_before == 0 {
2080                    range_start = line_start;
2081                }
2082                unsafe {
2083                    buf_extend(
2084                        buf,
2085                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2086                    );
2087                    buf_push(buf, line_delim);
2088                }
2089                found_end = true;
2090            }
2091        }
2092    }
2093    // Handle trailing data without final newline
2094    if line_start < data.len() && !found_end {
2095        if !has_delim {
2096            if !suppress {
2097                unsafe {
2098                    buf_extend(
2099                        buf,
2100                        std::slice::from_raw_parts(base.add(line_start), data.len() - line_start),
2101                    );
2102                }
2103            }
2104        } else if delim_count >= skip_before {
2105            if skip_before == 0 {
2106                range_start = line_start;
2107            }
2108            unsafe {
2109                buf_extend(
2110                    buf,
2111                    std::slice::from_raw_parts(base.add(range_start), data.len() - range_start),
2112                );
2113            }
2114        }
2115    }
2116}
2117
2118/// Extract fields start_field..=end_field from one line.
2119/// Uses scalar byte scanning for short lines, memchr_iter for longer.
2120/// Raw pointer arithmetic to eliminate bounds checking.
2121#[inline(always)]
2122fn fields_mid_range_line(
2123    line: &[u8],
2124    delim: u8,
2125    line_delim: u8,
2126    start_field: usize,
2127    end_field: usize,
2128    suppress: bool,
2129    buf: &mut Vec<u8>,
2130) {
2131    let len = line.len();
2132    if len == 0 {
2133        if !suppress {
2134            unsafe { buf_push(buf, line_delim) };
2135        }
2136        return;
2137    }
2138
2139    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
2140    let base = line.as_ptr();
2141
2142    // Count delimiters to find start_field and end_field boundaries
2143    let skip_before = start_field - 1; // delimiters to skip before start_field
2144    let field_span = end_field - start_field; // additional delimiters within the range
2145    let target_end_delim = skip_before + field_span + 1;
2146    let mut delim_count = 0;
2147    let mut range_start = 0;
2148    let mut has_delim = false;
2149
2150    for pos in memchr_iter(delim, line) {
2151        has_delim = true;
2152        delim_count += 1;
2153        if delim_count == skip_before {
2154            range_start = pos + 1;
2155        }
2156        if delim_count == target_end_delim {
2157            if skip_before == 0 {
2158                range_start = 0;
2159            }
2160            unsafe {
2161                buf_extend(
2162                    buf,
2163                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2164                );
2165                buf_push(buf, line_delim);
2166            }
2167            return;
2168        }
2169    }
2170
2171    if !has_delim {
2172        if !suppress {
2173            unsafe {
2174                buf_extend(buf, line);
2175                buf_push(buf, line_delim);
2176            }
2177        }
2178        return;
2179    }
2180
2181    // Line has delimiters but fewer fields than end_field
2182    if delim_count >= skip_before {
2183        // We have at least start_field, output from range_start to end
2184        if skip_before == 0 {
2185            range_start = 0;
2186        }
2187        unsafe {
2188            buf_extend(
2189                buf,
2190                std::slice::from_raw_parts(base.add(range_start), len - range_start),
2191            );
2192            buf_push(buf, line_delim);
2193        }
2194    } else {
2195        // Not enough fields even for start_field — output empty line
2196        unsafe { buf_push(buf, line_delim) };
2197    }
2198}
2199
2200/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
2201/// directly into the source data, flushing in MAX_IOV-sized batches.
2202/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
2203///
2204/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
2205/// Parallel field-1 extraction for large data using memchr2 single-pass.
2206/// Splits data into per-thread chunks, each chunk extracts field 1 using
2207/// memchr2(delim, newline) which finds the first special byte in one scan.
2208/// For field 1: first special byte is either the delimiter (field end) or
2209/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
2210fn single_field1_parallel(
2211    data: &[u8],
2212    delim: u8,
2213    line_delim: u8,
2214    out: &mut impl Write,
2215) -> io::Result<()> {
2216    let chunks = split_for_scope(data, line_delim);
2217    let n = chunks.len();
2218    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2219    rayon::scope(|s| {
2220        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2221            s.spawn(move |_| {
2222                result.reserve(chunk.len());
2223                single_field1_to_buf(chunk, delim, line_delim, result);
2224            });
2225        }
2226    });
2227    let slices: Vec<IoSlice> = results
2228        .iter()
2229        .filter(|r| !r.is_empty())
2230        .map(|r| IoSlice::new(r))
2231        .collect();
2232    write_ioslices(out, &slices)
2233}
2234
2235/// Extract field 1 from a chunk using memchr2_iter single-pass SIMD scanning.
2236/// Uses a single memchr2_iter pass over the entire chunk to find both delimiters
2237/// and newlines. This eliminates the per-line memchr function call overhead
2238/// (~5-10ns per call × 2 calls per line) that dominates for short-field data.
2239/// For 100MB with 1M lines: saves ~10-20ms of function call setup overhead.
2240#[inline]
2241fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
2242    buf.reserve(data.len());
2243    let base = data.as_ptr();
2244    let mut line_start: usize = 0;
2245    let mut found_delim = false;
2246
2247    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2248        let byte = unsafe { *base.add(pos) };
2249        if byte == line_delim {
2250            if !found_delim {
2251                // No delimiter on this line — output entire line including newline
2252                unsafe {
2253                    buf_extend(
2254                        buf,
2255                        std::slice::from_raw_parts(base.add(line_start), pos + 1 - line_start),
2256                    );
2257                }
2258            } else {
2259                // Delimiter was found earlier — just add the line terminator
2260                unsafe { buf_push(buf, line_delim) };
2261            }
2262            line_start = pos + 1;
2263            found_delim = false;
2264        } else if !found_delim {
2265            // First delimiter on this line — output field 1
2266            unsafe {
2267                buf_extend(
2268                    buf,
2269                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
2270                );
2271            }
2272            found_delim = true;
2273        }
2274        // Subsequent delimiters on same line: ignore
2275    }
2276    // Handle trailing data without final line_delim
2277    if line_start < data.len() {
2278        if !found_delim {
2279            unsafe {
2280                buf_extend(
2281                    buf,
2282                    std::slice::from_raw_parts(base.add(line_start), data.len() - line_start),
2283                );
2284            }
2285        }
2286    }
2287}
2288
2289/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
2290/// directly into the source data. Uses two-level scan: outer memchr(newline)
2291/// for the first delimiter. This is faster than memchr2 for SMALL data because
2292/// the inner scan exits after the FIRST delimiter, skipping all
2293/// subsequent delimiters on the line.
2294///
2295/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2296/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2297#[inline]
2298#[allow(dead_code)]
2299fn single_field1_zerocopy(
2300    data: &[u8],
2301    delim: u8,
2302    line_delim: u8,
2303    out: &mut impl Write,
2304) -> io::Result<()> {
2305    let newline_buf: [u8; 1] = [line_delim];
2306
2307    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2308    let mut run_start: usize = 0;
2309    let mut start = 0;
2310
2311    for end_pos in memchr_iter(line_delim, data) {
2312        let line = &data[start..end_pos];
2313        if let Some(dp) = memchr::memchr(delim, line) {
2314            // Line has delimiter — truncate at first delimiter.
2315            // Flush current contiguous run, then add truncated field + newline.
2316            if run_start < start {
2317                iov.push(IoSlice::new(&data[run_start..start]));
2318            }
2319            iov.push(IoSlice::new(&data[start..start + dp]));
2320            iov.push(IoSlice::new(&newline_buf));
2321            run_start = end_pos + 1;
2322
2323            if iov.len() >= MAX_IOV - 2 {
2324                write_ioslices(out, &iov)?;
2325                iov.clear();
2326            }
2327        }
2328        // else: no delimiter in line, output unchanged (stays in contiguous run)
2329        start = end_pos + 1;
2330    }
2331
2332    // Handle last line (no trailing newline)
2333    if start < data.len() {
2334        let line = &data[start..];
2335        if let Some(dp) = memchr::memchr(delim, line) {
2336            if run_start < start {
2337                iov.push(IoSlice::new(&data[run_start..start]));
2338            }
2339            iov.push(IoSlice::new(&data[start..start + dp]));
2340            iov.push(IoSlice::new(&newline_buf));
2341            if !iov.is_empty() {
2342                write_ioslices(out, &iov)?;
2343            }
2344            return Ok(());
2345        }
2346    }
2347
2348    // Flush remaining contiguous run
2349    if run_start < data.len() {
2350        iov.push(IoSlice::new(&data[run_start..]));
2351        if !data.is_empty() && *data.last().unwrap() != line_delim {
2352            iov.push(IoSlice::new(&newline_buf));
2353        }
2354    }
2355    if !iov.is_empty() {
2356        write_ioslices(out, &iov)?;
2357    }
2358    Ok(())
2359}
2360
2361/// Process a chunk of data for single-field extraction.
2362fn process_single_field_chunk(
2363    data: &[u8],
2364    delim: u8,
2365    target_idx: usize,
2366    line_delim: u8,
2367    suppress: bool,
2368    buf: &mut Vec<u8>,
2369) {
2370    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2371    buf.reserve(data.len());
2372    let mut start = 0;
2373    for end_pos in memchr_iter(line_delim, data) {
2374        let line = &data[start..end_pos];
2375        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2376        start = end_pos + 1;
2377    }
2378    if start < data.len() {
2379        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2380    }
2381}
2382
2383/// Extract a single field from one line.
2384/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2385/// For longer lines, uses memchr for SIMD-accelerated scanning.
2386/// Raw pointer arithmetic eliminates per-field bounds checking.
2387#[inline(always)]
2388fn extract_single_field_line(
2389    line: &[u8],
2390    delim: u8,
2391    target_idx: usize,
2392    line_delim: u8,
2393    suppress: bool,
2394    buf: &mut Vec<u8>,
2395) {
2396    let len = line.len();
2397    if len == 0 {
2398        if !suppress {
2399            unsafe { buf_push(buf, line_delim) };
2400        }
2401        return;
2402    }
2403
2404    // Note: no per-line buf.reserve — process_single_field_chunk already reserves data.len()
2405    let base = line.as_ptr();
2406
2407    // Ultra-fast path for first field: single memchr
2408    if target_idx == 0 {
2409        match memchr::memchr(delim, line) {
2410            Some(pos) => unsafe {
2411                buf_extend(buf, std::slice::from_raw_parts(base, pos));
2412                buf_push(buf, line_delim);
2413            },
2414            None => {
2415                if !suppress {
2416                    unsafe {
2417                        buf_extend(buf, line);
2418                        buf_push(buf, line_delim);
2419                    }
2420                }
2421            }
2422        }
2423        return;
2424    }
2425
2426    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2427    let mut field_start = 0;
2428    let mut field_idx = 0;
2429    let mut has_delim = false;
2430
2431    for pos in memchr_iter(delim, line) {
2432        has_delim = true;
2433        if field_idx == target_idx {
2434            unsafe {
2435                buf_extend(
2436                    buf,
2437                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2438                );
2439                buf_push(buf, line_delim);
2440            }
2441            return;
2442        }
2443        field_idx += 1;
2444        field_start = pos + 1;
2445    }
2446
2447    if !has_delim {
2448        if !suppress {
2449            unsafe {
2450                buf_extend(buf, line);
2451                buf_push(buf, line_delim);
2452            }
2453        }
2454        return;
2455    }
2456
2457    if field_idx == target_idx {
2458        unsafe {
2459            buf_extend(
2460                buf,
2461                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2462            );
2463            buf_push(buf, line_delim);
2464        }
2465    } else {
2466        unsafe { buf_push(buf, line_delim) };
2467    }
2468}
2469
2470/// Extract fields from a single line into the output buffer.
2471/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2472/// Raw pointer arithmetic eliminates per-field bounds checking.
2473#[inline(always)]
2474fn extract_fields_to_buf(
2475    line: &[u8],
2476    delim: u8,
2477    ranges: &[Range],
2478    output_delim: &[u8],
2479    suppress: bool,
2480    max_field: usize,
2481    field_mask: u64,
2482    line_delim: u8,
2483    buf: &mut Vec<u8>,
2484    complement: bool,
2485) {
2486    let len = line.len();
2487
2488    if len == 0 {
2489        if !suppress {
2490            buf.push(line_delim);
2491        }
2492        return;
2493    }
2494
2495    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2496    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2497    let needed = len + output_delim.len() * 16 + 1;
2498    if buf.capacity() - buf.len() < needed {
2499        buf.reserve(needed);
2500    }
2501
2502    let base = line.as_ptr();
2503    let mut field_num: usize = 1;
2504    let mut field_start: usize = 0;
2505    let mut first_output = true;
2506    let mut has_delim = false;
2507
2508    // Use memchr SIMD for all line sizes
2509    for delim_pos in memchr_iter(delim, line) {
2510        has_delim = true;
2511
2512        if is_selected(field_num, field_mask, ranges, complement) {
2513            if !first_output {
2514                unsafe { buf_extend(buf, output_delim) };
2515            }
2516            unsafe {
2517                buf_extend(
2518                    buf,
2519                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2520                )
2521            };
2522            first_output = false;
2523        }
2524
2525        field_num += 1;
2526        field_start = delim_pos + 1;
2527
2528        if field_num > max_field {
2529            break;
2530        }
2531    }
2532
2533    // Last field
2534    if (field_num <= max_field || complement)
2535        && has_delim
2536        && is_selected(field_num, field_mask, ranges, complement)
2537    {
2538        if !first_output {
2539            unsafe { buf_extend(buf, output_delim) };
2540        }
2541        unsafe {
2542            buf_extend(
2543                buf,
2544                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2545            )
2546        };
2547        first_output = false;
2548    }
2549
2550    if !first_output {
2551        unsafe { buf_push(buf, line_delim) };
2552    } else if !has_delim {
2553        if !suppress {
2554            unsafe {
2555                buf_extend(buf, line);
2556                buf_push(buf, line_delim);
2557            }
2558        }
2559    } else {
2560        unsafe { buf_push(buf, line_delim) };
2561    }
2562}
2563
2564// ── Fast path: byte/char extraction with batched output ──────────────────
2565
2566/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2567/// Zero-copy: writes directly from the source data using output runs.
2568/// For lines shorter than max_bytes, the output is identical to the input,
2569/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2570fn process_bytes_from_start(
2571    data: &[u8],
2572    max_bytes: usize,
2573    line_delim: u8,
2574    out: &mut impl Write,
2575) -> io::Result<()> {
2576    // For small data (< PARALLEL_THRESHOLD): check if all lines fit for zero-copy passthrough.
2577    // The sequential scan + write_all is competitive with per-line processing for small data.
2578    //
2579    // For large data (>= PARALLEL_THRESHOLD): skip the all_fit scan entirely.
2580    // The scan is sequential (~1.7ms for 10MB at memchr speed) while parallel per-line
2581    // processing is much faster (~0.5ms for 10MB with 4 threads). Even when all lines fit,
2582    // the parallel copy + write is faster than sequential scan + zero-copy write.
2583    if data.len() < PARALLEL_THRESHOLD && max_bytes > 0 && max_bytes < usize::MAX {
2584        let mut start = 0;
2585        let mut all_fit = true;
2586        for pos in memchr_iter(line_delim, data) {
2587            if pos - start > max_bytes {
2588                all_fit = false;
2589                break;
2590            }
2591            start = pos + 1;
2592        }
2593        // Check last line (no trailing delimiter)
2594        if all_fit && start < data.len() && data.len() - start > max_bytes {
2595            all_fit = false;
2596        }
2597        if all_fit {
2598            // All lines fit: output = input. Handle missing trailing delimiter.
2599            if !data.is_empty() && data[data.len() - 1] == line_delim {
2600                return out.write_all(data);
2601            } else if !data.is_empty() {
2602                out.write_all(data)?;
2603                return out.write_all(&[line_delim]);
2604            }
2605            return Ok(());
2606        }
2607    }
2608
2609    if data.len() >= PARALLEL_THRESHOLD {
2610        let chunks = split_for_scope(data, line_delim);
2611        let n = chunks.len();
2612        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2613        rayon::scope(|s| {
2614            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2615                s.spawn(move |_| {
2616                    // Output can be up to input size (when all lines fit).
2617                    // Reserve full chunk size to avoid reallocation.
2618                    result.reserve(chunk.len());
2619                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2620                });
2621            }
2622        });
2623        // Use write_vectored (writev) to batch N writes into fewer syscalls
2624        let slices: Vec<IoSlice> = results
2625            .iter()
2626            .filter(|r| !r.is_empty())
2627            .map(|r| IoSlice::new(r))
2628            .collect();
2629        write_ioslices(out, &slices)?;
2630    } else {
2631        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2632        // because every line gets truncated, creating 3 IoSlice entries per line.
2633        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2634        // managing millions of IoSlice entries through the kernel.
2635        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2636        if max_bytes <= 512 {
2637            // Estimate output size without scanning: output <= data.len(),
2638            // typically ~data.len()/4 for short max_bytes on longer lines.
2639            let est_out = (data.len() / 4).max(max_bytes + 2);
2640            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2641            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2642            if !buf.is_empty() {
2643                out.write_all(&buf)?;
2644            }
2645        } else {
2646            // Zero-copy path: track contiguous output runs and write directly from source.
2647            // For lines <= max_bytes, we include them as-is (no copy needed).
2648            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2649            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2650        }
2651    }
2652    Ok(())
2653}
2654
2655/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2656/// directly into the source data, flushing in MAX_IOV-sized batches.
2657/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2658/// produce two IoSlices (truncated data + newline).
2659#[inline]
2660fn bytes_from_start_zerocopy(
2661    data: &[u8],
2662    max_bytes: usize,
2663    line_delim: u8,
2664    out: &mut impl Write,
2665) -> io::Result<()> {
2666    let newline_buf: [u8; 1] = [line_delim];
2667    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2668    let mut start = 0;
2669    let mut run_start: usize = 0;
2670
2671    for pos in memchr_iter(line_delim, data) {
2672        let line_len = pos - start;
2673        if line_len > max_bytes {
2674            // This line needs truncation
2675            if run_start < start {
2676                iov.push(IoSlice::new(&data[run_start..start]));
2677            }
2678            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2679            iov.push(IoSlice::new(&newline_buf));
2680            run_start = pos + 1;
2681
2682            if iov.len() >= MAX_IOV - 2 {
2683                write_ioslices(out, &iov)?;
2684                iov.clear();
2685            }
2686        }
2687        start = pos + 1;
2688    }
2689    // Handle last line without terminator
2690    if start < data.len() {
2691        let line_len = data.len() - start;
2692        if line_len > max_bytes {
2693            if run_start < start {
2694                iov.push(IoSlice::new(&data[run_start..start]));
2695            }
2696            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2697            iov.push(IoSlice::new(&newline_buf));
2698            if !iov.is_empty() {
2699                write_ioslices(out, &iov)?;
2700            }
2701            return Ok(());
2702        }
2703    }
2704    // Flush remaining contiguous run
2705    if run_start < data.len() {
2706        iov.push(IoSlice::new(&data[run_start..]));
2707        if !data.is_empty() && *data.last().unwrap() != line_delim {
2708            iov.push(IoSlice::new(&newline_buf));
2709        }
2710    }
2711    if !iov.is_empty() {
2712        write_ioslices(out, &iov)?;
2713    }
2714    Ok(())
2715}
2716
2717/// Process a chunk for from-start byte range extraction (parallel path).
2718/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2719/// Pre-reserves data.len() (output never exceeds input), then uses a single
2720/// write pointer with deferred set_len — no per-line capacity checks.
2721#[inline]
2722fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2723    // Output is always <= input size (we only truncate, never expand).
2724    // Single reserve eliminates ALL per-line capacity checks.
2725    buf.reserve(data.len());
2726
2727    let src = data.as_ptr();
2728    let dst_base = buf.as_mut_ptr();
2729    let mut wp = buf.len();
2730    let mut start = 0;
2731
2732    for pos in memchr_iter(line_delim, data) {
2733        let line_len = pos - start;
2734        let take = line_len.min(max_bytes);
2735        unsafe {
2736            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2737            *dst_base.add(wp + take) = line_delim;
2738        }
2739        wp += take + 1;
2740        start = pos + 1;
2741    }
2742    // Handle last line without terminator
2743    if start < data.len() {
2744        let line_len = data.len() - start;
2745        let take = line_len.min(max_bytes);
2746        unsafe {
2747            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2748            *dst_base.add(wp + take) = line_delim;
2749        }
2750        wp += take + 1;
2751    }
2752    unsafe { buf.set_len(wp) };
2753}
2754
2755/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2756fn process_bytes_from_offset(
2757    data: &[u8],
2758    skip_bytes: usize,
2759    line_delim: u8,
2760    out: &mut impl Write,
2761) -> io::Result<()> {
2762    if data.len() >= PARALLEL_THRESHOLD {
2763        let chunks = split_for_scope(data, line_delim);
2764        let n = chunks.len();
2765        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2766        rayon::scope(|s| {
2767            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2768                s.spawn(move |_| {
2769                    result.reserve(chunk.len());
2770                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2771                });
2772            }
2773        });
2774        // Use write_vectored (writev) to batch N writes into fewer syscalls
2775        let slices: Vec<IoSlice> = results
2776            .iter()
2777            .filter(|r| !r.is_empty())
2778            .map(|r| IoSlice::new(r))
2779            .collect();
2780        write_ioslices(out, &slices)?;
2781    } else {
2782        // Zero-copy: write suffix of each line directly from source
2783        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2784    }
2785    Ok(())
2786}
2787
2788/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2789/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2790/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2791#[inline]
2792fn bytes_from_offset_zerocopy(
2793    data: &[u8],
2794    skip_bytes: usize,
2795    line_delim: u8,
2796    out: &mut impl Write,
2797) -> io::Result<()> {
2798    let delim_buf = [line_delim];
2799    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2800
2801    let mut start = 0;
2802    for pos in memchr_iter(line_delim, data) {
2803        let line_len = pos - start;
2804        if line_len > skip_bytes {
2805            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2806        }
2807        iov.push(IoSlice::new(&delim_buf));
2808        // Flush when approaching MAX_IOV to avoid oversized writev
2809        if iov.len() >= MAX_IOV - 1 {
2810            write_ioslices(out, &iov)?;
2811            iov.clear();
2812        }
2813        start = pos + 1;
2814    }
2815    if start < data.len() {
2816        let line_len = data.len() - start;
2817        if line_len > skip_bytes {
2818            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2819        }
2820        iov.push(IoSlice::new(&delim_buf));
2821    }
2822    if !iov.is_empty() {
2823        write_ioslices(out, &iov)?;
2824    }
2825    Ok(())
2826}
2827
2828/// Process a chunk for from-offset byte range extraction.
2829/// Single reserve + deferred set_len for zero per-line overhead.
2830#[inline]
2831fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2832    buf.reserve(data.len());
2833
2834    let src = data.as_ptr();
2835    let dst_base = buf.as_mut_ptr();
2836    let mut wp = buf.len();
2837    let mut start = 0;
2838
2839    for pos in memchr_iter(line_delim, data) {
2840        let line_len = pos - start;
2841        if line_len > skip_bytes {
2842            let take = line_len - skip_bytes;
2843            unsafe {
2844                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2845            }
2846            wp += take;
2847        }
2848        unsafe {
2849            *dst_base.add(wp) = line_delim;
2850        }
2851        wp += 1;
2852        start = pos + 1;
2853    }
2854    if start < data.len() {
2855        let line_len = data.len() - start;
2856        if line_len > skip_bytes {
2857            let take = line_len - skip_bytes;
2858            unsafe {
2859                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2860            }
2861            wp += take;
2862        }
2863        unsafe {
2864            *dst_base.add(wp) = line_delim;
2865        }
2866        wp += 1;
2867    }
2868    unsafe { buf.set_len(wp) };
2869}
2870
2871/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2872fn process_bytes_mid_range(
2873    data: &[u8],
2874    start_byte: usize,
2875    end_byte: usize,
2876    line_delim: u8,
2877    out: &mut impl Write,
2878) -> io::Result<()> {
2879    let skip = start_byte.saturating_sub(1);
2880
2881    if data.len() >= PARALLEL_THRESHOLD {
2882        let chunks = split_for_scope(data, line_delim);
2883        let n = chunks.len();
2884        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2885        rayon::scope(|s| {
2886            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2887                s.spawn(move |_| {
2888                    result.reserve(chunk.len());
2889                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2890                });
2891            }
2892        });
2893        let slices: Vec<IoSlice> = results
2894            .iter()
2895            .filter(|r| !r.is_empty())
2896            .map(|r| IoSlice::new(r))
2897            .collect();
2898        write_ioslices(out, &slices)?;
2899    } else {
2900        let mut buf = Vec::with_capacity(data.len());
2901        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2902        if !buf.is_empty() {
2903            out.write_all(&buf)?;
2904        }
2905    }
2906    Ok(())
2907}
2908
2909/// Process a chunk for mid-range byte extraction.
2910/// For each line, output bytes skip..min(line_len, end_byte).
2911/// Single reserve + deferred set_len.
2912#[inline]
2913fn bytes_mid_range_chunk(
2914    data: &[u8],
2915    skip: usize,
2916    end_byte: usize,
2917    line_delim: u8,
2918    buf: &mut Vec<u8>,
2919) {
2920    buf.reserve(data.len());
2921
2922    let src = data.as_ptr();
2923    let dst_base = buf.as_mut_ptr();
2924    let mut wp = buf.len();
2925    let mut start = 0;
2926
2927    for pos in memchr_iter(line_delim, data) {
2928        let line_len = pos - start;
2929        if line_len > skip {
2930            let take_end = line_len.min(end_byte);
2931            let take = take_end - skip;
2932            unsafe {
2933                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2934            }
2935            wp += take;
2936        }
2937        unsafe {
2938            *dst_base.add(wp) = line_delim;
2939        }
2940        wp += 1;
2941        start = pos + 1;
2942    }
2943    if start < data.len() {
2944        let line_len = data.len() - start;
2945        if line_len > skip {
2946            let take_end = line_len.min(end_byte);
2947            let take = take_end - skip;
2948            unsafe {
2949                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2950            }
2951            wp += take;
2952        }
2953        unsafe {
2954            *dst_base.add(wp) = line_delim;
2955        }
2956        wp += 1;
2957    }
2958    unsafe { buf.set_len(wp) };
2959}
2960
2961/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2962fn process_bytes_complement_mid(
2963    data: &[u8],
2964    skip_start: usize,
2965    skip_end: usize,
2966    line_delim: u8,
2967    out: &mut impl Write,
2968) -> io::Result<()> {
2969    let prefix_bytes = skip_start - 1; // bytes before the skip region
2970    if data.len() >= PARALLEL_THRESHOLD {
2971        let chunks = split_for_scope(data, line_delim);
2972        let n = chunks.len();
2973        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2974        rayon::scope(|s| {
2975            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2976                s.spawn(move |_| {
2977                    result.reserve(chunk.len());
2978                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
2979                });
2980            }
2981        });
2982        let slices: Vec<IoSlice> = results
2983            .iter()
2984            .filter(|r| !r.is_empty())
2985            .map(|r| IoSlice::new(r))
2986            .collect();
2987        write_ioslices(out, &slices)?;
2988    } else {
2989        let mut buf = Vec::with_capacity(data.len());
2990        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
2991        if !buf.is_empty() {
2992            out.write_all(&buf)?;
2993        }
2994    }
2995    Ok(())
2996}
2997
2998/// Process a chunk for complement mid-range byte extraction.
2999/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
3000#[inline]
3001fn bytes_complement_mid_chunk(
3002    data: &[u8],
3003    prefix_bytes: usize,
3004    skip_end: usize,
3005    line_delim: u8,
3006    buf: &mut Vec<u8>,
3007) {
3008    buf.reserve(data.len());
3009
3010    let src = data.as_ptr();
3011    let dst_base = buf.as_mut_ptr();
3012    let mut wp = buf.len();
3013    let mut start = 0;
3014
3015    for pos in memchr_iter(line_delim, data) {
3016        let line_len = pos - start;
3017        // Copy prefix (bytes before skip region)
3018        let take_prefix = prefix_bytes.min(line_len);
3019        if take_prefix > 0 {
3020            unsafe {
3021                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3022            }
3023            wp += take_prefix;
3024        }
3025        // Copy suffix (bytes after skip region)
3026        if line_len > skip_end {
3027            let suffix_len = line_len - skip_end;
3028            unsafe {
3029                std::ptr::copy_nonoverlapping(
3030                    src.add(start + skip_end),
3031                    dst_base.add(wp),
3032                    suffix_len,
3033                );
3034            }
3035            wp += suffix_len;
3036        }
3037        unsafe {
3038            *dst_base.add(wp) = line_delim;
3039        }
3040        wp += 1;
3041        start = pos + 1;
3042    }
3043    if start < data.len() {
3044        let line_len = data.len() - start;
3045        let take_prefix = prefix_bytes.min(line_len);
3046        if take_prefix > 0 {
3047            unsafe {
3048                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3049            }
3050            wp += take_prefix;
3051        }
3052        if line_len > skip_end {
3053            let suffix_len = line_len - skip_end;
3054            unsafe {
3055                std::ptr::copy_nonoverlapping(
3056                    src.add(start + skip_end),
3057                    dst_base.add(wp),
3058                    suffix_len,
3059                );
3060            }
3061            wp += suffix_len;
3062        }
3063        unsafe {
3064            *dst_base.add(wp) = line_delim;
3065        }
3066        wp += 1;
3067    }
3068    unsafe { buf.set_len(wp) };
3069}
3070
3071/// Optimized byte/char extraction with batched output and parallel processing.
3072fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3073    let line_delim = cfg.line_delim;
3074    let ranges = cfg.ranges;
3075    let complement = cfg.complement;
3076    let output_delim = cfg.output_delim;
3077
3078    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
3079    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
3080        let max_bytes = ranges[0].end;
3081        if max_bytes < usize::MAX {
3082            return process_bytes_from_start(data, max_bytes, line_delim, out);
3083        }
3084    }
3085
3086    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
3087    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
3088        let skip_bytes = ranges[0].start.saturating_sub(1);
3089        if skip_bytes > 0 {
3090            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
3091        }
3092    }
3093
3094    // Fast path: single mid-range (e.g., cut -b5-100)
3095    if !complement
3096        && ranges.len() == 1
3097        && ranges[0].start > 1
3098        && ranges[0].end < usize::MAX
3099        && output_delim.is_empty()
3100    {
3101        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
3102    }
3103
3104    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
3105    if complement
3106        && ranges.len() == 1
3107        && ranges[0].start == 1
3108        && ranges[0].end < usize::MAX
3109        && output_delim.is_empty()
3110    {
3111        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
3112    }
3113
3114    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
3115    if complement
3116        && ranges.len() == 1
3117        && ranges[0].end == usize::MAX
3118        && ranges[0].start > 1
3119        && output_delim.is_empty()
3120    {
3121        let max_bytes = ranges[0].start - 1;
3122        return process_bytes_from_start(data, max_bytes, line_delim, out);
3123    }
3124
3125    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
3126    if complement
3127        && ranges.len() == 1
3128        && ranges[0].start > 1
3129        && ranges[0].end < usize::MAX
3130        && output_delim.is_empty()
3131    {
3132        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
3133    }
3134
3135    if data.len() >= PARALLEL_THRESHOLD {
3136        let chunks = split_for_scope(data, line_delim);
3137        let n = chunks.len();
3138        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3139        rayon::scope(|s| {
3140            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3141                s.spawn(move |_| {
3142                    result.reserve(chunk.len());
3143                    process_bytes_chunk(
3144                        chunk,
3145                        ranges,
3146                        complement,
3147                        output_delim,
3148                        line_delim,
3149                        result,
3150                    );
3151                });
3152            }
3153        });
3154        let slices: Vec<IoSlice> = results
3155            .iter()
3156            .filter(|r| !r.is_empty())
3157            .map(|r| IoSlice::new(r))
3158            .collect();
3159        write_ioslices(out, &slices)?;
3160    } else {
3161        let mut buf = Vec::with_capacity(data.len());
3162        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
3163        if !buf.is_empty() {
3164            out.write_all(&buf)?;
3165        }
3166    }
3167    Ok(())
3168}
3169
3170/// Process a chunk of data for byte/char extraction.
3171/// Uses raw pointer arithmetic for the newline scan.
3172/// Complement single-range fast path: compute complement ranges once, then use
3173/// the non-complement multi-range path which is more cache-friendly.
3174fn process_bytes_chunk(
3175    data: &[u8],
3176    ranges: &[Range],
3177    complement: bool,
3178    output_delim: &[u8],
3179    line_delim: u8,
3180    buf: &mut Vec<u8>,
3181) {
3182    buf.reserve(data.len());
3183    let base = data.as_ptr();
3184    let mut start = 0;
3185    for end_pos in memchr_iter(line_delim, data) {
3186        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
3187        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3188        unsafe { buf_push(buf, line_delim) };
3189        start = end_pos + 1;
3190    }
3191    if start < data.len() {
3192        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
3193        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3194        unsafe { buf_push(buf, line_delim) };
3195    }
3196}
3197
3198/// Extract byte ranges from a line into the output buffer.
3199/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
3200/// Raw pointer arithmetic eliminates per-range bounds checking.
3201#[inline(always)]
3202fn cut_bytes_to_buf(
3203    line: &[u8],
3204    ranges: &[Range],
3205    complement: bool,
3206    output_delim: &[u8],
3207    buf: &mut Vec<u8>,
3208) {
3209    let len = line.len();
3210    let base = line.as_ptr();
3211    let mut first_range = true;
3212
3213    // Reserve worst case: full line + delimiters between ranges
3214    let needed = len + output_delim.len() * ranges.len() + 1;
3215    if buf.capacity() - buf.len() < needed {
3216        buf.reserve(needed);
3217    }
3218
3219    if complement {
3220        let mut pos: usize = 1;
3221        for r in ranges {
3222            let rs = r.start;
3223            let re = r.end.min(len);
3224            if pos < rs {
3225                if !first_range && !output_delim.is_empty() {
3226                    unsafe { buf_extend(buf, output_delim) };
3227                }
3228                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
3229                first_range = false;
3230            }
3231            pos = re + 1;
3232            if pos > len {
3233                break;
3234            }
3235        }
3236        if pos <= len {
3237            if !first_range && !output_delim.is_empty() {
3238                unsafe { buf_extend(buf, output_delim) };
3239            }
3240            unsafe {
3241                buf_extend(
3242                    buf,
3243                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3244                )
3245            };
3246        }
3247    } else if output_delim.is_empty() && ranges.len() == 1 {
3248        // Ultra-fast path: single range, no output delimiter
3249        let start = ranges[0].start.saturating_sub(1);
3250        let end = ranges[0].end.min(len);
3251        if start < len {
3252            unsafe {
3253                buf_extend(
3254                    buf,
3255                    std::slice::from_raw_parts(base.add(start), end - start),
3256                )
3257            };
3258        }
3259    } else {
3260        for r in ranges {
3261            let start = r.start.saturating_sub(1);
3262            let end = r.end.min(len);
3263            if start >= len {
3264                break;
3265            }
3266            if !first_range && !output_delim.is_empty() {
3267                unsafe { buf_extend(buf, output_delim) };
3268            }
3269            unsafe {
3270                buf_extend(
3271                    buf,
3272                    std::slice::from_raw_parts(base.add(start), end - start),
3273                )
3274            };
3275            first_range = false;
3276        }
3277    }
3278}
3279
3280// ── Public API ───────────────────────────────────────────────────────────
3281
3282/// Cut fields from a line using a delimiter. Writes to `out`.
3283#[inline]
3284pub fn cut_fields(
3285    line: &[u8],
3286    delim: u8,
3287    ranges: &[Range],
3288    complement: bool,
3289    output_delim: &[u8],
3290    suppress_no_delim: bool,
3291    out: &mut impl Write,
3292) -> io::Result<bool> {
3293    if memchr::memchr(delim, line).is_none() {
3294        if !suppress_no_delim {
3295            out.write_all(line)?;
3296            return Ok(true);
3297        }
3298        return Ok(false);
3299    }
3300
3301    let mut field_num: usize = 1;
3302    let mut field_start: usize = 0;
3303    let mut first_output = true;
3304
3305    for delim_pos in memchr_iter(delim, line) {
3306        let selected = in_ranges(ranges, field_num) != complement;
3307        if selected {
3308            if !first_output {
3309                out.write_all(output_delim)?;
3310            }
3311            out.write_all(&line[field_start..delim_pos])?;
3312            first_output = false;
3313        }
3314        field_start = delim_pos + 1;
3315        field_num += 1;
3316    }
3317
3318    let selected = in_ranges(ranges, field_num) != complement;
3319    if selected {
3320        if !first_output {
3321            out.write_all(output_delim)?;
3322        }
3323        out.write_all(&line[field_start..])?;
3324    }
3325
3326    Ok(true)
3327}
3328
3329/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3330#[inline]
3331pub fn cut_bytes(
3332    line: &[u8],
3333    ranges: &[Range],
3334    complement: bool,
3335    output_delim: &[u8],
3336    out: &mut impl Write,
3337) -> io::Result<bool> {
3338    let mut first_range = true;
3339
3340    if complement {
3341        let len = line.len();
3342        let mut comp_ranges = Vec::new();
3343        let mut pos: usize = 1;
3344        for r in ranges {
3345            let rs = r.start;
3346            let re = r.end.min(len);
3347            if pos < rs {
3348                comp_ranges.push((pos, rs - 1));
3349            }
3350            pos = re + 1;
3351            if pos > len {
3352                break;
3353            }
3354        }
3355        if pos <= len {
3356            comp_ranges.push((pos, len));
3357        }
3358        for &(s, e) in &comp_ranges {
3359            if !first_range && !output_delim.is_empty() {
3360                out.write_all(output_delim)?;
3361            }
3362            out.write_all(&line[s - 1..e])?;
3363            first_range = false;
3364        }
3365    } else {
3366        for r in ranges {
3367            let start = r.start.saturating_sub(1);
3368            let end = r.end.min(line.len());
3369            if start >= line.len() {
3370                break;
3371            }
3372            if !first_range && !output_delim.is_empty() {
3373                out.write_all(output_delim)?;
3374            }
3375            out.write_all(&line[start..end])?;
3376            first_range = false;
3377        }
3378    }
3379    Ok(true)
3380}
3381
3382/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3383/// Output is always <= input (we remove everything after first delimiter per line).
3384/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3385/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3386///
3387/// Lines without delimiter pass through unchanged (unless suppress=true).
3388/// Lines with delimiter: keep bytes before delimiter + newline.
3389pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3390    let len = data.len();
3391    let mut wp: usize = 0;
3392    let mut rp: usize = 0;
3393
3394    while rp < len {
3395        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3396            None => {
3397                // Rest is partial line, no delimiter
3398                if suppress {
3399                    // suppress: skip lines without delimiter
3400                    break;
3401                }
3402                let remaining = len - rp;
3403                if wp != rp {
3404                    data.copy_within(rp..len, wp);
3405                }
3406                wp += remaining;
3407                break;
3408            }
3409            Some(offset) => {
3410                let actual = rp + offset;
3411                if data[actual] == line_delim {
3412                    // No delimiter on this line
3413                    if suppress {
3414                        // Skip this line entirely
3415                        rp = actual + 1;
3416                    } else {
3417                        // Output entire line including newline
3418                        let chunk_len = actual + 1 - rp;
3419                        if wp != rp {
3420                            data.copy_within(rp..actual + 1, wp);
3421                        }
3422                        wp += chunk_len;
3423                        rp = actual + 1;
3424                    }
3425                } else {
3426                    // Delimiter found: output field 1 (up to delimiter) + newline
3427                    let field_len = actual - rp;
3428                    if wp != rp && field_len > 0 {
3429                        data.copy_within(rp..actual, wp);
3430                    }
3431                    wp += field_len;
3432                    data[wp] = line_delim;
3433                    wp += 1;
3434                    // Skip to next newline
3435                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3436                        None => {
3437                            rp = len;
3438                        }
3439                        Some(nl_off) => {
3440                            rp = actual + 1 + nl_off + 1;
3441                        }
3442                    }
3443                }
3444            }
3445        }
3446    }
3447    wp
3448}
3449
3450/// Process a full data buffer (from mmap or read) with cut operation.
3451pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3452    match cfg.mode {
3453        CutMode::Fields => process_fields_fast(data, cfg, out),
3454        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3455    }
3456}
3457
3458/// Process input from a reader (for stdin).
3459/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3460/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3461/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3462pub fn process_cut_reader<R: BufRead>(
3463    mut reader: R,
3464    cfg: &CutConfig,
3465    out: &mut impl Write,
3466) -> io::Result<()> {
3467    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3468    let line_delim = cfg.line_delim;
3469
3470    // Read large chunks and process in batch.
3471    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3472    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3473
3474    loop {
3475        // Read up to CHUNK_SIZE bytes
3476        buf.reserve(CHUNK_SIZE);
3477        let read_start = buf.len();
3478        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3479        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3480        buf.truncate(read_start + n);
3481
3482        if buf.is_empty() {
3483            break;
3484        }
3485
3486        if n == 0 {
3487            // EOF with leftover data (last line without terminator)
3488            process_cut_data(&buf, cfg, out)?;
3489            break;
3490        }
3491
3492        // Find the last line delimiter in the buffer so we process complete lines
3493        let process_end = match memchr::memrchr(line_delim, &buf) {
3494            Some(pos) => pos + 1,
3495            None => {
3496                // No line delimiter found — keep accumulating
3497                continue;
3498            }
3499        };
3500
3501        // Process the complete lines using the fast batch path
3502        process_cut_data(&buf[..process_end], cfg, out)?;
3503
3504        // Shift leftover to the front for next iteration
3505        let leftover_len = buf.len() - process_end;
3506        if leftover_len > 0 {
3507            buf.copy_within(process_end.., 0);
3508        }
3509        buf.truncate(leftover_len);
3510    }
3511
3512    Ok(())
3513}
3514
3515/// Read as many bytes as possible into buf, retrying on partial reads.
3516#[inline]
3517fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3518    let n = reader.read(buf)?;
3519    if n == buf.len() || n == 0 {
3520        return Ok(n);
3521    }
3522    // Slow path: partial read — retry to fill buffer
3523    let mut total = n;
3524    while total < buf.len() {
3525        match reader.read(&mut buf[total..]) {
3526            Ok(0) => break,
3527            Ok(n) => total += n,
3528            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3529            Err(e) => return Err(e),
3530        }
3531    }
3532    Ok(total)
3533}
3534
3535/// In-place cut processing for mutable data buffers.
3536/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3537/// for the given configuration (caller should fall back to regular processing).
3538///
3539/// In-place avoids allocating intermediate output buffers — the result is written
3540/// directly into the input buffer (output is always <= input for non-complement modes
3541/// with default output delimiter).
3542pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3543    if cfg.complement {
3544        return None;
3545    }
3546
3547    match cfg.mode {
3548        CutMode::Fields => {
3549            // Only handle when output delimiter matches input (single-byte)
3550            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3551                return None;
3552            }
3553            if cfg.delim == cfg.line_delim {
3554                return None;
3555            }
3556            Some(cut_fields_inplace_general(
3557                data,
3558                cfg.delim,
3559                cfg.line_delim,
3560                cfg.ranges,
3561                cfg.suppress_no_delim,
3562            ))
3563        }
3564        CutMode::Bytes | CutMode::Characters => {
3565            if !cfg.output_delim.is_empty() {
3566                return None;
3567            }
3568            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3569        }
3570    }
3571}
3572
3573/// In-place generalized field extraction.
3574/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3575fn cut_fields_inplace_general(
3576    data: &mut [u8],
3577    delim: u8,
3578    line_delim: u8,
3579    ranges: &[Range],
3580    suppress: bool,
3581) -> usize {
3582    // Special case: field 1 only (existing optimized path)
3583    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3584        return cut_field1_inplace(data, delim, line_delim, suppress);
3585    }
3586
3587    let len = data.len();
3588    if len == 0 {
3589        return 0;
3590    }
3591
3592    let max_field = ranges.last().map_or(0, |r| r.end);
3593    let max_delims = max_field.min(64);
3594    let mut wp: usize = 0;
3595    let mut rp: usize = 0;
3596
3597    while rp < len {
3598        let line_end = memchr::memchr(line_delim, &data[rp..])
3599            .map(|p| rp + p)
3600            .unwrap_or(len);
3601        let line_len = line_end - rp;
3602
3603        // Collect delimiter positions (relative to line start)
3604        let mut delim_pos = [0usize; 64];
3605        let mut num_delims: usize = 0;
3606
3607        for pos in memchr_iter(delim, &data[rp..line_end]) {
3608            if num_delims < max_delims {
3609                delim_pos[num_delims] = pos;
3610                num_delims += 1;
3611                if num_delims >= max_delims {
3612                    break;
3613                }
3614            }
3615        }
3616
3617        if num_delims == 0 {
3618            // No delimiter in line
3619            if !suppress {
3620                if wp != rp {
3621                    data.copy_within(rp..line_end, wp);
3622                }
3623                wp += line_len;
3624                if line_end < len {
3625                    data[wp] = line_delim;
3626                    wp += 1;
3627                }
3628            }
3629        } else {
3630            let total_fields = num_delims + 1;
3631            let mut first_output = true;
3632
3633            for r in ranges {
3634                let range_start = r.start;
3635                let range_end = r.end.min(total_fields);
3636                if range_start > total_fields {
3637                    break;
3638                }
3639                for field_num in range_start..=range_end {
3640                    if field_num > total_fields {
3641                        break;
3642                    }
3643
3644                    let field_start = if field_num == 1 {
3645                        0
3646                    } else if field_num - 2 < num_delims {
3647                        delim_pos[field_num - 2] + 1
3648                    } else {
3649                        continue;
3650                    };
3651                    let field_end = if field_num <= num_delims {
3652                        delim_pos[field_num - 1]
3653                    } else {
3654                        line_len
3655                    };
3656
3657                    if !first_output {
3658                        data[wp] = delim;
3659                        wp += 1;
3660                    }
3661                    let flen = field_end - field_start;
3662                    if flen > 0 {
3663                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3664                        wp += flen;
3665                    }
3666                    first_output = false;
3667                }
3668            }
3669
3670            if !first_output && line_end < len {
3671                data[wp] = line_delim;
3672                wp += 1;
3673            } else if first_output && line_end < len {
3674                // No fields selected but line had delimiters — output empty line
3675                data[wp] = line_delim;
3676                wp += 1;
3677            }
3678        }
3679
3680        rp = if line_end < len { line_end + 1 } else { len };
3681    }
3682
3683    wp
3684}
3685
3686/// In-place byte/char range extraction.
3687fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3688    let len = data.len();
3689    if len == 0 {
3690        return 0;
3691    }
3692
3693    // Quick check: single range from byte 1 to end = no-op
3694    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3695        return len;
3696    }
3697
3698    // Single range from byte 1: fast truncation path
3699    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3700        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3701    }
3702
3703    let mut wp: usize = 0;
3704    let mut rp: usize = 0;
3705
3706    while rp < len {
3707        let line_end = memchr::memchr(line_delim, &data[rp..])
3708            .map(|p| rp + p)
3709            .unwrap_or(len);
3710        let line_len = line_end - rp;
3711
3712        for r in ranges {
3713            let start = r.start.saturating_sub(1);
3714            let end = r.end.min(line_len);
3715            if start >= line_len {
3716                break;
3717            }
3718            let flen = end - start;
3719            if flen > 0 {
3720                data.copy_within(rp + start..rp + start + flen, wp);
3721                wp += flen;
3722            }
3723        }
3724
3725        if line_end < len {
3726            data[wp] = line_delim;
3727            wp += 1;
3728        }
3729
3730        rp = if line_end < len { line_end + 1 } else { len };
3731    }
3732
3733    wp
3734}
3735
3736/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3737fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3738    let len = data.len();
3739
3740    // Quick check: see if all lines fit within max_bytes (common case)
3741    let mut all_fit = true;
3742    let mut start = 0;
3743    for pos in memchr_iter(line_delim, data) {
3744        if pos - start > max_bytes {
3745            all_fit = false;
3746            break;
3747        }
3748        start = pos + 1;
3749    }
3750    if all_fit && start < len && len - start > max_bytes {
3751        all_fit = false;
3752    }
3753    if all_fit {
3754        return len;
3755    }
3756
3757    // Some lines need truncation
3758    let mut wp: usize = 0;
3759    let mut rp: usize = 0;
3760
3761    while rp < len {
3762        let line_end = memchr::memchr(line_delim, &data[rp..])
3763            .map(|p| rp + p)
3764            .unwrap_or(len);
3765        let line_len = line_end - rp;
3766
3767        let take = line_len.min(max_bytes);
3768        if take > 0 && wp != rp {
3769            data.copy_within(rp..rp + take, wp);
3770        }
3771        wp += take;
3772
3773        if line_end < len {
3774            data[wp] = line_delim;
3775            wp += 1;
3776        }
3777
3778        rp = if line_end < len { line_end + 1 } else { len };
3779    }
3780
3781    wp
3782}
3783
3784/// Cut operation mode
3785#[derive(Debug, Clone, Copy, PartialEq)]
3786pub enum CutMode {
3787    Bytes,
3788    Characters,
3789    Fields,
3790}