coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5/// Minimum file size for parallel processing (1MB).
6/// Rayon overhead is ~5-10μs per task; at 1MB per chunk,
7/// each chunk takes ~100μs+ to process, so overhead is < 10%.
8const PARALLEL_THRESHOLD: usize = 1024 * 1024;
9
10/// Configuration for cut operations.
11pub struct CutConfig<'a> {
12    pub mode: CutMode,
13    pub ranges: &'a [Range],
14    pub complement: bool,
15    pub delim: u8,
16    pub output_delim: &'a [u8],
17    pub suppress_no_delim: bool,
18    pub line_delim: u8,
19}
20
21/// A range specification like 1, 3-5, -3, 4-
22#[derive(Debug, Clone)]
23pub struct Range {
24    pub start: usize, // 1-based, 0 means "from beginning"
25    pub end: usize,   // 1-based, usize::MAX means "to end"
26}
27
28/// Parse a LIST specification like "1,3-5,7-" into ranges.
29/// Each range is 1-based. Returns sorted, merged ranges.
30pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
31    let mut ranges = Vec::new();
32
33    for part in spec.split(',') {
34        let part = part.trim();
35        if part.is_empty() {
36            continue;
37        }
38
39        if let Some(idx) = part.find('-') {
40            let left = &part[..idx];
41            let right = &part[idx + 1..];
42
43            let start = if left.is_empty() {
44                1
45            } else {
46                left.parse::<usize>()
47                    .map_err(|_| format!("invalid range: '{}'", part))?
48            };
49
50            let end = if right.is_empty() {
51                usize::MAX
52            } else {
53                right
54                    .parse::<usize>()
55                    .map_err(|_| format!("invalid range: '{}'", part))?
56            };
57
58            if start == 0 {
59                return Err("fields and positions are numbered from 1".to_string());
60            }
61            if start > end {
62                return Err(format!("invalid decreasing range: '{}'", part));
63            }
64
65            ranges.push(Range { start, end });
66        } else {
67            let n = part
68                .parse::<usize>()
69                .map_err(|_| format!("invalid field: '{}'", part))?;
70            if n == 0 {
71                return Err("fields and positions are numbered from 1".to_string());
72            }
73            ranges.push(Range { start: n, end: n });
74        }
75    }
76
77    if ranges.is_empty() {
78        return Err("you must specify a list of bytes, characters, or fields".to_string());
79    }
80
81    // Sort and merge overlapping ranges
82    ranges.sort_by_key(|r| (r.start, r.end));
83    let mut merged = vec![ranges[0].clone()];
84    for r in &ranges[1..] {
85        let last = merged.last_mut().unwrap();
86        if r.start <= last.end.saturating_add(1) {
87            last.end = last.end.max(r.end);
88        } else {
89            merged.push(r.clone());
90        }
91    }
92
93    Ok(merged)
94}
95
96/// Check if a 1-based position is in any range.
97/// Ranges must be sorted. Uses early exit since ranges are sorted.
98#[inline(always)]
99fn in_ranges(ranges: &[Range], pos: usize) -> bool {
100    for r in ranges {
101        if pos < r.start {
102            return false; // ranges are sorted, no point checking further
103        }
104        if pos <= r.end {
105            return true;
106        }
107    }
108    false
109}
110
111/// Pre-compute a 64-bit mask for field selection.
112/// Bit i-1 is set if field i should be output.
113#[inline]
114fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
115    let mut mask: u64 = 0;
116    for i in 1..=64u32 {
117        let in_range = in_ranges(ranges, i as usize);
118        if in_range != complement {
119            mask |= 1u64 << (i - 1);
120        }
121    }
122    mask
123}
124
125/// Check if a field should be selected, using bitset for first 64 fields.
126#[inline(always)]
127fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
128    if field_num <= 64 {
129        (mask >> (field_num - 1)) & 1 == 1
130    } else {
131        in_ranges(ranges, field_num) != complement
132    }
133}
134
135// ── Chunk splitting for parallel processing ──────────────────────────────
136
137/// Split data into chunks aligned to line boundaries for parallel processing.
138/// Returns slices that each end on a line_delim boundary (except possibly the last).
139fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
140    let num_threads = rayon::current_num_threads().max(1);
141    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
142        return vec![data];
143    }
144
145    let chunk_size = data.len() / num_threads;
146    let mut chunks = Vec::with_capacity(num_threads);
147    let mut pos = 0;
148
149    for _ in 0..num_threads - 1 {
150        let target = pos + chunk_size;
151        if target >= data.len() {
152            break;
153        }
154        // Align to next line boundary
155        let boundary = memchr::memchr(line_delim, &data[target..])
156            .map(|p| target + p + 1)
157            .unwrap_or(data.len());
158        if boundary > pos {
159            chunks.push(&data[pos..boundary]);
160        }
161        pos = boundary;
162    }
163
164    // Last chunk gets the remainder
165    if pos < data.len() {
166        chunks.push(&data[pos..]);
167    }
168
169    chunks
170}
171
172// ── Fast path: field extraction with batched output ──────────────────────
173
174/// Optimized field extraction with early exit and batched output.
175/// Uses SIMD newline scanning + inline byte scan per line.
176fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
177    let delim = cfg.delim;
178    let line_delim = cfg.line_delim;
179    let ranges = cfg.ranges;
180    let complement = cfg.complement;
181    let output_delim = cfg.output_delim;
182    let suppress = cfg.suppress_no_delim;
183
184    // Ultra-fast path: single field extraction (e.g., cut -f5)
185    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
186        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
187    }
188
189    // Pre-compute for general field extraction
190    let max_field = if complement {
191        usize::MAX
192    } else {
193        ranges.last().map(|r| r.end).unwrap_or(0)
194    };
195    let field_mask = compute_field_mask(ranges, complement);
196
197    if data.len() >= PARALLEL_THRESHOLD {
198        // Parallel path
199        let chunks = split_into_chunks(data, line_delim);
200        let results: Vec<Vec<u8>> = chunks
201            .par_iter()
202            .map(|chunk| {
203                let mut buf = Vec::with_capacity(chunk.len() / 2);
204                process_fields_chunk(
205                    chunk,
206                    delim,
207                    ranges,
208                    output_delim,
209                    suppress,
210                    max_field,
211                    field_mask,
212                    line_delim,
213                    complement,
214                    &mut buf,
215                );
216                buf
217            })
218            .collect();
219        for result in &results {
220            if !result.is_empty() {
221                out.write_all(result)?;
222            }
223        }
224    } else {
225        // Sequential path
226        let mut buf = Vec::with_capacity(data.len() / 2);
227        process_fields_chunk(
228            data,
229            delim,
230            ranges,
231            output_delim,
232            suppress,
233            max_field,
234            field_mask,
235            line_delim,
236            complement,
237            &mut buf,
238        );
239        if !buf.is_empty() {
240            out.write_all(&buf)?;
241        }
242    }
243    Ok(())
244}
245
246/// Process a chunk of data for general field extraction.
247fn process_fields_chunk(
248    data: &[u8],
249    delim: u8,
250    ranges: &[Range],
251    output_delim: &[u8],
252    suppress: bool,
253    max_field: usize,
254    field_mask: u64,
255    line_delim: u8,
256    complement: bool,
257    buf: &mut Vec<u8>,
258) {
259    let mut start = 0;
260    for end_pos in memchr_iter(line_delim, data) {
261        let line = &data[start..end_pos];
262        extract_fields_to_buf(
263            line,
264            delim,
265            ranges,
266            output_delim,
267            suppress,
268            max_field,
269            field_mask,
270            line_delim,
271            buf,
272            complement,
273        );
274        start = end_pos + 1;
275    }
276    if start < data.len() {
277        extract_fields_to_buf(
278            &data[start..],
279            delim,
280            ranges,
281            output_delim,
282            suppress,
283            max_field,
284            field_mask,
285            line_delim,
286            buf,
287            complement,
288        );
289    }
290}
291
292// ── Ultra-fast single field extraction ───────────────────────────────────
293
294/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
295/// Uses SIMD memchr for newline scanning + minimal inline byte scan per line.
296/// For large files, splits work across multiple threads with rayon.
297fn process_single_field(
298    data: &[u8],
299    delim: u8,
300    line_delim: u8,
301    target: usize,
302    suppress: bool,
303    out: &mut impl Write,
304) -> io::Result<()> {
305    let target_idx = target - 1;
306
307    if data.len() >= PARALLEL_THRESHOLD {
308        // Parallel path: split into chunks aligned to line boundaries
309        let chunks = split_into_chunks(data, line_delim);
310        let results: Vec<Vec<u8>> = chunks
311            .par_iter()
312            .map(|chunk| {
313                let mut buf = Vec::with_capacity(chunk.len() / 4);
314                process_single_field_chunk(
315                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
316                );
317                buf
318            })
319            .collect();
320        for result in &results {
321            if !result.is_empty() {
322                out.write_all(result)?;
323            }
324        }
325    } else {
326        // Sequential path for small data
327        let mut buf = Vec::with_capacity(data.len() / 4);
328        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
329        if !buf.is_empty() {
330            out.write_all(&buf)?;
331        }
332    }
333    Ok(())
334}
335
336/// Process a chunk of data for single-field extraction.
337fn process_single_field_chunk(
338    data: &[u8],
339    delim: u8,
340    target_idx: usize,
341    line_delim: u8,
342    suppress: bool,
343    buf: &mut Vec<u8>,
344) {
345    let mut start = 0;
346    for end_pos in memchr_iter(line_delim, data) {
347        let line = &data[start..end_pos];
348        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
349        start = end_pos + 1;
350    }
351    // Handle final line without terminator
352    if start < data.len() {
353        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
354    }
355}
356
357/// Extract a single field from one line using SIMD memchr_iter with early exit.
358/// Tracks field boundaries directly — no stack array needed.
359#[inline(always)]
360fn extract_single_field_line(
361    line: &[u8],
362    delim: u8,
363    target_idx: usize,
364    line_delim: u8,
365    suppress: bool,
366    buf: &mut Vec<u8>,
367) {
368    if line.is_empty() {
369        if !suppress {
370            buf.push(line_delim);
371        }
372        return;
373    }
374
375    let mut field_start = 0;
376    let mut field_idx = 0;
377    let mut has_delim = false;
378
379    for pos in memchr_iter(delim, line) {
380        has_delim = true;
381        if field_idx == target_idx {
382            // Found end of target field — output and return
383            buf.extend_from_slice(&line[field_start..pos]);
384            buf.push(line_delim);
385            return;
386        }
387        field_idx += 1;
388        field_start = pos + 1;
389    }
390
391    if !has_delim {
392        // No delimiters found — output whole line or suppress
393        if !suppress {
394            buf.extend_from_slice(line);
395            buf.push(line_delim);
396        }
397        return;
398    }
399
400    if field_idx == target_idx {
401        // Target is the last field (no trailing delimiter)
402        buf.extend_from_slice(&line[field_start..]);
403        buf.push(line_delim);
404    } else {
405        // Not enough fields — output empty line
406        buf.push(line_delim);
407    }
408}
409
410/// Extract fields from a single line into the output buffer.
411/// Uses inline byte scanning with early exit for maximum performance.
412#[inline(always)]
413fn extract_fields_to_buf(
414    line: &[u8],
415    delim: u8,
416    ranges: &[Range],
417    output_delim: &[u8],
418    suppress: bool,
419    max_field: usize,
420    field_mask: u64,
421    line_delim: u8,
422    buf: &mut Vec<u8>,
423    complement: bool,
424) {
425    let len = line.len();
426
427    // Empty line: no delimiter possible
428    if len == 0 {
429        if !suppress {
430            buf.push(line_delim);
431        }
432        return;
433    }
434
435    let mut field_num: usize = 1;
436    let mut field_start: usize = 0;
437    let mut first_output = true;
438    let mut has_delim = false;
439
440    // SIMD-accelerated delimiter scanning with early exit
441    for delim_pos in memchr_iter(delim, line) {
442        has_delim = true;
443
444        if is_selected(field_num, field_mask, ranges, complement) {
445            if !first_output {
446                buf.extend_from_slice(output_delim);
447            }
448            buf.extend_from_slice(&line[field_start..delim_pos]);
449            first_output = false;
450        }
451
452        field_num += 1;
453        field_start = delim_pos + 1;
454
455        // Early exit: past the last needed field
456        if field_num > max_field {
457            break;
458        }
459    }
460
461    // Last field (only if we didn't early-exit past it)
462    if (field_num <= max_field || complement)
463        && has_delim
464        && is_selected(field_num, field_mask, ranges, complement)
465    {
466        if !first_output {
467            buf.extend_from_slice(output_delim);
468        }
469        buf.extend_from_slice(&line[field_start..len]);
470        first_output = false;
471    }
472
473    // Output line terminator
474    if !first_output {
475        // Had output — add line delimiter
476        buf.push(line_delim);
477    } else if !has_delim {
478        // No delimiter found — output whole line or suppress
479        if !suppress {
480            buf.extend_from_slice(line);
481            buf.push(line_delim);
482        }
483    } else {
484        // Had delimiter but no selected field — output empty line (GNU compat)
485        buf.push(line_delim);
486    }
487}
488
489// ── Fast path: byte/char extraction with batched output ──────────────────
490
491/// Optimized byte/char extraction with batched output and parallel processing.
492fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
493    let line_delim = cfg.line_delim;
494    let ranges = cfg.ranges;
495    let complement = cfg.complement;
496    let output_delim = cfg.output_delim;
497
498    if data.len() >= PARALLEL_THRESHOLD {
499        let chunks = split_into_chunks(data, line_delim);
500        let results: Vec<Vec<u8>> = chunks
501            .par_iter()
502            .map(|chunk| {
503                let mut buf = Vec::with_capacity(chunk.len() / 2);
504                process_bytes_chunk(
505                    chunk,
506                    ranges,
507                    complement,
508                    output_delim,
509                    line_delim,
510                    &mut buf,
511                );
512                buf
513            })
514            .collect();
515        for result in &results {
516            if !result.is_empty() {
517                out.write_all(result)?;
518            }
519        }
520    } else {
521        let mut buf = Vec::with_capacity(data.len() / 2);
522        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
523        if !buf.is_empty() {
524            out.write_all(&buf)?;
525        }
526    }
527    Ok(())
528}
529
530/// Process a chunk of data for byte/char extraction.
531fn process_bytes_chunk(
532    data: &[u8],
533    ranges: &[Range],
534    complement: bool,
535    output_delim: &[u8],
536    line_delim: u8,
537    buf: &mut Vec<u8>,
538) {
539    let mut start = 0;
540    for end_pos in memchr_iter(line_delim, data) {
541        let line = &data[start..end_pos];
542        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
543        buf.push(line_delim);
544        start = end_pos + 1;
545    }
546    if start < data.len() {
547        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
548        buf.push(line_delim);
549    }
550}
551
552/// Extract byte ranges from a line into the output buffer.
553/// For the common non-complement case with contiguous ranges, uses bulk copy.
554#[inline(always)]
555fn cut_bytes_to_buf(
556    line: &[u8],
557    ranges: &[Range],
558    complement: bool,
559    output_delim: &[u8],
560    buf: &mut Vec<u8>,
561) {
562    let len = line.len();
563    let mut first_range = true;
564
565    if complement {
566        let mut pos: usize = 1;
567        for r in ranges {
568            let rs = r.start;
569            let re = r.end.min(len);
570            if pos < rs {
571                if !first_range && !output_delim.is_empty() {
572                    buf.extend_from_slice(output_delim);
573                }
574                buf.extend_from_slice(&line[pos - 1..rs - 1]);
575                first_range = false;
576            }
577            pos = re + 1;
578            if pos > len {
579                break;
580            }
581        }
582        if pos <= len {
583            if !first_range && !output_delim.is_empty() {
584                buf.extend_from_slice(output_delim);
585            }
586            buf.extend_from_slice(&line[pos - 1..len]);
587        }
588    } else if output_delim.is_empty() && ranges.len() == 1 {
589        // Ultra-fast path: single range, no output delimiter
590        let start = ranges[0].start.saturating_sub(1);
591        let end = ranges[0].end.min(len);
592        if start < len {
593            buf.extend_from_slice(&line[start..end]);
594        }
595    } else {
596        for r in ranges {
597            let start = r.start.saturating_sub(1);
598            let end = r.end.min(len);
599            if start >= len {
600                break;
601            }
602            if !first_range && !output_delim.is_empty() {
603                buf.extend_from_slice(output_delim);
604            }
605            buf.extend_from_slice(&line[start..end]);
606            first_range = false;
607        }
608    }
609}
610
611// ── Public API ───────────────────────────────────────────────────────────
612
613/// Cut fields from a line using a delimiter. Writes to `out`.
614/// Returns true if any output was written, false if suppressed.
615/// Used by process_cut_reader (stdin path) and unit tests.
616#[inline]
617pub fn cut_fields(
618    line: &[u8],
619    delim: u8,
620    ranges: &[Range],
621    complement: bool,
622    output_delim: &[u8],
623    suppress_no_delim: bool,
624    out: &mut impl Write,
625) -> io::Result<bool> {
626    // Check if line contains delimiter at all
627    if memchr::memchr(delim, line).is_none() {
628        if !suppress_no_delim {
629            out.write_all(line)?;
630            return Ok(true);
631        }
632        return Ok(false); // suppressed
633    }
634
635    // Walk through fields using memchr, output selected ones
636    let mut field_num: usize = 1;
637    let mut field_start: usize = 0;
638    let mut first_output = true;
639
640    for delim_pos in memchr_iter(delim, line) {
641        let selected = in_ranges(ranges, field_num) != complement;
642        if selected {
643            if !first_output {
644                out.write_all(output_delim)?;
645            }
646            out.write_all(&line[field_start..delim_pos])?;
647            first_output = false;
648        }
649        field_start = delim_pos + 1;
650        field_num += 1;
651    }
652
653    // Last field (after last delimiter)
654    let selected = in_ranges(ranges, field_num) != complement;
655    if selected {
656        if !first_output {
657            out.write_all(output_delim)?;
658        }
659        out.write_all(&line[field_start..])?;
660    }
661
662    Ok(true)
663}
664
665/// Cut bytes/chars from a line. Writes selected bytes to `out`.
666/// Used by process_cut_reader (stdin path) and unit tests.
667#[inline]
668pub fn cut_bytes(
669    line: &[u8],
670    ranges: &[Range],
671    complement: bool,
672    output_delim: &[u8],
673    out: &mut impl Write,
674) -> io::Result<bool> {
675    let mut first_range = true;
676
677    if complement {
678        let len = line.len();
679        let mut comp_ranges = Vec::new();
680        let mut pos: usize = 1;
681        for r in ranges {
682            let rs = r.start;
683            let re = r.end.min(len);
684            if pos < rs {
685                comp_ranges.push((pos, rs - 1));
686            }
687            pos = re + 1;
688            if pos > len {
689                break;
690            }
691        }
692        if pos <= len {
693            comp_ranges.push((pos, len));
694        }
695        for &(s, e) in &comp_ranges {
696            if !first_range && !output_delim.is_empty() {
697                out.write_all(output_delim)?;
698            }
699            out.write_all(&line[s - 1..e])?;
700            first_range = false;
701        }
702    } else {
703        for r in ranges {
704            let start = r.start.saturating_sub(1);
705            let end = r.end.min(line.len());
706            if start >= line.len() {
707                break;
708            }
709            if !first_range && !output_delim.is_empty() {
710                out.write_all(output_delim)?;
711            }
712            out.write_all(&line[start..end])?;
713            first_range = false;
714        }
715    }
716    Ok(true)
717}
718
719/// Process a full data buffer (from mmap or read) with cut operation.
720/// Dispatches to optimized fast paths for field and byte modes.
721pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
722    match cfg.mode {
723        CutMode::Fields => process_fields_fast(data, cfg, out),
724        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
725    }
726}
727
728/// Process input from a reader (for stdin).
729pub fn process_cut_reader<R: BufRead>(
730    mut reader: R,
731    cfg: &CutConfig,
732    out: &mut impl Write,
733) -> io::Result<()> {
734    let mut buf = Vec::new();
735
736    loop {
737        buf.clear();
738        let n = reader.read_until(cfg.line_delim, &mut buf)?;
739        if n == 0 {
740            break;
741        }
742
743        let has_line_delim = buf.last() == Some(&cfg.line_delim);
744        let line = if has_line_delim {
745            &buf[..buf.len() - 1]
746        } else {
747            &buf[..]
748        };
749
750        let wrote = process_one_line(line, cfg, out)?;
751
752        // GNU always terminates output lines, even if input had no trailing delimiter
753        if wrote {
754            out.write_all(&[cfg.line_delim])?;
755        }
756    }
757
758    Ok(())
759}
760
761/// Process one line according to the cut config (used by stdin reader path).
762#[inline]
763fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
764    match cfg.mode {
765        CutMode::Fields => cut_fields(
766            line,
767            cfg.delim,
768            cfg.ranges,
769            cfg.complement,
770            cfg.output_delim,
771            cfg.suppress_no_delim,
772            out,
773        ),
774        CutMode::Bytes | CutMode::Characters => {
775            cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
776        }
777    }
778}
779
780/// Cut operation mode
781#[derive(Debug, Clone, Copy, PartialEq)]
782pub enum CutMode {
783    Bytes,
784    Characters,
785    Fields,
786}
coreutils_rs/cut/core.rs

coreutils_rs/cut/
core.rs