Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5/// Minimum file size for parallel processing (1MB).
6/// Rayon overhead is ~5-10μs per task; at 1MB per chunk,
7/// each chunk takes ~100μs+ to process, so overhead is < 10%.
8const PARALLEL_THRESHOLD: usize = 1024 * 1024;
9
10/// Configuration for cut operations.
11pub struct CutConfig<'a> {
12    pub mode: CutMode,
13    pub ranges: &'a [Range],
14    pub complement: bool,
15    pub delim: u8,
16    pub output_delim: &'a [u8],
17    pub suppress_no_delim: bool,
18    pub line_delim: u8,
19}
20
21/// A range specification like 1, 3-5, -3, 4-
22#[derive(Debug, Clone)]
23pub struct Range {
24    pub start: usize, // 1-based, 0 means "from beginning"
25    pub end: usize,   // 1-based, usize::MAX means "to end"
26}
27
28/// Parse a LIST specification like "1,3-5,7-" into ranges.
29/// Each range is 1-based. Returns sorted, merged ranges.
30pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
31    let mut ranges = Vec::new();
32
33    for part in spec.split(',') {
34        let part = part.trim();
35        if part.is_empty() {
36            continue;
37        }
38
39        if let Some(idx) = part.find('-') {
40            let left = &part[..idx];
41            let right = &part[idx + 1..];
42
43            let start = if left.is_empty() {
44                1
45            } else {
46                left.parse::<usize>()
47                    .map_err(|_| format!("invalid range: '{}'", part))?
48            };
49
50            let end = if right.is_empty() {
51                usize::MAX
52            } else {
53                right
54                    .parse::<usize>()
55                    .map_err(|_| format!("invalid range: '{}'", part))?
56            };
57
58            if start == 0 {
59                return Err("fields and positions are numbered from 1".to_string());
60            }
61            if start > end {
62                return Err(format!("invalid decreasing range: '{}'", part));
63            }
64
65            ranges.push(Range { start, end });
66        } else {
67            let n = part
68                .parse::<usize>()
69                .map_err(|_| format!("invalid field: '{}'", part))?;
70            if n == 0 {
71                return Err("fields and positions are numbered from 1".to_string());
72            }
73            ranges.push(Range { start: n, end: n });
74        }
75    }
76
77    if ranges.is_empty() {
78        return Err("you must specify a list of bytes, characters, or fields".to_string());
79    }
80
81    // Sort and merge overlapping ranges
82    ranges.sort_by_key(|r| (r.start, r.end));
83    let mut merged = vec![ranges[0].clone()];
84    for r in &ranges[1..] {
85        let last = merged.last_mut().unwrap();
86        if r.start <= last.end.saturating_add(1) {
87            last.end = last.end.max(r.end);
88        } else {
89            merged.push(r.clone());
90        }
91    }
92
93    Ok(merged)
94}
95
96/// Check if a 1-based position is in any range.
97/// Ranges must be sorted. Uses early exit since ranges are sorted.
98#[inline(always)]
99fn in_ranges(ranges: &[Range], pos: usize) -> bool {
100    for r in ranges {
101        if pos < r.start {
102            return false; // ranges are sorted, no point checking further
103        }
104        if pos <= r.end {
105            return true;
106        }
107    }
108    false
109}
110
111/// Pre-compute a 64-bit mask for field selection.
112/// Bit i-1 is set if field i should be output.
113#[inline]
114fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
115    let mut mask: u64 = 0;
116    for i in 1..=64u32 {
117        let in_range = in_ranges(ranges, i as usize);
118        if in_range != complement {
119            mask |= 1u64 << (i - 1);
120        }
121    }
122    mask
123}
124
125/// Check if a field should be selected, using bitset for first 64 fields.
126#[inline(always)]
127fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
128    if field_num <= 64 {
129        (mask >> (field_num - 1)) & 1 == 1
130    } else {
131        in_ranges(ranges, field_num) != complement
132    }
133}
134
135// ── Chunk splitting for parallel processing ──────────────────────────────
136
137/// Split data into chunks aligned to line boundaries for parallel processing.
138/// Returns slices that each end on a line_delim boundary (except possibly the last).
139fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
140    let num_threads = rayon::current_num_threads().max(1);
141    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
142        return vec![data];
143    }
144
145    let chunk_size = data.len() / num_threads;
146    let mut chunks = Vec::with_capacity(num_threads);
147    let mut pos = 0;
148
149    for _ in 0..num_threads - 1 {
150        let target = pos + chunk_size;
151        if target >= data.len() {
152            break;
153        }
154        // Align to next line boundary
155        let boundary = memchr::memchr(line_delim, &data[target..])
156            .map(|p| target + p + 1)
157            .unwrap_or(data.len());
158        if boundary > pos {
159            chunks.push(&data[pos..boundary]);
160        }
161        pos = boundary;
162    }
163
164    // Last chunk gets the remainder
165    if pos < data.len() {
166        chunks.push(&data[pos..]);
167    }
168
169    chunks
170}
171
172// ── Fast path: field extraction with batched output ──────────────────────
173
174/// Optimized field extraction with early exit and batched output.
175/// Uses SIMD newline scanning + inline byte scan per line.
176fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
177    let delim = cfg.delim;
178    let line_delim = cfg.line_delim;
179    let ranges = cfg.ranges;
180    let complement = cfg.complement;
181    let output_delim = cfg.output_delim;
182    let suppress = cfg.suppress_no_delim;
183
184    // Ultra-fast path: single field extraction (e.g., cut -f5)
185    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
186        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
187    }
188
189    // Pre-compute for general field extraction
190    let max_field = if complement {
191        usize::MAX
192    } else {
193        ranges.last().map(|r| r.end).unwrap_or(0)
194    };
195    let field_mask = compute_field_mask(ranges, complement);
196
197    if data.len() >= PARALLEL_THRESHOLD {
198        // Parallel path
199        let chunks = split_into_chunks(data, line_delim);
200        let results: Vec<Vec<u8>> = chunks
201            .par_iter()
202            .map(|chunk| {
203                let mut buf = Vec::with_capacity(chunk.len() / 2);
204                process_fields_chunk(
205                    chunk,
206                    delim,
207                    ranges,
208                    output_delim,
209                    suppress,
210                    max_field,
211                    field_mask,
212                    line_delim,
213                    complement,
214                    &mut buf,
215                );
216                buf
217            })
218            .collect();
219        for result in &results {
220            if !result.is_empty() {
221                out.write_all(result)?;
222            }
223        }
224    } else {
225        // Sequential path
226        let mut buf = Vec::with_capacity(data.len() / 2);
227        process_fields_chunk(
228            data,
229            delim,
230            ranges,
231            output_delim,
232            suppress,
233            max_field,
234            field_mask,
235            line_delim,
236            complement,
237            &mut buf,
238        );
239        if !buf.is_empty() {
240            out.write_all(&buf)?;
241        }
242    }
243    Ok(())
244}
245
246/// Process a chunk of data for general field extraction.
247fn process_fields_chunk(
248    data: &[u8],
249    delim: u8,
250    ranges: &[Range],
251    output_delim: &[u8],
252    suppress: bool,
253    max_field: usize,
254    field_mask: u64,
255    line_delim: u8,
256    complement: bool,
257    buf: &mut Vec<u8>,
258) {
259    let mut start = 0;
260    for end_pos in memchr_iter(line_delim, data) {
261        let line = &data[start..end_pos];
262        extract_fields_to_buf(
263            line,
264            delim,
265            ranges,
266            output_delim,
267            suppress,
268            max_field,
269            field_mask,
270            line_delim,
271            buf,
272            complement,
273        );
274        start = end_pos + 1;
275    }
276    if start < data.len() {
277        extract_fields_to_buf(
278            &data[start..],
279            delim,
280            ranges,
281            output_delim,
282            suppress,
283            max_field,
284            field_mask,
285            line_delim,
286            buf,
287            complement,
288        );
289    }
290}
291
292// ── Ultra-fast single field extraction ───────────────────────────────────
293
294/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
295/// Uses SIMD memchr for newline scanning + minimal inline byte scan per line.
296/// For large files, splits work across multiple threads with rayon.
297fn process_single_field(
298    data: &[u8],
299    delim: u8,
300    line_delim: u8,
301    target: usize,
302    suppress: bool,
303    out: &mut impl Write,
304) -> io::Result<()> {
305    let target_idx = target - 1;
306
307    if data.len() >= PARALLEL_THRESHOLD {
308        // Parallel path: split into chunks aligned to line boundaries
309        let chunks = split_into_chunks(data, line_delim);
310        let results: Vec<Vec<u8>> = chunks
311            .par_iter()
312            .map(|chunk| {
313                let mut buf = Vec::with_capacity(chunk.len() / 4);
314                process_single_field_chunk(
315                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
316                );
317                buf
318            })
319            .collect();
320        for result in &results {
321            if !result.is_empty() {
322                out.write_all(result)?;
323            }
324        }
325    } else {
326        // Sequential path for small data
327        let mut buf = Vec::with_capacity(data.len() / 4);
328        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
329        if !buf.is_empty() {
330            out.write_all(&buf)?;
331        }
332    }
333    Ok(())
334}
335
336/// Process a chunk of data for single-field extraction.
337fn process_single_field_chunk(
338    data: &[u8],
339    delim: u8,
340    target_idx: usize,
341    line_delim: u8,
342    suppress: bool,
343    buf: &mut Vec<u8>,
344) {
345    let mut start = 0;
346    for end_pos in memchr_iter(line_delim, data) {
347        let line = &data[start..end_pos];
348        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
349        start = end_pos + 1;
350    }
351    // Handle final line without terminator
352    if start < data.len() {
353        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
354    }
355}
356
357/// Extract a single field from one line using a single SIMD memchr_iter pass.
358/// Collects all delimiter positions in one scan, then indexes directly.
359#[inline(always)]
360fn extract_single_field_line(
361    line: &[u8],
362    delim: u8,
363    target_idx: usize,
364    line_delim: u8,
365    suppress: bool,
366    buf: &mut Vec<u8>,
367) {
368    if line.is_empty() {
369        if !suppress {
370            buf.push(line_delim);
371        }
372        return;
373    }
374
375    // Single SIMD pass to find ALL delimiter positions
376    // This reduces N SIMD scans to 1 for field N
377    let mut delim_positions: [usize; 64] = [0; 64];
378    let mut num_delims: usize = 0;
379
380    for pos in memchr_iter(delim, line) {
381        if num_delims < 64 {
382            delim_positions[num_delims] = pos;
383        }
384        num_delims += 1;
385        // Early exit: we have enough delimiters to extract our target field
386        if num_delims > target_idx {
387            break;
388        }
389    }
390
391    // No delimiters found — output whole line or suppress
392    if num_delims == 0 {
393        if !suppress {
394            buf.extend_from_slice(line);
395            buf.push(line_delim);
396        }
397        return;
398    }
399
400    // Fast path: target is field 1 (cut -f1)
401    if target_idx == 0 {
402        buf.extend_from_slice(&line[..delim_positions[0]]);
403        buf.push(line_delim);
404        return;
405    }
406
407    // Not enough fields for target
408    if num_delims < target_idx {
409        buf.push(line_delim);
410        return;
411    }
412
413    // Field start is right after the (target_idx-1)th delimiter
414    let field_start = delim_positions[target_idx - 1] + 1;
415
416    // Field end is the target_idx-th delimiter (or end of line)
417    if num_delims > target_idx && target_idx < 64 {
418        buf.extend_from_slice(&line[field_start..delim_positions[target_idx]]);
419    } else {
420        buf.extend_from_slice(&line[field_start..]);
421    }
422    buf.push(line_delim);
423}
424
425/// Extract fields from a single line into the output buffer.
426/// Uses inline byte scanning with early exit for maximum performance.
427#[inline(always)]
428fn extract_fields_to_buf(
429    line: &[u8],
430    delim: u8,
431    ranges: &[Range],
432    output_delim: &[u8],
433    suppress: bool,
434    max_field: usize,
435    field_mask: u64,
436    line_delim: u8,
437    buf: &mut Vec<u8>,
438    complement: bool,
439) {
440    let len = line.len();
441
442    // Empty line: no delimiter possible
443    if len == 0 {
444        if !suppress {
445            buf.push(line_delim);
446        }
447        return;
448    }
449
450    let mut field_num: usize = 1;
451    let mut field_start: usize = 0;
452    let mut first_output = true;
453    let mut has_delim = false;
454
455    // SIMD-accelerated delimiter scanning with early exit
456    for delim_pos in memchr_iter(delim, line) {
457        has_delim = true;
458
459        if is_selected(field_num, field_mask, ranges, complement) {
460            if !first_output {
461                buf.extend_from_slice(output_delim);
462            }
463            buf.extend_from_slice(&line[field_start..delim_pos]);
464            first_output = false;
465        }
466
467        field_num += 1;
468        field_start = delim_pos + 1;
469
470        // Early exit: past the last needed field
471        if field_num > max_field {
472            break;
473        }
474    }
475
476    // Last field (only if we didn't early-exit past it)
477    if (field_num <= max_field || complement)
478        && has_delim
479        && is_selected(field_num, field_mask, ranges, complement)
480    {
481        if !first_output {
482            buf.extend_from_slice(output_delim);
483        }
484        buf.extend_from_slice(&line[field_start..len]);
485        first_output = false;
486    }
487
488    // Output line terminator
489    if !first_output {
490        // Had output — add line delimiter
491        buf.push(line_delim);
492    } else if !has_delim {
493        // No delimiter found — output whole line or suppress
494        if !suppress {
495            buf.extend_from_slice(line);
496            buf.push(line_delim);
497        }
498    } else {
499        // Had delimiter but no selected field — output empty line (GNU compat)
500        buf.push(line_delim);
501    }
502}
503
504// ── Fast path: byte/char extraction with batched output ──────────────────
505
506/// Optimized byte/char extraction with batched output and parallel processing.
507fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
508    let line_delim = cfg.line_delim;
509    let ranges = cfg.ranges;
510    let complement = cfg.complement;
511    let output_delim = cfg.output_delim;
512
513    if data.len() >= PARALLEL_THRESHOLD {
514        let chunks = split_into_chunks(data, line_delim);
515        let results: Vec<Vec<u8>> = chunks
516            .par_iter()
517            .map(|chunk| {
518                let mut buf = Vec::with_capacity(chunk.len() / 2);
519                process_bytes_chunk(
520                    chunk,
521                    ranges,
522                    complement,
523                    output_delim,
524                    line_delim,
525                    &mut buf,
526                );
527                buf
528            })
529            .collect();
530        for result in &results {
531            if !result.is_empty() {
532                out.write_all(result)?;
533            }
534        }
535    } else {
536        let mut buf = Vec::with_capacity(data.len() / 2);
537        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
538        if !buf.is_empty() {
539            out.write_all(&buf)?;
540        }
541    }
542    Ok(())
543}
544
545/// Process a chunk of data for byte/char extraction.
546fn process_bytes_chunk(
547    data: &[u8],
548    ranges: &[Range],
549    complement: bool,
550    output_delim: &[u8],
551    line_delim: u8,
552    buf: &mut Vec<u8>,
553) {
554    let mut start = 0;
555    for end_pos in memchr_iter(line_delim, data) {
556        let line = &data[start..end_pos];
557        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
558        buf.push(line_delim);
559        start = end_pos + 1;
560    }
561    if start < data.len() {
562        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
563        buf.push(line_delim);
564    }
565}
566
567/// Extract byte ranges from a line into the output buffer.
568/// For the common non-complement case with contiguous ranges, uses bulk copy.
569#[inline(always)]
570fn cut_bytes_to_buf(
571    line: &[u8],
572    ranges: &[Range],
573    complement: bool,
574    output_delim: &[u8],
575    buf: &mut Vec<u8>,
576) {
577    let len = line.len();
578    let mut first_range = true;
579
580    if complement {
581        let mut pos: usize = 1;
582        for r in ranges {
583            let rs = r.start;
584            let re = r.end.min(len);
585            if pos < rs {
586                if !first_range && !output_delim.is_empty() {
587                    buf.extend_from_slice(output_delim);
588                }
589                buf.extend_from_slice(&line[pos - 1..rs - 1]);
590                first_range = false;
591            }
592            pos = re + 1;
593            if pos > len {
594                break;
595            }
596        }
597        if pos <= len {
598            if !first_range && !output_delim.is_empty() {
599                buf.extend_from_slice(output_delim);
600            }
601            buf.extend_from_slice(&line[pos - 1..len]);
602        }
603    } else if output_delim.is_empty() && ranges.len() == 1 {
604        // Ultra-fast path: single range, no output delimiter
605        let start = ranges[0].start.saturating_sub(1);
606        let end = ranges[0].end.min(len);
607        if start < len {
608            buf.extend_from_slice(&line[start..end]);
609        }
610    } else {
611        for r in ranges {
612            let start = r.start.saturating_sub(1);
613            let end = r.end.min(len);
614            if start >= len {
615                break;
616            }
617            if !first_range && !output_delim.is_empty() {
618                buf.extend_from_slice(output_delim);
619            }
620            buf.extend_from_slice(&line[start..end]);
621            first_range = false;
622        }
623    }
624}
625
626// ── Public API ───────────────────────────────────────────────────────────
627
628/// Cut fields from a line using a delimiter. Writes to `out`.
629/// Returns true if any output was written, false if suppressed.
630/// Used by process_cut_reader (stdin path) and unit tests.
631#[inline]
632pub fn cut_fields(
633    line: &[u8],
634    delim: u8,
635    ranges: &[Range],
636    complement: bool,
637    output_delim: &[u8],
638    suppress_no_delim: bool,
639    out: &mut impl Write,
640) -> io::Result<bool> {
641    // Check if line contains delimiter at all
642    if memchr::memchr(delim, line).is_none() {
643        if !suppress_no_delim {
644            out.write_all(line)?;
645            return Ok(true);
646        }
647        return Ok(false); // suppressed
648    }
649
650    // Walk through fields using memchr, output selected ones
651    let mut field_num: usize = 1;
652    let mut field_start: usize = 0;
653    let mut first_output = true;
654
655    for delim_pos in memchr_iter(delim, line) {
656        let selected = in_ranges(ranges, field_num) != complement;
657        if selected {
658            if !first_output {
659                out.write_all(output_delim)?;
660            }
661            out.write_all(&line[field_start..delim_pos])?;
662            first_output = false;
663        }
664        field_start = delim_pos + 1;
665        field_num += 1;
666    }
667
668    // Last field (after last delimiter)
669    let selected = in_ranges(ranges, field_num) != complement;
670    if selected {
671        if !first_output {
672            out.write_all(output_delim)?;
673        }
674        out.write_all(&line[field_start..])?;
675    }
676
677    Ok(true)
678}
679
680/// Cut bytes/chars from a line. Writes selected bytes to `out`.
681/// Used by process_cut_reader (stdin path) and unit tests.
682#[inline]
683pub fn cut_bytes(
684    line: &[u8],
685    ranges: &[Range],
686    complement: bool,
687    output_delim: &[u8],
688    out: &mut impl Write,
689) -> io::Result<bool> {
690    let mut first_range = true;
691
692    if complement {
693        let len = line.len();
694        let mut comp_ranges = Vec::new();
695        let mut pos: usize = 1;
696        for r in ranges {
697            let rs = r.start;
698            let re = r.end.min(len);
699            if pos < rs {
700                comp_ranges.push((pos, rs - 1));
701            }
702            pos = re + 1;
703            if pos > len {
704                break;
705            }
706        }
707        if pos <= len {
708            comp_ranges.push((pos, len));
709        }
710        for &(s, e) in &comp_ranges {
711            if !first_range && !output_delim.is_empty() {
712                out.write_all(output_delim)?;
713            }
714            out.write_all(&line[s - 1..e])?;
715            first_range = false;
716        }
717    } else {
718        for r in ranges {
719            let start = r.start.saturating_sub(1);
720            let end = r.end.min(line.len());
721            if start >= line.len() {
722                break;
723            }
724            if !first_range && !output_delim.is_empty() {
725                out.write_all(output_delim)?;
726            }
727            out.write_all(&line[start..end])?;
728            first_range = false;
729        }
730    }
731    Ok(true)
732}
733
734/// Process a full data buffer (from mmap or read) with cut operation.
735/// Dispatches to optimized fast paths for field and byte modes.
736pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
737    match cfg.mode {
738        CutMode::Fields => process_fields_fast(data, cfg, out),
739        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
740    }
741}
742
743/// Process input from a reader (for stdin).
744pub fn process_cut_reader<R: BufRead>(
745    mut reader: R,
746    cfg: &CutConfig,
747    out: &mut impl Write,
748) -> io::Result<()> {
749    let mut buf = Vec::new();
750
751    loop {
752        buf.clear();
753        let n = reader.read_until(cfg.line_delim, &mut buf)?;
754        if n == 0 {
755            break;
756        }
757
758        let has_line_delim = buf.last() == Some(&cfg.line_delim);
759        let line = if has_line_delim {
760            &buf[..buf.len() - 1]
761        } else {
762            &buf[..]
763        };
764
765        let wrote = process_one_line(line, cfg, out)?;
766
767        // GNU always terminates output lines, even if input had no trailing delimiter
768        if wrote {
769            out.write_all(&[cfg.line_delim])?;
770        }
771    }
772
773    Ok(())
774}
775
776/// Process one line according to the cut config (used by stdin reader path).
777#[inline]
778fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
779    match cfg.mode {
780        CutMode::Fields => cut_fields(
781            line,
782            cfg.delim,
783            cfg.ranges,
784            cfg.complement,
785            cfg.output_delim,
786            cfg.suppress_no_delim,
787            out,
788        ),
789        CutMode::Bytes | CutMode::Characters => {
790            cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
791        }
792    }
793}
794
795/// Cut operation mode
796#[derive(Debug, Clone, Copy, PartialEq)]
797pub enum CutMode {
798    Bytes,
799    Characters,
800    Fields,
801}