1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8const MAX_IOV: usize = 1024;
10
11pub struct CutConfig<'a> {
13 pub mode: CutMode,
14 pub ranges: &'a [Range],
15 pub complement: bool,
16 pub delim: u8,
17 pub output_delim: &'a [u8],
18 pub suppress_no_delim: bool,
19 pub line_delim: u8,
20}
21
22#[derive(Debug, Clone)]
24pub struct Range {
25 pub start: usize, pub end: usize, }
28
29pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
32 let mut ranges = Vec::new();
33
34 for part in spec.split(',') {
35 let part = part.trim();
36 if part.is_empty() {
37 continue;
38 }
39
40 if let Some(idx) = part.find('-') {
41 let left = &part[..idx];
42 let right = &part[idx + 1..];
43
44 let start = if left.is_empty() {
45 1
46 } else {
47 left.parse::<usize>()
48 .map_err(|_| format!("invalid range: '{}'", part))?
49 };
50
51 let end = if right.is_empty() {
52 usize::MAX
53 } else {
54 right
55 .parse::<usize>()
56 .map_err(|_| format!("invalid range: '{}'", part))?
57 };
58
59 if start == 0 {
60 return Err("fields and positions are numbered from 1".to_string());
61 }
62 if start > end {
63 return Err(format!("invalid decreasing range: '{}'", part));
64 }
65
66 ranges.push(Range { start, end });
67 } else {
68 let n = part
69 .parse::<usize>()
70 .map_err(|_| format!("invalid field: '{}'", part))?;
71 if n == 0 {
72 return Err("fields and positions are numbered from 1".to_string());
73 }
74 ranges.push(Range { start: n, end: n });
75 }
76 }
77
78 if ranges.is_empty() {
79 return Err("you must specify a list of bytes, characters, or fields".to_string());
80 }
81
82 ranges.sort_by_key(|r| (r.start, r.end));
84 let mut merged = vec![ranges[0].clone()];
85 for r in &ranges[1..] {
86 let last = merged.last_mut().unwrap();
87 if r.start <= last.end.saturating_add(1) {
88 last.end = last.end.max(r.end);
89 } else {
90 merged.push(r.clone());
91 }
92 }
93
94 Ok(merged)
95}
96
97#[inline(always)]
100fn in_ranges(ranges: &[Range], pos: usize) -> bool {
101 for r in ranges {
102 if pos < r.start {
103 return false;
104 }
105 if pos <= r.end {
106 return true;
107 }
108 }
109 false
110}
111
112#[inline]
115fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
116 let mut mask: u64 = 0;
117 for i in 1..=64u32 {
118 let in_range = in_ranges(ranges, i as usize);
119 if in_range != complement {
120 mask |= 1u64 << (i - 1);
121 }
122 }
123 mask
124}
125
126#[inline(always)]
128fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
129 if field_num <= 64 {
130 (mask >> (field_num - 1)) & 1 == 1
131 } else {
132 in_ranges(ranges, field_num) != complement
133 }
134}
135
136#[inline(always)]
141unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
142 unsafe {
143 let len = buf.len();
144 std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
145 buf.set_len(len + data.len());
146 }
147}
148
149#[inline(always)]
152unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
153 unsafe {
154 let len = buf.len();
155 *buf.as_mut_ptr().add(len) = b;
156 buf.set_len(len + 1);
157 }
158}
159
160#[inline]
163fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
164 if slices.is_empty() {
165 return Ok(());
166 }
167 for batch in slices.chunks(MAX_IOV) {
168 let total: usize = batch.iter().map(|s| s.len()).sum();
169 match out.write_vectored(batch) {
170 Ok(n) if n >= total => continue,
171 Ok(mut written) => {
172 for slice in batch {
174 let slen = slice.len();
175 if written >= slen {
176 written -= slen;
177 continue;
178 }
179 if written > 0 {
180 out.write_all(&slice[written..])?;
181 written = 0;
182 } else {
183 out.write_all(slice)?;
184 }
185 }
186 }
187 Err(e) => return Err(e),
188 }
189 }
190 Ok(())
191}
192
193fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
197 let num_threads = rayon::current_num_threads().max(1);
198 if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
199 return vec![data];
200 }
201
202 let chunk_size = data.len() / num_threads;
203 let mut chunks = Vec::with_capacity(num_threads);
204 let mut pos = 0;
205
206 for _ in 0..num_threads - 1 {
207 let target = pos + chunk_size;
208 if target >= data.len() {
209 break;
210 }
211 let boundary = memchr::memchr(line_delim, &data[target..])
212 .map(|p| target + p + 1)
213 .unwrap_or(data.len());
214 if boundary > pos {
215 chunks.push(&data[pos..boundary]);
216 }
217 pos = boundary;
218 }
219
220 if pos < data.len() {
221 chunks.push(&data[pos..]);
222 }
223
224 chunks
225}
226
227fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
231 let delim = cfg.delim;
232 let line_delim = cfg.line_delim;
233 let ranges = cfg.ranges;
234 let complement = cfg.complement;
235 let output_delim = cfg.output_delim;
236 let suppress = cfg.suppress_no_delim;
237
238 if !complement && memchr::memchr(delim, data).is_none() {
240 if suppress {
241 return Ok(());
242 }
243 out.write_all(data)?;
244 if !data.is_empty() && *data.last().unwrap() != line_delim {
245 out.write_all(&[line_delim])?;
246 }
247 return Ok(());
248 }
249
250 if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
252 return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
253 }
254
255 if complement
257 && ranges.len() == 1
258 && ranges[0].start == ranges[0].end
259 && output_delim.len() == 1
260 && output_delim[0] == delim
261 {
262 return process_complement_single_field(
263 data,
264 delim,
265 line_delim,
266 ranges[0].start,
267 suppress,
268 out,
269 );
270 }
271
272 if !complement
274 && ranges.len() == 1
275 && ranges[0].start == 1
276 && output_delim.len() == 1
277 && output_delim[0] == delim
278 && ranges[0].end < usize::MAX
279 {
280 return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
281 }
282
283 if !complement
285 && ranges.len() == 1
286 && ranges[0].end == usize::MAX
287 && ranges[0].start > 1
288 && output_delim.len() == 1
289 && output_delim[0] == delim
290 {
291 return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
292 }
293
294 let max_field = if complement {
296 usize::MAX
297 } else {
298 ranges.last().map(|r| r.end).unwrap_or(0)
299 };
300 let field_mask = compute_field_mask(ranges, complement);
301
302 if data.len() >= PARALLEL_THRESHOLD {
303 let chunks = split_into_chunks(data, line_delim);
304 let results: Vec<Vec<u8>> = chunks
305 .par_iter()
306 .map(|chunk| {
307 let mut buf = Vec::with_capacity(chunk.len());
308 process_fields_chunk(
309 chunk,
310 delim,
311 ranges,
312 output_delim,
313 suppress,
314 max_field,
315 field_mask,
316 line_delim,
317 complement,
318 &mut buf,
319 );
320 buf
321 })
322 .collect();
323 let slices: Vec<IoSlice> = results
325 .iter()
326 .filter(|r| !r.is_empty())
327 .map(|r| IoSlice::new(r))
328 .collect();
329 write_ioslices(out, &slices)?;
330 } else {
331 let mut buf = Vec::with_capacity(data.len());
332 process_fields_chunk(
333 data,
334 delim,
335 ranges,
336 output_delim,
337 suppress,
338 max_field,
339 field_mask,
340 line_delim,
341 complement,
342 &mut buf,
343 );
344 if !buf.is_empty() {
345 out.write_all(&buf)?;
346 }
347 }
348 Ok(())
349}
350
351fn process_fields_chunk(
356 data: &[u8],
357 delim: u8,
358 ranges: &[Range],
359 output_delim: &[u8],
360 suppress: bool,
361 max_field: usize,
362 field_mask: u64,
363 line_delim: u8,
364 complement: bool,
365 buf: &mut Vec<u8>,
366) {
367 if delim != line_delim {
371 buf.reserve(data.len());
372
373 let mut line_start: usize = 0;
374 let mut field_start: usize = 0;
375 let mut field_num: usize = 1;
376 let mut first_output = true;
377 let mut has_delim = false;
378
379 for pos in memchr::memchr2_iter(delim, line_delim, data) {
380 let byte = unsafe { *data.get_unchecked(pos) };
381
382 if byte == line_delim {
383 if (field_num <= max_field || complement)
385 && has_delim
386 && is_selected(field_num, field_mask, ranges, complement)
387 {
388 if !first_output {
389 unsafe { buf_extend(buf, output_delim) };
390 }
391 unsafe { buf_extend(buf, &data[field_start..pos]) };
392 first_output = false;
393 }
394
395 if !first_output {
396 unsafe { buf_push(buf, line_delim) };
397 } else if !has_delim {
398 if !suppress {
399 unsafe {
400 buf_extend(buf, &data[line_start..pos]);
401 buf_push(buf, line_delim);
402 }
403 }
404 } else {
405 unsafe { buf_push(buf, line_delim) };
406 }
407
408 line_start = pos + 1;
410 field_start = pos + 1;
411 field_num = 1;
412 first_output = true;
413 has_delim = false;
414 } else {
415 has_delim = true;
417
418 if is_selected(field_num, field_mask, ranges, complement) {
419 if !first_output {
420 unsafe { buf_extend(buf, output_delim) };
421 }
422 unsafe { buf_extend(buf, &data[field_start..pos]) };
423 first_output = false;
424 }
425
426 field_num += 1;
427 field_start = pos + 1;
428
429 if field_num > max_field && !complement {
430 }
433 }
434 }
435
436 if line_start < data.len() {
438 let line = &data[line_start..];
439 if !line.is_empty() {
440 if (field_num <= max_field || complement)
441 && has_delim
442 && is_selected(field_num, field_mask, ranges, complement)
443 {
444 if !first_output {
445 unsafe { buf_extend(buf, output_delim) };
446 }
447 unsafe { buf_extend(buf, &data[field_start..data.len()]) };
448 first_output = false;
449 }
450
451 if !first_output {
452 unsafe { buf_push(buf, line_delim) };
453 } else if !has_delim {
454 if !suppress {
455 unsafe {
456 buf_extend(buf, &data[line_start..data.len()]);
457 buf_push(buf, line_delim);
458 }
459 }
460 } else {
461 unsafe { buf_push(buf, line_delim) };
462 }
463 }
464 }
465
466 return;
467 }
468
469 let mut start = 0;
471 for end_pos in memchr_iter(line_delim, data) {
472 let line = &data[start..end_pos];
473 extract_fields_to_buf(
474 line,
475 delim,
476 ranges,
477 output_delim,
478 suppress,
479 max_field,
480 field_mask,
481 line_delim,
482 buf,
483 complement,
484 );
485 start = end_pos + 1;
486 }
487 if start < data.len() {
488 extract_fields_to_buf(
489 &data[start..],
490 delim,
491 ranges,
492 output_delim,
493 suppress,
494 max_field,
495 field_mask,
496 line_delim,
497 buf,
498 complement,
499 );
500 }
501}
502
503fn process_single_field(
509 data: &[u8],
510 delim: u8,
511 line_delim: u8,
512 target: usize,
513 suppress: bool,
514 out: &mut impl Write,
515) -> io::Result<()> {
516 let target_idx = target - 1;
517
518 if delim != line_delim {
520 if data.len() >= PARALLEL_THRESHOLD {
521 let chunks = split_into_chunks(data, line_delim);
522 let results: Vec<Vec<u8>> = chunks
523 .par_iter()
524 .map(|chunk| {
525 let mut buf = Vec::with_capacity(chunk.len());
526 process_nth_field_combined(
527 chunk, delim, line_delim, target_idx, suppress, &mut buf,
528 );
529 buf
530 })
531 .collect();
532 for result in &results {
533 if !result.is_empty() {
534 out.write_all(result)?;
535 }
536 }
537 } else if target_idx == 0 && !suppress {
538 single_field1_zerocopy(data, delim, line_delim, out)?;
543 } else {
544 let mut buf = Vec::with_capacity(data.len());
545 process_nth_field_combined(data, delim, line_delim, target_idx, suppress, &mut buf);
546 if !buf.is_empty() {
547 out.write_all(&buf)?;
548 }
549 }
550 return Ok(());
551 }
552
553 if data.len() >= PARALLEL_THRESHOLD {
555 let chunks = split_into_chunks(data, line_delim);
556 let results: Vec<Vec<u8>> = chunks
557 .par_iter()
558 .map(|chunk| {
559 let mut buf = Vec::with_capacity(chunk.len() / 4);
560 process_single_field_chunk(
561 chunk, delim, target_idx, line_delim, suppress, &mut buf,
562 );
563 buf
564 })
565 .collect();
566 let slices: Vec<IoSlice> = results
568 .iter()
569 .filter(|r| !r.is_empty())
570 .map(|r| IoSlice::new(r))
571 .collect();
572 write_ioslices(out, &slices)?;
573 } else {
574 let mut buf = Vec::with_capacity(data.len() / 4);
575 process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
576 if !buf.is_empty() {
577 out.write_all(&buf)?;
578 }
579 }
580 Ok(())
581}
582
583fn process_complement_single_field(
585 data: &[u8],
586 delim: u8,
587 line_delim: u8,
588 skip_field: usize,
589 suppress: bool,
590 out: &mut impl Write,
591) -> io::Result<()> {
592 let skip_idx = skip_field - 1;
593
594 if data.len() >= PARALLEL_THRESHOLD {
595 let chunks = split_into_chunks(data, line_delim);
596 let results: Vec<Vec<u8>> = chunks
597 .par_iter()
598 .map(|chunk| {
599 let mut buf = Vec::with_capacity(chunk.len());
600 complement_single_field_chunk(
601 chunk, delim, skip_idx, line_delim, suppress, &mut buf,
602 );
603 buf
604 })
605 .collect();
606 let slices: Vec<IoSlice> = results
608 .iter()
609 .filter(|r| !r.is_empty())
610 .map(|r| IoSlice::new(r))
611 .collect();
612 write_ioslices(out, &slices)?;
613 } else {
614 let mut buf = Vec::with_capacity(data.len());
615 complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
616 if !buf.is_empty() {
617 out.write_all(&buf)?;
618 }
619 }
620 Ok(())
621}
622
623fn complement_single_field_chunk(
625 data: &[u8],
626 delim: u8,
627 skip_idx: usize,
628 line_delim: u8,
629 suppress: bool,
630 buf: &mut Vec<u8>,
631) {
632 let mut start = 0;
633 for end_pos in memchr_iter(line_delim, data) {
634 let line = &data[start..end_pos];
635 complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
636 start = end_pos + 1;
637 }
638 if start < data.len() {
639 complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
640 }
641}
642
643#[inline(always)]
645fn complement_single_field_line(
646 line: &[u8],
647 delim: u8,
648 skip_idx: usize,
649 line_delim: u8,
650 suppress: bool,
651 buf: &mut Vec<u8>,
652) {
653 if line.is_empty() {
654 if !suppress {
655 buf.push(line_delim);
656 }
657 return;
658 }
659
660 buf.reserve(line.len() + 1);
661
662 let mut field_idx = 0;
663 let mut field_start = 0;
664 let mut first_output = true;
665 let mut has_delim = false;
666
667 for pos in memchr_iter(delim, line) {
668 has_delim = true;
669 if field_idx != skip_idx {
670 if !first_output {
671 unsafe { buf_push(buf, delim) };
672 }
673 unsafe { buf_extend(buf, &line[field_start..pos]) };
674 first_output = false;
675 }
676 field_idx += 1;
677 field_start = pos + 1;
678 }
679
680 if !has_delim {
681 if !suppress {
682 unsafe {
683 buf_extend(buf, line);
684 buf_push(buf, line_delim);
685 }
686 }
687 return;
688 }
689
690 if field_idx != skip_idx {
692 if !first_output {
693 unsafe { buf_push(buf, delim) };
694 }
695 unsafe { buf_extend(buf, &line[field_start..]) };
696 }
697
698 unsafe { buf_push(buf, line_delim) };
699}
700
701fn process_fields_prefix(
705 data: &[u8],
706 delim: u8,
707 line_delim: u8,
708 last_field: usize,
709 suppress: bool,
710 out: &mut impl Write,
711) -> io::Result<()> {
712 if data.len() >= PARALLEL_THRESHOLD {
713 let chunks = split_into_chunks(data, line_delim);
714 let results: Vec<Vec<u8>> = chunks
715 .par_iter()
716 .map(|chunk| {
717 let mut buf = Vec::with_capacity(chunk.len());
718 fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
719 buf
720 })
721 .collect();
722 let slices: Vec<IoSlice> = results
724 .iter()
725 .filter(|r| !r.is_empty())
726 .map(|r| IoSlice::new(r))
727 .collect();
728 write_ioslices(out, &slices)?;
729 } else if !suppress {
730 fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
734 } else {
735 let mut buf = Vec::with_capacity(data.len());
736 fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
737 if !buf.is_empty() {
738 out.write_all(&buf)?;
739 }
740 }
741 Ok(())
742}
743
744#[inline]
749fn fields_prefix_zerocopy(
750 data: &[u8],
751 delim: u8,
752 line_delim: u8,
753 last_field: usize,
754 out: &mut impl Write,
755) -> io::Result<()> {
756 let mut start = 0;
757 let mut run_start: usize = 0;
758
759 for end_pos in memchr_iter(line_delim, data) {
760 let line = &data[start..end_pos];
761 let mut field_count = 1;
763 let mut truncate_at: Option<usize> = None;
764 for dpos in memchr_iter(delim, line) {
765 if field_count >= last_field {
766 truncate_at = Some(start + dpos);
767 break;
768 }
769 field_count += 1;
770 }
771
772 if let Some(trunc_pos) = truncate_at {
773 if run_start < start {
775 out.write_all(&data[run_start..start])?;
776 }
777 out.write_all(&data[start..trunc_pos])?;
778 out.write_all(&[line_delim])?;
779 run_start = end_pos + 1;
780 }
781 start = end_pos + 1;
783 }
784 if start < data.len() {
786 let line = &data[start..];
787 let mut field_count = 1;
788 let mut truncate_at: Option<usize> = None;
789 for dpos in memchr_iter(delim, line) {
790 if field_count >= last_field {
791 truncate_at = Some(start + dpos);
792 break;
793 }
794 field_count += 1;
795 }
796 if let Some(trunc_pos) = truncate_at {
797 if run_start < start {
798 out.write_all(&data[run_start..start])?;
799 }
800 out.write_all(&data[start..trunc_pos])?;
801 out.write_all(&[line_delim])?;
802 return Ok(());
803 }
804 }
805 if run_start < data.len() {
807 out.write_all(&data[run_start..])?;
808 if !data.is_empty() && *data.last().unwrap() != line_delim {
809 out.write_all(&[line_delim])?;
810 }
811 }
812 Ok(())
813}
814
815fn fields_prefix_chunk(
817 data: &[u8],
818 delim: u8,
819 line_delim: u8,
820 last_field: usize,
821 suppress: bool,
822 buf: &mut Vec<u8>,
823) {
824 let mut start = 0;
825 for end_pos in memchr_iter(line_delim, data) {
826 let line = &data[start..end_pos];
827 fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
828 start = end_pos + 1;
829 }
830 if start < data.len() {
831 fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
832 }
833}
834
835#[inline(always)]
837fn fields_prefix_line(
838 line: &[u8],
839 delim: u8,
840 line_delim: u8,
841 last_field: usize,
842 suppress: bool,
843 buf: &mut Vec<u8>,
844) {
845 if line.is_empty() {
846 if !suppress {
847 buf.push(line_delim);
848 }
849 return;
850 }
851
852 buf.reserve(line.len() + 1);
853
854 let mut field_count = 1;
855 let mut has_delim = false;
856
857 for pos in memchr_iter(delim, line) {
858 has_delim = true;
859 if field_count >= last_field {
860 unsafe {
861 buf_extend(buf, &line[..pos]);
862 buf_push(buf, line_delim);
863 }
864 return;
865 }
866 field_count += 1;
867 }
868
869 if !has_delim {
870 if !suppress {
871 unsafe {
872 buf_extend(buf, line);
873 buf_push(buf, line_delim);
874 }
875 }
876 return;
877 }
878
879 unsafe {
880 buf_extend(buf, line);
881 buf_push(buf, line_delim);
882 }
883}
884
885fn process_fields_suffix(
887 data: &[u8],
888 delim: u8,
889 line_delim: u8,
890 start_field: usize,
891 suppress: bool,
892 out: &mut impl Write,
893) -> io::Result<()> {
894 if data.len() >= PARALLEL_THRESHOLD {
895 let chunks = split_into_chunks(data, line_delim);
896 let results: Vec<Vec<u8>> = chunks
897 .par_iter()
898 .map(|chunk| {
899 let mut buf = Vec::with_capacity(chunk.len());
900 fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
901 buf
902 })
903 .collect();
904 let slices: Vec<IoSlice> = results
906 .iter()
907 .filter(|r| !r.is_empty())
908 .map(|r| IoSlice::new(r))
909 .collect();
910 write_ioslices(out, &slices)?;
911 } else {
912 let mut buf = Vec::with_capacity(data.len());
913 fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
914 if !buf.is_empty() {
915 out.write_all(&buf)?;
916 }
917 }
918 Ok(())
919}
920
921fn fields_suffix_chunk(
923 data: &[u8],
924 delim: u8,
925 line_delim: u8,
926 start_field: usize,
927 suppress: bool,
928 buf: &mut Vec<u8>,
929) {
930 let mut start = 0;
931 for end_pos in memchr_iter(line_delim, data) {
932 let line = &data[start..end_pos];
933 fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
934 start = end_pos + 1;
935 }
936 if start < data.len() {
937 fields_suffix_line(
938 &data[start..],
939 delim,
940 line_delim,
941 start_field,
942 suppress,
943 buf,
944 );
945 }
946}
947
948#[inline(always)]
950fn fields_suffix_line(
951 line: &[u8],
952 delim: u8,
953 line_delim: u8,
954 start_field: usize,
955 suppress: bool,
956 buf: &mut Vec<u8>,
957) {
958 if line.is_empty() {
959 if !suppress {
960 buf.push(line_delim);
961 }
962 return;
963 }
964
965 buf.reserve(line.len() + 1);
966
967 let skip_delims = start_field - 1;
968 let mut delim_count = 0;
969 let mut has_delim = false;
970
971 for pos in memchr_iter(delim, line) {
972 has_delim = true;
973 delim_count += 1;
974 if delim_count >= skip_delims {
975 unsafe {
976 buf_extend(buf, &line[pos + 1..]);
977 buf_push(buf, line_delim);
978 }
979 return;
980 }
981 }
982
983 if !has_delim {
984 if !suppress {
985 unsafe {
986 buf_extend(buf, line);
987 buf_push(buf, line_delim);
988 }
989 }
990 return;
991 }
992
993 unsafe { buf_push(buf, line_delim) };
995}
996
997fn process_nth_field_combined(
1002 data: &[u8],
1003 delim: u8,
1004 line_delim: u8,
1005 target_idx: usize,
1006 suppress: bool,
1007 buf: &mut Vec<u8>,
1008) {
1009 buf.reserve(data.len());
1010
1011 let mut line_start: usize = 0;
1012 let mut field_start: usize = 0;
1013 let mut field_idx: usize = 0;
1014 let mut has_delim = false;
1015 let mut emitted = false;
1016
1017 for pos in memchr::memchr2_iter(delim, line_delim, data) {
1018 let byte = unsafe { *data.get_unchecked(pos) };
1019
1020 if byte == line_delim {
1021 if !emitted {
1023 if has_delim && field_idx == target_idx {
1024 unsafe {
1026 buf_extend(buf, &data[field_start..pos]);
1027 buf_push(buf, line_delim);
1028 }
1029 } else if has_delim {
1030 unsafe {
1032 buf_push(buf, line_delim);
1033 }
1034 } else if !suppress {
1035 unsafe {
1037 buf_extend(buf, &data[line_start..pos]);
1038 buf_push(buf, line_delim);
1039 }
1040 }
1041 }
1042 line_start = pos + 1;
1044 field_start = pos + 1;
1045 field_idx = 0;
1046 has_delim = false;
1047 emitted = false;
1048 } else {
1049 has_delim = true;
1051 if field_idx == target_idx {
1052 unsafe {
1053 buf_extend(buf, &data[field_start..pos]);
1054 buf_push(buf, line_delim);
1055 }
1056 emitted = true;
1057 }
1058 field_idx += 1;
1059 field_start = pos + 1;
1060 }
1061 }
1062
1063 if line_start < data.len() && !emitted {
1065 if has_delim && field_idx == target_idx {
1066 unsafe {
1067 buf_extend(buf, &data[field_start..data.len()]);
1068 buf_push(buf, line_delim);
1069 }
1070 } else if has_delim {
1071 unsafe {
1072 buf_push(buf, line_delim);
1073 }
1074 } else if !suppress {
1075 unsafe {
1076 buf_extend(buf, &data[line_start..data.len()]);
1077 buf_push(buf, line_delim);
1078 }
1079 }
1080 }
1081}
1082
1083#[inline]
1087fn single_field1_zerocopy(
1088 data: &[u8],
1089 delim: u8,
1090 line_delim: u8,
1091 out: &mut impl Write,
1092) -> io::Result<()> {
1093 let mut line_start: usize = 0;
1094 let mut run_start: usize = 0;
1095 let mut first_delim: Option<usize> = None;
1096
1097 for pos in memchr::memchr2_iter(delim, line_delim, data) {
1098 let byte = unsafe { *data.get_unchecked(pos) };
1099
1100 if byte == line_delim {
1101 if let Some(dp) = first_delim {
1103 if run_start < line_start {
1106 out.write_all(&data[run_start..line_start])?;
1107 }
1108 out.write_all(&data[line_start..dp])?;
1109 out.write_all(&[line_delim])?;
1110 run_start = pos + 1;
1111 }
1112 line_start = pos + 1;
1114 first_delim = None;
1115 } else {
1116 if first_delim.is_none() {
1118 first_delim = Some(pos);
1119 }
1120 }
1121 }
1122
1123 if line_start < data.len() {
1125 if let Some(dp) = first_delim {
1126 if run_start < line_start {
1127 out.write_all(&data[run_start..line_start])?;
1128 }
1129 out.write_all(&data[line_start..dp])?;
1130 out.write_all(&[line_delim])?;
1131 return Ok(());
1132 }
1133 }
1134
1135 if run_start < data.len() {
1137 out.write_all(&data[run_start..])?;
1138 if !data.is_empty() && *data.last().unwrap() != line_delim {
1139 out.write_all(&[line_delim])?;
1140 }
1141 }
1142 Ok(())
1143}
1144
1145fn process_single_field_chunk(
1147 data: &[u8],
1148 delim: u8,
1149 target_idx: usize,
1150 line_delim: u8,
1151 suppress: bool,
1152 buf: &mut Vec<u8>,
1153) {
1154 let mut start = 0;
1155 for end_pos in memchr_iter(line_delim, data) {
1156 let line = &data[start..end_pos];
1157 extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
1158 start = end_pos + 1;
1159 }
1160 if start < data.len() {
1161 extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
1162 }
1163}
1164
1165#[inline(always)]
1168fn extract_single_field_line(
1169 line: &[u8],
1170 delim: u8,
1171 target_idx: usize,
1172 line_delim: u8,
1173 suppress: bool,
1174 buf: &mut Vec<u8>,
1175) {
1176 if line.is_empty() {
1177 if !suppress {
1178 buf.push(line_delim);
1179 }
1180 return;
1181 }
1182
1183 buf.reserve(line.len() + 1);
1185
1186 if target_idx == 0 {
1188 match memchr::memchr(delim, line) {
1189 Some(pos) => unsafe {
1190 buf_extend(buf, &line[..pos]);
1191 buf_push(buf, line_delim);
1192 },
1193 None => {
1194 if !suppress {
1195 unsafe {
1196 buf_extend(buf, line);
1197 buf_push(buf, line_delim);
1198 }
1199 }
1200 }
1201 }
1202 return;
1203 }
1204
1205 let mut field_start = 0;
1206 let mut field_idx = 0;
1207 let mut has_delim = false;
1208
1209 for pos in memchr_iter(delim, line) {
1210 has_delim = true;
1211 if field_idx == target_idx {
1212 unsafe {
1213 buf_extend(buf, &line[field_start..pos]);
1214 buf_push(buf, line_delim);
1215 }
1216 return;
1217 }
1218 field_idx += 1;
1219 field_start = pos + 1;
1220 }
1221
1222 if !has_delim {
1223 if !suppress {
1224 unsafe {
1225 buf_extend(buf, line);
1226 buf_push(buf, line_delim);
1227 }
1228 }
1229 return;
1230 }
1231
1232 if field_idx == target_idx {
1233 unsafe {
1234 buf_extend(buf, &line[field_start..]);
1235 buf_push(buf, line_delim);
1236 }
1237 } else {
1238 unsafe { buf_push(buf, line_delim) };
1239 }
1240}
1241
1242#[inline(always)]
1245fn extract_fields_to_buf(
1246 line: &[u8],
1247 delim: u8,
1248 ranges: &[Range],
1249 output_delim: &[u8],
1250 suppress: bool,
1251 max_field: usize,
1252 field_mask: u64,
1253 line_delim: u8,
1254 buf: &mut Vec<u8>,
1255 complement: bool,
1256) {
1257 let len = line.len();
1258
1259 if len == 0 {
1260 if !suppress {
1261 buf.push(line_delim);
1262 }
1263 return;
1264 }
1265
1266 let needed = len + output_delim.len() * 16 + 1;
1269 if buf.capacity() - buf.len() < needed {
1270 buf.reserve(needed);
1271 }
1272
1273 let mut field_num: usize = 1;
1274 let mut field_start: usize = 0;
1275 let mut first_output = true;
1276 let mut has_delim = false;
1277
1278 for delim_pos in memchr_iter(delim, line) {
1279 has_delim = true;
1280
1281 if is_selected(field_num, field_mask, ranges, complement) {
1282 if !first_output {
1283 unsafe { buf_extend(buf, output_delim) };
1284 }
1285 unsafe { buf_extend(buf, &line[field_start..delim_pos]) };
1286 first_output = false;
1287 }
1288
1289 field_num += 1;
1290 field_start = delim_pos + 1;
1291
1292 if field_num > max_field {
1293 break;
1294 }
1295 }
1296
1297 if (field_num <= max_field || complement)
1299 && has_delim
1300 && is_selected(field_num, field_mask, ranges, complement)
1301 {
1302 if !first_output {
1303 unsafe { buf_extend(buf, output_delim) };
1304 }
1305 unsafe { buf_extend(buf, &line[field_start..len]) };
1306 first_output = false;
1307 }
1308
1309 if !first_output {
1310 unsafe { buf_push(buf, line_delim) };
1311 } else if !has_delim {
1312 if !suppress {
1313 unsafe {
1314 buf_extend(buf, line);
1315 buf_push(buf, line_delim);
1316 }
1317 }
1318 } else {
1319 unsafe { buf_push(buf, line_delim) };
1320 }
1321}
1322
1323fn process_bytes_from_start(
1330 data: &[u8],
1331 max_bytes: usize,
1332 line_delim: u8,
1333 out: &mut impl Write,
1334) -> io::Result<()> {
1335 if data.len() >= PARALLEL_THRESHOLD {
1336 let chunks = split_into_chunks(data, line_delim);
1337 let results: Vec<Vec<u8>> = chunks
1338 .par_iter()
1339 .map(|chunk| {
1340 let mut buf = Vec::with_capacity(chunk.len());
1341 bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
1342 buf
1343 })
1344 .collect();
1345 let slices: Vec<IoSlice> = results
1347 .iter()
1348 .filter(|r| !r.is_empty())
1349 .map(|r| IoSlice::new(r))
1350 .collect();
1351 write_ioslices(out, &slices)?;
1352 } else {
1353 bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
1357 }
1358 Ok(())
1359}
1360
1361#[inline]
1364fn bytes_from_start_zerocopy(
1365 data: &[u8],
1366 max_bytes: usize,
1367 line_delim: u8,
1368 out: &mut impl Write,
1369) -> io::Result<()> {
1370 let mut start = 0;
1371 let mut run_start: usize = 0;
1372
1373 for pos in memchr_iter(line_delim, data) {
1374 let line_len = pos - start;
1375 if line_len > max_bytes {
1376 if run_start < start {
1378 out.write_all(&data[run_start..start])?;
1379 }
1380 out.write_all(&data[start..start + max_bytes])?;
1381 out.write_all(&[line_delim])?;
1382 run_start = pos + 1;
1383 }
1384 start = pos + 1;
1386 }
1387 if start < data.len() {
1389 let line_len = data.len() - start;
1390 if line_len > max_bytes {
1391 if run_start < start {
1392 out.write_all(&data[run_start..start])?;
1393 }
1394 out.write_all(&data[start..start + max_bytes])?;
1395 out.write_all(&[line_delim])?;
1396 return Ok(());
1397 }
1398 }
1399 if run_start < data.len() {
1401 out.write_all(&data[run_start..])?;
1402 if !data.is_empty() && *data.last().unwrap() != line_delim {
1404 out.write_all(&[line_delim])?;
1405 }
1406 }
1407 Ok(())
1408}
1409
1410#[inline]
1413fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1414 buf.reserve(data.len());
1416
1417 let mut start = 0;
1418 for pos in memchr_iter(line_delim, data) {
1419 let line_len = pos - start;
1420 let take = line_len.min(max_bytes);
1421 unsafe {
1422 buf_extend(buf, &data[start..start + take]);
1423 buf_push(buf, line_delim);
1424 }
1425 start = pos + 1;
1426 }
1427 if start < data.len() {
1429 let line_len = data.len() - start;
1430 let take = line_len.min(max_bytes);
1431 unsafe {
1432 buf_extend(buf, &data[start..start + take]);
1433 buf_push(buf, line_delim);
1434 }
1435 }
1436}
1437
1438fn process_bytes_from_offset(
1440 data: &[u8],
1441 skip_bytes: usize,
1442 line_delim: u8,
1443 out: &mut impl Write,
1444) -> io::Result<()> {
1445 if data.len() >= PARALLEL_THRESHOLD {
1446 let chunks = split_into_chunks(data, line_delim);
1447 let results: Vec<Vec<u8>> = chunks
1448 .par_iter()
1449 .map(|chunk| {
1450 let mut buf = Vec::with_capacity(chunk.len());
1451 bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
1452 buf
1453 })
1454 .collect();
1455 let slices: Vec<IoSlice> = results
1457 .iter()
1458 .filter(|r| !r.is_empty())
1459 .map(|r| IoSlice::new(r))
1460 .collect();
1461 write_ioslices(out, &slices)?;
1462 } else {
1463 bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
1465 }
1466 Ok(())
1467}
1468
1469#[inline]
1473fn bytes_from_offset_zerocopy(
1474 data: &[u8],
1475 skip_bytes: usize,
1476 line_delim: u8,
1477 out: &mut impl Write,
1478) -> io::Result<()> {
1479 let delim_buf = [line_delim];
1480 let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
1481
1482 let mut start = 0;
1483 for pos in memchr_iter(line_delim, data) {
1484 let line_len = pos - start;
1485 if line_len > skip_bytes {
1486 iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
1487 }
1488 iov.push(IoSlice::new(&delim_buf));
1489 if iov.len() >= MAX_IOV - 1 {
1491 write_ioslices(out, &iov)?;
1492 iov.clear();
1493 }
1494 start = pos + 1;
1495 }
1496 if start < data.len() {
1497 let line_len = data.len() - start;
1498 if line_len > skip_bytes {
1499 iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
1500 }
1501 iov.push(IoSlice::new(&delim_buf));
1502 }
1503 if !iov.is_empty() {
1504 write_ioslices(out, &iov)?;
1505 }
1506 Ok(())
1507}
1508
1509#[inline]
1512fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1513 buf.reserve(data.len());
1514
1515 let mut start = 0;
1516 for pos in memchr_iter(line_delim, data) {
1517 let line_len = pos - start;
1518 if line_len > skip_bytes {
1519 unsafe {
1520 buf_extend(buf, &data[start + skip_bytes..pos]);
1521 }
1522 }
1523 unsafe {
1524 buf_push(buf, line_delim);
1525 }
1526 start = pos + 1;
1527 }
1528 if start < data.len() {
1529 let line_len = data.len() - start;
1530 if line_len > skip_bytes {
1531 unsafe {
1532 buf_extend(buf, &data[start + skip_bytes..data.len()]);
1533 }
1534 }
1535 unsafe {
1536 buf_push(buf, line_delim);
1537 }
1538 }
1539}
1540
1541fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
1543 let line_delim = cfg.line_delim;
1544 let ranges = cfg.ranges;
1545 let complement = cfg.complement;
1546 let output_delim = cfg.output_delim;
1547
1548 if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
1550 let max_bytes = ranges[0].end;
1551 if max_bytes < usize::MAX {
1552 return process_bytes_from_start(data, max_bytes, line_delim, out);
1553 }
1554 }
1555
1556 if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
1558 let skip_bytes = ranges[0].start.saturating_sub(1);
1559 if skip_bytes > 0 {
1560 return process_bytes_from_offset(data, skip_bytes, line_delim, out);
1561 }
1562 }
1563
1564 if data.len() >= PARALLEL_THRESHOLD {
1565 let chunks = split_into_chunks(data, line_delim);
1566 let results: Vec<Vec<u8>> = chunks
1567 .par_iter()
1568 .map(|chunk| {
1569 let mut buf = Vec::with_capacity(chunk.len());
1570 process_bytes_chunk(
1571 chunk,
1572 ranges,
1573 complement,
1574 output_delim,
1575 line_delim,
1576 &mut buf,
1577 );
1578 buf
1579 })
1580 .collect();
1581 let slices: Vec<IoSlice> = results
1583 .iter()
1584 .filter(|r| !r.is_empty())
1585 .map(|r| IoSlice::new(r))
1586 .collect();
1587 write_ioslices(out, &slices)?;
1588 } else {
1589 let mut buf = Vec::with_capacity(data.len());
1590 process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
1591 if !buf.is_empty() {
1592 out.write_all(&buf)?;
1593 }
1594 }
1595 Ok(())
1596}
1597
1598fn process_bytes_chunk(
1600 data: &[u8],
1601 ranges: &[Range],
1602 complement: bool,
1603 output_delim: &[u8],
1604 line_delim: u8,
1605 buf: &mut Vec<u8>,
1606) {
1607 let mut start = 0;
1608 for end_pos in memchr_iter(line_delim, data) {
1609 let line = &data[start..end_pos];
1610 cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
1611 buf.push(line_delim);
1612 start = end_pos + 1;
1613 }
1614 if start < data.len() {
1615 cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
1616 buf.push(line_delim);
1617 }
1618}
1619
1620#[inline(always)]
1623fn cut_bytes_to_buf(
1624 line: &[u8],
1625 ranges: &[Range],
1626 complement: bool,
1627 output_delim: &[u8],
1628 buf: &mut Vec<u8>,
1629) {
1630 let len = line.len();
1631 let mut first_range = true;
1632
1633 buf.reserve(len + output_delim.len() * ranges.len() + 1);
1635
1636 if complement {
1637 let mut pos: usize = 1;
1638 for r in ranges {
1639 let rs = r.start;
1640 let re = r.end.min(len);
1641 if pos < rs {
1642 if !first_range && !output_delim.is_empty() {
1643 unsafe { buf_extend(buf, output_delim) };
1644 }
1645 unsafe { buf_extend(buf, &line[pos - 1..rs - 1]) };
1646 first_range = false;
1647 }
1648 pos = re + 1;
1649 if pos > len {
1650 break;
1651 }
1652 }
1653 if pos <= len {
1654 if !first_range && !output_delim.is_empty() {
1655 unsafe { buf_extend(buf, output_delim) };
1656 }
1657 unsafe { buf_extend(buf, &line[pos - 1..len]) };
1658 }
1659 } else if output_delim.is_empty() && ranges.len() == 1 {
1660 let start = ranges[0].start.saturating_sub(1);
1662 let end = ranges[0].end.min(len);
1663 if start < len {
1664 unsafe { buf_extend(buf, &line[start..end]) };
1665 }
1666 } else {
1667 for r in ranges {
1668 let start = r.start.saturating_sub(1);
1669 let end = r.end.min(len);
1670 if start >= len {
1671 break;
1672 }
1673 if !first_range && !output_delim.is_empty() {
1674 unsafe { buf_extend(buf, output_delim) };
1675 }
1676 unsafe { buf_extend(buf, &line[start..end]) };
1677 first_range = false;
1678 }
1679 }
1680}
1681
1682#[inline]
1686pub fn cut_fields(
1687 line: &[u8],
1688 delim: u8,
1689 ranges: &[Range],
1690 complement: bool,
1691 output_delim: &[u8],
1692 suppress_no_delim: bool,
1693 out: &mut impl Write,
1694) -> io::Result<bool> {
1695 if memchr::memchr(delim, line).is_none() {
1696 if !suppress_no_delim {
1697 out.write_all(line)?;
1698 return Ok(true);
1699 }
1700 return Ok(false);
1701 }
1702
1703 let mut field_num: usize = 1;
1704 let mut field_start: usize = 0;
1705 let mut first_output = true;
1706
1707 for delim_pos in memchr_iter(delim, line) {
1708 let selected = in_ranges(ranges, field_num) != complement;
1709 if selected {
1710 if !first_output {
1711 out.write_all(output_delim)?;
1712 }
1713 out.write_all(&line[field_start..delim_pos])?;
1714 first_output = false;
1715 }
1716 field_start = delim_pos + 1;
1717 field_num += 1;
1718 }
1719
1720 let selected = in_ranges(ranges, field_num) != complement;
1721 if selected {
1722 if !first_output {
1723 out.write_all(output_delim)?;
1724 }
1725 out.write_all(&line[field_start..])?;
1726 }
1727
1728 Ok(true)
1729}
1730
1731#[inline]
1733pub fn cut_bytes(
1734 line: &[u8],
1735 ranges: &[Range],
1736 complement: bool,
1737 output_delim: &[u8],
1738 out: &mut impl Write,
1739) -> io::Result<bool> {
1740 let mut first_range = true;
1741
1742 if complement {
1743 let len = line.len();
1744 let mut comp_ranges = Vec::new();
1745 let mut pos: usize = 1;
1746 for r in ranges {
1747 let rs = r.start;
1748 let re = r.end.min(len);
1749 if pos < rs {
1750 comp_ranges.push((pos, rs - 1));
1751 }
1752 pos = re + 1;
1753 if pos > len {
1754 break;
1755 }
1756 }
1757 if pos <= len {
1758 comp_ranges.push((pos, len));
1759 }
1760 for &(s, e) in &comp_ranges {
1761 if !first_range && !output_delim.is_empty() {
1762 out.write_all(output_delim)?;
1763 }
1764 out.write_all(&line[s - 1..e])?;
1765 first_range = false;
1766 }
1767 } else {
1768 for r in ranges {
1769 let start = r.start.saturating_sub(1);
1770 let end = r.end.min(line.len());
1771 if start >= line.len() {
1772 break;
1773 }
1774 if !first_range && !output_delim.is_empty() {
1775 out.write_all(output_delim)?;
1776 }
1777 out.write_all(&line[start..end])?;
1778 first_range = false;
1779 }
1780 }
1781 Ok(true)
1782}
1783
1784pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
1786 match cfg.mode {
1787 CutMode::Fields => process_fields_fast(data, cfg, out),
1788 CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
1789 }
1790}
1791
1792pub fn process_cut_reader<R: BufRead>(
1796 mut reader: R,
1797 cfg: &CutConfig,
1798 out: &mut impl Write,
1799) -> io::Result<()> {
1800 const CHUNK_SIZE: usize = 4 * 1024 * 1024; let line_delim = cfg.line_delim;
1802
1803 let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
1806
1807 loop {
1808 buf.reserve(CHUNK_SIZE);
1810 let read_start = buf.len();
1811 unsafe { buf.set_len(read_start + CHUNK_SIZE) };
1812 let n = read_fully(&mut reader, &mut buf[read_start..])?;
1813 buf.truncate(read_start + n);
1814
1815 if buf.is_empty() {
1816 break;
1817 }
1818
1819 if n == 0 {
1820 process_cut_data(&buf, cfg, out)?;
1822 break;
1823 }
1824
1825 let process_end = match memchr::memrchr(line_delim, &buf) {
1827 Some(pos) => pos + 1,
1828 None => {
1829 continue;
1831 }
1832 };
1833
1834 process_cut_data(&buf[..process_end], cfg, out)?;
1836
1837 let leftover_len = buf.len() - process_end;
1839 if leftover_len > 0 {
1840 buf.copy_within(process_end.., 0);
1841 }
1842 buf.truncate(leftover_len);
1843 }
1844
1845 Ok(())
1846}
1847
1848#[inline]
1850fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
1851 let n = reader.read(buf)?;
1852 if n == buf.len() || n == 0 {
1853 return Ok(n);
1854 }
1855 let mut total = n;
1857 while total < buf.len() {
1858 match reader.read(&mut buf[total..]) {
1859 Ok(0) => break,
1860 Ok(n) => total += n,
1861 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1862 Err(e) => return Err(e),
1863 }
1864 }
1865 Ok(total)
1866}
1867
1868#[derive(Debug, Clone, Copy, PartialEq)]
1870pub enum CutMode {
1871 Bytes,
1872 Characters,
1873 Fields,
1874}