1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5const PARALLEL_THRESHOLD: usize = 512 * 1024;
8
9pub struct CutConfig<'a> {
11 pub mode: CutMode,
12 pub ranges: &'a [Range],
13 pub complement: bool,
14 pub delim: u8,
15 pub output_delim: &'a [u8],
16 pub suppress_no_delim: bool,
17 pub line_delim: u8,
18}
19
20#[derive(Debug, Clone)]
22pub struct Range {
23 pub start: usize, pub end: usize, }
26
27pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
30 let mut ranges = Vec::new();
31
32 for part in spec.split(',') {
33 let part = part.trim();
34 if part.is_empty() {
35 continue;
36 }
37
38 if let Some(idx) = part.find('-') {
39 let left = &part[..idx];
40 let right = &part[idx + 1..];
41
42 let start = if left.is_empty() {
43 1
44 } else {
45 left.parse::<usize>()
46 .map_err(|_| format!("invalid range: '{}'", part))?
47 };
48
49 let end = if right.is_empty() {
50 usize::MAX
51 } else {
52 right
53 .parse::<usize>()
54 .map_err(|_| format!("invalid range: '{}'", part))?
55 };
56
57 if start == 0 {
58 return Err("fields and positions are numbered from 1".to_string());
59 }
60 if start > end {
61 return Err(format!("invalid decreasing range: '{}'", part));
62 }
63
64 ranges.push(Range { start, end });
65 } else {
66 let n = part
67 .parse::<usize>()
68 .map_err(|_| format!("invalid field: '{}'", part))?;
69 if n == 0 {
70 return Err("fields and positions are numbered from 1".to_string());
71 }
72 ranges.push(Range { start: n, end: n });
73 }
74 }
75
76 if ranges.is_empty() {
77 return Err("you must specify a list of bytes, characters, or fields".to_string());
78 }
79
80 ranges.sort_by_key(|r| (r.start, r.end));
82 let mut merged = vec![ranges[0].clone()];
83 for r in &ranges[1..] {
84 let last = merged.last_mut().unwrap();
85 if r.start <= last.end.saturating_add(1) {
86 last.end = last.end.max(r.end);
87 } else {
88 merged.push(r.clone());
89 }
90 }
91
92 Ok(merged)
93}
94
95#[inline(always)]
98fn in_ranges(ranges: &[Range], pos: usize) -> bool {
99 for r in ranges {
100 if pos < r.start {
101 return false; }
103 if pos <= r.end {
104 return true;
105 }
106 }
107 false
108}
109
110#[inline]
113fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
114 let mut mask: u64 = 0;
115 for i in 1..=64u32 {
116 let in_range = in_ranges(ranges, i as usize);
117 if in_range != complement {
118 mask |= 1u64 << (i - 1);
119 }
120 }
121 mask
122}
123
124#[inline(always)]
126fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
127 if field_num <= 64 {
128 (mask >> (field_num - 1)) & 1 == 1
129 } else {
130 in_ranges(ranges, field_num) != complement
131 }
132}
133
134fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
139 let num_threads = rayon::current_num_threads().max(1);
140 if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
141 return vec![data];
142 }
143
144 let chunk_size = data.len() / num_threads;
145 let mut chunks = Vec::with_capacity(num_threads);
146 let mut pos = 0;
147
148 for _ in 0..num_threads - 1 {
149 let target = pos + chunk_size;
150 if target >= data.len() {
151 break;
152 }
153 let boundary = memchr::memchr(line_delim, &data[target..])
155 .map(|p| target + p + 1)
156 .unwrap_or(data.len());
157 if boundary > pos {
158 chunks.push(&data[pos..boundary]);
159 }
160 pos = boundary;
161 }
162
163 if pos < data.len() {
165 chunks.push(&data[pos..]);
166 }
167
168 chunks
169}
170
171fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
176 let delim = cfg.delim;
177 let line_delim = cfg.line_delim;
178 let ranges = cfg.ranges;
179 let complement = cfg.complement;
180 let output_delim = cfg.output_delim;
181 let suppress = cfg.suppress_no_delim;
182
183 if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
185 return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
186 }
187
188 let max_field = if complement {
190 usize::MAX
191 } else {
192 ranges.last().map(|r| r.end).unwrap_or(0)
193 };
194 let field_mask = compute_field_mask(ranges, complement);
195
196 if data.len() >= PARALLEL_THRESHOLD {
197 let chunks = split_into_chunks(data, line_delim);
199 let results: Vec<Vec<u8>> = chunks
200 .par_iter()
201 .map(|chunk| {
202 let mut buf = Vec::with_capacity(chunk.len() / 2);
203 process_fields_chunk(
204 chunk,
205 delim,
206 ranges,
207 output_delim,
208 suppress,
209 max_field,
210 field_mask,
211 line_delim,
212 complement,
213 &mut buf,
214 );
215 buf
216 })
217 .collect();
218 for result in &results {
219 if !result.is_empty() {
220 out.write_all(result)?;
221 }
222 }
223 } else {
224 let mut buf = Vec::with_capacity(data.len() / 2);
226 process_fields_chunk(
227 data,
228 delim,
229 ranges,
230 output_delim,
231 suppress,
232 max_field,
233 field_mask,
234 line_delim,
235 complement,
236 &mut buf,
237 );
238 if !buf.is_empty() {
239 out.write_all(&buf)?;
240 }
241 }
242 Ok(())
243}
244
245fn process_fields_chunk(
247 data: &[u8],
248 delim: u8,
249 ranges: &[Range],
250 output_delim: &[u8],
251 suppress: bool,
252 max_field: usize,
253 field_mask: u64,
254 line_delim: u8,
255 complement: bool,
256 buf: &mut Vec<u8>,
257) {
258 let mut start = 0;
259 for end_pos in memchr_iter(line_delim, data) {
260 let line = &data[start..end_pos];
261 extract_fields_to_buf(
262 line,
263 delim,
264 ranges,
265 output_delim,
266 suppress,
267 max_field,
268 field_mask,
269 line_delim,
270 buf,
271 complement,
272 );
273 start = end_pos + 1;
274 }
275 if start < data.len() {
276 extract_fields_to_buf(
277 &data[start..],
278 delim,
279 ranges,
280 output_delim,
281 suppress,
282 max_field,
283 field_mask,
284 line_delim,
285 buf,
286 complement,
287 );
288 }
289}
290
291fn process_single_field(
297 data: &[u8],
298 delim: u8,
299 line_delim: u8,
300 target: usize,
301 suppress: bool,
302 out: &mut impl Write,
303) -> io::Result<()> {
304 let target_idx = target - 1;
305
306 if data.len() >= PARALLEL_THRESHOLD {
307 let chunks = split_into_chunks(data, line_delim);
309 let results: Vec<Vec<u8>> = chunks
310 .par_iter()
311 .map(|chunk| {
312 let mut buf = Vec::with_capacity(chunk.len() / 4);
313 process_single_field_chunk(
314 chunk, delim, target_idx, line_delim, suppress, &mut buf,
315 );
316 buf
317 })
318 .collect();
319 for result in &results {
320 if !result.is_empty() {
321 out.write_all(result)?;
322 }
323 }
324 } else {
325 let mut buf = Vec::with_capacity(data.len() / 4);
327 process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
328 if !buf.is_empty() {
329 out.write_all(&buf)?;
330 }
331 }
332 Ok(())
333}
334
335fn process_single_field_chunk(
337 data: &[u8],
338 delim: u8,
339 target_idx: usize,
340 line_delim: u8,
341 suppress: bool,
342 buf: &mut Vec<u8>,
343) {
344 let mut start = 0;
345 for end_pos in memchr_iter(line_delim, data) {
346 let line = &data[start..end_pos];
347 extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
348 start = end_pos + 1;
349 }
350 if start < data.len() {
352 extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
353 }
354}
355
356#[inline(always)]
359fn extract_single_field_line(
360 line: &[u8],
361 delim: u8,
362 target_idx: usize,
363 line_delim: u8,
364 suppress: bool,
365 buf: &mut Vec<u8>,
366) {
367 if line.is_empty() {
368 if !suppress {
369 buf.push(line_delim);
370 }
371 return;
372 }
373
374 let mut delim_positions: [usize; 64] = [0; 64];
377 let mut num_delims: usize = 0;
378
379 for pos in memchr_iter(delim, line) {
380 if num_delims < 64 {
381 delim_positions[num_delims] = pos;
382 }
383 num_delims += 1;
384 if num_delims > target_idx {
386 break;
387 }
388 }
389
390 if num_delims == 0 {
392 if !suppress {
393 buf.extend_from_slice(line);
394 buf.push(line_delim);
395 }
396 return;
397 }
398
399 if target_idx == 0 {
401 buf.extend_from_slice(&line[..delim_positions[0]]);
402 buf.push(line_delim);
403 return;
404 }
405
406 if num_delims < target_idx {
408 buf.push(line_delim);
409 return;
410 }
411
412 let field_start = delim_positions[target_idx - 1] + 1;
414
415 if num_delims > target_idx && target_idx < 64 {
417 buf.extend_from_slice(&line[field_start..delim_positions[target_idx]]);
418 } else {
419 buf.extend_from_slice(&line[field_start..]);
420 }
421 buf.push(line_delim);
422}
423
424#[inline(always)]
427fn extract_fields_to_buf(
428 line: &[u8],
429 delim: u8,
430 ranges: &[Range],
431 output_delim: &[u8],
432 suppress: bool,
433 max_field: usize,
434 field_mask: u64,
435 line_delim: u8,
436 buf: &mut Vec<u8>,
437 complement: bool,
438) {
439 let len = line.len();
440
441 if len == 0 {
443 if !suppress {
444 buf.push(line_delim);
445 }
446 return;
447 }
448
449 let mut field_num: usize = 1;
450 let mut field_start: usize = 0;
451 let mut first_output = true;
452 let mut has_delim = false;
453
454 for delim_pos in memchr_iter(delim, line) {
456 has_delim = true;
457
458 if is_selected(field_num, field_mask, ranges, complement) {
459 if !first_output {
460 buf.extend_from_slice(output_delim);
461 }
462 buf.extend_from_slice(&line[field_start..delim_pos]);
463 first_output = false;
464 }
465
466 field_num += 1;
467 field_start = delim_pos + 1;
468
469 if field_num > max_field {
471 break;
472 }
473 }
474
475 if (field_num <= max_field || complement)
477 && has_delim
478 && is_selected(field_num, field_mask, ranges, complement)
479 {
480 if !first_output {
481 buf.extend_from_slice(output_delim);
482 }
483 buf.extend_from_slice(&line[field_start..len]);
484 first_output = false;
485 }
486
487 if !first_output {
489 buf.push(line_delim);
491 } else if !has_delim {
492 if !suppress {
494 buf.extend_from_slice(line);
495 buf.push(line_delim);
496 }
497 } else {
498 buf.push(line_delim);
500 }
501}
502
503fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
507 let line_delim = cfg.line_delim;
508 let ranges = cfg.ranges;
509 let complement = cfg.complement;
510 let output_delim = cfg.output_delim;
511
512 if data.len() >= PARALLEL_THRESHOLD {
513 let chunks = split_into_chunks(data, line_delim);
514 let results: Vec<Vec<u8>> = chunks
515 .par_iter()
516 .map(|chunk| {
517 let mut buf = Vec::with_capacity(chunk.len() / 2);
518 process_bytes_chunk(
519 chunk,
520 ranges,
521 complement,
522 output_delim,
523 line_delim,
524 &mut buf,
525 );
526 buf
527 })
528 .collect();
529 for result in &results {
530 if !result.is_empty() {
531 out.write_all(result)?;
532 }
533 }
534 } else {
535 let mut buf = Vec::with_capacity(data.len() / 2);
536 process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
537 if !buf.is_empty() {
538 out.write_all(&buf)?;
539 }
540 }
541 Ok(())
542}
543
544fn process_bytes_chunk(
546 data: &[u8],
547 ranges: &[Range],
548 complement: bool,
549 output_delim: &[u8],
550 line_delim: u8,
551 buf: &mut Vec<u8>,
552) {
553 let mut start = 0;
554 for end_pos in memchr_iter(line_delim, data) {
555 let line = &data[start..end_pos];
556 cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
557 buf.push(line_delim);
558 start = end_pos + 1;
559 }
560 if start < data.len() {
561 cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
562 buf.push(line_delim);
563 }
564}
565
566#[inline(always)]
568fn cut_bytes_to_buf(
569 line: &[u8],
570 ranges: &[Range],
571 complement: bool,
572 output_delim: &[u8],
573 buf: &mut Vec<u8>,
574) {
575 let mut first_range = true;
576
577 if complement {
578 let len = line.len();
579 let mut pos: usize = 1;
580 for r in ranges {
581 let rs = r.start;
582 let re = r.end.min(len);
583 if pos < rs {
584 if !first_range && !output_delim.is_empty() {
585 buf.extend_from_slice(output_delim);
586 }
587 buf.extend_from_slice(&line[pos - 1..rs - 1]);
588 first_range = false;
589 }
590 pos = re + 1;
591 if pos > len {
592 break;
593 }
594 }
595 if pos <= len {
596 if !first_range && !output_delim.is_empty() {
597 buf.extend_from_slice(output_delim);
598 }
599 buf.extend_from_slice(&line[pos - 1..len]);
600 }
601 } else {
602 for r in ranges {
603 let start = r.start.saturating_sub(1);
604 let end = r.end.min(line.len());
605 if start >= line.len() {
606 break;
607 }
608 if !first_range && !output_delim.is_empty() {
609 buf.extend_from_slice(output_delim);
610 }
611 buf.extend_from_slice(&line[start..end]);
612 first_range = false;
613 }
614 }
615}
616
617#[inline]
623pub fn cut_fields(
624 line: &[u8],
625 delim: u8,
626 ranges: &[Range],
627 complement: bool,
628 output_delim: &[u8],
629 suppress_no_delim: bool,
630 out: &mut impl Write,
631) -> io::Result<bool> {
632 if memchr::memchr(delim, line).is_none() {
634 if !suppress_no_delim {
635 out.write_all(line)?;
636 return Ok(true);
637 }
638 return Ok(false); }
640
641 let mut field_num: usize = 1;
643 let mut field_start: usize = 0;
644 let mut first_output = true;
645
646 for delim_pos in memchr_iter(delim, line) {
647 let selected = in_ranges(ranges, field_num) != complement;
648 if selected {
649 if !first_output {
650 out.write_all(output_delim)?;
651 }
652 out.write_all(&line[field_start..delim_pos])?;
653 first_output = false;
654 }
655 field_start = delim_pos + 1;
656 field_num += 1;
657 }
658
659 let selected = in_ranges(ranges, field_num) != complement;
661 if selected {
662 if !first_output {
663 out.write_all(output_delim)?;
664 }
665 out.write_all(&line[field_start..])?;
666 }
667
668 Ok(true)
669}
670
671#[inline]
674pub fn cut_bytes(
675 line: &[u8],
676 ranges: &[Range],
677 complement: bool,
678 output_delim: &[u8],
679 out: &mut impl Write,
680) -> io::Result<bool> {
681 let mut first_range = true;
682
683 if complement {
684 let len = line.len();
685 let mut comp_ranges = Vec::new();
686 let mut pos: usize = 1;
687 for r in ranges {
688 let rs = r.start;
689 let re = r.end.min(len);
690 if pos < rs {
691 comp_ranges.push((pos, rs - 1));
692 }
693 pos = re + 1;
694 if pos > len {
695 break;
696 }
697 }
698 if pos <= len {
699 comp_ranges.push((pos, len));
700 }
701 for &(s, e) in &comp_ranges {
702 if !first_range && !output_delim.is_empty() {
703 out.write_all(output_delim)?;
704 }
705 out.write_all(&line[s - 1..e])?;
706 first_range = false;
707 }
708 } else {
709 for r in ranges {
710 let start = r.start.saturating_sub(1);
711 let end = r.end.min(line.len());
712 if start >= line.len() {
713 break;
714 }
715 if !first_range && !output_delim.is_empty() {
716 out.write_all(output_delim)?;
717 }
718 out.write_all(&line[start..end])?;
719 first_range = false;
720 }
721 }
722 Ok(true)
723}
724
725pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
728 match cfg.mode {
729 CutMode::Fields => process_fields_fast(data, cfg, out),
730 CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
731 }
732}
733
734pub fn process_cut_reader<R: BufRead>(
736 mut reader: R,
737 cfg: &CutConfig,
738 out: &mut impl Write,
739) -> io::Result<()> {
740 let mut buf = Vec::new();
741
742 loop {
743 buf.clear();
744 let n = reader.read_until(cfg.line_delim, &mut buf)?;
745 if n == 0 {
746 break;
747 }
748
749 let has_line_delim = buf.last() == Some(&cfg.line_delim);
750 let line = if has_line_delim {
751 &buf[..buf.len() - 1]
752 } else {
753 &buf[..]
754 };
755
756 let wrote = process_one_line(line, cfg, out)?;
757
758 if wrote {
760 out.write_all(&[cfg.line_delim])?;
761 }
762 }
763
764 Ok(())
765}
766
767#[inline]
769fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
770 match cfg.mode {
771 CutMode::Fields => cut_fields(
772 line,
773 cfg.delim,
774 cfg.ranges,
775 cfg.complement,
776 cfg.output_delim,
777 cfg.suppress_no_delim,
778 out,
779 ),
780 CutMode::Bytes | CutMode::Characters => {
781 cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
782 }
783 }
784}
785
786#[derive(Debug, Clone, Copy, PartialEq)]
788pub enum CutMode {
789 Bytes,
790 Characters,
791 Fields,
792}