1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5const PARALLEL_THRESHOLD: usize = 1024 * 1024;
9
10pub struct CutConfig<'a> {
12 pub mode: CutMode,
13 pub ranges: &'a [Range],
14 pub complement: bool,
15 pub delim: u8,
16 pub output_delim: &'a [u8],
17 pub suppress_no_delim: bool,
18 pub line_delim: u8,
19}
20
21#[derive(Debug, Clone)]
23pub struct Range {
24 pub start: usize, pub end: usize, }
27
28pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
31 let mut ranges = Vec::new();
32
33 for part in spec.split(',') {
34 let part = part.trim();
35 if part.is_empty() {
36 continue;
37 }
38
39 if let Some(idx) = part.find('-') {
40 let left = &part[..idx];
41 let right = &part[idx + 1..];
42
43 let start = if left.is_empty() {
44 1
45 } else {
46 left.parse::<usize>()
47 .map_err(|_| format!("invalid range: '{}'", part))?
48 };
49
50 let end = if right.is_empty() {
51 usize::MAX
52 } else {
53 right
54 .parse::<usize>()
55 .map_err(|_| format!("invalid range: '{}'", part))?
56 };
57
58 if start == 0 {
59 return Err("fields and positions are numbered from 1".to_string());
60 }
61 if start > end {
62 return Err(format!("invalid decreasing range: '{}'", part));
63 }
64
65 ranges.push(Range { start, end });
66 } else {
67 let n = part
68 .parse::<usize>()
69 .map_err(|_| format!("invalid field: '{}'", part))?;
70 if n == 0 {
71 return Err("fields and positions are numbered from 1".to_string());
72 }
73 ranges.push(Range { start: n, end: n });
74 }
75 }
76
77 if ranges.is_empty() {
78 return Err("you must specify a list of bytes, characters, or fields".to_string());
79 }
80
81 ranges.sort_by_key(|r| (r.start, r.end));
83 let mut merged = vec![ranges[0].clone()];
84 for r in &ranges[1..] {
85 let last = merged.last_mut().unwrap();
86 if r.start <= last.end.saturating_add(1) {
87 last.end = last.end.max(r.end);
88 } else {
89 merged.push(r.clone());
90 }
91 }
92
93 Ok(merged)
94}
95
96#[inline(always)]
99fn in_ranges(ranges: &[Range], pos: usize) -> bool {
100 for r in ranges {
101 if pos < r.start {
102 return false; }
104 if pos <= r.end {
105 return true;
106 }
107 }
108 false
109}
110
111#[inline]
114fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
115 let mut mask: u64 = 0;
116 for i in 1..=64u32 {
117 let in_range = in_ranges(ranges, i as usize);
118 if in_range != complement {
119 mask |= 1u64 << (i - 1);
120 }
121 }
122 mask
123}
124
125#[inline(always)]
127fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
128 if field_num <= 64 {
129 (mask >> (field_num - 1)) & 1 == 1
130 } else {
131 in_ranges(ranges, field_num) != complement
132 }
133}
134
135fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
140 let num_threads = rayon::current_num_threads().max(1);
141 if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
142 return vec![data];
143 }
144
145 let chunk_size = data.len() / num_threads;
146 let mut chunks = Vec::with_capacity(num_threads);
147 let mut pos = 0;
148
149 for _ in 0..num_threads - 1 {
150 let target = pos + chunk_size;
151 if target >= data.len() {
152 break;
153 }
154 let boundary = memchr::memchr(line_delim, &data[target..])
156 .map(|p| target + p + 1)
157 .unwrap_or(data.len());
158 if boundary > pos {
159 chunks.push(&data[pos..boundary]);
160 }
161 pos = boundary;
162 }
163
164 if pos < data.len() {
166 chunks.push(&data[pos..]);
167 }
168
169 chunks
170}
171
172fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
177 let delim = cfg.delim;
178 let line_delim = cfg.line_delim;
179 let ranges = cfg.ranges;
180 let complement = cfg.complement;
181 let output_delim = cfg.output_delim;
182 let suppress = cfg.suppress_no_delim;
183
184 if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
186 return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
187 }
188
189 let max_field = if complement {
191 usize::MAX
192 } else {
193 ranges.last().map(|r| r.end).unwrap_or(0)
194 };
195 let field_mask = compute_field_mask(ranges, complement);
196
197 if data.len() >= PARALLEL_THRESHOLD {
198 let chunks = split_into_chunks(data, line_delim);
200 let results: Vec<Vec<u8>> = chunks
201 .par_iter()
202 .map(|chunk| {
203 let mut buf = Vec::with_capacity(chunk.len() / 2);
204 process_fields_chunk(
205 chunk,
206 delim,
207 ranges,
208 output_delim,
209 suppress,
210 max_field,
211 field_mask,
212 line_delim,
213 complement,
214 &mut buf,
215 );
216 buf
217 })
218 .collect();
219 for result in &results {
220 if !result.is_empty() {
221 out.write_all(result)?;
222 }
223 }
224 } else {
225 let mut buf = Vec::with_capacity(data.len() / 2);
227 process_fields_chunk(
228 data,
229 delim,
230 ranges,
231 output_delim,
232 suppress,
233 max_field,
234 field_mask,
235 line_delim,
236 complement,
237 &mut buf,
238 );
239 if !buf.is_empty() {
240 out.write_all(&buf)?;
241 }
242 }
243 Ok(())
244}
245
246fn process_fields_chunk(
248 data: &[u8],
249 delim: u8,
250 ranges: &[Range],
251 output_delim: &[u8],
252 suppress: bool,
253 max_field: usize,
254 field_mask: u64,
255 line_delim: u8,
256 complement: bool,
257 buf: &mut Vec<u8>,
258) {
259 let mut start = 0;
260 for end_pos in memchr_iter(line_delim, data) {
261 let line = &data[start..end_pos];
262 extract_fields_to_buf(
263 line,
264 delim,
265 ranges,
266 output_delim,
267 suppress,
268 max_field,
269 field_mask,
270 line_delim,
271 buf,
272 complement,
273 );
274 start = end_pos + 1;
275 }
276 if start < data.len() {
277 extract_fields_to_buf(
278 &data[start..],
279 delim,
280 ranges,
281 output_delim,
282 suppress,
283 max_field,
284 field_mask,
285 line_delim,
286 buf,
287 complement,
288 );
289 }
290}
291
292fn process_single_field(
298 data: &[u8],
299 delim: u8,
300 line_delim: u8,
301 target: usize,
302 suppress: bool,
303 out: &mut impl Write,
304) -> io::Result<()> {
305 let target_idx = target - 1;
306
307 if data.len() >= PARALLEL_THRESHOLD {
308 let chunks = split_into_chunks(data, line_delim);
310 let results: Vec<Vec<u8>> = chunks
311 .par_iter()
312 .map(|chunk| {
313 let mut buf = Vec::with_capacity(chunk.len() / 4);
314 process_single_field_chunk(
315 chunk, delim, target_idx, line_delim, suppress, &mut buf,
316 );
317 buf
318 })
319 .collect();
320 for result in &results {
321 if !result.is_empty() {
322 out.write_all(result)?;
323 }
324 }
325 } else {
326 let mut buf = Vec::with_capacity(data.len() / 4);
328 process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
329 if !buf.is_empty() {
330 out.write_all(&buf)?;
331 }
332 }
333 Ok(())
334}
335
336fn process_single_field_chunk(
338 data: &[u8],
339 delim: u8,
340 target_idx: usize,
341 line_delim: u8,
342 suppress: bool,
343 buf: &mut Vec<u8>,
344) {
345 let mut start = 0;
346 for end_pos in memchr_iter(line_delim, data) {
347 let line = &data[start..end_pos];
348 extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
349 start = end_pos + 1;
350 }
351 if start < data.len() {
353 extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
354 }
355}
356
357#[inline(always)]
360fn extract_single_field_line(
361 line: &[u8],
362 delim: u8,
363 target_idx: usize,
364 line_delim: u8,
365 suppress: bool,
366 buf: &mut Vec<u8>,
367) {
368 if line.is_empty() {
369 if !suppress {
370 buf.push(line_delim);
371 }
372 return;
373 }
374
375 let mut delim_positions: [usize; 64] = [0; 64];
378 let mut num_delims: usize = 0;
379
380 for pos in memchr_iter(delim, line) {
381 if num_delims < 64 {
382 delim_positions[num_delims] = pos;
383 }
384 num_delims += 1;
385 if num_delims > target_idx {
387 break;
388 }
389 }
390
391 if num_delims == 0 {
393 if !suppress {
394 buf.extend_from_slice(line);
395 buf.push(line_delim);
396 }
397 return;
398 }
399
400 if target_idx == 0 {
402 buf.extend_from_slice(&line[..delim_positions[0]]);
403 buf.push(line_delim);
404 return;
405 }
406
407 if num_delims < target_idx {
409 buf.push(line_delim);
410 return;
411 }
412
413 let field_start = delim_positions[target_idx - 1] + 1;
415
416 if num_delims > target_idx && target_idx < 64 {
418 buf.extend_from_slice(&line[field_start..delim_positions[target_idx]]);
419 } else {
420 buf.extend_from_slice(&line[field_start..]);
421 }
422 buf.push(line_delim);
423}
424
425#[inline(always)]
428fn extract_fields_to_buf(
429 line: &[u8],
430 delim: u8,
431 ranges: &[Range],
432 output_delim: &[u8],
433 suppress: bool,
434 max_field: usize,
435 field_mask: u64,
436 line_delim: u8,
437 buf: &mut Vec<u8>,
438 complement: bool,
439) {
440 let len = line.len();
441
442 if len == 0 {
444 if !suppress {
445 buf.push(line_delim);
446 }
447 return;
448 }
449
450 let mut field_num: usize = 1;
451 let mut field_start: usize = 0;
452 let mut first_output = true;
453 let mut has_delim = false;
454
455 for delim_pos in memchr_iter(delim, line) {
457 has_delim = true;
458
459 if is_selected(field_num, field_mask, ranges, complement) {
460 if !first_output {
461 buf.extend_from_slice(output_delim);
462 }
463 buf.extend_from_slice(&line[field_start..delim_pos]);
464 first_output = false;
465 }
466
467 field_num += 1;
468 field_start = delim_pos + 1;
469
470 if field_num > max_field {
472 break;
473 }
474 }
475
476 if (field_num <= max_field || complement)
478 && has_delim
479 && is_selected(field_num, field_mask, ranges, complement)
480 {
481 if !first_output {
482 buf.extend_from_slice(output_delim);
483 }
484 buf.extend_from_slice(&line[field_start..len]);
485 first_output = false;
486 }
487
488 if !first_output {
490 buf.push(line_delim);
492 } else if !has_delim {
493 if !suppress {
495 buf.extend_from_slice(line);
496 buf.push(line_delim);
497 }
498 } else {
499 buf.push(line_delim);
501 }
502}
503
504fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
508 let line_delim = cfg.line_delim;
509 let ranges = cfg.ranges;
510 let complement = cfg.complement;
511 let output_delim = cfg.output_delim;
512
513 if data.len() >= PARALLEL_THRESHOLD {
514 let chunks = split_into_chunks(data, line_delim);
515 let results: Vec<Vec<u8>> = chunks
516 .par_iter()
517 .map(|chunk| {
518 let mut buf = Vec::with_capacity(chunk.len() / 2);
519 process_bytes_chunk(
520 chunk,
521 ranges,
522 complement,
523 output_delim,
524 line_delim,
525 &mut buf,
526 );
527 buf
528 })
529 .collect();
530 for result in &results {
531 if !result.is_empty() {
532 out.write_all(result)?;
533 }
534 }
535 } else {
536 let mut buf = Vec::with_capacity(data.len() / 2);
537 process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
538 if !buf.is_empty() {
539 out.write_all(&buf)?;
540 }
541 }
542 Ok(())
543}
544
545fn process_bytes_chunk(
547 data: &[u8],
548 ranges: &[Range],
549 complement: bool,
550 output_delim: &[u8],
551 line_delim: u8,
552 buf: &mut Vec<u8>,
553) {
554 let mut start = 0;
555 for end_pos in memchr_iter(line_delim, data) {
556 let line = &data[start..end_pos];
557 cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
558 buf.push(line_delim);
559 start = end_pos + 1;
560 }
561 if start < data.len() {
562 cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
563 buf.push(line_delim);
564 }
565}
566
567#[inline(always)]
570fn cut_bytes_to_buf(
571 line: &[u8],
572 ranges: &[Range],
573 complement: bool,
574 output_delim: &[u8],
575 buf: &mut Vec<u8>,
576) {
577 let len = line.len();
578 let mut first_range = true;
579
580 if complement {
581 let mut pos: usize = 1;
582 for r in ranges {
583 let rs = r.start;
584 let re = r.end.min(len);
585 if pos < rs {
586 if !first_range && !output_delim.is_empty() {
587 buf.extend_from_slice(output_delim);
588 }
589 buf.extend_from_slice(&line[pos - 1..rs - 1]);
590 first_range = false;
591 }
592 pos = re + 1;
593 if pos > len {
594 break;
595 }
596 }
597 if pos <= len {
598 if !first_range && !output_delim.is_empty() {
599 buf.extend_from_slice(output_delim);
600 }
601 buf.extend_from_slice(&line[pos - 1..len]);
602 }
603 } else if output_delim.is_empty() && ranges.len() == 1 {
604 let start = ranges[0].start.saturating_sub(1);
606 let end = ranges[0].end.min(len);
607 if start < len {
608 buf.extend_from_slice(&line[start..end]);
609 }
610 } else {
611 for r in ranges {
612 let start = r.start.saturating_sub(1);
613 let end = r.end.min(len);
614 if start >= len {
615 break;
616 }
617 if !first_range && !output_delim.is_empty() {
618 buf.extend_from_slice(output_delim);
619 }
620 buf.extend_from_slice(&line[start..end]);
621 first_range = false;
622 }
623 }
624}
625
626#[inline]
632pub fn cut_fields(
633 line: &[u8],
634 delim: u8,
635 ranges: &[Range],
636 complement: bool,
637 output_delim: &[u8],
638 suppress_no_delim: bool,
639 out: &mut impl Write,
640) -> io::Result<bool> {
641 if memchr::memchr(delim, line).is_none() {
643 if !suppress_no_delim {
644 out.write_all(line)?;
645 return Ok(true);
646 }
647 return Ok(false); }
649
650 let mut field_num: usize = 1;
652 let mut field_start: usize = 0;
653 let mut first_output = true;
654
655 for delim_pos in memchr_iter(delim, line) {
656 let selected = in_ranges(ranges, field_num) != complement;
657 if selected {
658 if !first_output {
659 out.write_all(output_delim)?;
660 }
661 out.write_all(&line[field_start..delim_pos])?;
662 first_output = false;
663 }
664 field_start = delim_pos + 1;
665 field_num += 1;
666 }
667
668 let selected = in_ranges(ranges, field_num) != complement;
670 if selected {
671 if !first_output {
672 out.write_all(output_delim)?;
673 }
674 out.write_all(&line[field_start..])?;
675 }
676
677 Ok(true)
678}
679
680#[inline]
683pub fn cut_bytes(
684 line: &[u8],
685 ranges: &[Range],
686 complement: bool,
687 output_delim: &[u8],
688 out: &mut impl Write,
689) -> io::Result<bool> {
690 let mut first_range = true;
691
692 if complement {
693 let len = line.len();
694 let mut comp_ranges = Vec::new();
695 let mut pos: usize = 1;
696 for r in ranges {
697 let rs = r.start;
698 let re = r.end.min(len);
699 if pos < rs {
700 comp_ranges.push((pos, rs - 1));
701 }
702 pos = re + 1;
703 if pos > len {
704 break;
705 }
706 }
707 if pos <= len {
708 comp_ranges.push((pos, len));
709 }
710 for &(s, e) in &comp_ranges {
711 if !first_range && !output_delim.is_empty() {
712 out.write_all(output_delim)?;
713 }
714 out.write_all(&line[s - 1..e])?;
715 first_range = false;
716 }
717 } else {
718 for r in ranges {
719 let start = r.start.saturating_sub(1);
720 let end = r.end.min(line.len());
721 if start >= line.len() {
722 break;
723 }
724 if !first_range && !output_delim.is_empty() {
725 out.write_all(output_delim)?;
726 }
727 out.write_all(&line[start..end])?;
728 first_range = false;
729 }
730 }
731 Ok(true)
732}
733
734pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
737 match cfg.mode {
738 CutMode::Fields => process_fields_fast(data, cfg, out),
739 CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
740 }
741}
742
743pub fn process_cut_reader<R: BufRead>(
745 mut reader: R,
746 cfg: &CutConfig,
747 out: &mut impl Write,
748) -> io::Result<()> {
749 let mut buf = Vec::new();
750
751 loop {
752 buf.clear();
753 let n = reader.read_until(cfg.line_delim, &mut buf)?;
754 if n == 0 {
755 break;
756 }
757
758 let has_line_delim = buf.last() == Some(&cfg.line_delim);
759 let line = if has_line_delim {
760 &buf[..buf.len() - 1]
761 } else {
762 &buf[..]
763 };
764
765 let wrote = process_one_line(line, cfg, out)?;
766
767 if wrote {
769 out.write_all(&[cfg.line_delim])?;
770 }
771 }
772
773 Ok(())
774}
775
776#[inline]
778fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
779 match cfg.mode {
780 CutMode::Fields => cut_fields(
781 line,
782 cfg.delim,
783 cfg.ranges,
784 cfg.complement,
785 cfg.output_delim,
786 cfg.suppress_no_delim,
787 out,
788 ),
789 CutMode::Bytes | CutMode::Characters => {
790 cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
791 }
792 }
793}
794
795#[derive(Debug, Clone, Copy, PartialEq)]
797pub enum CutMode {
798 Bytes,
799 Characters,
800 Fields,
801}