1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5const PARALLEL_THRESHOLD: usize = 1024 * 1024;
9
10pub struct CutConfig<'a> {
12 pub mode: CutMode,
13 pub ranges: &'a [Range],
14 pub complement: bool,
15 pub delim: u8,
16 pub output_delim: &'a [u8],
17 pub suppress_no_delim: bool,
18 pub line_delim: u8,
19}
20
21#[derive(Debug, Clone)]
23pub struct Range {
24 pub start: usize, pub end: usize, }
27
28pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
31 let mut ranges = Vec::new();
32
33 for part in spec.split(',') {
34 let part = part.trim();
35 if part.is_empty() {
36 continue;
37 }
38
39 if let Some(idx) = part.find('-') {
40 let left = &part[..idx];
41 let right = &part[idx + 1..];
42
43 let start = if left.is_empty() {
44 1
45 } else {
46 left.parse::<usize>()
47 .map_err(|_| format!("invalid range: '{}'", part))?
48 };
49
50 let end = if right.is_empty() {
51 usize::MAX
52 } else {
53 right
54 .parse::<usize>()
55 .map_err(|_| format!("invalid range: '{}'", part))?
56 };
57
58 if start == 0 {
59 return Err("fields and positions are numbered from 1".to_string());
60 }
61 if start > end {
62 return Err(format!("invalid decreasing range: '{}'", part));
63 }
64
65 ranges.push(Range { start, end });
66 } else {
67 let n = part
68 .parse::<usize>()
69 .map_err(|_| format!("invalid field: '{}'", part))?;
70 if n == 0 {
71 return Err("fields and positions are numbered from 1".to_string());
72 }
73 ranges.push(Range { start: n, end: n });
74 }
75 }
76
77 if ranges.is_empty() {
78 return Err("you must specify a list of bytes, characters, or fields".to_string());
79 }
80
81 ranges.sort_by_key(|r| (r.start, r.end));
83 let mut merged = vec![ranges[0].clone()];
84 for r in &ranges[1..] {
85 let last = merged.last_mut().unwrap();
86 if r.start <= last.end.saturating_add(1) {
87 last.end = last.end.max(r.end);
88 } else {
89 merged.push(r.clone());
90 }
91 }
92
93 Ok(merged)
94}
95
96#[inline(always)]
99fn in_ranges(ranges: &[Range], pos: usize) -> bool {
100 for r in ranges {
101 if pos < r.start {
102 return false; }
104 if pos <= r.end {
105 return true;
106 }
107 }
108 false
109}
110
111#[inline]
114fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
115 let mut mask: u64 = 0;
116 for i in 1..=64u32 {
117 let in_range = in_ranges(ranges, i as usize);
118 if in_range != complement {
119 mask |= 1u64 << (i - 1);
120 }
121 }
122 mask
123}
124
125#[inline(always)]
127fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
128 if field_num <= 64 {
129 (mask >> (field_num - 1)) & 1 == 1
130 } else {
131 in_ranges(ranges, field_num) != complement
132 }
133}
134
135fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
140 let num_threads = rayon::current_num_threads().max(1);
141 if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
142 return vec![data];
143 }
144
145 let chunk_size = data.len() / num_threads;
146 let mut chunks = Vec::with_capacity(num_threads);
147 let mut pos = 0;
148
149 for _ in 0..num_threads - 1 {
150 let target = pos + chunk_size;
151 if target >= data.len() {
152 break;
153 }
154 let boundary = memchr::memchr(line_delim, &data[target..])
156 .map(|p| target + p + 1)
157 .unwrap_or(data.len());
158 if boundary > pos {
159 chunks.push(&data[pos..boundary]);
160 }
161 pos = boundary;
162 }
163
164 if pos < data.len() {
166 chunks.push(&data[pos..]);
167 }
168
169 chunks
170}
171
172fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
177 let delim = cfg.delim;
178 let line_delim = cfg.line_delim;
179 let ranges = cfg.ranges;
180 let complement = cfg.complement;
181 let output_delim = cfg.output_delim;
182 let suppress = cfg.suppress_no_delim;
183
184 if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
186 return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
187 }
188
189 let max_field = if complement {
191 usize::MAX
192 } else {
193 ranges.last().map(|r| r.end).unwrap_or(0)
194 };
195 let field_mask = compute_field_mask(ranges, complement);
196
197 if data.len() >= PARALLEL_THRESHOLD {
198 let chunks = split_into_chunks(data, line_delim);
200 let results: Vec<Vec<u8>> = chunks
201 .par_iter()
202 .map(|chunk| {
203 let mut buf = Vec::with_capacity(chunk.len() / 2);
204 process_fields_chunk(
205 chunk,
206 delim,
207 ranges,
208 output_delim,
209 suppress,
210 max_field,
211 field_mask,
212 line_delim,
213 complement,
214 &mut buf,
215 );
216 buf
217 })
218 .collect();
219 for result in &results {
220 if !result.is_empty() {
221 out.write_all(result)?;
222 }
223 }
224 } else {
225 let mut buf = Vec::with_capacity(data.len() / 2);
227 process_fields_chunk(
228 data,
229 delim,
230 ranges,
231 output_delim,
232 suppress,
233 max_field,
234 field_mask,
235 line_delim,
236 complement,
237 &mut buf,
238 );
239 if !buf.is_empty() {
240 out.write_all(&buf)?;
241 }
242 }
243 Ok(())
244}
245
246fn process_fields_chunk(
248 data: &[u8],
249 delim: u8,
250 ranges: &[Range],
251 output_delim: &[u8],
252 suppress: bool,
253 max_field: usize,
254 field_mask: u64,
255 line_delim: u8,
256 complement: bool,
257 buf: &mut Vec<u8>,
258) {
259 let mut start = 0;
260 for end_pos in memchr_iter(line_delim, data) {
261 let line = &data[start..end_pos];
262 extract_fields_to_buf(
263 line,
264 delim,
265 ranges,
266 output_delim,
267 suppress,
268 max_field,
269 field_mask,
270 line_delim,
271 buf,
272 complement,
273 );
274 start = end_pos + 1;
275 }
276 if start < data.len() {
277 extract_fields_to_buf(
278 &data[start..],
279 delim,
280 ranges,
281 output_delim,
282 suppress,
283 max_field,
284 field_mask,
285 line_delim,
286 buf,
287 complement,
288 );
289 }
290}
291
292fn process_single_field(
298 data: &[u8],
299 delim: u8,
300 line_delim: u8,
301 target: usize,
302 suppress: bool,
303 out: &mut impl Write,
304) -> io::Result<()> {
305 let target_idx = target - 1;
306
307 if data.len() >= PARALLEL_THRESHOLD {
308 let chunks = split_into_chunks(data, line_delim);
310 let results: Vec<Vec<u8>> = chunks
311 .par_iter()
312 .map(|chunk| {
313 let mut buf = Vec::with_capacity(chunk.len() / 4);
314 process_single_field_chunk(
315 chunk, delim, target_idx, line_delim, suppress, &mut buf,
316 );
317 buf
318 })
319 .collect();
320 for result in &results {
321 if !result.is_empty() {
322 out.write_all(result)?;
323 }
324 }
325 } else {
326 let mut buf = Vec::with_capacity(data.len() / 4);
328 process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
329 if !buf.is_empty() {
330 out.write_all(&buf)?;
331 }
332 }
333 Ok(())
334}
335
336fn process_single_field_chunk(
338 data: &[u8],
339 delim: u8,
340 target_idx: usize,
341 line_delim: u8,
342 suppress: bool,
343 buf: &mut Vec<u8>,
344) {
345 let mut start = 0;
346 for end_pos in memchr_iter(line_delim, data) {
347 let line = &data[start..end_pos];
348 extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
349 start = end_pos + 1;
350 }
351 if start < data.len() {
353 extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
354 }
355}
356
357#[inline(always)]
361fn extract_single_field_line(
362 line: &[u8],
363 delim: u8,
364 target_idx: usize,
365 line_delim: u8,
366 suppress: bool,
367 buf: &mut Vec<u8>,
368) {
369 if line.is_empty() {
370 if !suppress {
371 buf.push(line_delim);
372 }
373 return;
374 }
375
376 if target_idx == 0 {
378 match memchr::memchr(delim, line) {
379 Some(pos) => {
380 buf.extend_from_slice(&line[..pos]);
381 buf.push(line_delim);
382 }
383 None => {
384 if !suppress {
386 buf.extend_from_slice(line);
387 buf.push(line_delim);
388 }
389 }
390 }
391 return;
392 }
393
394 let mut field_start = 0;
395 let mut field_idx = 0;
396 let mut has_delim = false;
397
398 for pos in memchr_iter(delim, line) {
399 has_delim = true;
400 if field_idx == target_idx {
401 buf.extend_from_slice(&line[field_start..pos]);
403 buf.push(line_delim);
404 return;
405 }
406 field_idx += 1;
407 field_start = pos + 1;
408 }
409
410 if !has_delim {
411 if !suppress {
413 buf.extend_from_slice(line);
414 buf.push(line_delim);
415 }
416 return;
417 }
418
419 if field_idx == target_idx {
420 buf.extend_from_slice(&line[field_start..]);
422 buf.push(line_delim);
423 } else {
424 buf.push(line_delim);
426 }
427}
428
429#[inline(always)]
432fn extract_fields_to_buf(
433 line: &[u8],
434 delim: u8,
435 ranges: &[Range],
436 output_delim: &[u8],
437 suppress: bool,
438 max_field: usize,
439 field_mask: u64,
440 line_delim: u8,
441 buf: &mut Vec<u8>,
442 complement: bool,
443) {
444 let len = line.len();
445
446 if len == 0 {
448 if !suppress {
449 buf.push(line_delim);
450 }
451 return;
452 }
453
454 let mut field_num: usize = 1;
455 let mut field_start: usize = 0;
456 let mut first_output = true;
457 let mut has_delim = false;
458
459 for delim_pos in memchr_iter(delim, line) {
461 has_delim = true;
462
463 if is_selected(field_num, field_mask, ranges, complement) {
464 if !first_output {
465 buf.extend_from_slice(output_delim);
466 }
467 buf.extend_from_slice(&line[field_start..delim_pos]);
468 first_output = false;
469 }
470
471 field_num += 1;
472 field_start = delim_pos + 1;
473
474 if field_num > max_field {
476 break;
477 }
478 }
479
480 if (field_num <= max_field || complement)
482 && has_delim
483 && is_selected(field_num, field_mask, ranges, complement)
484 {
485 if !first_output {
486 buf.extend_from_slice(output_delim);
487 }
488 buf.extend_from_slice(&line[field_start..len]);
489 first_output = false;
490 }
491
492 if !first_output {
494 buf.push(line_delim);
496 } else if !has_delim {
497 if !suppress {
499 buf.extend_from_slice(line);
500 buf.push(line_delim);
501 }
502 } else {
503 buf.push(line_delim);
505 }
506}
507
508fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
512 let line_delim = cfg.line_delim;
513 let ranges = cfg.ranges;
514 let complement = cfg.complement;
515 let output_delim = cfg.output_delim;
516
517 if data.len() >= PARALLEL_THRESHOLD {
518 let chunks = split_into_chunks(data, line_delim);
519 let results: Vec<Vec<u8>> = chunks
520 .par_iter()
521 .map(|chunk| {
522 let mut buf = Vec::with_capacity(chunk.len() / 2);
523 process_bytes_chunk(
524 chunk,
525 ranges,
526 complement,
527 output_delim,
528 line_delim,
529 &mut buf,
530 );
531 buf
532 })
533 .collect();
534 for result in &results {
535 if !result.is_empty() {
536 out.write_all(result)?;
537 }
538 }
539 } else {
540 let mut buf = Vec::with_capacity(data.len() / 2);
541 process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
542 if !buf.is_empty() {
543 out.write_all(&buf)?;
544 }
545 }
546 Ok(())
547}
548
549fn process_bytes_chunk(
551 data: &[u8],
552 ranges: &[Range],
553 complement: bool,
554 output_delim: &[u8],
555 line_delim: u8,
556 buf: &mut Vec<u8>,
557) {
558 let mut start = 0;
559 for end_pos in memchr_iter(line_delim, data) {
560 let line = &data[start..end_pos];
561 cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
562 buf.push(line_delim);
563 start = end_pos + 1;
564 }
565 if start < data.len() {
566 cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
567 buf.push(line_delim);
568 }
569}
570
571#[inline(always)]
574fn cut_bytes_to_buf(
575 line: &[u8],
576 ranges: &[Range],
577 complement: bool,
578 output_delim: &[u8],
579 buf: &mut Vec<u8>,
580) {
581 let len = line.len();
582 let mut first_range = true;
583
584 if complement {
585 let mut pos: usize = 1;
586 for r in ranges {
587 let rs = r.start;
588 let re = r.end.min(len);
589 if pos < rs {
590 if !first_range && !output_delim.is_empty() {
591 buf.extend_from_slice(output_delim);
592 }
593 buf.extend_from_slice(&line[pos - 1..rs - 1]);
594 first_range = false;
595 }
596 pos = re + 1;
597 if pos > len {
598 break;
599 }
600 }
601 if pos <= len {
602 if !first_range && !output_delim.is_empty() {
603 buf.extend_from_slice(output_delim);
604 }
605 buf.extend_from_slice(&line[pos - 1..len]);
606 }
607 } else if output_delim.is_empty() && ranges.len() == 1 {
608 let start = ranges[0].start.saturating_sub(1);
610 let end = ranges[0].end.min(len);
611 if start < len {
612 buf.extend_from_slice(&line[start..end]);
613 }
614 } else {
615 for r in ranges {
616 let start = r.start.saturating_sub(1);
617 let end = r.end.min(len);
618 if start >= len {
619 break;
620 }
621 if !first_range && !output_delim.is_empty() {
622 buf.extend_from_slice(output_delim);
623 }
624 buf.extend_from_slice(&line[start..end]);
625 first_range = false;
626 }
627 }
628}
629
630#[inline]
636pub fn cut_fields(
637 line: &[u8],
638 delim: u8,
639 ranges: &[Range],
640 complement: bool,
641 output_delim: &[u8],
642 suppress_no_delim: bool,
643 out: &mut impl Write,
644) -> io::Result<bool> {
645 if memchr::memchr(delim, line).is_none() {
647 if !suppress_no_delim {
648 out.write_all(line)?;
649 return Ok(true);
650 }
651 return Ok(false); }
653
654 let mut field_num: usize = 1;
656 let mut field_start: usize = 0;
657 let mut first_output = true;
658
659 for delim_pos in memchr_iter(delim, line) {
660 let selected = in_ranges(ranges, field_num) != complement;
661 if selected {
662 if !first_output {
663 out.write_all(output_delim)?;
664 }
665 out.write_all(&line[field_start..delim_pos])?;
666 first_output = false;
667 }
668 field_start = delim_pos + 1;
669 field_num += 1;
670 }
671
672 let selected = in_ranges(ranges, field_num) != complement;
674 if selected {
675 if !first_output {
676 out.write_all(output_delim)?;
677 }
678 out.write_all(&line[field_start..])?;
679 }
680
681 Ok(true)
682}
683
684#[inline]
687pub fn cut_bytes(
688 line: &[u8],
689 ranges: &[Range],
690 complement: bool,
691 output_delim: &[u8],
692 out: &mut impl Write,
693) -> io::Result<bool> {
694 let mut first_range = true;
695
696 if complement {
697 let len = line.len();
698 let mut comp_ranges = Vec::new();
699 let mut pos: usize = 1;
700 for r in ranges {
701 let rs = r.start;
702 let re = r.end.min(len);
703 if pos < rs {
704 comp_ranges.push((pos, rs - 1));
705 }
706 pos = re + 1;
707 if pos > len {
708 break;
709 }
710 }
711 if pos <= len {
712 comp_ranges.push((pos, len));
713 }
714 for &(s, e) in &comp_ranges {
715 if !first_range && !output_delim.is_empty() {
716 out.write_all(output_delim)?;
717 }
718 out.write_all(&line[s - 1..e])?;
719 first_range = false;
720 }
721 } else {
722 for r in ranges {
723 let start = r.start.saturating_sub(1);
724 let end = r.end.min(line.len());
725 if start >= line.len() {
726 break;
727 }
728 if !first_range && !output_delim.is_empty() {
729 out.write_all(output_delim)?;
730 }
731 out.write_all(&line[start..end])?;
732 first_range = false;
733 }
734 }
735 Ok(true)
736}
737
738pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
741 match cfg.mode {
742 CutMode::Fields => process_fields_fast(data, cfg, out),
743 CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
744 }
745}
746
747pub fn process_cut_reader<R: BufRead>(
749 mut reader: R,
750 cfg: &CutConfig,
751 out: &mut impl Write,
752) -> io::Result<()> {
753 let mut buf = Vec::new();
754
755 loop {
756 buf.clear();
757 let n = reader.read_until(cfg.line_delim, &mut buf)?;
758 if n == 0 {
759 break;
760 }
761
762 let has_line_delim = buf.last() == Some(&cfg.line_delim);
763 let line = if has_line_delim {
764 &buf[..buf.len() - 1]
765 } else {
766 &buf[..]
767 };
768
769 let wrote = process_one_line(line, cfg, out)?;
770
771 if wrote {
773 out.write_all(&[cfg.line_delim])?;
774 }
775 }
776
777 Ok(())
778}
779
780#[inline]
782fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
783 match cfg.mode {
784 CutMode::Fields => cut_fields(
785 line,
786 cfg.delim,
787 cfg.ranges,
788 cfg.complement,
789 cfg.output_delim,
790 cfg.suppress_no_delim,
791 out,
792 ),
793 CutMode::Bytes | CutMode::Characters => {
794 cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
795 }
796 }
797}
798
799#[derive(Debug, Clone, Copy, PartialEq)]
801pub enum CutMode {
802 Bytes,
803 Characters,
804 Fields,
805}