1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5const PARALLEL_THRESHOLD: usize = 1024 * 1024;
9
10pub struct CutConfig<'a> {
12 pub mode: CutMode,
13 pub ranges: &'a [Range],
14 pub complement: bool,
15 pub delim: u8,
16 pub output_delim: &'a [u8],
17 pub suppress_no_delim: bool,
18 pub line_delim: u8,
19}
20
21#[derive(Debug, Clone)]
23pub struct Range {
24 pub start: usize, pub end: usize, }
27
28pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
31 let mut ranges = Vec::new();
32
33 for part in spec.split(',') {
34 let part = part.trim();
35 if part.is_empty() {
36 continue;
37 }
38
39 if let Some(idx) = part.find('-') {
40 let left = &part[..idx];
41 let right = &part[idx + 1..];
42
43 let start = if left.is_empty() {
44 1
45 } else {
46 left.parse::<usize>()
47 .map_err(|_| format!("invalid range: '{}'", part))?
48 };
49
50 let end = if right.is_empty() {
51 usize::MAX
52 } else {
53 right
54 .parse::<usize>()
55 .map_err(|_| format!("invalid range: '{}'", part))?
56 };
57
58 if start == 0 {
59 return Err("fields and positions are numbered from 1".to_string());
60 }
61 if start > end {
62 return Err(format!("invalid decreasing range: '{}'", part));
63 }
64
65 ranges.push(Range { start, end });
66 } else {
67 let n = part
68 .parse::<usize>()
69 .map_err(|_| format!("invalid field: '{}'", part))?;
70 if n == 0 {
71 return Err("fields and positions are numbered from 1".to_string());
72 }
73 ranges.push(Range { start: n, end: n });
74 }
75 }
76
77 if ranges.is_empty() {
78 return Err("you must specify a list of bytes, characters, or fields".to_string());
79 }
80
81 ranges.sort_by_key(|r| (r.start, r.end));
83 let mut merged = vec![ranges[0].clone()];
84 for r in &ranges[1..] {
85 let last = merged.last_mut().unwrap();
86 if r.start <= last.end.saturating_add(1) {
87 last.end = last.end.max(r.end);
88 } else {
89 merged.push(r.clone());
90 }
91 }
92
93 Ok(merged)
94}
95
96#[inline(always)]
99fn in_ranges(ranges: &[Range], pos: usize) -> bool {
100 for r in ranges {
101 if pos < r.start {
102 return false; }
104 if pos <= r.end {
105 return true;
106 }
107 }
108 false
109}
110
111#[inline]
114fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
115 let mut mask: u64 = 0;
116 for i in 1..=64u32 {
117 let in_range = in_ranges(ranges, i as usize);
118 if in_range != complement {
119 mask |= 1u64 << (i - 1);
120 }
121 }
122 mask
123}
124
125#[inline(always)]
127fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
128 if field_num <= 64 {
129 (mask >> (field_num - 1)) & 1 == 1
130 } else {
131 in_ranges(ranges, field_num) != complement
132 }
133}
134
135fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
140 let num_threads = rayon::current_num_threads().max(1);
141 if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
142 return vec![data];
143 }
144
145 let chunk_size = data.len() / num_threads;
146 let mut chunks = Vec::with_capacity(num_threads);
147 let mut pos = 0;
148
149 for _ in 0..num_threads - 1 {
150 let target = pos + chunk_size;
151 if target >= data.len() {
152 break;
153 }
154 let boundary = memchr::memchr(line_delim, &data[target..])
156 .map(|p| target + p + 1)
157 .unwrap_or(data.len());
158 if boundary > pos {
159 chunks.push(&data[pos..boundary]);
160 }
161 pos = boundary;
162 }
163
164 if pos < data.len() {
166 chunks.push(&data[pos..]);
167 }
168
169 chunks
170}
171
172fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
177 let delim = cfg.delim;
178 let line_delim = cfg.line_delim;
179 let ranges = cfg.ranges;
180 let complement = cfg.complement;
181 let output_delim = cfg.output_delim;
182 let suppress = cfg.suppress_no_delim;
183
184 if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
186 return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
187 }
188
189 let max_field = if complement {
191 usize::MAX
192 } else {
193 ranges.last().map(|r| r.end).unwrap_or(0)
194 };
195 let field_mask = compute_field_mask(ranges, complement);
196
197 if data.len() >= PARALLEL_THRESHOLD {
198 let chunks = split_into_chunks(data, line_delim);
200 let results: Vec<Vec<u8>> = chunks
201 .par_iter()
202 .map(|chunk| {
203 let mut buf = Vec::with_capacity(chunk.len() / 2);
204 process_fields_chunk(
205 chunk,
206 delim,
207 ranges,
208 output_delim,
209 suppress,
210 max_field,
211 field_mask,
212 line_delim,
213 complement,
214 &mut buf,
215 );
216 buf
217 })
218 .collect();
219 for result in &results {
220 if !result.is_empty() {
221 out.write_all(result)?;
222 }
223 }
224 } else {
225 let mut buf = Vec::with_capacity(data.len() / 2);
227 process_fields_chunk(
228 data,
229 delim,
230 ranges,
231 output_delim,
232 suppress,
233 max_field,
234 field_mask,
235 line_delim,
236 complement,
237 &mut buf,
238 );
239 if !buf.is_empty() {
240 out.write_all(&buf)?;
241 }
242 }
243 Ok(())
244}
245
246fn process_fields_chunk(
248 data: &[u8],
249 delim: u8,
250 ranges: &[Range],
251 output_delim: &[u8],
252 suppress: bool,
253 max_field: usize,
254 field_mask: u64,
255 line_delim: u8,
256 complement: bool,
257 buf: &mut Vec<u8>,
258) {
259 let mut start = 0;
260 for end_pos in memchr_iter(line_delim, data) {
261 let line = &data[start..end_pos];
262 extract_fields_to_buf(
263 line,
264 delim,
265 ranges,
266 output_delim,
267 suppress,
268 max_field,
269 field_mask,
270 line_delim,
271 buf,
272 complement,
273 );
274 start = end_pos + 1;
275 }
276 if start < data.len() {
277 extract_fields_to_buf(
278 &data[start..],
279 delim,
280 ranges,
281 output_delim,
282 suppress,
283 max_field,
284 field_mask,
285 line_delim,
286 buf,
287 complement,
288 );
289 }
290}
291
292fn process_single_field(
298 data: &[u8],
299 delim: u8,
300 line_delim: u8,
301 target: usize,
302 suppress: bool,
303 out: &mut impl Write,
304) -> io::Result<()> {
305 let target_idx = target - 1;
306
307 if data.len() >= PARALLEL_THRESHOLD {
308 let chunks = split_into_chunks(data, line_delim);
310 let results: Vec<Vec<u8>> = chunks
311 .par_iter()
312 .map(|chunk| {
313 let mut buf = Vec::with_capacity(chunk.len() / 4);
314 process_single_field_chunk(
315 chunk, delim, target_idx, line_delim, suppress, &mut buf,
316 );
317 buf
318 })
319 .collect();
320 for result in &results {
321 if !result.is_empty() {
322 out.write_all(result)?;
323 }
324 }
325 } else {
326 let mut buf = Vec::with_capacity(data.len() / 4);
328 process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
329 if !buf.is_empty() {
330 out.write_all(&buf)?;
331 }
332 }
333 Ok(())
334}
335
336fn process_single_field_chunk(
338 data: &[u8],
339 delim: u8,
340 target_idx: usize,
341 line_delim: u8,
342 suppress: bool,
343 buf: &mut Vec<u8>,
344) {
345 let mut start = 0;
346 for end_pos in memchr_iter(line_delim, data) {
347 let line = &data[start..end_pos];
348 extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
349 start = end_pos + 1;
350 }
351 if start < data.len() {
353 extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
354 }
355}
356
357#[inline(always)]
360fn extract_single_field_line(
361 line: &[u8],
362 delim: u8,
363 target_idx: usize,
364 line_delim: u8,
365 suppress: bool,
366 buf: &mut Vec<u8>,
367) {
368 if line.is_empty() {
369 if !suppress {
370 buf.push(line_delim);
371 }
372 return;
373 }
374
375 let mut field_start = 0;
376 let mut field_idx = 0;
377 let mut has_delim = false;
378
379 for pos in memchr_iter(delim, line) {
380 has_delim = true;
381 if field_idx == target_idx {
382 buf.extend_from_slice(&line[field_start..pos]);
384 buf.push(line_delim);
385 return;
386 }
387 field_idx += 1;
388 field_start = pos + 1;
389 }
390
391 if !has_delim {
392 if !suppress {
394 buf.extend_from_slice(line);
395 buf.push(line_delim);
396 }
397 return;
398 }
399
400 if field_idx == target_idx {
401 buf.extend_from_slice(&line[field_start..]);
403 buf.push(line_delim);
404 } else {
405 buf.push(line_delim);
407 }
408}
409
410#[inline(always)]
413fn extract_fields_to_buf(
414 line: &[u8],
415 delim: u8,
416 ranges: &[Range],
417 output_delim: &[u8],
418 suppress: bool,
419 max_field: usize,
420 field_mask: u64,
421 line_delim: u8,
422 buf: &mut Vec<u8>,
423 complement: bool,
424) {
425 let len = line.len();
426
427 if len == 0 {
429 if !suppress {
430 buf.push(line_delim);
431 }
432 return;
433 }
434
435 let mut field_num: usize = 1;
436 let mut field_start: usize = 0;
437 let mut first_output = true;
438 let mut has_delim = false;
439
440 for delim_pos in memchr_iter(delim, line) {
442 has_delim = true;
443
444 if is_selected(field_num, field_mask, ranges, complement) {
445 if !first_output {
446 buf.extend_from_slice(output_delim);
447 }
448 buf.extend_from_slice(&line[field_start..delim_pos]);
449 first_output = false;
450 }
451
452 field_num += 1;
453 field_start = delim_pos + 1;
454
455 if field_num > max_field {
457 break;
458 }
459 }
460
461 if (field_num <= max_field || complement)
463 && has_delim
464 && is_selected(field_num, field_mask, ranges, complement)
465 {
466 if !first_output {
467 buf.extend_from_slice(output_delim);
468 }
469 buf.extend_from_slice(&line[field_start..len]);
470 first_output = false;
471 }
472
473 if !first_output {
475 buf.push(line_delim);
477 } else if !has_delim {
478 if !suppress {
480 buf.extend_from_slice(line);
481 buf.push(line_delim);
482 }
483 } else {
484 buf.push(line_delim);
486 }
487}
488
489fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
493 let line_delim = cfg.line_delim;
494 let ranges = cfg.ranges;
495 let complement = cfg.complement;
496 let output_delim = cfg.output_delim;
497
498 if data.len() >= PARALLEL_THRESHOLD {
499 let chunks = split_into_chunks(data, line_delim);
500 let results: Vec<Vec<u8>> = chunks
501 .par_iter()
502 .map(|chunk| {
503 let mut buf = Vec::with_capacity(chunk.len() / 2);
504 process_bytes_chunk(
505 chunk,
506 ranges,
507 complement,
508 output_delim,
509 line_delim,
510 &mut buf,
511 );
512 buf
513 })
514 .collect();
515 for result in &results {
516 if !result.is_empty() {
517 out.write_all(result)?;
518 }
519 }
520 } else {
521 let mut buf = Vec::with_capacity(data.len() / 2);
522 process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
523 if !buf.is_empty() {
524 out.write_all(&buf)?;
525 }
526 }
527 Ok(())
528}
529
530fn process_bytes_chunk(
532 data: &[u8],
533 ranges: &[Range],
534 complement: bool,
535 output_delim: &[u8],
536 line_delim: u8,
537 buf: &mut Vec<u8>,
538) {
539 let mut start = 0;
540 for end_pos in memchr_iter(line_delim, data) {
541 let line = &data[start..end_pos];
542 cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
543 buf.push(line_delim);
544 start = end_pos + 1;
545 }
546 if start < data.len() {
547 cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
548 buf.push(line_delim);
549 }
550}
551
552#[inline(always)]
555fn cut_bytes_to_buf(
556 line: &[u8],
557 ranges: &[Range],
558 complement: bool,
559 output_delim: &[u8],
560 buf: &mut Vec<u8>,
561) {
562 let len = line.len();
563 let mut first_range = true;
564
565 if complement {
566 let mut pos: usize = 1;
567 for r in ranges {
568 let rs = r.start;
569 let re = r.end.min(len);
570 if pos < rs {
571 if !first_range && !output_delim.is_empty() {
572 buf.extend_from_slice(output_delim);
573 }
574 buf.extend_from_slice(&line[pos - 1..rs - 1]);
575 first_range = false;
576 }
577 pos = re + 1;
578 if pos > len {
579 break;
580 }
581 }
582 if pos <= len {
583 if !first_range && !output_delim.is_empty() {
584 buf.extend_from_slice(output_delim);
585 }
586 buf.extend_from_slice(&line[pos - 1..len]);
587 }
588 } else if output_delim.is_empty() && ranges.len() == 1 {
589 let start = ranges[0].start.saturating_sub(1);
591 let end = ranges[0].end.min(len);
592 if start < len {
593 buf.extend_from_slice(&line[start..end]);
594 }
595 } else {
596 for r in ranges {
597 let start = r.start.saturating_sub(1);
598 let end = r.end.min(len);
599 if start >= len {
600 break;
601 }
602 if !first_range && !output_delim.is_empty() {
603 buf.extend_from_slice(output_delim);
604 }
605 buf.extend_from_slice(&line[start..end]);
606 first_range = false;
607 }
608 }
609}
610
611#[inline]
617pub fn cut_fields(
618 line: &[u8],
619 delim: u8,
620 ranges: &[Range],
621 complement: bool,
622 output_delim: &[u8],
623 suppress_no_delim: bool,
624 out: &mut impl Write,
625) -> io::Result<bool> {
626 if memchr::memchr(delim, line).is_none() {
628 if !suppress_no_delim {
629 out.write_all(line)?;
630 return Ok(true);
631 }
632 return Ok(false); }
634
635 let mut field_num: usize = 1;
637 let mut field_start: usize = 0;
638 let mut first_output = true;
639
640 for delim_pos in memchr_iter(delim, line) {
641 let selected = in_ranges(ranges, field_num) != complement;
642 if selected {
643 if !first_output {
644 out.write_all(output_delim)?;
645 }
646 out.write_all(&line[field_start..delim_pos])?;
647 first_output = false;
648 }
649 field_start = delim_pos + 1;
650 field_num += 1;
651 }
652
653 let selected = in_ranges(ranges, field_num) != complement;
655 if selected {
656 if !first_output {
657 out.write_all(output_delim)?;
658 }
659 out.write_all(&line[field_start..])?;
660 }
661
662 Ok(true)
663}
664
665#[inline]
668pub fn cut_bytes(
669 line: &[u8],
670 ranges: &[Range],
671 complement: bool,
672 output_delim: &[u8],
673 out: &mut impl Write,
674) -> io::Result<bool> {
675 let mut first_range = true;
676
677 if complement {
678 let len = line.len();
679 let mut comp_ranges = Vec::new();
680 let mut pos: usize = 1;
681 for r in ranges {
682 let rs = r.start;
683 let re = r.end.min(len);
684 if pos < rs {
685 comp_ranges.push((pos, rs - 1));
686 }
687 pos = re + 1;
688 if pos > len {
689 break;
690 }
691 }
692 if pos <= len {
693 comp_ranges.push((pos, len));
694 }
695 for &(s, e) in &comp_ranges {
696 if !first_range && !output_delim.is_empty() {
697 out.write_all(output_delim)?;
698 }
699 out.write_all(&line[s - 1..e])?;
700 first_range = false;
701 }
702 } else {
703 for r in ranges {
704 let start = r.start.saturating_sub(1);
705 let end = r.end.min(line.len());
706 if start >= line.len() {
707 break;
708 }
709 if !first_range && !output_delim.is_empty() {
710 out.write_all(output_delim)?;
711 }
712 out.write_all(&line[start..end])?;
713 first_range = false;
714 }
715 }
716 Ok(true)
717}
718
719pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
722 match cfg.mode {
723 CutMode::Fields => process_fields_fast(data, cfg, out),
724 CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
725 }
726}
727
728pub fn process_cut_reader<R: BufRead>(
730 mut reader: R,
731 cfg: &CutConfig,
732 out: &mut impl Write,
733) -> io::Result<()> {
734 let mut buf = Vec::new();
735
736 loop {
737 buf.clear();
738 let n = reader.read_until(cfg.line_delim, &mut buf)?;
739 if n == 0 {
740 break;
741 }
742
743 let has_line_delim = buf.last() == Some(&cfg.line_delim);
744 let line = if has_line_delim {
745 &buf[..buf.len() - 1]
746 } else {
747 &buf[..]
748 };
749
750 let wrote = process_one_line(line, cfg, out)?;
751
752 if wrote {
754 out.write_all(&[cfg.line_delim])?;
755 }
756 }
757
758 Ok(())
759}
760
761#[inline]
763fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
764 match cfg.mode {
765 CutMode::Fields => cut_fields(
766 line,
767 cfg.delim,
768 cfg.ranges,
769 cfg.complement,
770 cfg.output_delim,
771 cfg.suppress_no_delim,
772 out,
773 ),
774 CutMode::Bytes | CutMode::Characters => {
775 cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
776 }
777 }
778}
779
780#[derive(Debug, Clone, Copy, PartialEq)]
782pub enum CutMode {
783 Bytes,
784 Characters,
785 Fields,
786}