1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9pub struct CutConfig<'a> {
11 pub mode: CutMode,
12 pub ranges: &'a [Range],
13 pub complement: bool,
14 pub delim: u8,
15 pub output_delim: &'a [u8],
16 pub suppress_no_delim: bool,
17 pub line_delim: u8,
18}
19
20#[derive(Debug, Clone)]
22pub struct Range {
23 pub start: usize, pub end: usize, }
26
27pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
30 let mut ranges = Vec::new();
31
32 for part in spec.split(',') {
33 let part = part.trim();
34 if part.is_empty() {
35 continue;
36 }
37
38 if let Some(idx) = part.find('-') {
39 let left = &part[..idx];
40 let right = &part[idx + 1..];
41
42 let start = if left.is_empty() {
43 1
44 } else {
45 left.parse::<usize>()
46 .map_err(|_| format!("invalid range: '{}'", part))?
47 };
48
49 let end = if right.is_empty() {
50 usize::MAX
51 } else {
52 right
53 .parse::<usize>()
54 .map_err(|_| format!("invalid range: '{}'", part))?
55 };
56
57 if start == 0 {
58 return Err("fields and positions are numbered from 1".to_string());
59 }
60 if start > end {
61 return Err(format!("invalid decreasing range: '{}'", part));
62 }
63
64 ranges.push(Range { start, end });
65 } else {
66 let n = part
67 .parse::<usize>()
68 .map_err(|_| format!("invalid field: '{}'", part))?;
69 if n == 0 {
70 return Err("fields and positions are numbered from 1".to_string());
71 }
72 ranges.push(Range { start: n, end: n });
73 }
74 }
75
76 if ranges.is_empty() {
77 return Err("you must specify a list of bytes, characters, or fields".to_string());
78 }
79
80 ranges.sort_by_key(|r| (r.start, r.end));
82 let mut merged = vec![ranges[0].clone()];
83 for r in &ranges[1..] {
84 let last = merged.last_mut().unwrap();
85 if r.start <= last.end.saturating_add(1) {
86 last.end = last.end.max(r.end);
87 } else {
88 merged.push(r.clone());
89 }
90 }
91
92 Ok(merged)
93}
94
95#[inline(always)]
98fn in_ranges(ranges: &[Range], pos: usize) -> bool {
99 for r in ranges {
100 if pos < r.start {
101 return false; }
103 if pos <= r.end {
104 return true;
105 }
106 }
107 false
108}
109
110#[inline]
113fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
114 let mut mask: u64 = 0;
115 for i in 1..=64u32 {
116 let in_range = in_ranges(ranges, i as usize);
117 if in_range != complement {
118 mask |= 1u64 << (i - 1);
119 }
120 }
121 mask
122}
123
124#[inline(always)]
126fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
127 if field_num <= 64 {
128 (mask >> (field_num - 1)) & 1 == 1
129 } else {
130 in_ranges(ranges, field_num) != complement
131 }
132}
133
134fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
139 let num_threads = rayon::current_num_threads().max(1);
140 if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
141 return vec![data];
142 }
143
144 let chunk_size = data.len() / num_threads;
145 let mut chunks = Vec::with_capacity(num_threads);
146 let mut pos = 0;
147
148 for _ in 0..num_threads - 1 {
149 let target = pos + chunk_size;
150 if target >= data.len() {
151 break;
152 }
153 let boundary = memchr::memchr(line_delim, &data[target..])
155 .map(|p| target + p + 1)
156 .unwrap_or(data.len());
157 if boundary > pos {
158 chunks.push(&data[pos..boundary]);
159 }
160 pos = boundary;
161 }
162
163 if pos < data.len() {
165 chunks.push(&data[pos..]);
166 }
167
168 chunks
169}
170
171fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
176 let delim = cfg.delim;
177 let line_delim = cfg.line_delim;
178 let ranges = cfg.ranges;
179 let complement = cfg.complement;
180 let output_delim = cfg.output_delim;
181 let suppress = cfg.suppress_no_delim;
182
183 if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
185 return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
186 }
187
188 let max_field = if complement {
190 usize::MAX
191 } else {
192 ranges.last().map(|r| r.end).unwrap_or(0)
193 };
194 let field_mask = compute_field_mask(ranges, complement);
195
196 if data.len() >= PARALLEL_THRESHOLD {
197 let chunks = split_into_chunks(data, line_delim);
199 let results: Vec<Vec<u8>> = chunks
200 .par_iter()
201 .map(|chunk| {
202 let mut buf = Vec::with_capacity(chunk.len() / 2);
203 process_fields_chunk(
204 chunk,
205 delim,
206 ranges,
207 output_delim,
208 suppress,
209 max_field,
210 field_mask,
211 line_delim,
212 complement,
213 &mut buf,
214 );
215 buf
216 })
217 .collect();
218 for result in &results {
219 if !result.is_empty() {
220 out.write_all(result)?;
221 }
222 }
223 } else {
224 let mut buf = Vec::with_capacity(data.len() / 2);
226 process_fields_chunk(
227 data,
228 delim,
229 ranges,
230 output_delim,
231 suppress,
232 max_field,
233 field_mask,
234 line_delim,
235 complement,
236 &mut buf,
237 );
238 if !buf.is_empty() {
239 out.write_all(&buf)?;
240 }
241 }
242 Ok(())
243}
244
245fn process_fields_chunk(
247 data: &[u8],
248 delim: u8,
249 ranges: &[Range],
250 output_delim: &[u8],
251 suppress: bool,
252 max_field: usize,
253 field_mask: u64,
254 line_delim: u8,
255 complement: bool,
256 buf: &mut Vec<u8>,
257) {
258 let mut start = 0;
259 for end_pos in memchr_iter(line_delim, data) {
260 let line = &data[start..end_pos];
261 extract_fields_to_buf(
262 line,
263 delim,
264 ranges,
265 output_delim,
266 suppress,
267 max_field,
268 field_mask,
269 line_delim,
270 buf,
271 complement,
272 );
273 start = end_pos + 1;
274 }
275 if start < data.len() {
276 extract_fields_to_buf(
277 &data[start..],
278 delim,
279 ranges,
280 output_delim,
281 suppress,
282 max_field,
283 field_mask,
284 line_delim,
285 buf,
286 complement,
287 );
288 }
289}
290
291fn process_single_field(
297 data: &[u8],
298 delim: u8,
299 line_delim: u8,
300 target: usize,
301 suppress: bool,
302 out: &mut impl Write,
303) -> io::Result<()> {
304 let target_idx = target - 1;
305
306 if data.len() >= PARALLEL_THRESHOLD {
307 let chunks = split_into_chunks(data, line_delim);
309 let results: Vec<Vec<u8>> = chunks
310 .par_iter()
311 .map(|chunk| {
312 let mut buf = Vec::with_capacity(chunk.len() / 4);
313 process_single_field_chunk(
314 chunk, delim, target_idx, line_delim, suppress, &mut buf,
315 );
316 buf
317 })
318 .collect();
319 for result in &results {
320 if !result.is_empty() {
321 out.write_all(result)?;
322 }
323 }
324 } else {
325 let mut buf = Vec::with_capacity(data.len() / 4);
327 process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
328 if !buf.is_empty() {
329 out.write_all(&buf)?;
330 }
331 }
332 Ok(())
333}
334
335fn process_single_field_chunk(
337 data: &[u8],
338 delim: u8,
339 target_idx: usize,
340 line_delim: u8,
341 suppress: bool,
342 buf: &mut Vec<u8>,
343) {
344 let mut start = 0;
345 for end_pos in memchr_iter(line_delim, data) {
346 let line = &data[start..end_pos];
347 extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
348 start = end_pos + 1;
349 }
350 if start < data.len() {
352 extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
353 }
354}
355
356#[inline(always)]
358fn extract_single_field_line(
359 line: &[u8],
360 delim: u8,
361 target_idx: usize,
362 line_delim: u8,
363 suppress: bool,
364 buf: &mut Vec<u8>,
365) {
366 let len = line.len();
367 if len == 0 {
368 if !suppress {
369 buf.push(line_delim);
370 }
371 return;
372 }
373
374 let mut delim_count: usize = 0;
375 let mut field_start: usize = 0;
376
377 let mut i = 0;
378 while i < len {
379 if unsafe { *line.get_unchecked(i) } == delim {
380 if delim_count == target_idx {
381 buf.extend_from_slice(unsafe { line.get_unchecked(field_start..i) });
382 buf.push(line_delim);
383 return;
384 }
385 delim_count += 1;
386 field_start = i + 1;
387 }
388 i += 1;
389 }
390
391 if delim_count == 0 {
392 if !suppress {
394 buf.extend_from_slice(line);
395 buf.push(line_delim);
396 }
397 } else if delim_count == target_idx {
398 buf.extend_from_slice(unsafe { line.get_unchecked(field_start..len) });
400 buf.push(line_delim);
401 } else {
402 buf.push(line_delim);
404 }
405}
406
407#[inline(always)]
410fn extract_fields_to_buf(
411 line: &[u8],
412 delim: u8,
413 ranges: &[Range],
414 output_delim: &[u8],
415 suppress: bool,
416 max_field: usize,
417 field_mask: u64,
418 line_delim: u8,
419 buf: &mut Vec<u8>,
420 complement: bool,
421) {
422 let len = line.len();
423
424 if len == 0 {
426 if !suppress {
427 buf.push(line_delim);
428 }
429 return;
430 }
431
432 let mut field_num: usize = 1;
433 let mut field_start: usize = 0;
434 let mut first_output = true;
435 let mut has_delim = false;
436
437 let mut i = 0;
439 while i < len {
440 if unsafe { *line.get_unchecked(i) } == delim {
442 has_delim = true;
443
444 if is_selected(field_num, field_mask, ranges, complement) {
445 if !first_output {
446 buf.extend_from_slice(output_delim);
447 }
448 buf.extend_from_slice(&line[field_start..i]);
449 first_output = false;
450 }
451
452 field_num += 1;
453 field_start = i + 1;
454
455 if field_num > max_field {
457 break;
458 }
459 }
460 i += 1;
461 }
462
463 if (field_num <= max_field || complement)
465 && has_delim
466 && is_selected(field_num, field_mask, ranges, complement)
467 {
468 if !first_output {
469 buf.extend_from_slice(output_delim);
470 }
471 buf.extend_from_slice(&line[field_start..len]);
472 first_output = false;
473 }
474
475 if !first_output {
477 buf.push(line_delim);
479 } else if !has_delim {
480 if !suppress {
482 buf.extend_from_slice(line);
483 buf.push(line_delim);
484 }
485 } else {
486 buf.push(line_delim);
488 }
489}
490
491fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
495 let line_delim = cfg.line_delim;
496 let ranges = cfg.ranges;
497 let complement = cfg.complement;
498 let output_delim = cfg.output_delim;
499
500 if data.len() >= PARALLEL_THRESHOLD {
501 let chunks = split_into_chunks(data, line_delim);
502 let results: Vec<Vec<u8>> = chunks
503 .par_iter()
504 .map(|chunk| {
505 let mut buf = Vec::with_capacity(chunk.len() / 2);
506 process_bytes_chunk(
507 chunk,
508 ranges,
509 complement,
510 output_delim,
511 line_delim,
512 &mut buf,
513 );
514 buf
515 })
516 .collect();
517 for result in &results {
518 if !result.is_empty() {
519 out.write_all(result)?;
520 }
521 }
522 } else {
523 let mut buf = Vec::with_capacity(data.len() / 2);
524 process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
525 if !buf.is_empty() {
526 out.write_all(&buf)?;
527 }
528 }
529 Ok(())
530}
531
532fn process_bytes_chunk(
534 data: &[u8],
535 ranges: &[Range],
536 complement: bool,
537 output_delim: &[u8],
538 line_delim: u8,
539 buf: &mut Vec<u8>,
540) {
541 let mut start = 0;
542 for end_pos in memchr_iter(line_delim, data) {
543 let line = &data[start..end_pos];
544 cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
545 buf.push(line_delim);
546 start = end_pos + 1;
547 }
548 if start < data.len() {
549 cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
550 buf.push(line_delim);
551 }
552}
553
554#[inline(always)]
556fn cut_bytes_to_buf(
557 line: &[u8],
558 ranges: &[Range],
559 complement: bool,
560 output_delim: &[u8],
561 buf: &mut Vec<u8>,
562) {
563 let mut first_range = true;
564
565 if complement {
566 let len = line.len();
567 let mut pos: usize = 1;
568 for r in ranges {
569 let rs = r.start;
570 let re = r.end.min(len);
571 if pos < rs {
572 if !first_range && !output_delim.is_empty() {
573 buf.extend_from_slice(output_delim);
574 }
575 buf.extend_from_slice(&line[pos - 1..rs - 1]);
576 first_range = false;
577 }
578 pos = re + 1;
579 if pos > len {
580 break;
581 }
582 }
583 if pos <= len {
584 if !first_range && !output_delim.is_empty() {
585 buf.extend_from_slice(output_delim);
586 }
587 buf.extend_from_slice(&line[pos - 1..len]);
588 }
589 } else {
590 for r in ranges {
591 let start = r.start.saturating_sub(1);
592 let end = r.end.min(line.len());
593 if start >= line.len() {
594 break;
595 }
596 if !first_range && !output_delim.is_empty() {
597 buf.extend_from_slice(output_delim);
598 }
599 buf.extend_from_slice(&line[start..end]);
600 first_range = false;
601 }
602 }
603}
604
605#[inline]
611pub fn cut_fields(
612 line: &[u8],
613 delim: u8,
614 ranges: &[Range],
615 complement: bool,
616 output_delim: &[u8],
617 suppress_no_delim: bool,
618 out: &mut impl Write,
619) -> io::Result<bool> {
620 if memchr::memchr(delim, line).is_none() {
622 if !suppress_no_delim {
623 out.write_all(line)?;
624 return Ok(true);
625 }
626 return Ok(false); }
628
629 let mut field_num: usize = 1;
631 let mut field_start: usize = 0;
632 let mut first_output = true;
633
634 for delim_pos in memchr_iter(delim, line) {
635 let selected = in_ranges(ranges, field_num) != complement;
636 if selected {
637 if !first_output {
638 out.write_all(output_delim)?;
639 }
640 out.write_all(&line[field_start..delim_pos])?;
641 first_output = false;
642 }
643 field_start = delim_pos + 1;
644 field_num += 1;
645 }
646
647 let selected = in_ranges(ranges, field_num) != complement;
649 if selected {
650 if !first_output {
651 out.write_all(output_delim)?;
652 }
653 out.write_all(&line[field_start..])?;
654 }
655
656 Ok(true)
657}
658
659#[inline]
662pub fn cut_bytes(
663 line: &[u8],
664 ranges: &[Range],
665 complement: bool,
666 output_delim: &[u8],
667 out: &mut impl Write,
668) -> io::Result<bool> {
669 let mut first_range = true;
670
671 if complement {
672 let len = line.len();
673 let mut comp_ranges = Vec::new();
674 let mut pos: usize = 1;
675 for r in ranges {
676 let rs = r.start;
677 let re = r.end.min(len);
678 if pos < rs {
679 comp_ranges.push((pos, rs - 1));
680 }
681 pos = re + 1;
682 if pos > len {
683 break;
684 }
685 }
686 if pos <= len {
687 comp_ranges.push((pos, len));
688 }
689 for &(s, e) in &comp_ranges {
690 if !first_range && !output_delim.is_empty() {
691 out.write_all(output_delim)?;
692 }
693 out.write_all(&line[s - 1..e])?;
694 first_range = false;
695 }
696 } else {
697 for r in ranges {
698 let start = r.start.saturating_sub(1);
699 let end = r.end.min(line.len());
700 if start >= line.len() {
701 break;
702 }
703 if !first_range && !output_delim.is_empty() {
704 out.write_all(output_delim)?;
705 }
706 out.write_all(&line[start..end])?;
707 first_range = false;
708 }
709 }
710 Ok(true)
711}
712
713pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
716 match cfg.mode {
717 CutMode::Fields => process_fields_fast(data, cfg, out),
718 CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
719 }
720}
721
722pub fn process_cut_reader<R: BufRead>(
724 mut reader: R,
725 cfg: &CutConfig,
726 out: &mut impl Write,
727) -> io::Result<()> {
728 let mut buf = Vec::new();
729
730 loop {
731 buf.clear();
732 let n = reader.read_until(cfg.line_delim, &mut buf)?;
733 if n == 0 {
734 break;
735 }
736
737 let has_line_delim = buf.last() == Some(&cfg.line_delim);
738 let line = if has_line_delim {
739 &buf[..buf.len() - 1]
740 } else {
741 &buf[..]
742 };
743
744 let wrote = process_one_line(line, cfg, out)?;
745
746 if wrote {
748 out.write_all(&[cfg.line_delim])?;
749 }
750 }
751
752 Ok(())
753}
754
755#[inline]
757fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
758 match cfg.mode {
759 CutMode::Fields => cut_fields(
760 line,
761 cfg.delim,
762 cfg.ranges,
763 cfg.complement,
764 cfg.output_delim,
765 cfg.suppress_no_delim,
766 out,
767 ),
768 CutMode::Bytes | CutMode::Characters => {
769 cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
770 }
771 }
772}
773
774#[derive(Debug, Clone, Copy, PartialEq)]
776pub enum CutMode {
777 Bytes,
778 Characters,
779 Fields,
780}