1use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
2
3#[inline]
5fn write_all_raw(writer: &mut impl Write, buf: &[u8]) -> io::Result<()> {
6 writer.write_all(buf)
7}
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum AllRepeatedMethod {
12 None,
13 Prepend,
14 Separate,
15}
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum GroupMethod {
20 Separate,
21 Prepend,
22 Append,
23 Both,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum OutputMode {
29 Default,
31 RepeatedOnly,
33 AllRepeated(AllRepeatedMethod),
35 UniqueOnly,
37 Group(GroupMethod),
39}
40
41#[derive(Debug, Clone)]
43pub struct UniqConfig {
44 pub mode: OutputMode,
45 pub count: bool,
46 pub ignore_case: bool,
47 pub skip_fields: usize,
48 pub skip_chars: usize,
49 pub check_chars: Option<usize>,
50 pub zero_terminated: bool,
51}
52
53impl Default for UniqConfig {
54 fn default() -> Self {
55 Self {
56 mode: OutputMode::Default,
57 count: false,
58 ignore_case: false,
59 skip_fields: 0,
60 skip_chars: 0,
61 check_chars: None,
62 zero_terminated: false,
63 }
64 }
65}
66
67#[inline(always)]
70fn get_compare_slice<'a>(line: &'a [u8], config: &UniqConfig) -> &'a [u8] {
71 let mut start = 0;
72 let len = line.len();
73
74 for _ in 0..config.skip_fields {
76 while start < len && (line[start] == b' ' || line[start] == b'\t') {
78 start += 1;
79 }
80 while start < len && line[start] != b' ' && line[start] != b'\t' {
82 start += 1;
83 }
84 }
85
86 if config.skip_chars > 0 {
88 let remaining = len - start;
89 let skip = config.skip_chars.min(remaining);
90 start += skip;
91 }
92
93 let slice = &line[start..];
94
95 if let Some(w) = config.check_chars {
97 if w < slice.len() {
98 return &slice[..w];
99 }
100 }
101
102 slice
103}
104
105#[inline(always)]
107fn lines_equal(a: &[u8], b: &[u8], config: &UniqConfig) -> bool {
108 let sa = get_compare_slice(a, config);
109 let sb = get_compare_slice(b, config);
110
111 if config.ignore_case {
112 sa.eq_ignore_ascii_case(sb)
113 } else {
114 sa == sb
115 }
116}
117
118#[inline(always)]
120fn needs_key_extraction(config: &UniqConfig) -> bool {
121 config.skip_fields > 0 || config.skip_chars > 0 || config.check_chars.is_some()
122}
123
124#[inline(always)]
127fn lines_equal_fast(a: &[u8], b: &[u8]) -> bool {
128 if a.len() != b.len() {
131 return false;
132 }
133 a == b
134}
135
136#[inline(always)]
140fn write_count_line(out: &mut impl Write, count: u64, line: &[u8], term: u8) -> io::Result<()> {
141 let mut prefix = [b' '; 28]; let digits = itoa_right_aligned_into(&mut prefix, count);
144 let width = digits.max(7); let prefix_len = width + 1; prefix[width] = b' ';
147 out.write_all(&prefix[..prefix_len])?;
149 out.write_all(line)?;
150 out.write_all(&[term])?;
151 Ok(())
152}
153
154#[inline(always)]
157fn itoa_right_aligned_into(buf: &mut [u8; 28], mut val: u64) -> usize {
158 if val == 0 {
159 buf[6] = b'0';
160 return 7; }
162 let mut pos = 27;
164 while val > 0 {
165 pos -= 1;
166 buf[pos] = b'0' + (val % 10) as u8;
167 val /= 10;
168 }
169 let num_digits = 27 - pos;
170 if num_digits >= 7 {
171 buf.copy_within(pos..27, 0);
173 num_digits
174 } else {
175 let pad = 7 - num_digits;
177 buf.copy_within(pos..27, pad);
178 7
180 }
181}
182
183pub fn process_uniq_bytes(data: &[u8], output: impl Write, config: &UniqConfig) -> io::Result<()> {
189 let mut writer = BufWriter::with_capacity(8 * 1024 * 1024, output);
190 let term = if config.zero_terminated { b'\0' } else { b'\n' };
191
192 match config.mode {
193 OutputMode::Group(method) => {
194 process_group_bytes(data, &mut writer, config, method, term)?;
195 }
196 OutputMode::AllRepeated(method) => {
197 process_all_repeated_bytes(data, &mut writer, config, method, term)?;
198 }
199 _ => {
200 process_standard_bytes(data, &mut writer, config, term)?;
201 }
202 }
203
204 writer.flush()?;
205 Ok(())
206}
207
208struct LineIter<'a> {
211 data: &'a [u8],
212 pos: usize,
213 term: u8,
214}
215
216impl<'a> LineIter<'a> {
217 #[inline(always)]
218 fn new(data: &'a [u8], term: u8) -> Self {
219 Self { data, pos: 0, term }
220 }
221}
222
223impl<'a> Iterator for LineIter<'a> {
224 type Item = (&'a [u8], &'a [u8]);
226
227 #[inline(always)]
228 fn next(&mut self) -> Option<Self::Item> {
229 if self.pos >= self.data.len() {
230 return None;
231 }
232
233 let remaining = &self.data[self.pos..];
234 match memchr::memchr(self.term, remaining) {
235 Some(idx) => {
236 let line_start = self.pos;
237 let line_end = self.pos + idx; let full_end = self.pos + idx + 1; self.pos = full_end;
240 Some((
241 &self.data[line_start..line_end],
242 &self.data[line_start..full_end],
243 ))
244 }
245 None => {
246 let line_start = self.pos;
248 self.pos = self.data.len();
249 let line = &self.data[line_start..];
250 Some((line, line))
251 }
252 }
253 }
254}
255
256fn process_standard_bytes(
258 data: &[u8],
259 writer: &mut impl Write,
260 config: &UniqConfig,
261 term: u8,
262) -> io::Result<()> {
263 let mut lines = LineIter::new(data, term);
264
265 let (prev_content, prev_full) = match lines.next() {
266 Some(v) => v,
267 None => return Ok(()), };
269
270 let fast = !needs_key_extraction(config) && !config.ignore_case;
271
272 if fast && !config.count && matches!(config.mode, OutputMode::Default) {
275 let data_base = data.as_ptr() as usize;
276 let mut prev_content = prev_content;
277
278 write_all_raw(writer, prev_full)?;
280 if prev_full.len() == prev_content.len() {
281 writer.write_all(&[term])?;
282 }
283
284 let mut span_start: usize = usize::MAX; let mut span_end: usize = 0;
287
288 for (cur_content, cur_full) in lines {
289 if lines_equal_fast(prev_content, cur_content) {
290 if span_start != usize::MAX {
292 write_all_raw(writer, &data[span_start..span_end])?;
293 span_start = usize::MAX;
294 }
295 prev_content = cur_content;
296 continue;
297 }
298
299 let cur_offset = cur_full.as_ptr() as usize - data_base;
300
301 if span_start == usize::MAX {
302 span_start = cur_offset;
304 span_end = cur_offset + cur_full.len();
305 } else if cur_offset == span_end {
306 span_end += cur_full.len();
308 } else {
309 write_all_raw(writer, &data[span_start..span_end])?;
311 span_start = cur_offset;
312 span_end = cur_offset + cur_full.len();
313 }
314
315 if cur_full.len() == cur_content.len() {
317 write_all_raw(writer, &data[span_start..span_end])?;
318 writer.write_all(&[term])?;
319 span_start = usize::MAX;
320 }
321
322 prev_content = cur_content;
323 }
324
325 if span_start != usize::MAX {
327 write_all_raw(writer, &data[span_start..span_end])?;
328 }
329 return Ok(());
330 }
331
332 let mut prev_content = prev_content;
334 let mut prev_full = prev_full;
335 let mut count: u64 = 1;
336
337 for (cur_content, cur_full) in lines {
338 let equal = if fast {
339 lines_equal_fast(prev_content, cur_content)
340 } else {
341 lines_equal(prev_content, cur_content, config)
342 };
343
344 if equal {
345 count += 1;
346 } else {
347 output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
349 prev_content = cur_content;
350 prev_full = cur_full;
351 count = 1;
352 }
353 }
354
355 output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
357 Ok(())
358}
359
360#[inline(always)]
362fn output_group_bytes(
363 writer: &mut impl Write,
364 content: &[u8],
365 full: &[u8],
366 count: u64,
367 config: &UniqConfig,
368 term: u8,
369) -> io::Result<()> {
370 let should_print = match config.mode {
371 OutputMode::Default => true,
372 OutputMode::RepeatedOnly => count > 1,
373 OutputMode::UniqueOnly => count == 1,
374 _ => true,
375 };
376
377 if should_print {
378 if config.count {
379 write_count_line(writer, count, content, term)?;
380 } else {
381 writer.write_all(full)?;
382 if full.len() == content.len() {
384 writer.write_all(&[term])?;
385 }
386 }
387 }
388
389 Ok(())
390}
391
392fn process_all_repeated_bytes(
394 data: &[u8],
395 writer: &mut impl Write,
396 config: &UniqConfig,
397 method: AllRepeatedMethod,
398 term: u8,
399) -> io::Result<()> {
400 let mut lines = LineIter::new(data, term);
401
402 let first = match lines.next() {
403 Some(v) => v,
404 None => return Ok(()),
405 };
406
407 let mut group_lines: Vec<(&[u8], &[u8])> = Vec::with_capacity(64);
410 group_lines.push(first);
411 let mut first_group_printed = false;
412
413 let fast = !needs_key_extraction(config) && !config.ignore_case;
414
415 for (cur_content, cur_full) in lines {
416 let prev_content = group_lines.last().unwrap().0;
417 let equal = if fast {
418 lines_equal_fast(prev_content, cur_content)
419 } else {
420 lines_equal(prev_content, cur_content, config)
421 };
422
423 if equal {
424 group_lines.push((cur_content, cur_full));
425 } else {
426 flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
428 group_lines.clear();
429 group_lines.push((cur_content, cur_full));
430 }
431 }
432
433 flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
435
436 Ok(())
437}
438
439fn flush_all_repeated_bytes(
441 writer: &mut impl Write,
442 group: &[(&[u8], &[u8])],
443 method: AllRepeatedMethod,
444 first_group_printed: &mut bool,
445 term: u8,
446) -> io::Result<()> {
447 if group.len() <= 1 {
448 return Ok(()); }
450
451 match method {
452 AllRepeatedMethod::Prepend => {
453 writer.write_all(&[term])?;
454 }
455 AllRepeatedMethod::Separate => {
456 if *first_group_printed {
457 writer.write_all(&[term])?;
458 }
459 }
460 AllRepeatedMethod::None => {}
461 }
462
463 for &(content, full) in group {
464 writer.write_all(full)?;
465 if full.len() == content.len() {
466 writer.write_all(&[term])?;
467 }
468 }
469
470 *first_group_printed = true;
471 Ok(())
472}
473
474fn process_group_bytes(
476 data: &[u8],
477 writer: &mut impl Write,
478 config: &UniqConfig,
479 method: GroupMethod,
480 term: u8,
481) -> io::Result<()> {
482 let mut lines = LineIter::new(data, term);
483
484 let (prev_content, prev_full) = match lines.next() {
485 Some(v) => v,
486 None => return Ok(()),
487 };
488
489 if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
491 writer.write_all(&[term])?;
492 }
493
494 writer.write_all(prev_full)?;
496 if prev_full.len() == prev_content.len() {
497 writer.write_all(&[term])?;
498 }
499
500 let mut prev_content = prev_content;
501 let fast = !needs_key_extraction(config) && !config.ignore_case;
502
503 for (cur_content, cur_full) in lines {
504 let equal = if fast {
505 lines_equal_fast(prev_content, cur_content)
506 } else {
507 lines_equal(prev_content, cur_content, config)
508 };
509
510 if !equal {
511 writer.write_all(&[term])?;
513 }
514
515 writer.write_all(cur_full)?;
516 if cur_full.len() == cur_content.len() {
517 writer.write_all(&[term])?;
518 }
519
520 prev_content = cur_content;
521 }
522
523 if matches!(method, GroupMethod::Append | GroupMethod::Both) {
525 writer.write_all(&[term])?;
526 }
527
528 Ok(())
529}
530
531pub fn process_uniq<R: Read, W: Write>(input: R, output: W, config: &UniqConfig) -> io::Result<()> {
538 let reader = BufReader::with_capacity(8 * 1024 * 1024, input);
539 let mut writer = BufWriter::with_capacity(8 * 1024 * 1024, output);
540 let term = if config.zero_terminated { b'\0' } else { b'\n' };
541
542 match config.mode {
543 OutputMode::Group(method) => {
544 process_group_stream(reader, &mut writer, config, method, term)?;
545 }
546 OutputMode::AllRepeated(method) => {
547 process_all_repeated_stream(reader, &mut writer, config, method, term)?;
548 }
549 _ => {
550 process_standard_stream(reader, &mut writer, config, term)?;
551 }
552 }
553
554 writer.flush()?;
555 Ok(())
556}
557
558fn process_standard_stream<R: BufRead, W: Write>(
560 mut reader: R,
561 writer: &mut W,
562 config: &UniqConfig,
563 term: u8,
564) -> io::Result<()> {
565 let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
566 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
567
568 if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
570 return Ok(()); }
572 let mut count: u64 = 1;
573
574 loop {
575 current_line.clear();
576 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
577
578 if bytes_read == 0 {
579 output_group_stream(writer, &prev_line, count, config, term)?;
581 break;
582 }
583
584 if compare_lines_stream(&prev_line, ¤t_line, config, term) {
585 count += 1;
586 } else {
587 output_group_stream(writer, &prev_line, count, config, term)?;
588 std::mem::swap(&mut prev_line, &mut current_line);
589 count = 1;
590 }
591 }
592
593 Ok(())
594}
595
596#[inline(always)]
598fn compare_lines_stream(a: &[u8], b: &[u8], config: &UniqConfig, term: u8) -> bool {
599 let a_stripped = strip_term(a, term);
600 let b_stripped = strip_term(b, term);
601 lines_equal(a_stripped, b_stripped, config)
602}
603
604#[inline(always)]
606fn strip_term(line: &[u8], term: u8) -> &[u8] {
607 if line.last() == Some(&term) {
608 &line[..line.len() - 1]
609 } else {
610 line
611 }
612}
613
614#[inline(always)]
616fn output_group_stream(
617 writer: &mut impl Write,
618 line: &[u8],
619 count: u64,
620 config: &UniqConfig,
621 term: u8,
622) -> io::Result<()> {
623 let should_print = match config.mode {
624 OutputMode::Default => true,
625 OutputMode::RepeatedOnly => count > 1,
626 OutputMode::UniqueOnly => count == 1,
627 _ => true,
628 };
629
630 if should_print {
631 let content = strip_term(line, term);
632 if config.count {
633 write_count_line(writer, count, content, term)?;
634 } else {
635 writer.write_all(content)?;
636 writer.write_all(&[term])?;
637 }
638 }
639
640 Ok(())
641}
642
643fn process_all_repeated_stream<R: BufRead, W: Write>(
645 mut reader: R,
646 writer: &mut W,
647 config: &UniqConfig,
648 method: AllRepeatedMethod,
649 term: u8,
650) -> io::Result<()> {
651 let mut group: Vec<Vec<u8>> = Vec::new();
652 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
653 let mut first_group_printed = false;
654
655 current_line.clear();
656 if read_line_term(&mut reader, &mut current_line, term)? == 0 {
657 return Ok(());
658 }
659 group.push(current_line.clone());
660
661 loop {
662 current_line.clear();
663 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
664
665 if bytes_read == 0 {
666 flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
667 break;
668 }
669
670 if compare_lines_stream(group.last().unwrap(), ¤t_line, config, term) {
671 group.push(current_line.clone());
672 } else {
673 flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
674 group.clear();
675 group.push(current_line.clone());
676 }
677 }
678
679 Ok(())
680}
681
682fn flush_all_repeated_stream(
684 writer: &mut impl Write,
685 group: &[Vec<u8>],
686 method: AllRepeatedMethod,
687 first_group_printed: &mut bool,
688 term: u8,
689) -> io::Result<()> {
690 if group.len() <= 1 {
691 return Ok(());
692 }
693
694 match method {
695 AllRepeatedMethod::Prepend => {
696 writer.write_all(&[term])?;
697 }
698 AllRepeatedMethod::Separate => {
699 if *first_group_printed {
700 writer.write_all(&[term])?;
701 }
702 }
703 AllRepeatedMethod::None => {}
704 }
705
706 for line in group {
707 let content = strip_term(line, term);
708 writer.write_all(content)?;
709 writer.write_all(&[term])?;
710 }
711
712 *first_group_printed = true;
713 Ok(())
714}
715
716fn process_group_stream<R: BufRead, W: Write>(
718 mut reader: R,
719 writer: &mut W,
720 config: &UniqConfig,
721 method: GroupMethod,
722 term: u8,
723) -> io::Result<()> {
724 let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
725 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
726
727 if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
728 return Ok(());
729 }
730
731 if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
733 writer.write_all(&[term])?;
734 }
735
736 let content = strip_term(&prev_line, term);
737 writer.write_all(content)?;
738 writer.write_all(&[term])?;
739
740 loop {
741 current_line.clear();
742 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
743
744 if bytes_read == 0 {
745 if matches!(method, GroupMethod::Append | GroupMethod::Both) {
746 writer.write_all(&[term])?;
747 }
748 break;
749 }
750
751 if !compare_lines_stream(&prev_line, ¤t_line, config, term) {
752 writer.write_all(&[term])?;
753 }
754
755 let content = strip_term(¤t_line, term);
756 writer.write_all(content)?;
757 writer.write_all(&[term])?;
758
759 std::mem::swap(&mut prev_line, &mut current_line);
760 }
761
762 Ok(())
763}
764
765#[inline(always)]
768fn read_line_term<R: BufRead>(reader: &mut R, buf: &mut Vec<u8>, term: u8) -> io::Result<usize> {
769 reader.read_until(term, buf)
770}