1use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
2
3#[inline]
5fn write_all_raw(writer: &mut impl Write, buf: &[u8]) -> io::Result<()> {
6 writer.write_all(buf)
7}
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum AllRepeatedMethod {
12 None,
13 Prepend,
14 Separate,
15}
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum GroupMethod {
20 Separate,
21 Prepend,
22 Append,
23 Both,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum OutputMode {
29 Default,
31 RepeatedOnly,
33 AllRepeated(AllRepeatedMethod),
35 UniqueOnly,
37 Group(GroupMethod),
39}
40
41#[derive(Debug, Clone)]
43pub struct UniqConfig {
44 pub mode: OutputMode,
45 pub count: bool,
46 pub ignore_case: bool,
47 pub skip_fields: usize,
48 pub skip_chars: usize,
49 pub check_chars: Option<usize>,
50 pub zero_terminated: bool,
51}
52
53impl Default for UniqConfig {
54 fn default() -> Self {
55 Self {
56 mode: OutputMode::Default,
57 count: false,
58 ignore_case: false,
59 skip_fields: 0,
60 skip_chars: 0,
61 check_chars: None,
62 zero_terminated: false,
63 }
64 }
65}
66
67#[inline(always)]
70fn get_compare_slice<'a>(line: &'a [u8], config: &UniqConfig) -> &'a [u8] {
71 let mut start = 0;
72 let len = line.len();
73
74 for _ in 0..config.skip_fields {
76 while start < len && (line[start] == b' ' || line[start] == b'\t') {
78 start += 1;
79 }
80 while start < len && line[start] != b' ' && line[start] != b'\t' {
82 start += 1;
83 }
84 }
85
86 if config.skip_chars > 0 {
88 let remaining = len - start;
89 let skip = config.skip_chars.min(remaining);
90 start += skip;
91 }
92
93 let slice = &line[start..];
94
95 if let Some(w) = config.check_chars {
97 if w < slice.len() {
98 return &slice[..w];
99 }
100 }
101
102 slice
103}
104
105#[inline(always)]
107fn lines_equal(a: &[u8], b: &[u8], config: &UniqConfig) -> bool {
108 let sa = get_compare_slice(a, config);
109 let sb = get_compare_slice(b, config);
110
111 if config.ignore_case {
112 sa.eq_ignore_ascii_case(sb)
113 } else {
114 sa == sb
115 }
116}
117
118#[inline(always)]
120fn needs_key_extraction(config: &UniqConfig) -> bool {
121 config.skip_fields > 0 || config.skip_chars > 0 || config.check_chars.is_some()
122}
123
124#[inline(always)]
127fn lines_equal_fast(a: &[u8], b: &[u8]) -> bool {
128 let alen = a.len();
129 if alen != b.len() {
130 return false;
131 }
132 if alen == 0 {
133 return true;
134 }
135 if alen >= 8 {
137 let a8 = unsafe { (a.as_ptr() as *const u64).read_unaligned() };
138 let b8 = unsafe { (b.as_ptr() as *const u64).read_unaligned() };
139 if a8 != b8 {
140 return false;
141 }
142 }
143 a == b
144}
145
146#[inline(always)]
150fn write_count_line(out: &mut impl Write, count: u64, line: &[u8], term: u8) -> io::Result<()> {
151 let mut prefix = [b' '; 28]; let digits = itoa_right_aligned_into(&mut prefix, count);
154 let width = digits.max(7); let prefix_len = width + 1; prefix[width] = b' ';
157
158 let total = prefix_len + line.len() + 1;
160 if total <= 256 {
161 let mut buf = [0u8; 256];
162 buf[..prefix_len].copy_from_slice(&prefix[..prefix_len]);
163 buf[prefix_len..prefix_len + line.len()].copy_from_slice(line);
164 buf[prefix_len + line.len()] = term;
165 out.write_all(&buf[..total])
166 } else {
167 out.write_all(&prefix[..prefix_len])?;
168 out.write_all(line)?;
169 out.write_all(&[term])
170 }
171}
172
173#[inline(always)]
176fn itoa_right_aligned_into(buf: &mut [u8; 28], mut val: u64) -> usize {
177 if val == 0 {
178 buf[6] = b'0';
179 return 7; }
181 let mut pos = 27;
183 while val > 0 {
184 pos -= 1;
185 buf[pos] = b'0' + (val % 10) as u8;
186 val /= 10;
187 }
188 let num_digits = 27 - pos;
189 if num_digits >= 7 {
190 buf.copy_within(pos..27, 0);
192 num_digits
193 } else {
194 let pad = 7 - num_digits;
196 buf.copy_within(pos..27, pad);
197 7
199 }
200}
201
202pub fn process_uniq_bytes(data: &[u8], output: impl Write, config: &UniqConfig) -> io::Result<()> {
208 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, output);
210 let term = if config.zero_terminated { b'\0' } else { b'\n' };
211
212 match config.mode {
213 OutputMode::Group(method) => {
214 process_group_bytes(data, &mut writer, config, method, term)?;
215 }
216 OutputMode::AllRepeated(method) => {
217 process_all_repeated_bytes(data, &mut writer, config, method, term)?;
218 }
219 _ => {
220 process_standard_bytes(data, &mut writer, config, term)?;
221 }
222 }
223
224 writer.flush()?;
225 Ok(())
226}
227
228struct LineIter<'a> {
231 data: &'a [u8],
232 pos: usize,
233 term: u8,
234}
235
236impl<'a> LineIter<'a> {
237 #[inline(always)]
238 fn new(data: &'a [u8], term: u8) -> Self {
239 Self { data, pos: 0, term }
240 }
241}
242
243impl<'a> Iterator for LineIter<'a> {
244 type Item = (&'a [u8], &'a [u8]);
246
247 #[inline(always)]
248 fn next(&mut self) -> Option<Self::Item> {
249 if self.pos >= self.data.len() {
250 return None;
251 }
252
253 let remaining = &self.data[self.pos..];
254 match memchr::memchr(self.term, remaining) {
255 Some(idx) => {
256 let line_start = self.pos;
257 let line_end = self.pos + idx; let full_end = self.pos + idx + 1; self.pos = full_end;
260 Some((
261 &self.data[line_start..line_end],
262 &self.data[line_start..full_end],
263 ))
264 }
265 None => {
266 let line_start = self.pos;
268 self.pos = self.data.len();
269 let line = &self.data[line_start..];
270 Some((line, line))
271 }
272 }
273 }
274}
275
276fn process_standard_bytes(
278 data: &[u8],
279 writer: &mut impl Write,
280 config: &UniqConfig,
281 term: u8,
282) -> io::Result<()> {
283 let mut lines = LineIter::new(data, term);
284
285 let (prev_content, prev_full) = match lines.next() {
286 Some(v) => v,
287 None => return Ok(()), };
289
290 let fast = !needs_key_extraction(config) && !config.ignore_case;
291
292 if fast && !config.count && matches!(config.mode, OutputMode::Default) {
295 let data_base = data.as_ptr() as usize;
296 let mut prev_content = prev_content;
297
298 write_all_raw(writer, prev_full)?;
300 if prev_full.len() == prev_content.len() {
301 writer.write_all(&[term])?;
302 }
303
304 let mut span_start: usize = usize::MAX; let mut span_end: usize = 0;
307
308 for (cur_content, cur_full) in lines {
309 if lines_equal_fast(prev_content, cur_content) {
310 if span_start != usize::MAX {
312 write_all_raw(writer, &data[span_start..span_end])?;
313 span_start = usize::MAX;
314 }
315 prev_content = cur_content;
316 continue;
317 }
318
319 let cur_offset = cur_full.as_ptr() as usize - data_base;
320
321 if span_start == usize::MAX {
322 span_start = cur_offset;
324 span_end = cur_offset + cur_full.len();
325 } else if cur_offset == span_end {
326 span_end += cur_full.len();
328 } else {
329 write_all_raw(writer, &data[span_start..span_end])?;
331 span_start = cur_offset;
332 span_end = cur_offset + cur_full.len();
333 }
334
335 if cur_full.len() == cur_content.len() {
337 write_all_raw(writer, &data[span_start..span_end])?;
338 writer.write_all(&[term])?;
339 span_start = usize::MAX;
340 }
341
342 prev_content = cur_content;
343 }
344
345 if span_start != usize::MAX {
347 write_all_raw(writer, &data[span_start..span_end])?;
348 }
349 return Ok(());
350 }
351
352 let mut prev_content = prev_content;
354 let mut prev_full = prev_full;
355 let mut count: u64 = 1;
356
357 for (cur_content, cur_full) in lines {
358 let equal = if fast {
359 lines_equal_fast(prev_content, cur_content)
360 } else {
361 lines_equal(prev_content, cur_content, config)
362 };
363
364 if equal {
365 count += 1;
366 } else {
367 output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
369 prev_content = cur_content;
370 prev_full = cur_full;
371 count = 1;
372 }
373 }
374
375 output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
377 Ok(())
378}
379
380#[inline(always)]
382fn output_group_bytes(
383 writer: &mut impl Write,
384 content: &[u8],
385 full: &[u8],
386 count: u64,
387 config: &UniqConfig,
388 term: u8,
389) -> io::Result<()> {
390 let should_print = match config.mode {
391 OutputMode::Default => true,
392 OutputMode::RepeatedOnly => count > 1,
393 OutputMode::UniqueOnly => count == 1,
394 _ => true,
395 };
396
397 if should_print {
398 if config.count {
399 write_count_line(writer, count, content, term)?;
400 } else {
401 writer.write_all(full)?;
402 if full.len() == content.len() {
404 writer.write_all(&[term])?;
405 }
406 }
407 }
408
409 Ok(())
410}
411
412fn process_all_repeated_bytes(
414 data: &[u8],
415 writer: &mut impl Write,
416 config: &UniqConfig,
417 method: AllRepeatedMethod,
418 term: u8,
419) -> io::Result<()> {
420 let mut lines = LineIter::new(data, term);
421
422 let first = match lines.next() {
423 Some(v) => v,
424 None => return Ok(()),
425 };
426
427 let mut group_lines: Vec<(&[u8], &[u8])> = Vec::with_capacity(64);
430 group_lines.push(first);
431 let mut first_group_printed = false;
432
433 let fast = !needs_key_extraction(config) && !config.ignore_case;
434
435 for (cur_content, cur_full) in lines {
436 let prev_content = group_lines.last().unwrap().0;
437 let equal = if fast {
438 lines_equal_fast(prev_content, cur_content)
439 } else {
440 lines_equal(prev_content, cur_content, config)
441 };
442
443 if equal {
444 group_lines.push((cur_content, cur_full));
445 } else {
446 flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
448 group_lines.clear();
449 group_lines.push((cur_content, cur_full));
450 }
451 }
452
453 flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
455
456 Ok(())
457}
458
459fn flush_all_repeated_bytes(
461 writer: &mut impl Write,
462 group: &[(&[u8], &[u8])],
463 method: AllRepeatedMethod,
464 first_group_printed: &mut bool,
465 term: u8,
466) -> io::Result<()> {
467 if group.len() <= 1 {
468 return Ok(()); }
470
471 match method {
472 AllRepeatedMethod::Prepend => {
473 writer.write_all(&[term])?;
474 }
475 AllRepeatedMethod::Separate => {
476 if *first_group_printed {
477 writer.write_all(&[term])?;
478 }
479 }
480 AllRepeatedMethod::None => {}
481 }
482
483 for &(content, full) in group {
484 writer.write_all(full)?;
485 if full.len() == content.len() {
486 writer.write_all(&[term])?;
487 }
488 }
489
490 *first_group_printed = true;
491 Ok(())
492}
493
494fn process_group_bytes(
496 data: &[u8],
497 writer: &mut impl Write,
498 config: &UniqConfig,
499 method: GroupMethod,
500 term: u8,
501) -> io::Result<()> {
502 let mut lines = LineIter::new(data, term);
503
504 let (prev_content, prev_full) = match lines.next() {
505 Some(v) => v,
506 None => return Ok(()),
507 };
508
509 if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
511 writer.write_all(&[term])?;
512 }
513
514 writer.write_all(prev_full)?;
516 if prev_full.len() == prev_content.len() {
517 writer.write_all(&[term])?;
518 }
519
520 let mut prev_content = prev_content;
521 let fast = !needs_key_extraction(config) && !config.ignore_case;
522
523 for (cur_content, cur_full) in lines {
524 let equal = if fast {
525 lines_equal_fast(prev_content, cur_content)
526 } else {
527 lines_equal(prev_content, cur_content, config)
528 };
529
530 if !equal {
531 writer.write_all(&[term])?;
533 }
534
535 writer.write_all(cur_full)?;
536 if cur_full.len() == cur_content.len() {
537 writer.write_all(&[term])?;
538 }
539
540 prev_content = cur_content;
541 }
542
543 if matches!(method, GroupMethod::Append | GroupMethod::Both) {
545 writer.write_all(&[term])?;
546 }
547
548 Ok(())
549}
550
551pub fn process_uniq<R: Read, W: Write>(input: R, output: W, config: &UniqConfig) -> io::Result<()> {
558 let reader = BufReader::with_capacity(8 * 1024 * 1024, input);
559 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, output);
560 let term = if config.zero_terminated { b'\0' } else { b'\n' };
561
562 match config.mode {
563 OutputMode::Group(method) => {
564 process_group_stream(reader, &mut writer, config, method, term)?;
565 }
566 OutputMode::AllRepeated(method) => {
567 process_all_repeated_stream(reader, &mut writer, config, method, term)?;
568 }
569 _ => {
570 process_standard_stream(reader, &mut writer, config, term)?;
571 }
572 }
573
574 writer.flush()?;
575 Ok(())
576}
577
578fn process_standard_stream<R: BufRead, W: Write>(
580 mut reader: R,
581 writer: &mut W,
582 config: &UniqConfig,
583 term: u8,
584) -> io::Result<()> {
585 let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
586 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
587
588 if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
590 return Ok(()); }
592 let mut count: u64 = 1;
593
594 loop {
595 current_line.clear();
596 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
597
598 if bytes_read == 0 {
599 output_group_stream(writer, &prev_line, count, config, term)?;
601 break;
602 }
603
604 if compare_lines_stream(&prev_line, ¤t_line, config, term) {
605 count += 1;
606 } else {
607 output_group_stream(writer, &prev_line, count, config, term)?;
608 std::mem::swap(&mut prev_line, &mut current_line);
609 count = 1;
610 }
611 }
612
613 Ok(())
614}
615
616#[inline(always)]
618fn compare_lines_stream(a: &[u8], b: &[u8], config: &UniqConfig, term: u8) -> bool {
619 let a_stripped = strip_term(a, term);
620 let b_stripped = strip_term(b, term);
621 lines_equal(a_stripped, b_stripped, config)
622}
623
624#[inline(always)]
626fn strip_term(line: &[u8], term: u8) -> &[u8] {
627 if line.last() == Some(&term) {
628 &line[..line.len() - 1]
629 } else {
630 line
631 }
632}
633
634#[inline(always)]
636fn output_group_stream(
637 writer: &mut impl Write,
638 line: &[u8],
639 count: u64,
640 config: &UniqConfig,
641 term: u8,
642) -> io::Result<()> {
643 let should_print = match config.mode {
644 OutputMode::Default => true,
645 OutputMode::RepeatedOnly => count > 1,
646 OutputMode::UniqueOnly => count == 1,
647 _ => true,
648 };
649
650 if should_print {
651 let content = strip_term(line, term);
652 if config.count {
653 write_count_line(writer, count, content, term)?;
654 } else {
655 writer.write_all(content)?;
656 writer.write_all(&[term])?;
657 }
658 }
659
660 Ok(())
661}
662
663fn process_all_repeated_stream<R: BufRead, W: Write>(
665 mut reader: R,
666 writer: &mut W,
667 config: &UniqConfig,
668 method: AllRepeatedMethod,
669 term: u8,
670) -> io::Result<()> {
671 let mut group: Vec<Vec<u8>> = Vec::new();
672 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
673 let mut first_group_printed = false;
674
675 current_line.clear();
676 if read_line_term(&mut reader, &mut current_line, term)? == 0 {
677 return Ok(());
678 }
679 group.push(current_line.clone());
680
681 loop {
682 current_line.clear();
683 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
684
685 if bytes_read == 0 {
686 flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
687 break;
688 }
689
690 if compare_lines_stream(group.last().unwrap(), ¤t_line, config, term) {
691 group.push(current_line.clone());
692 } else {
693 flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
694 group.clear();
695 group.push(current_line.clone());
696 }
697 }
698
699 Ok(())
700}
701
702fn flush_all_repeated_stream(
704 writer: &mut impl Write,
705 group: &[Vec<u8>],
706 method: AllRepeatedMethod,
707 first_group_printed: &mut bool,
708 term: u8,
709) -> io::Result<()> {
710 if group.len() <= 1 {
711 return Ok(());
712 }
713
714 match method {
715 AllRepeatedMethod::Prepend => {
716 writer.write_all(&[term])?;
717 }
718 AllRepeatedMethod::Separate => {
719 if *first_group_printed {
720 writer.write_all(&[term])?;
721 }
722 }
723 AllRepeatedMethod::None => {}
724 }
725
726 for line in group {
727 let content = strip_term(line, term);
728 writer.write_all(content)?;
729 writer.write_all(&[term])?;
730 }
731
732 *first_group_printed = true;
733 Ok(())
734}
735
736fn process_group_stream<R: BufRead, W: Write>(
738 mut reader: R,
739 writer: &mut W,
740 config: &UniqConfig,
741 method: GroupMethod,
742 term: u8,
743) -> io::Result<()> {
744 let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
745 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
746
747 if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
748 return Ok(());
749 }
750
751 if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
753 writer.write_all(&[term])?;
754 }
755
756 let content = strip_term(&prev_line, term);
757 writer.write_all(content)?;
758 writer.write_all(&[term])?;
759
760 loop {
761 current_line.clear();
762 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
763
764 if bytes_read == 0 {
765 if matches!(method, GroupMethod::Append | GroupMethod::Both) {
766 writer.write_all(&[term])?;
767 }
768 break;
769 }
770
771 if !compare_lines_stream(&prev_line, ¤t_line, config, term) {
772 writer.write_all(&[term])?;
773 }
774
775 let content = strip_term(¤t_line, term);
776 writer.write_all(content)?;
777 writer.write_all(&[term])?;
778
779 std::mem::swap(&mut prev_line, &mut current_line);
780 }
781
782 Ok(())
783}
784
785#[inline(always)]
788fn read_line_term<R: BufRead>(reader: &mut R, buf: &mut Vec<u8>, term: u8) -> io::Result<usize> {
789 reader.read_until(term, buf)
790}