1use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum AllRepeatedMethod {
6 None,
7 Prepend,
8 Separate,
9}
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum GroupMethod {
14 Separate,
15 Prepend,
16 Append,
17 Both,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum OutputMode {
23 Default,
25 RepeatedOnly,
27 AllRepeated(AllRepeatedMethod),
29 UniqueOnly,
31 Group(GroupMethod),
33}
34
35#[derive(Debug, Clone)]
37pub struct UniqConfig {
38 pub mode: OutputMode,
39 pub count: bool,
40 pub ignore_case: bool,
41 pub skip_fields: usize,
42 pub skip_chars: usize,
43 pub check_chars: Option<usize>,
44 pub zero_terminated: bool,
45}
46
47impl Default for UniqConfig {
48 fn default() -> Self {
49 Self {
50 mode: OutputMode::Default,
51 count: false,
52 ignore_case: false,
53 skip_fields: 0,
54 skip_chars: 0,
55 check_chars: None,
56 zero_terminated: false,
57 }
58 }
59}
60
61#[inline(always)]
64fn get_compare_slice<'a>(line: &'a [u8], config: &UniqConfig) -> &'a [u8] {
65 let mut start = 0;
66 let len = line.len();
67
68 for _ in 0..config.skip_fields {
70 while start < len && (line[start] == b' ' || line[start] == b'\t') {
72 start += 1;
73 }
74 while start < len && line[start] != b' ' && line[start] != b'\t' {
76 start += 1;
77 }
78 }
79
80 if config.skip_chars > 0 {
82 let remaining = len - start;
83 let skip = config.skip_chars.min(remaining);
84 start += skip;
85 }
86
87 let slice = &line[start..];
88
89 if let Some(w) = config.check_chars {
91 if w < slice.len() {
92 return &slice[..w];
93 }
94 }
95
96 slice
97}
98
99#[inline(always)]
101fn lines_equal(a: &[u8], b: &[u8], config: &UniqConfig) -> bool {
102 let sa = get_compare_slice(a, config);
103 let sb = get_compare_slice(b, config);
104
105 if config.ignore_case {
106 sa.eq_ignore_ascii_case(sb)
107 } else {
108 sa == sb
109 }
110}
111
112#[inline(always)]
114fn needs_key_extraction(config: &UniqConfig) -> bool {
115 config.skip_fields > 0 || config.skip_chars > 0 || config.check_chars.is_some()
116}
117
118#[inline(always)]
120fn lines_equal_fast(a: &[u8], b: &[u8]) -> bool {
121 a == b
122}
123
124#[inline(always)]
127fn write_count_line(out: &mut impl Write, count: u64, line: &[u8], term: u8) -> io::Result<()> {
128 let mut buf = [b' '; 20]; let count_len = {
132 let count_str = itoa_right_aligned(&mut buf, count);
133 count_str.len()
134 };
135 let start = 20 - count_len;
136 if count_len < 7 {
137 let pad = 7 - count_len;
139 out.write_all(&buf[..pad])?; }
141 out.write_all(&buf[start..])?;
142 out.write_all(b" ")?;
143 out.write_all(line)?;
144 out.write_all(&[term])?;
145 Ok(())
146}
147
148#[inline(always)]
151fn itoa_right_aligned(buf: &mut [u8; 20], mut val: u64) -> &[u8] {
152 if val == 0 {
153 buf[19] = b'0';
154 return &buf[19..20];
155 }
156 let mut pos = 20;
157 while val > 0 {
158 pos -= 1;
159 buf[pos] = b'0' + (val % 10) as u8;
160 val /= 10;
161 }
162 &buf[pos..20]
163}
164
165pub fn process_uniq_bytes(data: &[u8], output: impl Write, config: &UniqConfig) -> io::Result<()> {
171 let mut writer = BufWriter::with_capacity(1024 * 1024, output);
172 let term = if config.zero_terminated { b'\0' } else { b'\n' };
173
174 match config.mode {
175 OutputMode::Group(method) => {
176 process_group_bytes(data, &mut writer, config, method, term)?;
177 }
178 OutputMode::AllRepeated(method) => {
179 process_all_repeated_bytes(data, &mut writer, config, method, term)?;
180 }
181 _ => {
182 process_standard_bytes(data, &mut writer, config, term)?;
183 }
184 }
185
186 writer.flush()?;
187 Ok(())
188}
189
190struct LineIter<'a> {
193 data: &'a [u8],
194 pos: usize,
195 term: u8,
196}
197
198impl<'a> LineIter<'a> {
199 #[inline(always)]
200 fn new(data: &'a [u8], term: u8) -> Self {
201 Self { data, pos: 0, term }
202 }
203}
204
205impl<'a> Iterator for LineIter<'a> {
206 type Item = (&'a [u8], &'a [u8]);
208
209 #[inline(always)]
210 fn next(&mut self) -> Option<Self::Item> {
211 if self.pos >= self.data.len() {
212 return None;
213 }
214
215 let remaining = &self.data[self.pos..];
216 match memchr::memchr(self.term, remaining) {
217 Some(idx) => {
218 let line_start = self.pos;
219 let line_end = self.pos + idx; let full_end = self.pos + idx + 1; self.pos = full_end;
222 Some((
223 &self.data[line_start..line_end],
224 &self.data[line_start..full_end],
225 ))
226 }
227 None => {
228 let line_start = self.pos;
230 self.pos = self.data.len();
231 let line = &self.data[line_start..];
232 Some((line, line))
233 }
234 }
235 }
236}
237
238fn process_standard_bytes(
240 data: &[u8],
241 writer: &mut impl Write,
242 config: &UniqConfig,
243 term: u8,
244) -> io::Result<()> {
245 let mut lines = LineIter::new(data, term);
246
247 let (prev_content, prev_full) = match lines.next() {
248 Some(v) => v,
249 None => return Ok(()), };
251
252 let mut prev_content = prev_content;
253 let mut prev_full = prev_full;
254 let mut count: u64 = 1;
255
256 let fast = !needs_key_extraction(config) && !config.ignore_case;
257
258 for (cur_content, cur_full) in lines {
259 let equal = if fast {
260 lines_equal_fast(prev_content, cur_content)
261 } else {
262 lines_equal(prev_content, cur_content, config)
263 };
264
265 if equal {
266 count += 1;
267 } else {
268 output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
270 prev_content = cur_content;
271 prev_full = cur_full;
272 count = 1;
273 }
274 }
275
276 output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
278 Ok(())
279}
280
281#[inline(always)]
283fn output_group_bytes(
284 writer: &mut impl Write,
285 content: &[u8],
286 full: &[u8],
287 count: u64,
288 config: &UniqConfig,
289 term: u8,
290) -> io::Result<()> {
291 let should_print = match config.mode {
292 OutputMode::Default => true,
293 OutputMode::RepeatedOnly => count > 1,
294 OutputMode::UniqueOnly => count == 1,
295 _ => true,
296 };
297
298 if should_print {
299 if config.count {
300 write_count_line(writer, count, content, term)?;
301 } else {
302 writer.write_all(full)?;
303 if full.len() == content.len() {
305 writer.write_all(&[term])?;
306 }
307 }
308 }
309
310 Ok(())
311}
312
313fn process_all_repeated_bytes(
315 data: &[u8],
316 writer: &mut impl Write,
317 config: &UniqConfig,
318 method: AllRepeatedMethod,
319 term: u8,
320) -> io::Result<()> {
321 let mut lines = LineIter::new(data, term);
322
323 let first = match lines.next() {
324 Some(v) => v,
325 None => return Ok(()),
326 };
327
328 let mut group_lines: Vec<(&[u8], &[u8])> = Vec::with_capacity(64);
331 group_lines.push(first);
332 let mut first_group_printed = false;
333
334 let fast = !needs_key_extraction(config) && !config.ignore_case;
335
336 for (cur_content, cur_full) in lines {
337 let prev_content = group_lines.last().unwrap().0;
338 let equal = if fast {
339 lines_equal_fast(prev_content, cur_content)
340 } else {
341 lines_equal(prev_content, cur_content, config)
342 };
343
344 if equal {
345 group_lines.push((cur_content, cur_full));
346 } else {
347 flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
349 group_lines.clear();
350 group_lines.push((cur_content, cur_full));
351 }
352 }
353
354 flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
356
357 Ok(())
358}
359
360fn flush_all_repeated_bytes(
362 writer: &mut impl Write,
363 group: &[(&[u8], &[u8])],
364 method: AllRepeatedMethod,
365 first_group_printed: &mut bool,
366 term: u8,
367) -> io::Result<()> {
368 if group.len() <= 1 {
369 return Ok(()); }
371
372 match method {
373 AllRepeatedMethod::Prepend => {
374 writer.write_all(&[term])?;
375 }
376 AllRepeatedMethod::Separate => {
377 if *first_group_printed {
378 writer.write_all(&[term])?;
379 }
380 }
381 AllRepeatedMethod::None => {}
382 }
383
384 for &(content, full) in group {
385 writer.write_all(full)?;
386 if full.len() == content.len() {
387 writer.write_all(&[term])?;
388 }
389 }
390
391 *first_group_printed = true;
392 Ok(())
393}
394
395fn process_group_bytes(
397 data: &[u8],
398 writer: &mut impl Write,
399 config: &UniqConfig,
400 method: GroupMethod,
401 term: u8,
402) -> io::Result<()> {
403 let mut lines = LineIter::new(data, term);
404
405 let (prev_content, prev_full) = match lines.next() {
406 Some(v) => v,
407 None => return Ok(()),
408 };
409
410 if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
412 writer.write_all(&[term])?;
413 }
414
415 writer.write_all(prev_full)?;
417 if prev_full.len() == prev_content.len() {
418 writer.write_all(&[term])?;
419 }
420
421 let mut prev_content = prev_content;
422 let fast = !needs_key_extraction(config) && !config.ignore_case;
423
424 for (cur_content, cur_full) in lines {
425 let equal = if fast {
426 lines_equal_fast(prev_content, cur_content)
427 } else {
428 lines_equal(prev_content, cur_content, config)
429 };
430
431 if !equal {
432 writer.write_all(&[term])?;
434 }
435
436 writer.write_all(cur_full)?;
437 if cur_full.len() == cur_content.len() {
438 writer.write_all(&[term])?;
439 }
440
441 prev_content = cur_content;
442 }
443
444 if matches!(method, GroupMethod::Append | GroupMethod::Both) {
446 writer.write_all(&[term])?;
447 }
448
449 Ok(())
450}
451
452pub fn process_uniq<R: Read, W: Write>(input: R, output: W, config: &UniqConfig) -> io::Result<()> {
459 let reader = BufReader::with_capacity(1024 * 1024, input);
460 let mut writer = BufWriter::with_capacity(1024 * 1024, output);
461 let term = if config.zero_terminated { b'\0' } else { b'\n' };
462
463 match config.mode {
464 OutputMode::Group(method) => {
465 process_group_stream(reader, &mut writer, config, method, term)?;
466 }
467 OutputMode::AllRepeated(method) => {
468 process_all_repeated_stream(reader, &mut writer, config, method, term)?;
469 }
470 _ => {
471 process_standard_stream(reader, &mut writer, config, term)?;
472 }
473 }
474
475 writer.flush()?;
476 Ok(())
477}
478
479fn process_standard_stream<R: BufRead, W: Write>(
481 mut reader: R,
482 writer: &mut W,
483 config: &UniqConfig,
484 term: u8,
485) -> io::Result<()> {
486 let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
487 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
488
489 if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
491 return Ok(()); }
493 let mut count: u64 = 1;
494
495 loop {
496 current_line.clear();
497 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
498
499 if bytes_read == 0 {
500 output_group_stream(writer, &prev_line, count, config, term)?;
502 break;
503 }
504
505 if compare_lines_stream(&prev_line, ¤t_line, config, term) {
506 count += 1;
507 } else {
508 output_group_stream(writer, &prev_line, count, config, term)?;
509 std::mem::swap(&mut prev_line, &mut current_line);
510 count = 1;
511 }
512 }
513
514 Ok(())
515}
516
517#[inline(always)]
519fn compare_lines_stream(a: &[u8], b: &[u8], config: &UniqConfig, term: u8) -> bool {
520 let a_stripped = strip_term(a, term);
521 let b_stripped = strip_term(b, term);
522 lines_equal(a_stripped, b_stripped, config)
523}
524
525#[inline(always)]
527fn strip_term(line: &[u8], term: u8) -> &[u8] {
528 if line.last() == Some(&term) {
529 &line[..line.len() - 1]
530 } else {
531 line
532 }
533}
534
535#[inline(always)]
537fn output_group_stream(
538 writer: &mut impl Write,
539 line: &[u8],
540 count: u64,
541 config: &UniqConfig,
542 term: u8,
543) -> io::Result<()> {
544 let should_print = match config.mode {
545 OutputMode::Default => true,
546 OutputMode::RepeatedOnly => count > 1,
547 OutputMode::UniqueOnly => count == 1,
548 _ => true,
549 };
550
551 if should_print {
552 let content = strip_term(line, term);
553 if config.count {
554 write_count_line(writer, count, content, term)?;
555 } else {
556 writer.write_all(content)?;
557 writer.write_all(&[term])?;
558 }
559 }
560
561 Ok(())
562}
563
564fn process_all_repeated_stream<R: BufRead, W: Write>(
566 mut reader: R,
567 writer: &mut W,
568 config: &UniqConfig,
569 method: AllRepeatedMethod,
570 term: u8,
571) -> io::Result<()> {
572 let mut group: Vec<Vec<u8>> = Vec::new();
573 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
574 let mut first_group_printed = false;
575
576 current_line.clear();
577 if read_line_term(&mut reader, &mut current_line, term)? == 0 {
578 return Ok(());
579 }
580 group.push(current_line.clone());
581
582 loop {
583 current_line.clear();
584 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
585
586 if bytes_read == 0 {
587 flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
588 break;
589 }
590
591 if compare_lines_stream(group.last().unwrap(), ¤t_line, config, term) {
592 group.push(current_line.clone());
593 } else {
594 flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
595 group.clear();
596 group.push(current_line.clone());
597 }
598 }
599
600 Ok(())
601}
602
603fn flush_all_repeated_stream(
605 writer: &mut impl Write,
606 group: &[Vec<u8>],
607 method: AllRepeatedMethod,
608 first_group_printed: &mut bool,
609 term: u8,
610) -> io::Result<()> {
611 if group.len() <= 1 {
612 return Ok(());
613 }
614
615 match method {
616 AllRepeatedMethod::Prepend => {
617 writer.write_all(&[term])?;
618 }
619 AllRepeatedMethod::Separate => {
620 if *first_group_printed {
621 writer.write_all(&[term])?;
622 }
623 }
624 AllRepeatedMethod::None => {}
625 }
626
627 for line in group {
628 let content = strip_term(line, term);
629 writer.write_all(content)?;
630 writer.write_all(&[term])?;
631 }
632
633 *first_group_printed = true;
634 Ok(())
635}
636
637fn process_group_stream<R: BufRead, W: Write>(
639 mut reader: R,
640 writer: &mut W,
641 config: &UniqConfig,
642 method: GroupMethod,
643 term: u8,
644) -> io::Result<()> {
645 let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
646 let mut current_line: Vec<u8> = Vec::with_capacity(4096);
647
648 if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
649 return Ok(());
650 }
651
652 if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
654 writer.write_all(&[term])?;
655 }
656
657 let content = strip_term(&prev_line, term);
658 writer.write_all(content)?;
659 writer.write_all(&[term])?;
660
661 loop {
662 current_line.clear();
663 let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
664
665 if bytes_read == 0 {
666 if matches!(method, GroupMethod::Append | GroupMethod::Both) {
667 writer.write_all(&[term])?;
668 }
669 break;
670 }
671
672 if !compare_lines_stream(&prev_line, ¤t_line, config, term) {
673 writer.write_all(&[term])?;
674 }
675
676 let content = strip_term(¤t_line, term);
677 writer.write_all(content)?;
678 writer.write_all(&[term])?;
679
680 std::mem::swap(&mut prev_line, &mut current_line);
681 }
682
683 Ok(())
684}
685
686#[inline(always)]
689fn read_line_term<R: BufRead>(reader: &mut R, buf: &mut Vec<u8>, term: u8) -> io::Result<usize> {
690 reader.read_until(term, buf)
691}