Skip to main content

coreutils_rs/uniq/
core.rs

1use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
2
3/// How to delimit groups when using --all-repeated
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum AllRepeatedMethod {
6    None,
7    Prepend,
8    Separate,
9}
10
11/// How to delimit groups when using --group
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum GroupMethod {
14    Separate,
15    Prepend,
16    Append,
17    Both,
18}
19
20/// Output mode for uniq
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum OutputMode {
23    /// Default: print unique lines and first of each duplicate group
24    Default,
25    /// -d: print only first line of duplicate groups
26    RepeatedOnly,
27    /// -D / --all-repeated: print ALL duplicate lines
28    AllRepeated(AllRepeatedMethod),
29    /// -u: print only lines that are NOT duplicated
30    UniqueOnly,
31    /// --group: show all items with group separators
32    Group(GroupMethod),
33}
34
35/// Configuration for uniq processing
36#[derive(Debug, Clone)]
37pub struct UniqConfig {
38    pub mode: OutputMode,
39    pub count: bool,
40    pub ignore_case: bool,
41    pub skip_fields: usize,
42    pub skip_chars: usize,
43    pub check_chars: Option<usize>,
44    pub zero_terminated: bool,
45}
46
47impl Default for UniqConfig {
48    fn default() -> Self {
49        Self {
50            mode: OutputMode::Default,
51            count: false,
52            ignore_case: false,
53            skip_fields: 0,
54            skip_chars: 0,
55            check_chars: None,
56            zero_terminated: false,
57        }
58    }
59}
60
61/// Extract the comparison key from a line according to skip_fields, skip_chars, check_chars.
62/// Matches GNU uniq field-skip semantics exactly: for each field, skip blanks then non-blanks.
63#[inline(always)]
64fn get_compare_slice<'a>(line: &'a [u8], config: &UniqConfig) -> &'a [u8] {
65    let mut start = 0;
66    let len = line.len();
67
68    // Skip N fields (GNU: each field = run of blanks + run of non-blanks)
69    for _ in 0..config.skip_fields {
70        // Skip blanks (space and tab)
71        while start < len && (line[start] == b' ' || line[start] == b'\t') {
72            start += 1;
73        }
74        // Skip non-blanks (field content)
75        while start < len && line[start] != b' ' && line[start] != b'\t' {
76            start += 1;
77        }
78    }
79
80    // Skip N characters
81    if config.skip_chars > 0 {
82        let remaining = len - start;
83        let skip = config.skip_chars.min(remaining);
84        start += skip;
85    }
86
87    let slice = &line[start..];
88
89    // Limit comparison to N characters
90    if let Some(w) = config.check_chars {
91        if w < slice.len() {
92            return &slice[..w];
93        }
94    }
95
96    slice
97}
98
99/// Compare two lines (without terminators) using the config's comparison rules.
100#[inline(always)]
101fn lines_equal(a: &[u8], b: &[u8], config: &UniqConfig) -> bool {
102    let sa = get_compare_slice(a, config);
103    let sb = get_compare_slice(b, config);
104
105    if config.ignore_case {
106        sa.eq_ignore_ascii_case(sb)
107    } else {
108        sa == sb
109    }
110}
111
112/// Check if config requires field/char skipping or char limiting.
113#[inline(always)]
114fn needs_key_extraction(config: &UniqConfig) -> bool {
115    config.skip_fields > 0 || config.skip_chars > 0 || config.check_chars.is_some()
116}
117
118/// Fast path comparison: no field/char extraction needed, no case folding.
119#[inline(always)]
120fn lines_equal_fast(a: &[u8], b: &[u8]) -> bool {
121    a == b
122}
123
124/// Write a count-prefixed line in GNU uniq format.
125/// GNU format: "%7lu " — right-aligned in 7-char field, followed by space.
126#[inline(always)]
127fn write_count_line(out: &mut impl Write, count: u64, line: &[u8], term: u8) -> io::Result<()> {
128    // Fast path for small counts (vast majority of cases)
129    // Manually format to avoid write!() macro overhead
130    let mut buf = [b' '; 20]; // Enough for u64 max
131    let count_len = {
132        let count_str = itoa_right_aligned(&mut buf, count);
133        count_str.len()
134    };
135    let start = 20 - count_len;
136    if count_len < 7 {
137        // Pad with spaces to width 7
138        let pad = 7 - count_len;
139        out.write_all(&buf[..pad])?; // spaces
140    }
141    out.write_all(&buf[start..])?;
142    out.write_all(b" ")?;
143    out.write_all(line)?;
144    out.write_all(&[term])?;
145    Ok(())
146}
147
148/// Convert u64 to decimal string in a stack buffer, right-aligned.
149/// Returns the slice of the buffer containing the decimal digits.
150#[inline(always)]
151fn itoa_right_aligned(buf: &mut [u8; 20], mut val: u64) -> &[u8] {
152    if val == 0 {
153        buf[19] = b'0';
154        return &buf[19..20];
155    }
156    let mut pos = 20;
157    while val > 0 {
158        pos -= 1;
159        buf[pos] = b'0' + (val % 10) as u8;
160        val /= 10;
161    }
162    &buf[pos..20]
163}
164
165// ============================================================================
166// High-performance mmap-based processing (for byte slices, zero-copy)
167// ============================================================================
168
169/// Process uniq from a byte slice (mmap'd file). Zero-copy, no per-line allocation.
170pub fn process_uniq_bytes(data: &[u8], output: impl Write, config: &UniqConfig) -> io::Result<()> {
171    let mut writer = BufWriter::with_capacity(1024 * 1024, output);
172    let term = if config.zero_terminated { b'\0' } else { b'\n' };
173
174    match config.mode {
175        OutputMode::Group(method) => {
176            process_group_bytes(data, &mut writer, config, method, term)?;
177        }
178        OutputMode::AllRepeated(method) => {
179            process_all_repeated_bytes(data, &mut writer, config, method, term)?;
180        }
181        _ => {
182            process_standard_bytes(data, &mut writer, config, term)?;
183        }
184    }
185
186    writer.flush()?;
187    Ok(())
188}
189
190/// Iterator over lines in a byte slice, yielding (line_without_terminator, has_terminator).
191/// Uses memchr for SIMD-accelerated line boundary detection.
192struct LineIter<'a> {
193    data: &'a [u8],
194    pos: usize,
195    term: u8,
196}
197
198impl<'a> LineIter<'a> {
199    #[inline(always)]
200    fn new(data: &'a [u8], term: u8) -> Self {
201        Self { data, pos: 0, term }
202    }
203}
204
205impl<'a> Iterator for LineIter<'a> {
206    /// (line content without terminator, full line including terminator for output)
207    type Item = (&'a [u8], &'a [u8]);
208
209    #[inline(always)]
210    fn next(&mut self) -> Option<Self::Item> {
211        if self.pos >= self.data.len() {
212            return None;
213        }
214
215        let remaining = &self.data[self.pos..];
216        match memchr::memchr(self.term, remaining) {
217            Some(idx) => {
218                let line_start = self.pos;
219                let line_end = self.pos + idx; // without terminator
220                let full_end = self.pos + idx + 1; // with terminator
221                self.pos = full_end;
222                Some((
223                    &self.data[line_start..line_end],
224                    &self.data[line_start..full_end],
225                ))
226            }
227            None => {
228                // Last line without terminator
229                let line_start = self.pos;
230                self.pos = self.data.len();
231                let line = &self.data[line_start..];
232                Some((line, line))
233            }
234        }
235    }
236}
237
238/// Standard processing for Default, RepeatedOnly, UniqueOnly on byte slices.
239fn process_standard_bytes(
240    data: &[u8],
241    writer: &mut impl Write,
242    config: &UniqConfig,
243    term: u8,
244) -> io::Result<()> {
245    let mut lines = LineIter::new(data, term);
246
247    let (prev_content, prev_full) = match lines.next() {
248        Some(v) => v,
249        None => return Ok(()), // empty input
250    };
251
252    let mut prev_content = prev_content;
253    let mut prev_full = prev_full;
254    let mut count: u64 = 1;
255
256    let fast = !needs_key_extraction(config) && !config.ignore_case;
257
258    for (cur_content, cur_full) in lines {
259        let equal = if fast {
260            lines_equal_fast(prev_content, cur_content)
261        } else {
262            lines_equal(prev_content, cur_content, config)
263        };
264
265        if equal {
266            count += 1;
267        } else {
268            // Output previous group
269            output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
270            prev_content = cur_content;
271            prev_full = cur_full;
272            count = 1;
273        }
274    }
275
276    // Output last group
277    output_group_bytes(writer, prev_content, prev_full, count, config, term)?;
278    Ok(())
279}
280
281/// Output a group for standard modes (bytes path).
282#[inline(always)]
283fn output_group_bytes(
284    writer: &mut impl Write,
285    content: &[u8],
286    full: &[u8],
287    count: u64,
288    config: &UniqConfig,
289    term: u8,
290) -> io::Result<()> {
291    let should_print = match config.mode {
292        OutputMode::Default => true,
293        OutputMode::RepeatedOnly => count > 1,
294        OutputMode::UniqueOnly => count == 1,
295        _ => true,
296    };
297
298    if should_print {
299        if config.count {
300            write_count_line(writer, count, content, term)?;
301        } else {
302            writer.write_all(full)?;
303            // Add terminator if the original line didn't have one
304            if full.len() == content.len() {
305                writer.write_all(&[term])?;
306            }
307        }
308    }
309
310    Ok(())
311}
312
313/// Process --all-repeated / -D mode on byte slices.
314fn process_all_repeated_bytes(
315    data: &[u8],
316    writer: &mut impl Write,
317    config: &UniqConfig,
318    method: AllRepeatedMethod,
319    term: u8,
320) -> io::Result<()> {
321    let mut lines = LineIter::new(data, term);
322
323    let first = match lines.next() {
324        Some(v) => v,
325        None => return Ok(()),
326    };
327
328    // Collect groups as (start_offset, line_count, first_line_content, lines_vec)
329    // For all-repeated we need to buffer group lines since we only print if count > 1
330    let mut group_lines: Vec<(&[u8], &[u8])> = Vec::with_capacity(64);
331    group_lines.push(first);
332    let mut first_group_printed = false;
333
334    let fast = !needs_key_extraction(config) && !config.ignore_case;
335
336    for (cur_content, cur_full) in lines {
337        let prev_content = group_lines.last().unwrap().0;
338        let equal = if fast {
339            lines_equal_fast(prev_content, cur_content)
340        } else {
341            lines_equal(prev_content, cur_content, config)
342        };
343
344        if equal {
345            group_lines.push((cur_content, cur_full));
346        } else {
347            // Flush group
348            flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
349            group_lines.clear();
350            group_lines.push((cur_content, cur_full));
351        }
352    }
353
354    // Flush last group
355    flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
356
357    Ok(())
358}
359
360/// Flush a group for --all-repeated mode (bytes path).
361fn flush_all_repeated_bytes(
362    writer: &mut impl Write,
363    group: &[(&[u8], &[u8])],
364    method: AllRepeatedMethod,
365    first_group_printed: &mut bool,
366    term: u8,
367) -> io::Result<()> {
368    if group.len() <= 1 {
369        return Ok(()); // Not a duplicate group
370    }
371
372    match method {
373        AllRepeatedMethod::Prepend => {
374            writer.write_all(&[term])?;
375        }
376        AllRepeatedMethod::Separate => {
377            if *first_group_printed {
378                writer.write_all(&[term])?;
379            }
380        }
381        AllRepeatedMethod::None => {}
382    }
383
384    for &(content, full) in group {
385        writer.write_all(full)?;
386        if full.len() == content.len() {
387            writer.write_all(&[term])?;
388        }
389    }
390
391    *first_group_printed = true;
392    Ok(())
393}
394
395/// Process --group mode on byte slices.
396fn process_group_bytes(
397    data: &[u8],
398    writer: &mut impl Write,
399    config: &UniqConfig,
400    method: GroupMethod,
401    term: u8,
402) -> io::Result<()> {
403    let mut lines = LineIter::new(data, term);
404
405    let (prev_content, prev_full) = match lines.next() {
406        Some(v) => v,
407        None => return Ok(()),
408    };
409
410    // Prepend/Both: separator before first group
411    if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
412        writer.write_all(&[term])?;
413    }
414
415    // Write first line
416    writer.write_all(prev_full)?;
417    if prev_full.len() == prev_content.len() {
418        writer.write_all(&[term])?;
419    }
420
421    let mut prev_content = prev_content;
422    let fast = !needs_key_extraction(config) && !config.ignore_case;
423
424    for (cur_content, cur_full) in lines {
425        let equal = if fast {
426            lines_equal_fast(prev_content, cur_content)
427        } else {
428            lines_equal(prev_content, cur_content, config)
429        };
430
431        if !equal {
432            // New group — write separator
433            writer.write_all(&[term])?;
434        }
435
436        writer.write_all(cur_full)?;
437        if cur_full.len() == cur_content.len() {
438            writer.write_all(&[term])?;
439        }
440
441        prev_content = cur_content;
442    }
443
444    // Append/Both: separator after last group
445    if matches!(method, GroupMethod::Append | GroupMethod::Both) {
446        writer.write_all(&[term])?;
447    }
448
449    Ok(())
450}
451
452// ============================================================================
453// Streaming processing (for stdin / pipe input)
454// ============================================================================
455
456/// Main streaming uniq processor.
457/// Reads from `input`, writes to `output`.
458pub fn process_uniq<R: Read, W: Write>(input: R, output: W, config: &UniqConfig) -> io::Result<()> {
459    let reader = BufReader::with_capacity(1024 * 1024, input);
460    let mut writer = BufWriter::with_capacity(1024 * 1024, output);
461    let term = if config.zero_terminated { b'\0' } else { b'\n' };
462
463    match config.mode {
464        OutputMode::Group(method) => {
465            process_group_stream(reader, &mut writer, config, method, term)?;
466        }
467        OutputMode::AllRepeated(method) => {
468            process_all_repeated_stream(reader, &mut writer, config, method, term)?;
469        }
470        _ => {
471            process_standard_stream(reader, &mut writer, config, term)?;
472        }
473    }
474
475    writer.flush()?;
476    Ok(())
477}
478
479/// Standard processing for Default, RepeatedOnly, UniqueOnly modes (streaming).
480fn process_standard_stream<R: BufRead, W: Write>(
481    mut reader: R,
482    writer: &mut W,
483    config: &UniqConfig,
484    term: u8,
485) -> io::Result<()> {
486    let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
487    let mut current_line: Vec<u8> = Vec::with_capacity(4096);
488
489    // Read first line
490    if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
491        return Ok(()); // empty input
492    }
493    let mut count: u64 = 1;
494
495    loop {
496        current_line.clear();
497        let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
498
499        if bytes_read == 0 {
500            // End of input — output the last group
501            output_group_stream(writer, &prev_line, count, config, term)?;
502            break;
503        }
504
505        if compare_lines_stream(&prev_line, &current_line, config, term) {
506            count += 1;
507        } else {
508            output_group_stream(writer, &prev_line, count, config, term)?;
509            std::mem::swap(&mut prev_line, &mut current_line);
510            count = 1;
511        }
512    }
513
514    Ok(())
515}
516
517/// Compare two lines (with terminators) in streaming mode.
518#[inline(always)]
519fn compare_lines_stream(a: &[u8], b: &[u8], config: &UniqConfig, term: u8) -> bool {
520    let a_stripped = strip_term(a, term);
521    let b_stripped = strip_term(b, term);
522    lines_equal(a_stripped, b_stripped, config)
523}
524
525/// Strip terminator from end of line.
526#[inline(always)]
527fn strip_term(line: &[u8], term: u8) -> &[u8] {
528    if line.last() == Some(&term) {
529        &line[..line.len() - 1]
530    } else {
531        line
532    }
533}
534
535/// Output a group in streaming mode.
536#[inline(always)]
537fn output_group_stream(
538    writer: &mut impl Write,
539    line: &[u8],
540    count: u64,
541    config: &UniqConfig,
542    term: u8,
543) -> io::Result<()> {
544    let should_print = match config.mode {
545        OutputMode::Default => true,
546        OutputMode::RepeatedOnly => count > 1,
547        OutputMode::UniqueOnly => count == 1,
548        _ => true,
549    };
550
551    if should_print {
552        let content = strip_term(line, term);
553        if config.count {
554            write_count_line(writer, count, content, term)?;
555        } else {
556            writer.write_all(content)?;
557            writer.write_all(&[term])?;
558        }
559    }
560
561    Ok(())
562}
563
564/// Process --all-repeated / -D mode (streaming).
565fn process_all_repeated_stream<R: BufRead, W: Write>(
566    mut reader: R,
567    writer: &mut W,
568    config: &UniqConfig,
569    method: AllRepeatedMethod,
570    term: u8,
571) -> io::Result<()> {
572    let mut group: Vec<Vec<u8>> = Vec::new();
573    let mut current_line: Vec<u8> = Vec::with_capacity(4096);
574    let mut first_group_printed = false;
575
576    current_line.clear();
577    if read_line_term(&mut reader, &mut current_line, term)? == 0 {
578        return Ok(());
579    }
580    group.push(current_line.clone());
581
582    loop {
583        current_line.clear();
584        let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
585
586        if bytes_read == 0 {
587            flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
588            break;
589        }
590
591        if compare_lines_stream(group.last().unwrap(), &current_line, config, term) {
592            group.push(current_line.clone());
593        } else {
594            flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
595            group.clear();
596            group.push(current_line.clone());
597        }
598    }
599
600    Ok(())
601}
602
603/// Flush a group for --all-repeated mode (streaming).
604fn flush_all_repeated_stream(
605    writer: &mut impl Write,
606    group: &[Vec<u8>],
607    method: AllRepeatedMethod,
608    first_group_printed: &mut bool,
609    term: u8,
610) -> io::Result<()> {
611    if group.len() <= 1 {
612        return Ok(());
613    }
614
615    match method {
616        AllRepeatedMethod::Prepend => {
617            writer.write_all(&[term])?;
618        }
619        AllRepeatedMethod::Separate => {
620            if *first_group_printed {
621                writer.write_all(&[term])?;
622            }
623        }
624        AllRepeatedMethod::None => {}
625    }
626
627    for line in group {
628        let content = strip_term(line, term);
629        writer.write_all(content)?;
630        writer.write_all(&[term])?;
631    }
632
633    *first_group_printed = true;
634    Ok(())
635}
636
637/// Process --group mode (streaming).
638fn process_group_stream<R: BufRead, W: Write>(
639    mut reader: R,
640    writer: &mut W,
641    config: &UniqConfig,
642    method: GroupMethod,
643    term: u8,
644) -> io::Result<()> {
645    let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
646    let mut current_line: Vec<u8> = Vec::with_capacity(4096);
647
648    if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
649        return Ok(());
650    }
651
652    // Prepend/Both: separator before first group
653    if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
654        writer.write_all(&[term])?;
655    }
656
657    let content = strip_term(&prev_line, term);
658    writer.write_all(content)?;
659    writer.write_all(&[term])?;
660
661    loop {
662        current_line.clear();
663        let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
664
665        if bytes_read == 0 {
666            if matches!(method, GroupMethod::Append | GroupMethod::Both) {
667                writer.write_all(&[term])?;
668            }
669            break;
670        }
671
672        if !compare_lines_stream(&prev_line, &current_line, config, term) {
673            writer.write_all(&[term])?;
674        }
675
676        let content = strip_term(&current_line, term);
677        writer.write_all(content)?;
678        writer.write_all(&[term])?;
679
680        std::mem::swap(&mut prev_line, &mut current_line);
681    }
682
683    Ok(())
684}
685
686/// Read a line terminated by the given byte (newline or NUL).
687/// Returns number of bytes read (0 = EOF).
688#[inline(always)]
689fn read_line_term<R: BufRead>(reader: &mut R, buf: &mut Vec<u8>, term: u8) -> io::Result<usize> {
690    reader.read_until(term, buf)
691}