use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
fn write_all_vectored(writer: &mut impl Write, slices: &[io::IoSlice<'_>]) -> io::Result<()> {
let n = writer.write_vectored(slices)?;
let expected: usize = slices.iter().map(|s| s.len()).sum();
if n >= expected {
return Ok(());
}
if n == 0 && expected > 0 {
return Err(io::Error::new(
io::ErrorKind::WriteZero,
"write_vectored returned 0",
));
}
let mut consumed = n;
for slice in slices {
if consumed == 0 {
writer.write_all(slice)?;
} else if consumed >= slice.len() {
consumed -= slice.len();
} else {
writer.write_all(&slice[consumed..])?;
consumed = 0;
}
}
Ok(())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AllRepeatedMethod {
None,
Prepend,
Separate,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GroupMethod {
Separate,
Prepend,
Append,
Both,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OutputMode {
Default,
RepeatedOnly,
AllRepeated(AllRepeatedMethod),
UniqueOnly,
Group(GroupMethod),
}
#[derive(Debug, Clone)]
pub struct UniqConfig {
pub mode: OutputMode,
pub count: bool,
pub ignore_case: bool,
pub skip_fields: usize,
pub skip_chars: usize,
pub check_chars: Option<usize>,
pub zero_terminated: bool,
}
impl Default for UniqConfig {
fn default() -> Self {
Self {
mode: OutputMode::Default,
count: false,
ignore_case: false,
skip_fields: 0,
skip_chars: 0,
check_chars: None,
zero_terminated: false,
}
}
}
#[inline(always)]
fn get_compare_slice<'a>(line: &'a [u8], config: &UniqConfig) -> &'a [u8] {
let mut start = 0;
let len = line.len();
let mut fields_remaining = config.skip_fields;
while fields_remaining > 0 && start < len {
while start < len && (line[start] == b' ' || line[start] == b'\t') {
start += 1;
}
while start < len && line[start] != b' ' && line[start] != b'\t' {
start += 1;
}
fields_remaining -= 1;
}
if config.skip_chars > 0 {
let remaining = len - start;
let skip = config.skip_chars.min(remaining);
start += skip;
}
let slice = &line[start..];
if let Some(w) = config.check_chars {
if w < slice.len() {
return &slice[..w];
}
}
slice
}
#[inline(always)]
fn lines_equal(a: &[u8], b: &[u8], config: &UniqConfig) -> bool {
let sa = get_compare_slice(a, config);
let sb = get_compare_slice(b, config);
if config.ignore_case {
sa.eq_ignore_ascii_case(sb)
} else {
sa == sb
}
}
#[inline(always)]
fn lines_equal_case_insensitive(a: &[u8], b: &[u8]) -> bool {
a.eq_ignore_ascii_case(b)
}
#[inline(always)]
fn needs_key_extraction(config: &UniqConfig) -> bool {
config.skip_fields > 0 || config.skip_chars > 0 || config.check_chars.is_some()
}
#[inline(always)]
fn lines_equal_fast(a: &[u8], b: &[u8]) -> bool {
let alen = a.len();
if alen != b.len() {
return false;
}
if alen == 0 {
return true;
}
if alen <= 8 {
return a == b;
}
unsafe {
let ap = a.as_ptr();
let bp = b.as_ptr();
let a8 = (ap as *const u64).read_unaligned();
let b8 = (bp as *const u64).read_unaligned();
if a8 != b8 {
return false;
}
if alen <= 16 {
let a_tail = (ap.add(alen - 8) as *const u64).read_unaligned();
let b_tail = (bp.add(alen - 8) as *const u64).read_unaligned();
return a_tail == b_tail;
}
if alen <= 32 {
let a16 = (ap.add(8) as *const u64).read_unaligned();
let b16 = (bp.add(8) as *const u64).read_unaligned();
if a16 != b16 {
return false;
}
let a_tail = (ap.add(alen - 8) as *const u64).read_unaligned();
let b_tail = (bp.add(alen - 8) as *const u64).read_unaligned();
return a_tail == b_tail;
}
if alen <= 256 {
let mut off = 8usize; while off + 32 <= alen {
let a0 = (ap.add(off) as *const u64).read_unaligned();
let b0 = (bp.add(off) as *const u64).read_unaligned();
let a1 = (ap.add(off + 8) as *const u64).read_unaligned();
let b1 = (bp.add(off + 8) as *const u64).read_unaligned();
let a2 = (ap.add(off + 16) as *const u64).read_unaligned();
let b2 = (bp.add(off + 16) as *const u64).read_unaligned();
let a3 = (ap.add(off + 24) as *const u64).read_unaligned();
let b3 = (bp.add(off + 24) as *const u64).read_unaligned();
if (a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2) | (a3 ^ b3) != 0 {
return false;
}
off += 32;
}
while off + 8 <= alen {
let aw = (ap.add(off) as *const u64).read_unaligned();
let bw = (bp.add(off) as *const u64).read_unaligned();
if aw != bw {
return false;
}
off += 8;
}
if off < alen {
let a_tail = (ap.add(alen - 8) as *const u64).read_unaligned();
let b_tail = (bp.add(alen - 8) as *const u64).read_unaligned();
return a_tail == b_tail;
}
return true;
}
}
a == b
}
#[inline(always)]
fn lines_equal_after_prefix(a: &[u8], b: &[u8]) -> bool {
let alen = a.len();
debug_assert!(alen == b.len());
debug_assert!(alen > 8);
unsafe {
let ap = a.as_ptr();
let bp = b.as_ptr();
if alen <= 16 {
let a_tail = (ap.add(alen - 8) as *const u64).read_unaligned();
let b_tail = (bp.add(alen - 8) as *const u64).read_unaligned();
return a_tail == b_tail;
}
if alen <= 32 {
let a16 = (ap.add(8) as *const u64).read_unaligned();
let b16 = (bp.add(8) as *const u64).read_unaligned();
if a16 != b16 {
return false;
}
let a_tail = (ap.add(alen - 8) as *const u64).read_unaligned();
let b_tail = (bp.add(alen - 8) as *const u64).read_unaligned();
return a_tail == b_tail;
}
if alen <= 256 {
let mut off = 8usize;
while off + 32 <= alen {
let a0 = (ap.add(off) as *const u64).read_unaligned();
let b0 = (bp.add(off) as *const u64).read_unaligned();
let a1 = (ap.add(off + 8) as *const u64).read_unaligned();
let b1 = (bp.add(off + 8) as *const u64).read_unaligned();
let a2 = (ap.add(off + 16) as *const u64).read_unaligned();
let b2 = (bp.add(off + 16) as *const u64).read_unaligned();
let a3 = (ap.add(off + 24) as *const u64).read_unaligned();
let b3 = (bp.add(off + 24) as *const u64).read_unaligned();
if (a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2) | (a3 ^ b3) != 0 {
return false;
}
off += 32;
}
while off + 8 <= alen {
let aw = (ap.add(off) as *const u64).read_unaligned();
let bw = (bp.add(off) as *const u64).read_unaligned();
if aw != bw {
return false;
}
off += 8;
}
if off < alen {
let a_tail = (ap.add(alen - 8) as *const u64).read_unaligned();
let b_tail = (bp.add(alen - 8) as *const u64).read_unaligned();
return a_tail == b_tail;
}
return true;
}
}
a[8..] == b[8..]
}
#[inline(always)]
fn write_count_line(out: &mut impl Write, count: u64, line: &[u8], term: u8) -> io::Result<()> {
if count <= 9 {
let prefix: &[u8] = match count {
1 => b" 1 ",
2 => b" 2 ",
3 => b" 3 ",
4 => b" 4 ",
5 => b" 5 ",
6 => b" 6 ",
7 => b" 7 ",
8 => b" 8 ",
9 => b" 9 ",
_ => unreachable!(),
};
let total = 8 + line.len() + 1;
if total <= 256 {
let mut buf = [0u8; 256];
unsafe {
std::ptr::copy_nonoverlapping(prefix.as_ptr(), buf.as_mut_ptr(), 8);
std::ptr::copy_nonoverlapping(line.as_ptr(), buf.as_mut_ptr().add(8), line.len());
*buf.as_mut_ptr().add(8 + line.len()) = term;
}
return out.write_all(&buf[..total]);
} else {
out.write_all(prefix)?;
out.write_all(line)?;
return out.write_all(&[term]);
}
}
let mut prefix = [b' '; 28]; let digits = itoa_right_aligned_into(&mut prefix, count);
let width = digits.max(7); let prefix_len = width + 1; prefix[width] = b' ';
let total = prefix_len + line.len() + 1;
if total <= 256 {
let mut buf = [0u8; 256];
buf[..prefix_len].copy_from_slice(&prefix[..prefix_len]);
buf[prefix_len..prefix_len + line.len()].copy_from_slice(line);
buf[prefix_len + line.len()] = term;
out.write_all(&buf[..total])
} else {
out.write_all(&prefix[..prefix_len])?;
out.write_all(line)?;
out.write_all(&[term])
}
}
#[inline(always)]
fn itoa_right_aligned_into(buf: &mut [u8; 28], mut val: u64) -> usize {
if val == 0 {
buf[6] = b'0';
return 7; }
let mut pos = 27;
while val > 0 {
pos -= 1;
buf[pos] = b'0' + (val % 10) as u8;
val /= 10;
}
let num_digits = 27 - pos;
if num_digits >= 7 {
buf.copy_within(pos..27, 0);
num_digits
} else {
let pad = 7 - num_digits;
buf.copy_within(pos..27, pad);
7
}
}
pub fn process_uniq_bytes(
data: &[u8],
mut output: impl Write,
config: &UniqConfig,
) -> io::Result<()> {
let term = if config.zero_terminated { b'\0' } else { b'\n' };
let fast = !needs_key_extraction(config) && !config.ignore_case;
if fast
&& matches!(
config.mode,
OutputMode::Default | OutputMode::RepeatedOnly | OutputMode::UniqueOnly
)
{
return process_standard_bytes(data, &mut output, config, term);
}
let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, output);
match config.mode {
OutputMode::Group(method) => {
process_group_bytes(data, &mut writer, config, method, term)?;
}
OutputMode::AllRepeated(method) => {
process_all_repeated_bytes(data, &mut writer, config, method, term)?;
}
_ => {
process_standard_bytes(data, &mut writer, config, term)?;
}
}
writer.flush()?;
Ok(())
}
struct LineIter<'a> {
data: &'a [u8],
pos: usize,
term: u8,
}
impl<'a> LineIter<'a> {
#[inline(always)]
fn new(data: &'a [u8], term: u8) -> Self {
Self { data, pos: 0, term }
}
}
impl<'a> Iterator for LineIter<'a> {
type Item = (&'a [u8], &'a [u8]);
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
if self.pos >= self.data.len() {
return None;
}
let remaining = &self.data[self.pos..];
match memchr::memchr(self.term, remaining) {
Some(idx) => {
let line_start = self.pos;
let line_end = self.pos + idx; let full_end = self.pos + idx + 1; self.pos = full_end;
Some((
&self.data[line_start..line_end],
&self.data[line_start..full_end],
))
}
None => {
let line_start = self.pos;
self.pos = self.data.len();
let line = &self.data[line_start..];
Some((line, line))
}
}
}
}
#[inline(always)]
fn line_content_at<'a>(
data: &'a [u8],
line_starts: &[usize],
idx: usize,
content_end: usize,
) -> &'a [u8] {
let start = line_starts[idx];
let end = if idx + 1 < line_starts.len() {
line_starts[idx + 1] - 1 } else {
content_end };
&data[start..end]
}
#[inline(always)]
fn line_full_at<'a>(data: &'a [u8], line_starts: &[usize], idx: usize) -> &'a [u8] {
let start = line_starts[idx];
let end = if idx + 1 < line_starts.len() {
line_starts[idx + 1] } else {
data.len()
};
&data[start..end]
}
#[inline]
fn skip_dup_run(data: &[u8], dup_start: usize, pattern_start: usize, pattern_len: usize) -> usize {
let data_len = data.len();
if pattern_len == 0 || dup_start + 2 * pattern_len > data_len {
return dup_start + pattern_len.min(data_len - dup_start);
}
let mut verified_end = dup_start + pattern_len;
let mut block_copies = 1usize;
loop {
let block_bytes = block_copies * pattern_len;
let next_end = verified_end + block_bytes;
if next_end > data_len {
let remaining = data_len - verified_end;
let remaining_bytes = (remaining / pattern_len) * pattern_len;
if remaining_bytes > 0
&& data[dup_start..dup_start + remaining_bytes]
== data[verified_end..verified_end + remaining_bytes]
{
verified_end += remaining_bytes;
}
break;
}
if data[dup_start..dup_start + block_bytes] == data[verified_end..next_end] {
verified_end = next_end;
block_copies *= 2;
} else {
break;
}
}
while verified_end + pattern_len <= data_len {
if data[verified_end..verified_end + pattern_len]
== data[pattern_start..pattern_start + pattern_len]
{
verified_end += pattern_len;
} else {
break;
}
}
verified_end
}
#[inline]
fn linear_scan_group_end(
data: &[u8],
line_starts: &[usize],
group_start: usize,
num_lines: usize,
content_end: usize,
) -> usize {
let key = line_content_at(data, line_starts, group_start, content_end);
let key_len = key.len();
let mut i = group_start + 1;
while i < num_lines {
let candidate = line_content_at(data, line_starts, i, content_end);
if candidate.len() != key_len || !lines_equal_fast(key, candidate) {
return i;
}
i += 1;
}
i
}
fn process_standard_bytes(
data: &[u8],
writer: &mut impl Write,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
if data.is_empty() {
return Ok(());
}
let fast = !needs_key_extraction(config) && !config.ignore_case;
let fast_ci = !needs_key_extraction(config) && config.ignore_case;
if fast && !config.count && matches!(config.mode, OutputMode::Default) {
return process_default_fast_singlepass(data, writer, term);
}
if fast
&& !config.count
&& matches!(
config.mode,
OutputMode::RepeatedOnly | OutputMode::UniqueOnly
)
{
return process_filter_fast_singlepass(data, writer, config, term);
}
if fast && config.count {
return process_count_fast_singlepass(data, writer, config, term);
}
if fast_ci && !config.count && matches!(config.mode, OutputMode::Default) {
return process_default_ci_singlepass(data, writer, term);
}
if fast_ci
&& !config.count
&& matches!(
config.mode,
OutputMode::RepeatedOnly | OutputMode::UniqueOnly
)
{
return process_filter_ci_singlepass(data, writer, config, term);
}
if fast_ci && config.count {
return process_count_ci_singlepass(data, writer, config, term);
}
let estimated_lines = (data.len() / 40).max(64);
let mut line_starts: Vec<usize> = Vec::with_capacity(estimated_lines);
line_starts.push(0);
for pos in memchr::memchr_iter(term, data) {
if pos + 1 < data.len() {
line_starts.push(pos + 1);
}
}
let num_lines = line_starts.len();
if num_lines == 0 {
return Ok(());
}
let content_end = if data.last() == Some(&term) {
data.len() - 1
} else {
data.len()
};
if fast && !config.count && matches!(config.mode, OutputMode::Default) {
let first_full = line_full_at(data, &line_starts, 0);
let first_content = line_content_at(data, &line_starts, 0, content_end);
writer.write_all(first_full)?;
if first_full.len() == first_content.len() {
writer.write_all(&[term])?;
}
let mut i = 1;
while i < num_lines {
let prev = line_content_at(data, &line_starts, i - 1, content_end);
let cur = line_content_at(data, &line_starts, i, content_end);
if lines_equal_fast(prev, cur) {
let group_end =
linear_scan_group_end(data, &line_starts, i - 1, num_lines, content_end);
i = group_end;
continue;
}
let cur_full = line_full_at(data, &line_starts, i);
writer.write_all(cur_full)?;
if cur_full.len() == cur.len() {
writer.write_all(&[term])?;
}
i += 1;
}
return Ok(());
}
let mut i = 0;
while i < num_lines {
let content = line_content_at(data, &line_starts, i, content_end);
let full = line_full_at(data, &line_starts, i);
let group_end = if fast
&& i + 1 < num_lines
&& lines_equal_fast(
content,
line_content_at(data, &line_starts, i + 1, content_end),
) {
linear_scan_group_end(data, &line_starts, i, num_lines, content_end)
} else if !fast
&& i + 1 < num_lines
&& lines_equal(
content,
line_content_at(data, &line_starts, i + 1, content_end),
config,
)
{
let mut j = i + 2;
while j < num_lines {
if !lines_equal(
content,
line_content_at(data, &line_starts, j, content_end),
config,
) {
break;
}
j += 1;
}
j
} else {
i + 1
};
let count = (group_end - i) as u64;
output_group_bytes(writer, content, full, count, config, term)?;
i = group_end;
}
Ok(())
}
fn process_default_fast_singlepass(
data: &[u8],
writer: &mut impl Write,
term: u8,
) -> io::Result<()> {
if data.len() >= 4 * 1024 * 1024 {
return process_default_parallel(data, writer, term);
}
process_default_sequential(data, writer, term)
}
fn process_default_sequential(data: &[u8], writer: &mut impl Write, term: u8) -> io::Result<()> {
let data_len = data.len();
let base = data.as_ptr();
let mut prev_start: usize = 0;
let first_end: usize = match memchr::memchr(term, data) {
Some(pos) => pos,
None => {
writer.write_all(data)?;
return writer.write_all(&[term]);
}
};
let mut prev_len = first_end - prev_start;
let mut prev_prefix: u64 = if prev_len >= 8 {
unsafe { (base.add(prev_start) as *const u64).read_unaligned() }
} else {
0
};
const BATCH: usize = 256;
let term_byte: [u8; 1] = [term];
let mut slices: Vec<io::IoSlice<'_>> = Vec::with_capacity(BATCH);
let mut run_start: usize = 0;
let mut cur_start = first_end + 1;
let mut last_output_end = first_end + 1;
while cur_start < data_len {
let cur_end = {
let speculative = cur_start + prev_len;
if speculative < data_len && unsafe { *base.add(speculative) } == term {
speculative
} else {
match memchr::memchr(term, unsafe {
std::slice::from_raw_parts(base.add(cur_start), data_len - cur_start)
}) {
Some(offset) => cur_start + offset,
None => data_len,
}
}
};
let cur_len = cur_end - cur_start;
let is_dup = if cur_len != prev_len {
false
} else if cur_len == 0 {
true
} else if cur_len >= 8 {
let cur_prefix = unsafe { (base.add(cur_start) as *const u64).read_unaligned() };
if cur_prefix != prev_prefix {
false
} else if cur_len <= 8 {
true } else if cur_len <= 16 {
unsafe {
let a_tail =
(base.add(prev_start + prev_len - 8) as *const u64).read_unaligned();
let b_tail = (base.add(cur_start + cur_len - 8) as *const u64).read_unaligned();
a_tail == b_tail
}
} else if cur_len <= 32 {
unsafe {
let a16 = (base.add(prev_start + 8) as *const u64).read_unaligned();
let b16 = (base.add(cur_start + 8) as *const u64).read_unaligned();
if a16 != b16 {
false
} else {
let a_tail =
(base.add(prev_start + prev_len - 8) as *const u64).read_unaligned();
let b_tail =
(base.add(cur_start + cur_len - 8) as *const u64).read_unaligned();
a_tail == b_tail
}
}
} else if cur_len <= 256 {
unsafe {
let ap = base.add(prev_start);
let bp = base.add(cur_start);
let mut off = 8usize; let mut eq = true;
while off + 32 <= cur_len {
let a0 = (ap.add(off) as *const u64).read_unaligned();
let b0 = (bp.add(off) as *const u64).read_unaligned();
let a1 = (ap.add(off + 8) as *const u64).read_unaligned();
let b1 = (bp.add(off + 8) as *const u64).read_unaligned();
let a2 = (ap.add(off + 16) as *const u64).read_unaligned();
let b2 = (bp.add(off + 16) as *const u64).read_unaligned();
let a3 = (ap.add(off + 24) as *const u64).read_unaligned();
let b3 = (bp.add(off + 24) as *const u64).read_unaligned();
if (a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2) | (a3 ^ b3) != 0 {
eq = false;
break;
}
off += 32;
}
if eq {
while off + 8 <= cur_len {
let aw = (ap.add(off) as *const u64).read_unaligned();
let bw = (bp.add(off) as *const u64).read_unaligned();
if aw != bw {
eq = false;
break;
}
off += 8;
}
}
if eq && off < cur_len {
let a_tail = (ap.add(cur_len - 8) as *const u64).read_unaligned();
let b_tail = (bp.add(cur_len - 8) as *const u64).read_unaligned();
eq = a_tail == b_tail;
}
eq
}
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
a == b
}
}
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
a == b
}
};
if is_dup {
let pattern_len = prev_len + 1; if run_start < cur_start {
slices.push(io::IoSlice::new(&data[run_start..cur_start]));
if slices.len() >= BATCH {
write_all_vectored(writer, &slices)?;
slices.clear();
}
}
let skip_end = skip_dup_run(data, cur_start, prev_start, pattern_len);
run_start = skip_end;
cur_start = skip_end;
continue;
} else {
prev_start = cur_start;
prev_len = cur_len;
prev_prefix = if cur_len >= 8 {
unsafe { (base.add(cur_start) as *const u64).read_unaligned() }
} else {
0
};
last_output_end = if cur_end < data_len {
cur_end + 1
} else {
cur_end
};
}
if cur_end < data_len {
cur_start = cur_end + 1;
} else {
break;
}
}
if run_start < data_len {
slices.push(io::IoSlice::new(
&data[run_start..last_output_end.max(run_start)],
));
}
if data_len > 0 && unsafe { *base.add(data_len - 1) } != term {
slices.push(io::IoSlice::new(&term_byte));
}
if !slices.is_empty() {
write_all_vectored(writer, &slices)?;
}
Ok(())
}
fn process_default_parallel(data: &[u8], writer: &mut impl Write, term: u8) -> io::Result<()> {
use rayon::prelude::*;
let num_threads = rayon::current_num_threads().max(1);
let chunk_target = data.len() / num_threads;
let mut boundaries = Vec::with_capacity(num_threads + 1);
boundaries.push(0usize);
for i in 1..num_threads {
let target = i * chunk_target;
if target >= data.len() {
break;
}
if let Some(p) = memchr::memchr(term, &data[target..]) {
let b = target + p + 1;
if b > *boundaries.last().unwrap() && b <= data.len() {
boundaries.push(b);
}
}
}
boundaries.push(data.len());
let n_chunks = boundaries.len() - 1;
if n_chunks <= 1 {
return process_default_sequential(data, writer, term);
}
struct ChunkResult {
runs: Vec<(usize, usize)>,
first_line_start: usize,
first_line_end: usize,
last_line_start: usize,
last_line_end: usize,
}
let results: Vec<ChunkResult> = boundaries
.windows(2)
.collect::<Vec<_>>()
.par_iter()
.map(|w| {
let chunk_start = w[0];
let chunk_end = w[1];
let chunk = &data[chunk_start..chunk_end];
let first_term = match memchr::memchr(term, chunk) {
Some(pos) => pos,
None => {
return ChunkResult {
runs: vec![(chunk_start, chunk_end)],
first_line_start: chunk_start,
first_line_end: chunk_end,
last_line_start: chunk_start,
last_line_end: chunk_end,
};
}
};
let first_line_start = chunk_start;
let first_line_end = chunk_start + first_term;
let mut runs: Vec<(usize, usize)> = Vec::new();
let mut run_start = chunk_start;
let mut prev_start = 0usize;
let mut _prev_end = first_term;
let mut last_out_start = chunk_start;
let mut last_out_end = first_line_end;
let mut prev_len = first_term;
let chunk_base = chunk.as_ptr();
let chunk_len = chunk.len();
let mut prev_prefix: u64 = if prev_len >= 8 {
unsafe { (chunk_base as *const u64).read_unaligned() }
} else {
0
};
let mut cur_start = first_term + 1;
while cur_start < chunk_len {
let cur_end = {
let spec = cur_start + prev_len;
if spec < chunk_len && unsafe { *chunk_base.add(spec) } == term {
spec
} else {
match memchr::memchr(term, unsafe {
std::slice::from_raw_parts(
chunk_base.add(cur_start),
chunk_len - cur_start,
)
}) {
Some(offset) => cur_start + offset,
None => chunk_len,
}
}
};
let cur_len = cur_end - cur_start;
let is_dup = if cur_len != prev_len {
false
} else if cur_len == 0 {
true
} else if cur_len >= 8 {
let cur_prefix =
unsafe { (chunk_base.add(cur_start) as *const u64).read_unaligned() };
if cur_prefix != prev_prefix {
false
} else if cur_len <= 8 {
true
} else {
unsafe {
let a =
std::slice::from_raw_parts(chunk_base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(chunk_base.add(cur_start), cur_len);
lines_equal_after_prefix(a, b)
}
}
} else {
unsafe {
let a = std::slice::from_raw_parts(chunk_base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(chunk_base.add(cur_start), cur_len);
a == b
}
};
if is_dup {
let pattern_len = prev_len + 1;
let abs_cur = chunk_start + cur_start;
if run_start < abs_cur {
runs.push((run_start, abs_cur));
}
let skip_end = skip_dup_run(chunk, cur_start, prev_start, pattern_len);
run_start = chunk_start + skip_end;
cur_start = skip_end;
continue;
} else {
last_out_start = chunk_start + cur_start;
last_out_end = chunk_start + cur_end;
prev_len = cur_len;
prev_prefix = if cur_len >= 8 {
unsafe { (chunk_base.add(cur_start) as *const u64).read_unaligned() }
} else {
0
};
}
prev_start = cur_start;
_prev_end = cur_end;
if cur_end < chunk_len {
cur_start = cur_end + 1;
} else {
break;
}
}
if run_start < chunk_end {
runs.push((run_start, chunk_end));
}
ChunkResult {
runs,
first_line_start,
first_line_end,
last_line_start: last_out_start,
last_line_end: last_out_end,
}
})
.collect();
const BATCH: usize = 256;
let mut slices: Vec<io::IoSlice<'_>> = Vec::with_capacity(BATCH);
for (i, result) in results.iter().enumerate() {
let skip_first = if i > 0 {
let prev = &results[i - 1];
let prev_last = &data[prev.last_line_start..prev.last_line_end];
let cur_first = &data[result.first_line_start..result.first_line_end];
lines_equal_fast(prev_last, cur_first)
} else {
false
};
let skip_end = if skip_first {
result.first_line_end + 1
} else {
0
};
for &(rs, re) in &result.runs {
let actual_start = rs.max(skip_end);
if actual_start < re {
slices.push(io::IoSlice::new(&data[actual_start..re]));
if slices.len() >= BATCH {
write_all_vectored(writer, &slices)?;
slices.clear();
}
}
}
}
if !slices.is_empty() {
write_all_vectored(writer, &slices)?;
}
if !data.is_empty() && *data.last().unwrap() != term {
writer.write_all(&[term])?;
}
Ok(())
}
fn process_filter_fast_singlepass(
data: &[u8],
writer: &mut impl Write,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
let repeated = matches!(config.mode, OutputMode::RepeatedOnly);
let data_len = data.len();
let base = data.as_ptr();
let first_term = match memchr::memchr(term, data) {
Some(pos) => pos,
None => {
if !repeated {
writer.write_all(data)?;
writer.write_all(&[term])?;
}
return Ok(());
}
};
let mut prev_start: usize = 0;
let mut prev_end: usize = first_term;
let mut prev_len = prev_end;
let mut prev_prefix: u64 = if prev_len >= 8 {
unsafe { (base.add(prev_start) as *const u64).read_unaligned() }
} else {
0
};
let mut count: u64 = 1;
let mut cur_start = first_term + 1;
const BATCH: usize = 512;
let term_slice: [u8; 1] = [term];
let mut slices: Vec<io::IoSlice<'_>> = Vec::with_capacity(BATCH * 2);
while cur_start < data_len {
let cur_end = {
let speculative = cur_start + prev_len;
if speculative < data_len && unsafe { *base.add(speculative) } == term {
speculative
} else {
match memchr::memchr(term, unsafe {
std::slice::from_raw_parts(base.add(cur_start), data_len - cur_start)
}) {
Some(offset) => cur_start + offset,
None => data_len,
}
}
};
let cur_len = cur_end - cur_start;
let is_dup = if cur_len != prev_len {
false
} else if cur_len == 0 {
true
} else if cur_len >= 8 {
let cur_prefix = unsafe { (base.add(cur_start) as *const u64).read_unaligned() };
if cur_prefix != prev_prefix {
false
} else if cur_len <= 8 {
true
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
lines_equal_after_prefix(a, b)
}
}
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
a == b
}
};
if is_dup {
let pattern_len = prev_len + 1;
let skip_end = skip_dup_run(data, cur_start, prev_start, pattern_len);
let skipped = (skip_end - cur_start) / pattern_len;
count += skipped as u64;
cur_start = skip_end;
continue;
} else {
let should_print = if repeated { count > 1 } else { count == 1 };
if should_print {
slices.push(io::IoSlice::new(&data[prev_start..prev_end]));
slices.push(io::IoSlice::new(&term_slice));
if slices.len() >= BATCH * 2 {
write_all_vectored(writer, &slices)?;
slices.clear();
}
}
prev_start = cur_start;
prev_end = cur_end;
prev_len = cur_len;
prev_prefix = if cur_len >= 8 {
unsafe { (base.add(cur_start) as *const u64).read_unaligned() }
} else {
0
};
count = 1;
}
if cur_end < data_len {
cur_start = cur_end + 1;
} else {
break;
}
}
let should_print = if repeated { count > 1 } else { count == 1 };
if should_print {
slices.push(io::IoSlice::new(&data[prev_start..prev_end]));
slices.push(io::IoSlice::new(&term_slice));
}
if !slices.is_empty() {
write_all_vectored(writer, &slices)?;
}
Ok(())
}
fn process_count_fast_singlepass(
data: &[u8],
writer: &mut impl Write,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
let data_len = data.len();
let base = data.as_ptr();
let first_term = match memchr::memchr(term, data) {
Some(pos) => pos,
None => {
let should_print = match config.mode {
OutputMode::Default => true,
OutputMode::RepeatedOnly => false,
OutputMode::UniqueOnly => true,
_ => true,
};
if should_print {
write_count_line(writer, 1, data, term)?;
}
return Ok(());
}
};
let mut prev_start: usize = 0;
let mut prev_end: usize = first_term;
let mut prev_len = prev_end;
let mut prev_prefix: u64 = if prev_len >= 8 {
unsafe { (base.add(prev_start) as *const u64).read_unaligned() }
} else {
0
};
let mut count: u64 = 1;
let mut cur_start = first_term + 1;
const BATCH: usize = 340;
const PREFIX_SLOT: usize = 28; let term_slice: [u8; 1] = [term];
let mut prefix_buf = vec![b' '; BATCH * PREFIX_SLOT];
let mut groups: Vec<(usize, usize, usize)> = Vec::with_capacity(BATCH);
while cur_start < data_len {
let cur_end = {
let speculative = cur_start + prev_len;
if speculative < data_len && unsafe { *base.add(speculative) } == term {
speculative
} else {
match memchr::memchr(term, unsafe {
std::slice::from_raw_parts(base.add(cur_start), data_len - cur_start)
}) {
Some(offset) => cur_start + offset,
None => data_len,
}
}
};
let cur_len = cur_end - cur_start;
let is_dup = if cur_len != prev_len {
false
} else if cur_len == 0 {
true
} else if cur_len >= 8 {
let cur_prefix = unsafe { (base.add(cur_start) as *const u64).read_unaligned() };
if cur_prefix != prev_prefix {
false
} else if cur_len <= 8 {
true
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
lines_equal_after_prefix(a, b)
}
}
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
a == b
}
};
if is_dup {
let pattern_len = prev_len + 1;
let skip_end = skip_dup_run(data, cur_start, prev_start, pattern_len);
let skipped = (skip_end - cur_start) / pattern_len;
count += skipped as u64;
cur_start = skip_end;
continue;
} else {
let should_print = match config.mode {
OutputMode::RepeatedOnly => count > 1,
OutputMode::UniqueOnly => count == 1,
_ => true,
};
if should_print {
let idx = groups.len();
let prefix_off = idx * PREFIX_SLOT;
let prefix_len = format_count_prefix_into(
count,
&mut prefix_buf[prefix_off..prefix_off + PREFIX_SLOT],
);
groups.push((prefix_len, prev_start, prev_end));
if groups.len() >= BATCH {
flush_count_groups(writer, &prefix_buf, &groups, &term_slice, data)?;
groups.clear();
prefix_buf.fill(b' ');
}
}
prev_start = cur_start;
prev_end = cur_end;
prev_len = cur_len;
prev_prefix = if cur_len >= 8 {
unsafe { (base.add(cur_start) as *const u64).read_unaligned() }
} else {
0
};
count = 1;
}
if cur_end < data_len {
cur_start = cur_end + 1;
} else {
break;
}
}
let should_print = match config.mode {
OutputMode::RepeatedOnly => count > 1,
OutputMode::UniqueOnly => count == 1,
_ => true,
};
if should_print {
let idx = groups.len();
let prefix_off = idx * PREFIX_SLOT;
let prefix_len =
format_count_prefix_into(count, &mut prefix_buf[prefix_off..prefix_off + PREFIX_SLOT]);
groups.push((prefix_len, prev_start, prev_end));
}
if !groups.is_empty() {
flush_count_groups(writer, &prefix_buf, &groups, &term_slice, data)?;
}
Ok(())
}
#[inline]
fn flush_count_groups(
writer: &mut impl Write,
prefix_buf: &[u8],
groups: &[(usize, usize, usize)],
term_slice: &[u8; 1],
data: &[u8],
) -> io::Result<()> {
const PREFIX_SLOT: usize = 28;
let mut slices: Vec<io::IoSlice<'_>> = Vec::with_capacity(groups.len() * 3);
for (i, &(prefix_len, line_start, line_end)) in groups.iter().enumerate() {
let prefix_off = i * PREFIX_SLOT;
slices.push(io::IoSlice::new(
&prefix_buf[prefix_off..prefix_off + prefix_len],
));
slices.push(io::IoSlice::new(&data[line_start..line_end]));
slices.push(io::IoSlice::new(term_slice));
}
write_all_vectored(writer, &slices)
}
#[inline(always)]
fn format_count_prefix_into(count: u64, buf: &mut [u8]) -> usize {
if count <= 9 {
buf[6] = b'0' + count as u8;
buf[7] = b' ';
return 8;
}
let mut tmp = [b' '; 28];
let digits = itoa_right_aligned_into(&mut tmp, count);
let width = digits.max(7);
tmp[width] = b' ';
let len = width + 1;
buf[..len].copy_from_slice(&tmp[..len]);
len
}
fn process_default_ci_singlepass(data: &[u8], writer: &mut impl Write, term: u8) -> io::Result<()> {
let data_len = data.len();
let base = data.as_ptr();
let first_end = match memchr::memchr(term, data) {
Some(pos) => pos,
None => {
writer.write_all(data)?;
return writer.write_all(&[term]);
}
};
let mut prev_start: usize = 0;
let mut prev_len = first_end;
let mut prev_prefix_upper: u64 = if prev_len >= 8 {
unsafe { (base.add(prev_start) as *const u64).read_unaligned() & 0xDFDFDFDFDFDFDFDFu64 }
} else {
0
};
const BATCH: usize = 256;
let mut slices: Vec<io::IoSlice<'_>> = Vec::with_capacity(BATCH);
let mut run_start: usize = 0;
let mut cur_start = first_end + 1;
while cur_start < data_len {
let cur_end = {
let speculative = cur_start + prev_len;
if speculative < data_len && unsafe { *base.add(speculative) } == term {
speculative
} else {
match memchr::memchr(term, unsafe {
std::slice::from_raw_parts(base.add(cur_start), data_len - cur_start)
}) {
Some(offset) => cur_start + offset,
None => data_len,
}
}
};
let cur_len = cur_end - cur_start;
let is_dup = if cur_len != prev_len {
false
} else if cur_len == 0 {
true
} else if cur_len >= 8 {
let cur_prefix = unsafe { (base.add(cur_start) as *const u64).read_unaligned() };
let cur_prefix_upper = cur_prefix & 0xDFDFDFDFDFDFDFDFu64;
if cur_prefix_upper != prev_prefix_upper {
false
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
lines_equal_case_insensitive(a, b)
}
}
} else {
unsafe {
let a = std::slice::from_raw_parts(base.add(prev_start), prev_len);
let b = std::slice::from_raw_parts(base.add(cur_start), cur_len);
lines_equal_case_insensitive(a, b)
}
};
if is_dup {
if run_start < cur_start {
slices.push(io::IoSlice::new(&data[run_start..cur_start]));
if slices.len() >= BATCH {
write_all_vectored(writer, &slices)?;
slices.clear();
}
}
run_start = if cur_end < data_len {
cur_end + 1
} else {
cur_end
};
} else {
prev_start = cur_start;
prev_len = cur_len;
prev_prefix_upper = if cur_len >= 8 {
unsafe {
(base.add(cur_start) as *const u64).read_unaligned() & 0xDFDFDFDFDFDFDFDFu64
}
} else {
0
};
}
if cur_end < data_len {
cur_start = cur_end + 1;
} else {
break;
}
}
if run_start < data_len {
slices.push(io::IoSlice::new(&data[run_start..data_len]));
}
if !data.is_empty() && data[data_len - 1] != term {
let term_byte: [u8; 1] = [term];
slices.push(io::IoSlice::new(&term_byte));
write_all_vectored(writer, &slices)?;
} else if !slices.is_empty() {
write_all_vectored(writer, &slices)?;
}
Ok(())
}
fn process_filter_ci_singlepass(
data: &[u8],
writer: &mut impl Write,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
let repeated = matches!(config.mode, OutputMode::RepeatedOnly);
let data_len = data.len();
let base = data.as_ptr();
let first_term = match memchr::memchr(term, data) {
Some(pos) => pos,
None => {
if !repeated {
writer.write_all(data)?;
writer.write_all(&[term])?;
}
return Ok(());
}
};
let mut prev_start: usize = 0;
let mut prev_end: usize = first_term;
let mut prev_len = prev_end;
let mut count: u64 = 1;
let mut cur_start = first_term + 1;
const BATCH: usize = 512;
let term_slice: [u8; 1] = [term];
let mut slices: Vec<io::IoSlice<'_>> = Vec::with_capacity(BATCH * 2);
while cur_start < data_len {
let cur_end = {
let speculative = cur_start + prev_len;
if speculative < data_len && unsafe { *base.add(speculative) } == term {
speculative
} else {
match memchr::memchr(term, unsafe {
std::slice::from_raw_parts(base.add(cur_start), data_len - cur_start)
}) {
Some(offset) => cur_start + offset,
None => data_len,
}
}
};
let cur_len = cur_end - cur_start;
let is_dup = cur_len == prev_len
&& lines_equal_case_insensitive(&data[prev_start..prev_end], &data[cur_start..cur_end]);
if is_dup {
count += 1;
} else {
let should_print = if repeated { count > 1 } else { count == 1 };
if should_print {
slices.push(io::IoSlice::new(&data[prev_start..prev_end]));
slices.push(io::IoSlice::new(&term_slice));
if slices.len() >= BATCH * 2 {
write_all_vectored(writer, &slices)?;
slices.clear();
}
}
prev_start = cur_start;
prev_end = cur_end;
prev_len = cur_len;
count = 1;
}
if cur_end < data_len {
cur_start = cur_end + 1;
} else {
break;
}
}
let should_print = if repeated { count > 1 } else { count == 1 };
if should_print {
slices.push(io::IoSlice::new(&data[prev_start..prev_end]));
slices.push(io::IoSlice::new(&term_slice));
}
if !slices.is_empty() {
write_all_vectored(writer, &slices)?;
}
Ok(())
}
fn process_count_ci_singlepass(
data: &[u8],
writer: &mut impl Write,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
let first_term = match memchr::memchr(term, data) {
Some(pos) => pos,
None => {
let should_print = match config.mode {
OutputMode::Default => true,
OutputMode::RepeatedOnly => false,
OutputMode::UniqueOnly => true,
_ => true,
};
if should_print {
write_count_line(writer, 1, data, term)?;
}
return Ok(());
}
};
let is_default = matches!(config.mode, OutputMode::Default);
let mut prev_start: usize = 0;
let mut prev_end: usize = first_term;
let mut count: u64 = 1;
let mut cur_start = first_term + 1;
const BATCH: usize = 340;
const PREFIX_SLOT: usize = 28;
let term_slice: [u8; 1] = [term];
let mut prefix_buf = vec![b' '; BATCH * PREFIX_SLOT];
let mut groups: Vec<(usize, usize, usize)> = Vec::with_capacity(BATCH);
let base = data.as_ptr();
let data_len = data.len();
let mut prev_len = prev_end - prev_start;
while cur_start < data_len {
let cur_end = {
let speculative = cur_start + prev_len;
if speculative < data_len && unsafe { *base.add(speculative) } == term {
speculative
} else {
match memchr::memchr(term, unsafe {
std::slice::from_raw_parts(base.add(cur_start), data_len - cur_start)
}) {
Some(offset) => cur_start + offset,
None => data_len,
}
}
};
let cur_len = cur_end - cur_start;
let is_dup = cur_len == prev_len
&& data[prev_start..prev_end].eq_ignore_ascii_case(&data[cur_start..cur_end]);
if is_dup {
count += 1;
} else {
let should_print = if is_default {
true
} else {
match config.mode {
OutputMode::RepeatedOnly => count > 1,
OutputMode::UniqueOnly => count == 1,
_ => true,
}
};
if should_print {
let idx = groups.len();
let prefix_off = idx * PREFIX_SLOT;
let prefix_len = format_count_prefix_into(
count,
&mut prefix_buf[prefix_off..prefix_off + PREFIX_SLOT],
);
groups.push((prefix_len, prev_start, prev_end));
if groups.len() >= BATCH {
flush_count_groups(writer, &prefix_buf, &groups, &term_slice, data)?;
groups.clear();
prefix_buf.fill(b' ');
}
}
prev_start = cur_start;
prev_end = cur_end;
prev_len = cur_len;
count = 1;
}
if cur_end < data_len {
cur_start = cur_end + 1;
} else {
break;
}
}
let should_print = if is_default {
true
} else {
match config.mode {
OutputMode::RepeatedOnly => count > 1,
OutputMode::UniqueOnly => count == 1,
_ => true,
}
};
if should_print {
let idx = groups.len();
let prefix_off = idx * PREFIX_SLOT;
let prefix_len =
format_count_prefix_into(count, &mut prefix_buf[prefix_off..prefix_off + PREFIX_SLOT]);
groups.push((prefix_len, prev_start, prev_end));
}
if !groups.is_empty() {
flush_count_groups(writer, &prefix_buf, &groups, &term_slice, data)?;
}
Ok(())
}
#[inline(always)]
fn output_group_bytes(
writer: &mut impl Write,
content: &[u8],
full: &[u8],
count: u64,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
let should_print = match config.mode {
OutputMode::Default => true,
OutputMode::RepeatedOnly => count > 1,
OutputMode::UniqueOnly => count == 1,
_ => true,
};
if should_print {
if config.count {
write_count_line(writer, count, content, term)?;
} else {
writer.write_all(full)?;
if full.len() == content.len() {
writer.write_all(&[term])?;
}
}
}
Ok(())
}
fn process_all_repeated_bytes(
data: &[u8],
writer: &mut impl Write,
config: &UniqConfig,
method: AllRepeatedMethod,
term: u8,
) -> io::Result<()> {
let mut lines = LineIter::new(data, term);
let first = match lines.next() {
Some(v) => v,
None => return Ok(()),
};
let mut group_lines: Vec<(&[u8], &[u8])> = Vec::with_capacity(64);
group_lines.push(first);
let mut first_group_printed = false;
let fast = !needs_key_extraction(config) && !config.ignore_case;
for (cur_content, cur_full) in lines {
let prev_content = group_lines.last().unwrap().0;
let equal = if fast {
lines_equal_fast(prev_content, cur_content)
} else {
lines_equal(prev_content, cur_content, config)
};
if equal {
group_lines.push((cur_content, cur_full));
} else {
flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
group_lines.clear();
group_lines.push((cur_content, cur_full));
}
}
flush_all_repeated_bytes(writer, &group_lines, method, &mut first_group_printed, term)?;
Ok(())
}
fn flush_all_repeated_bytes(
writer: &mut impl Write,
group: &[(&[u8], &[u8])],
method: AllRepeatedMethod,
first_group_printed: &mut bool,
term: u8,
) -> io::Result<()> {
if group.len() <= 1 {
return Ok(()); }
match method {
AllRepeatedMethod::Prepend => {
writer.write_all(&[term])?;
}
AllRepeatedMethod::Separate => {
if *first_group_printed {
writer.write_all(&[term])?;
}
}
AllRepeatedMethod::None => {}
}
for &(content, full) in group {
writer.write_all(full)?;
if full.len() == content.len() {
writer.write_all(&[term])?;
}
}
*first_group_printed = true;
Ok(())
}
fn process_group_bytes(
data: &[u8],
writer: &mut impl Write,
config: &UniqConfig,
method: GroupMethod,
term: u8,
) -> io::Result<()> {
let mut lines = LineIter::new(data, term);
let (prev_content, prev_full) = match lines.next() {
Some(v) => v,
None => return Ok(()),
};
if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
writer.write_all(&[term])?;
}
writer.write_all(prev_full)?;
if prev_full.len() == prev_content.len() {
writer.write_all(&[term])?;
}
let mut prev_content = prev_content;
let fast = !needs_key_extraction(config) && !config.ignore_case;
for (cur_content, cur_full) in lines {
let equal = if fast {
lines_equal_fast(prev_content, cur_content)
} else {
lines_equal(prev_content, cur_content, config)
};
if !equal {
writer.write_all(&[term])?;
}
writer.write_all(cur_full)?;
if cur_full.len() == cur_content.len() {
writer.write_all(&[term])?;
}
prev_content = cur_content;
}
if matches!(method, GroupMethod::Append | GroupMethod::Both) {
writer.write_all(&[term])?;
}
Ok(())
}
pub fn process_uniq<R: Read, W: Write>(input: R, output: W, config: &UniqConfig) -> io::Result<()> {
let reader = BufReader::with_capacity(8 * 1024 * 1024, input);
let mut writer = BufWriter::with_capacity(32 * 1024 * 1024, output);
let term = if config.zero_terminated { b'\0' } else { b'\n' };
match config.mode {
OutputMode::Group(method) => {
process_group_stream(reader, &mut writer, config, method, term)?;
}
OutputMode::AllRepeated(method) => {
process_all_repeated_stream(reader, &mut writer, config, method, term)?;
}
_ => {
process_standard_stream(reader, &mut writer, config, term)?;
}
}
writer.flush()?;
Ok(())
}
fn process_standard_stream<R: BufRead, W: Write>(
mut reader: R,
writer: &mut W,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
let mut current_line: Vec<u8> = Vec::with_capacity(4096);
if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
return Ok(()); }
let mut count: u64 = 1;
loop {
current_line.clear();
let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
if bytes_read == 0 {
output_group_stream(writer, &prev_line, count, config, term)?;
break;
}
if compare_lines_stream(&prev_line, ¤t_line, config, term) {
count += 1;
} else {
output_group_stream(writer, &prev_line, count, config, term)?;
std::mem::swap(&mut prev_line, &mut current_line);
count = 1;
}
}
Ok(())
}
#[inline(always)]
fn compare_lines_stream(a: &[u8], b: &[u8], config: &UniqConfig, term: u8) -> bool {
let a_stripped = strip_term(a, term);
let b_stripped = strip_term(b, term);
lines_equal(a_stripped, b_stripped, config)
}
#[inline(always)]
fn strip_term(line: &[u8], term: u8) -> &[u8] {
if line.last() == Some(&term) {
&line[..line.len() - 1]
} else {
line
}
}
#[inline(always)]
fn output_group_stream(
writer: &mut impl Write,
line: &[u8],
count: u64,
config: &UniqConfig,
term: u8,
) -> io::Result<()> {
let should_print = match config.mode {
OutputMode::Default => true,
OutputMode::RepeatedOnly => count > 1,
OutputMode::UniqueOnly => count == 1,
_ => true,
};
if should_print {
let content = strip_term(line, term);
if config.count {
write_count_line(writer, count, content, term)?;
} else {
writer.write_all(content)?;
writer.write_all(&[term])?;
}
}
Ok(())
}
fn process_all_repeated_stream<R: BufRead, W: Write>(
mut reader: R,
writer: &mut W,
config: &UniqConfig,
method: AllRepeatedMethod,
term: u8,
) -> io::Result<()> {
let mut group: Vec<Vec<u8>> = Vec::new();
let mut current_line: Vec<u8> = Vec::with_capacity(4096);
let mut first_group_printed = false;
current_line.clear();
if read_line_term(&mut reader, &mut current_line, term)? == 0 {
return Ok(());
}
group.push(current_line.clone());
loop {
current_line.clear();
let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
if bytes_read == 0 {
flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
break;
}
if compare_lines_stream(group.last().unwrap(), ¤t_line, config, term) {
group.push(current_line.clone());
} else {
flush_all_repeated_stream(writer, &group, method, &mut first_group_printed, term)?;
group.clear();
group.push(current_line.clone());
}
}
Ok(())
}
fn flush_all_repeated_stream(
writer: &mut impl Write,
group: &[Vec<u8>],
method: AllRepeatedMethod,
first_group_printed: &mut bool,
term: u8,
) -> io::Result<()> {
if group.len() <= 1 {
return Ok(());
}
match method {
AllRepeatedMethod::Prepend => {
writer.write_all(&[term])?;
}
AllRepeatedMethod::Separate => {
if *first_group_printed {
writer.write_all(&[term])?;
}
}
AllRepeatedMethod::None => {}
}
for line in group {
let content = strip_term(line, term);
writer.write_all(content)?;
writer.write_all(&[term])?;
}
*first_group_printed = true;
Ok(())
}
fn process_group_stream<R: BufRead, W: Write>(
mut reader: R,
writer: &mut W,
config: &UniqConfig,
method: GroupMethod,
term: u8,
) -> io::Result<()> {
let mut prev_line: Vec<u8> = Vec::with_capacity(4096);
let mut current_line: Vec<u8> = Vec::with_capacity(4096);
if read_line_term(&mut reader, &mut prev_line, term)? == 0 {
return Ok(());
}
if matches!(method, GroupMethod::Prepend | GroupMethod::Both) {
writer.write_all(&[term])?;
}
let content = strip_term(&prev_line, term);
writer.write_all(content)?;
writer.write_all(&[term])?;
loop {
current_line.clear();
let bytes_read = read_line_term(&mut reader, &mut current_line, term)?;
if bytes_read == 0 {
if matches!(method, GroupMethod::Append | GroupMethod::Both) {
writer.write_all(&[term])?;
}
break;
}
if !compare_lines_stream(&prev_line, ¤t_line, config, term) {
writer.write_all(&[term])?;
}
let content = strip_term(¤t_line, term);
writer.write_all(content)?;
writer.write_all(&[term])?;
std::mem::swap(&mut prev_line, &mut current_line);
}
Ok(())
}
#[inline(always)]
fn read_line_term<R: BufRead>(reader: &mut R, buf: &mut Vec<u8>, term: u8) -> io::Result<usize> {
reader.read_until(term, buf)
}