use ::std::collections::HashSet;
use ::clap::StructOpt;
use ::log::debug;
use ::regex::Regex;
use crate::common::LineWriter;
use crate::common::VecWriter;
use crate::common::{get_first_match_or_all, LineReader};
#[derive(StructOpt, Debug, Default)]
#[structopt(
name = "unique",
about = "Remove any duplicate lines, keeping the first match and preserving order unless sorting is requested."
)]
pub struct UniqueArgs {
#[structopt(parse(from_flag = Order::from_is_sorted), short = 's', long = "sorted", )]
pub order: Order,
#[structopt(parse(from_flag = Keep::from_find_duplicates), short = 'd', long = "filter-duplicates", conflicts_with = "prefix")]
pub keep: Keep,
#[structopt(long)]
pub by: Option<Regex>,
#[structopt(short = 'p', long = "prefix", conflicts_with = "by")]
pub prefix: bool,
}
#[test]
fn test_cli_args() {
use clap::IntoApp;
UniqueArgs::into_app().debug_assert()
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub enum Order {
#[default]
Preserve,
SortAscending,
}
impl Order {
fn from_is_sorted(is_sorted: bool) -> Self {
if is_sorted {
Order::SortAscending
} else {
Order::Preserve
}
}
}
fn order_inplace<T: Ord>(data: &mut [T]) {
debug!("sorting unique_prefix result");
data.sort_unstable()
}
#[derive(Debug, Default, Clone, Copy)]
pub enum Keep {
#[default]
First,
Subsequent,
}
impl Keep {
fn from_find_duplicates(is_find_duplicates: bool) -> Self {
if is_find_duplicates {
Keep::Subsequent
} else {
Keep::First
}
}
fn keep_is_first(&self, is_first: bool) -> bool {
match self {
Keep::First => is_first,
Keep::Subsequent => !is_first,
}
}
}
pub async fn unique(args: UniqueArgs, reader: &mut impl LineReader, writer: &mut impl LineWriter) {
assert!(
!(args.prefix && args.by.is_some()),
"cannot use both --prefix and --by"
);
if args.prefix {
let lines = reader.collect_all().await;
for line in unique_prefix(lines, args.order, args.keep) {
writer.write_line(line).await
}
} else if Order::SortAscending == args.order {
let mut vec_writer = VecWriter::new();
unique_nosort(args.keep, &args.by, reader, &mut vec_writer).await;
let mut matches = vec_writer.get();
order_inplace(&mut matches);
writer.write_all_lines(matches.into_iter()).await
} else {
unique_nosort(args.keep, &args.by, reader, writer).await
};
}
async fn unique_nosort(
keep: Keep,
unique_by_pattern: &Option<Regex>,
reader: &mut impl LineReader,
writer: &mut impl LineWriter,
) {
let mut seen = HashSet::new();
while let Some(line) = reader.read_line().await {
let key = get_first_match_or_all(unique_by_pattern, line);
if !keep.keep_is_first(seen.insert(key.to_owned())) {
continue;
}
writer.write_line(line).await
}
}
pub fn unique_prefix(texts: Vec<String>, order: Order, keep: Keep) -> Vec<String> {
if matches!(order, Order::SortAscending) && matches!(keep, Keep::Subsequent) {
panic!("--filter-duplicates, --sorted and --prefix cannot all be used together");
};
if texts.is_empty() {
debug!("empty input while removing items that have other items as prefix");
return texts;
}
match order {
Order::Preserve => {
debug!("removing items that have other items as prefix, preserving order");
let mut uniques = HashSet::with_capacity(texts.len());
unique_prefix_sorted(texts.clone(), |uniq| {
uniques.insert(uniq);
});
let mut seen: HashSet<String> = HashSet::default();
texts
.into_iter()
.filter(|item| uniques.contains(item))
.filter(|item| keep.keep_is_first(seen.insert(item.clone())))
.collect()
}
Order::SortAscending => {
debug!("removing items that have other items as prefix, sorting ascendingly");
let mut result = Vec::with_capacity(texts.len());
unique_prefix_sorted(texts, |uniq| result.push(uniq));
result
}
}
}
fn unique_prefix_sorted(mut texts: Vec<String>, mut collect: impl FnMut(String)) {
texts.sort_unstable();
collect(texts[0].to_owned());
let mut prev = texts[0].to_owned();
for this in texts.into_iter().skip(1) {
let prev_is_parent = this.as_str().starts_with(&prev);
if prev_is_parent {
continue;
}
prev = this.to_owned();
collect(this)
}
}