rsv_lib/csv/
unique.rs

1use crate::args::Unique;
2use crate::utils::cli_result::CliResult;
3use crate::utils::column::Columns;
4use crate::utils::filename::new_path;
5use crate::utils::writer::Writer;
6use ahash::HashMapExt;
7use std::fs::File;
8use std::io::{BufRead, BufReader, Lines};
9use std::path::Path;
10
11impl Unique {
12    pub fn csv_run(&self) -> CliResult {
13        let path = &self.path();
14        let all_cols = self.cols == "-1";
15
16        // cols
17        let cols = if all_cols {
18            None
19        } else {
20            Some(
21                Columns::new(&self.cols)
22                    .total_col_of(path, self.sep, self.quote)
23                    .parse(),
24            )
25        };
26
27        // wtr and rdr
28        let out = new_path(path, "-drop-duplicates");
29        let mut wtr = Writer::file_or_stdout(self.export, &out)?;
30        let mut rdr = BufReader::new(File::open(path)?).lines();
31
32        // header
33        if !self.no_header {
34            let Some(r) = rdr.next() else { return Ok(()) };
35            wtr.write_str_unchecked(&r?)
36        }
37
38        // read
39        match (self.keep_last, all_cols) {
40            (true, true) => keep_last_and_all_cols(&mut rdr, &mut wtr, path, self.no_header)?,
41            (true, false) => {
42                keep_last_and_partial_cols(self, &mut rdr, &mut wtr, cols.unwrap(), path)?
43            }
44            (false, true) => keep_first_and_all_cols(&mut rdr, &mut wtr)?,
45            (false, false) => keep_first_and_partial_cols(&mut rdr, &mut wtr, cols.unwrap(), self)?,
46        };
47
48        if self.export {
49            println!("\nSaved to file: {}", out.display())
50        }
51
52        Ok(())
53    }
54}
55
56fn keep_first_and_all_cols(rdr: &mut Lines<BufReader<File>>, wtr: &mut Writer) -> CliResult {
57    let mut unique_holder = ahash::HashSet::default();
58    for r in rdr {
59        let r = r?;
60        if !unique_holder.contains(&r) {
61            wtr.write_str_unchecked(&r);
62            unique_holder.insert(r);
63        }
64    }
65
66    Ok(())
67}
68
69fn keep_first_and_partial_cols(
70    rdr: &mut Lines<BufReader<File>>,
71    wtr: &mut Writer,
72    cols: Columns,
73    args: &Unique,
74) -> CliResult {
75    let mut unique_holder = ahash::HashSet::default();
76    for r in rdr {
77        let r = r?;
78        let segs = args.split_row_to_vec(&r);
79        let p = cols.select_owned_string(&segs);
80        if !unique_holder.contains(&p) {
81            wtr.write_str_unchecked(&r);
82            unique_holder.insert(p);
83        }
84    }
85
86    Ok(())
87}
88
89fn keep_last_and_all_cols(
90    rdr: &mut Lines<BufReader<File>>,
91    wtr: &mut Writer,
92    path: &Path,
93    no_header: bool,
94) -> CliResult {
95    let mut unique_n = ahash::HashMap::default();
96
97    // first scan to locate record location
98    let rdr2 = BufReader::new(File::open(path)?).lines();
99    for r in rdr2.skip(1 - (no_header as usize)) {
100        let r = r?;
101        *unique_n.entry(r).or_insert(0) += 1;
102    }
103
104    // second scan
105    for r in rdr {
106        let r = r?;
107        if unique_n[&r] == 1 {
108            wtr.write_str_unchecked(&r);
109        } else {
110            *unique_n.entry(r).or_insert(0) -= 1;
111        }
112    }
113
114    Ok(())
115}
116
117fn keep_last_and_partial_cols(
118    args: &Unique,
119    rdr: &mut Lines<BufReader<File>>,
120    wtr: &mut Writer,
121    cols: Columns,
122    path: &Path,
123) -> CliResult {
124    let mut unique_n = ahash::HashMap::new();
125
126    // first scan to locate record location
127    let rdr2 = BufReader::new(File::open(path)?).lines();
128    for r in rdr2.skip(1 - (args.no_header as usize)) {
129        let r = r?;
130        let segs = args.split_row_to_vec(&r);
131        let p = cols.select_owned_string(&segs);
132        *unique_n.entry(p).or_insert(0) += 1;
133    }
134
135    // second scan
136    for r in rdr {
137        let r = r?;
138        let segs = args.split_row_to_vec(&r);
139        let p = cols.select_owned_string(&segs);
140        if unique_n[&p] == 1 {
141            wtr.write_str_unchecked(&r);
142        } else {
143            *unique_n.entry(p).or_insert(0) -= 1;
144        }
145    }
146
147    Ok(())
148}