rsv_lib/utils/
file.rs

1use std::error::Error;
2use std::io::BufWriter;
3use std::{
4    fs::File,
5    io::{BufRead, BufReader, Write},
6    path::Path,
7};
8
9use super::constants::MB_USIZE;
10use super::row_split::CsvRowSplitter;
11
12pub fn estimate_row_bytes(path: &Path) -> Result<f64, Box<dyn Error>> {
13    // read 20000 lines to estimate bytes per line
14    let mut n = 0;
15    let mut bytes = 0;
16    let file = File::open(path)?;
17    for l in BufReader::new(file).lines().skip(1) {
18        bytes += l.unwrap().len() + 1;
19        n += 1;
20
21        if n > 5000 {
22            break;
23        }
24    }
25
26    // estimate line count
27    Ok((bytes as f64) / (n as f64))
28}
29
30pub fn column_n(path: &Path, sep: char, quote: char) -> Result<Option<usize>, Box<dyn Error>> {
31    // read
32    let rdr = BufReader::new(File::open(path)?);
33    let n = rdr
34        .lines()
35        .next()
36        .map(|i| i.ok())
37        .unwrap_or_default()
38        .map(|i| CsvRowSplitter::new(&i, sep, quote).count());
39
40    Ok(n)
41}
42
43#[allow(dead_code)]
44pub fn estimate_line_count_by_mb(path: &Path, mb: Option<usize>) -> usize {
45    match estimate_row_bytes(path) {
46        // default chunk-size to 200mb or 10_0000 lines
47        Ok(v) => ((mb.unwrap_or(200) * MB_USIZE) as f64 / v) as usize,
48        Err(_) => 100_000,
49    }
50}
51
52pub fn write_frequency_to_csv(path: &Path, names: &Vec<String>, freq: Vec<(String, usize)>) {
53    let mut wtr = BufWriter::new(File::create(path).unwrap());
54
55    // header
56    if !names.is_empty() {
57        writeln!(wtr, "{}", names.join(",")).unwrap();
58    }
59
60    // content
61    for (k, v) in freq {
62        writeln!(wtr, "{k},{v}").unwrap();
63    }
64}
65
66pub fn is_excel(p: &Path) -> bool {
67    match p.extension() {
68        Some(e) => e == "xlsx" || e == "xls",
69        None => false,
70    }
71}