1use anyhow::Result;
2use rand::Rng;
3use std::fs::File;
4use std::io::{Read, Seek, SeekFrom};
5use std::path::Path;
6
7pub const SMALL_FILE_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
8
9const CHUNK_SIZE: usize = 1 << 16; #[derive(Clone)]
12pub struct EstimateOptions<R: Rng> {
13 pub chunk_size: usize,
14 pub sample_length: usize,
15 pub num_samples: usize,
16 pub rng: R,
17}
18
19fn count_lines_from_reader<R: Read>(reader: &mut R) -> Result<u64> {
20 let mut buffer = [0u8; CHUNK_SIZE];
21 let mut count = 0;
22
23 loop {
24 let bytes_read = reader.read(&mut buffer)?;
25 if bytes_read == 0 {
26 break;
27 }
28 count += bytecount::count(&buffer[..bytes_read], b'\n') as u64;
29 }
30
31 Ok(count)
32}
33
34pub fn count_lines_exact(path: &Path) -> Result<u64> {
35 let mut file = File::open(path)?;
36 count_lines_from_reader(&mut file)
37}
38
39pub fn count_lines_exact_reader<R: Read>(reader: &mut R) -> Result<u64> {
40 count_lines_from_reader(reader)
41}
42
43pub fn count_lines_estimate<R: Rng>(path: &Path, opts: EstimateOptions<R>) -> Result<u64> {
44 let EstimateOptions {
45 chunk_size,
46 sample_length,
47 num_samples,
48 mut rng,
49 } = opts;
50
51 let total_bytes = std::fs::metadata(path)?.len();
52 let n_bytes_read = (chunk_size * sample_length * num_samples) as u64;
53
54 if n_bytes_read > total_bytes {
55 return count_lines_exact(path);
56 }
57
58 let mut file = File::open(path)?;
59 let mut newline_count = 0;
60
61 for _ in 0..num_samples {
62 let start_pos = rng.gen_range(0..(total_bytes - (chunk_size * sample_length) as u64));
63 file.seek(SeekFrom::Start(start_pos))?;
64
65 let mut buffer = vec![0u8; chunk_size];
66 for _ in 0..sample_length {
67 let bytes_read = file.read(&mut buffer)?;
68 if bytes_read == 0 {
69 break;
70 }
71 newline_count += bytecount::count(&buffer[..bytes_read], b'\n') as u64;
72 }
73 }
74
75 let bytes_per_line = n_bytes_read as f64 / newline_count as f64;
78 let estimated = (total_bytes as f64 / bytes_per_line).round() as u64;
79
80 Ok(estimated)
81}