count_lines/
lib.rs

1use anyhow::Result;
2use rand::Rng;
3use std::fs::File;
4use std::io::{Read, Seek, SeekFrom};
5use std::path::Path;
6
7pub const SMALL_FILE_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
8
9const CHUNK_SIZE: usize = 1 << 16; // 64KB
10
11#[derive(Clone)]
12pub struct EstimateOptions<R: Rng> {
13    pub chunk_size: usize,
14    pub sample_length: usize,
15    pub num_samples: usize,
16    pub rng: R,
17}
18
19fn count_lines_from_reader<R: Read>(reader: &mut R) -> Result<u64> {
20    let mut buffer = [0u8; CHUNK_SIZE];
21    let mut count = 0;
22
23    loop {
24        let bytes_read = reader.read(&mut buffer)?;
25        if bytes_read == 0 {
26            break;
27        }
28        count += bytecount::count(&buffer[..bytes_read], b'\n') as u64;
29    }
30
31    Ok(count)
32}
33
34pub fn count_lines_exact(path: &Path) -> Result<u64> {
35    let mut file = File::open(path)?;
36    count_lines_from_reader(&mut file)
37}
38
39pub fn count_lines_exact_reader<R: Read>(reader: &mut R) -> Result<u64> {
40    count_lines_from_reader(reader)
41}
42
43pub fn count_lines_estimate<R: Rng>(path: &Path, opts: EstimateOptions<R>) -> Result<u64> {
44    let EstimateOptions {
45        chunk_size,
46        sample_length,
47        num_samples,
48        mut rng,
49    } = opts;
50
51    let total_bytes = std::fs::metadata(path)?.len();
52    let n_bytes_read = (chunk_size * sample_length * num_samples) as u64;
53
54    if n_bytes_read > total_bytes {
55        return count_lines_exact(path);
56    }
57
58    let mut file = File::open(path)?;
59    let mut newline_count = 0;
60
61    for _ in 0..num_samples {
62        let start_pos = rng.gen_range(0..(total_bytes - (chunk_size * sample_length) as u64));
63        file.seek(SeekFrom::Start(start_pos))?;
64
65        let mut buffer = vec![0u8; chunk_size];
66        for _ in 0..sample_length {
67            let bytes_read = file.read(&mut buffer)?;
68            if bytes_read == 0 {
69                break;
70            }
71            newline_count += bytecount::count(&buffer[..bytes_read], b'\n') as u64;
72        }
73    }
74
75    // Estimate the average number of bytes per line from the sampled data.
76    // Then extrapolate to estimate total lines in the file.
77    let bytes_per_line = n_bytes_read as f64 / newline_count as f64;
78    let estimated = (total_bytes as f64 / bytes_per_line).round() as u64;
79
80    Ok(estimated)
81}