csv_scout/
sample.rs

1use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
2
3use crate::error::Result;
4use crate::sniffer::IS_UTF8;
5
6/// Argument used when calling `sample_size` on `Sniffer`.
7#[derive(Debug, Clone, Copy)]
8pub enum SampleSize {
9    /// Use a number of records as the size of the sample to sniff.
10    Records(usize),
11    /// Use a number of bytes as the size of the sample to sniff.
12    Bytes(usize),
13    /// Sniff the entire sample.
14    All,
15}
16
17pub fn take_sample_from_start<R>(
18    reader: &'_ mut R,
19    sample_size: SampleSize,
20) -> Result<SampleIter<'_, R>>
21where
22    R: Read + Seek,
23{
24    reader.seek(SeekFrom::Start(0))?;
25    Ok(SampleIter::new(reader, sample_size))
26}
27
28pub struct SampleIter<'a, R: 'a + Read> {
29    reader: BufReader<&'a mut R>,
30    sample_size: SampleSize,
31    n_bytes: usize,
32    n_records: usize,
33    is_done: bool,
34}
35
36impl<'a, R: Read> SampleIter<'a, R> {
37    fn new(reader: &'a mut R, sample_size: SampleSize) -> Self {
38        let buf_reader = BufReader::new(reader);
39        SampleIter {
40            reader: buf_reader,
41            sample_size,
42            n_bytes: 0,
43            n_records: 0,
44            is_done: false,
45        }
46    }
47}
48
49impl<R: Read> Iterator for SampleIter<'_, R> {
50    type Item = Result<String>;
51
52    fn next(&mut self) -> Option<Result<String>> {
53        if self.is_done {
54            return None;
55        }
56
57        let mut buf = Vec::new();
58        let n_bytes_read = match self.reader.read_until(b'\n', &mut buf) {
59            Ok(n_bytes_read) => n_bytes_read,
60            Err(e) => {
61                return Some(Err(e.into()));
62            }
63        };
64        if n_bytes_read == 0 {
65            self.is_done = true;
66            return None;
67        }
68
69        let mut output = simdutf8::basic::from_utf8(&buf).map_or_else(
70            |_| {
71                // Its not all utf-8, set IS_UTF8 global to false
72                IS_UTF8.with(|flag| {
73                    *flag.borrow_mut() = false;
74                });
75                String::from_utf8_lossy(&buf).to_string()
76            },
77            std::string::ToString::to_string,
78        );
79
80        let last_byte = (output.as_ref() as &[u8])[output.len() - 1];
81        if last_byte != b'\n' && last_byte != b'\r' {
82            // For quote detection, we need to include lines that don't end with newlines
83            // as they might contain closing quotes. Check if we're at EOF.
84            let mut check_buf = [0u8; 1];
85            match self.reader.read(&mut check_buf) {
86                Ok(0) => {
87                    // EOF reached, include this line even without newline
88                }
89                Ok(_) => {
90                    // More data available, this line was cut off, ignore it
91                    self.is_done = true;
92                    return None;
93                }
94                Err(_) => {
95                    // Error reading, treat as cut off line
96                    self.is_done = true;
97                    return None;
98                }
99            }
100        }
101
102        output = output.trim_matches(|c| c == '\n' || c == '\r').into();
103        self.n_bytes += n_bytes_read;
104        self.n_records += 1;
105        match self.sample_size {
106            SampleSize::Records(max_records) => {
107                if self.n_records > max_records {
108                    self.is_done = true;
109                    return None;
110                }
111            }
112            SampleSize::Bytes(max_bytes) => {
113                if self.n_bytes > max_bytes {
114                    self.is_done = true;
115                    return None;
116                }
117            }
118            SampleSize::All => {}
119        }
120        Some(Ok(output))
121    }
122}