1use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
2
3use crate::error::Result;
4use crate::sniffer::IS_UTF8;
5
6#[derive(Debug, Clone, Copy)]
8pub enum SampleSize {
9 Records(usize),
11 Bytes(usize),
13 All,
15}
16
17pub fn take_sample_from_start<R>(
18 reader: &'_ mut R,
19 sample_size: SampleSize,
20) -> Result<SampleIter<'_, R>>
21where
22 R: Read + Seek,
23{
24 reader.seek(SeekFrom::Start(0))?;
25 Ok(SampleIter::new(reader, sample_size))
26}
27
28pub struct SampleIter<'a, R: 'a + Read> {
29 reader: BufReader<&'a mut R>,
30 sample_size: SampleSize,
31 n_bytes: usize,
32 n_records: usize,
33 is_done: bool,
34}
35
36impl<'a, R: Read> SampleIter<'a, R> {
37 fn new(reader: &'a mut R, sample_size: SampleSize) -> Self {
38 let buf_reader = BufReader::new(reader);
39 SampleIter {
40 reader: buf_reader,
41 sample_size,
42 n_bytes: 0,
43 n_records: 0,
44 is_done: false,
45 }
46 }
47}
48
49impl<R: Read> Iterator for SampleIter<'_, R> {
50 type Item = Result<String>;
51
52 fn next(&mut self) -> Option<Result<String>> {
53 if self.is_done {
54 return None;
55 }
56
57 let mut buf = Vec::new();
58 let n_bytes_read = match self.reader.read_until(b'\n', &mut buf) {
59 Ok(n_bytes_read) => n_bytes_read,
60 Err(e) => {
61 return Some(Err(e.into()));
62 }
63 };
64 if n_bytes_read == 0 {
65 self.is_done = true;
66 return None;
67 }
68
69 let mut output = simdutf8::basic::from_utf8(&buf).map_or_else(
70 |_| {
71 IS_UTF8.with(|flag| {
73 *flag.borrow_mut() = false;
74 });
75 String::from_utf8_lossy(&buf).to_string()
76 },
77 std::string::ToString::to_string,
78 );
79
80 let last_byte = (output.as_ref() as &[u8])[output.len() - 1];
81 if last_byte != b'\n' && last_byte != b'\r' {
82 let mut check_buf = [0u8; 1];
85 match self.reader.read(&mut check_buf) {
86 Ok(0) => {
87 }
89 Ok(_) => {
90 self.is_done = true;
92 return None;
93 }
94 Err(_) => {
95 self.is_done = true;
97 return None;
98 }
99 }
100 }
101
102 output = output.trim_matches(|c| c == '\n' || c == '\r').into();
103 self.n_bytes += n_bytes_read;
104 self.n_records += 1;
105 match self.sample_size {
106 SampleSize::Records(max_records) => {
107 if self.n_records > max_records {
108 self.is_done = true;
109 return None;
110 }
111 }
112 SampleSize::Bytes(max_bytes) => {
113 if self.n_bytes > max_bytes {
114 self.is_done = true;
115 return None;
116 }
117 }
118 SampleSize::All => {}
119 }
120 Some(Ok(output))
121 }
122}