Skip to main content

simple_fs/span/
csv_spans.rs

1use crate::spath::SPath;
2use crate::{Error, Result, open_file};
3use std::io::{self, Read};
4
5/// CSV-aware record spans: returns byte ranges [start, end) for each *row*.
6/// - Treats '\n' as a record separator only when **not** inside quotes.
7/// - For CRLF, the '\r' is excluded from the end bound.
8/// - Supports `""` as an escaped quote inside quoted fields.
9/// - Streams in chunks; does *not* read the whole file into memory.
10pub fn csv_row_spans(path: impl AsRef<SPath>) -> Result<Vec<(usize, usize)>> {
11	let path = path.as_ref();
12	let mut f = open_file(path)?;
13	csv_row_spans_from_reader(&mut f).map_err(|err| Error::FileCantRead((path, err).into()))
14}
15
16// region:    --- Support
17
18fn csv_row_spans_from_reader<R: Read>(r: &mut R) -> io::Result<Vec<(usize, usize)>> {
19	let mut spans: Vec<(usize, usize)> = Vec::new();
20
21	// 64 KiB chunks: good balance of cacheability vs syscalls.
22	let mut buf = [0u8; 64 * 1024];
23
24	// Absolute position of start of `buf` in file.
25	let mut file_pos: usize = 0;
26	// Absolute start offset of the current record.
27	let mut rec_start: usize = 0;
28
29	// CSV quote state across chunk boundaries.
30	let mut in_quotes: bool = false;
31	// We saw a '"' at the end of the previous byte; need to decide if it’s
32	// a closing quote or the first of a `""` escape when we see the next byte.
33	let mut quote_pending: bool = false;
34
35	// Track CR immediately before '\n' across chunk boundary.
36	let mut prev_byte_is_cr: bool = false;
37
38	loop {
39		let n = r.read(&mut buf)?;
40		if n == 0 {
41			break;
42		}
43		let chunk = &buf[..n];
44
45		let mut i = 0usize;
46		while i < n {
47			let b = chunk[i];
48
49			// Resolve a pending quote (from previous byte/chunk) if any.
50			if quote_pending {
51				if b == b'"' {
52					// Escaped quote "" inside a quoted field.
53					// Consume this byte as the second quote of the escape.
54					quote_pending = false;
55					// Stay in_quotes; the pair represents a literal '"'.
56					i += 1;
57					prev_byte_is_cr = false;
58					continue;
59				} else {
60					// Previous '"' was a closing quote.
61					in_quotes = false;
62					quote_pending = false;
63					// Fall through to process current byte normally.
64				}
65			}
66
67			match b {
68				b'"' => {
69					if in_quotes {
70						// Might be closing quote, but need lookahead to disambiguate "".
71						quote_pending = true;
72					} else {
73						// Enter quoted field.
74						in_quotes = true;
75						// No pending: we only set pending when *inside* quotes.
76					}
77				}
78				b'\n' if !in_quotes && !quote_pending => {
79					// This is a record delimiter. Compute end (exclude preceding \r).
80					let abs_nl = file_pos + i;
81					let end = if i > 0 {
82						if chunk[i - 1] == b'\r' { abs_nl - 1 } else { abs_nl }
83					} else if prev_byte_is_cr {
84						abs_nl - 1
85					} else {
86						abs_nl
87					};
88					spans.push((rec_start, end));
89					rec_start = abs_nl + 1;
90				}
91				_ => { /* regular byte */ }
92			}
93
94			prev_byte_is_cr = b == b'\r';
95			i += 1;
96		}
97
98		// If chunk ended with a '"' inside quotes, we have to defer the decision.
99		// `quote_pending` already encodes that state correctly.
100		// If chunk ended with '\r', remember it for CRLF spanning chunks:
101		// handled via `prev_byte_is_cr` above.
102
103		file_pos += n;
104	}
105
106	// End-of-file: close any pending quote decision (treat as closing if still pending).
107	#[allow(unused)]
108	if quote_pending {
109		in_quotes = false;
110		quote_pending = false;
111	}
112
113	// Final record if file doesn’t end with '\n'
114	if rec_start < file_pos {
115		spans.push((rec_start, file_pos));
116	}
117
118	Ok(spans)
119}
120
121// endregion: --- Support
122
123// region:    --- Tests
124
125#[cfg(test)]
126mod tests {
127	type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>; // For tests.
128
129	use super::*;
130
131	#[test]
132	fn test_span_csv_row_spans_simple() -> Result<()> {
133		// -- Setup & Fixtures
134		let path = SPath::from("tests-data/example.csv");
135
136		// -- Exec
137		let spans = csv_row_spans(&path)?;
138
139		// -- Check
140		assert_eq!(spans.len(), 4, "should find 4 CSV records (including header)");
141
142		let expected = [
143			"name,age,comment",
144			"Alice,30,\"hello, world\"",
145			"Bob,25,\"Line with \"\"quote\"\"\"",
146			"Carol,28,\"multi\nline with \"\"quotes\"\" inside\"",
147		];
148
149		for (i, exp) in expected.iter().enumerate() {
150			let (s, e) = spans.get(i).copied().ok_or("missing expected span")?;
151			let got = crate::read_span(&path, s, e)?;
152			assert_eq!(&got, exp);
153		}
154
155		Ok(())
156	}
157}
158
159// endregion: --- Tests