simple_fs/span/
csv_spans.rs

1use crate::spath::SPath;
2use crate::{Error, Result, open_file};
3use std::io::{self, Read};
4
5/// CSV-aware record spans: returns byte ranges [start, end) for each *row*.
6/// - Treats '\n' as a record separator only when **not** inside quotes.
7/// - For CRLF, the '\r' is excluded from the end bound.
8/// - Supports `""` as an escaped quote inside quoted fields.
9/// - Streams in chunks; does *not* read the whole file into memory.
10pub fn csv_row_spans(path: impl AsRef<SPath>) -> Result<Vec<(usize, usize)>> {
11	let path = path.as_ref();
12	let mut f = open_file(path)?;
13	csv_row_spans_from_reader(&mut f).map_err(|err| Error::FileCantRead((path, err).into()))
14}
15
16// region:    --- Support
17
18fn csv_row_spans_from_reader<R: Read>(r: &mut R) -> io::Result<Vec<(usize, usize)>> {
19	let mut spans: Vec<(usize, usize)> = Vec::new();
20
21	// 64 KiB chunks: good balance of cacheability vs syscalls.
22	let mut buf = [0u8; 64 * 1024];
23
24	// Absolute position of start of `buf` in file.
25	let mut file_pos: usize = 0;
26	// Absolute start offset of the current record.
27	let mut rec_start: usize = 0;
28
29	// CSV quote state across chunk boundaries.
30	let mut in_quotes: bool = false;
31	// We saw a '"' at the end of the previous byte; need to decide if it’s
32	// a closing quote or the first of a `""` escape when we see the next byte.
33	let mut quote_pending: bool = false;
34
35	// Track CR immediately before '\n' across chunk boundary.
36	let mut prev_byte_is_cr: bool = false;
37
38	loop {
39		let n = r.read(&mut buf)?;
40		if n == 0 {
41			break;
42		}
43		let chunk = &buf[..n];
44
45		let mut i = 0usize;
46		while i < n {
47			let b = chunk[i];
48
49			// Resolve a pending quote (from previous byte/chunk) if any.
50			if quote_pending {
51				if b == b'"' {
52					// Escaped quote "" inside a quoted field.
53					// Consume this byte as the second quote of the escape.
54					quote_pending = false;
55					// Stay in_quotes; the pair represents a literal '"'.
56					i += 1;
57					prev_byte_is_cr = false;
58					continue;
59				} else {
60					// Previous '"' was a closing quote.
61					in_quotes = false;
62					quote_pending = false;
63					// Fall through to process current byte normally.
64				}
65			}
66
67			match b {
68				b'"' => {
69					if in_quotes {
70						// Might be closing quote, but need lookahead to disambiguate "".
71						quote_pending = true;
72					} else {
73						// Enter quoted field.
74						in_quotes = true;
75						// No pending: we only set pending when *inside* quotes.
76					}
77				}
78				b'\n' => {
79					if !in_quotes && !quote_pending {
80						// This is a record delimiter. Compute end (exclude preceding \r).
81						let abs_nl = file_pos + i;
82						let end = if i > 0 {
83							if chunk[i - 1] == b'\r' { abs_nl - 1 } else { abs_nl }
84						} else if prev_byte_is_cr {
85							abs_nl - 1
86						} else {
87							abs_nl
88						};
89						spans.push((rec_start, end));
90						rec_start = abs_nl + 1;
91					}
92				}
93				_ => { /* regular byte */ }
94			}
95
96			prev_byte_is_cr = b == b'\r';
97			i += 1;
98		}
99
100		// If chunk ended with a '"' inside quotes, we have to defer the decision.
101		// `quote_pending` already encodes that state correctly.
102		// If chunk ended with '\r', remember it for CRLF spanning chunks:
103		// handled via `prev_byte_is_cr` above.
104
105		file_pos += n;
106	}
107
108	// End-of-file: close any pending quote decision (treat as closing if still pending).
109	#[allow(unused)]
110	if quote_pending {
111		in_quotes = false;
112		quote_pending = false;
113	}
114
115	// Final record if file doesn’t end with '\n'
116	if rec_start < file_pos {
117		spans.push((rec_start, file_pos));
118	}
119
120	Ok(spans)
121}
122
123// endregion: --- Support
124
125// region:    --- Tests
126
127#[cfg(test)]
128mod tests {
129	type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>; // For tests.
130
131	use super::*;
132
133	#[test]
134	fn test_span_csv_row_spans_simple() -> Result<()> {
135		// -- Setup & Fixtures
136		let path = SPath::from("tests-data/example.csv");
137
138		// -- Exec
139		let spans = csv_row_spans(&path)?;
140
141		// -- Check
142		assert_eq!(spans.len(), 4, "should find 4 CSV records (including header)");
143
144		let expected = [
145			"name,age,comment",
146			"Alice,30,\"hello, world\"",
147			"Bob,25,\"Line with \"\"quote\"\"\"",
148			"Carol,28,\"multi\nline with \"\"quotes\"\" inside\"",
149		];
150
151		for (i, exp) in expected.iter().enumerate() {
152			let (s, e) = spans.get(i).copied().ok_or("missing expected span")?;
153			let got = crate::read_span(&path, s, e)?;
154			assert_eq!(&got, exp);
155		}
156
157		Ok(())
158	}
159}
160
161// endregion: --- Tests