csv_index/simple.rs
1use std::io;
2
3use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
4use csv;
5
6/// A simple index for random access to CSV records.
7///
8/// This index permits seeking to the start of any CSV record with a constant
9/// number of operations.
10///
11/// The format of the index is simplistic and amenable to serializing to disk.
12/// It consists of exactly `N+1` 64 bit big-endian integers, where `N` is the
13/// number of records in the CSV data that is indexed. Each `i`th integer
14/// corresponds to the approximate byte offset where the `i`th record in the
15/// CSV data begins. One additional integer is written to the end of the index
16/// which indicates the total number of records in the CSV data.
17///
18/// This indexing format does not store the line numbers of CSV records, so
19/// using the positions returned by this index to seek a CSV reader will likely
20/// cause any future line numbers reported by that reader to be incorrect.
21///
22/// This format will never change.
23///
24/// N.B. The format of this indexing scheme matches the format of the old the
25/// `csv::Indexed` type in pre-1.0 versions of the `csv` crate.
26pub struct RandomAccessSimple<R> {
27 rdr: R,
28 len: u64,
29}
30
31impl<W: io::Write> RandomAccessSimple<W> {
32 /// Write a simple index to the given writer for the given CSV reader.
33 ///
34 /// If there was a problem reading CSV records or writing to the given
35 /// writer, then an error is returned.
36 ///
37 /// That the given CSV reader is read as given until EOF. The index
38 /// produced includes all records, including the first record even if the
39 /// CSV reader is configured to interpret the first record as a header
40 /// record.
41 ///
42 /// # Example: in memory index
43 ///
44 /// This example shows how to create a simple random access index, open it
45 /// and query the number of records in the index.
46 ///
47 /// ```
48 /// use std::io;
49 /// use csv_index::RandomAccessSimple;
50 ///
51 /// # fn main() { example().unwrap(); }
52 /// fn example() -> csv::Result<()> {
53 /// let data = "\
54 /// city,country,pop
55 /// Boston,United States,4628910
56 /// Concord,United States,42695
57 /// ";
58 /// let mut rdr = csv::Reader::from_reader(data.as_bytes());
59 /// let mut wtr = io::Cursor::new(vec![]);
60 /// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
61 ///
62 /// let idx = RandomAccessSimple::open(wtr)?;
63 /// assert_eq!(idx.len(), 3);
64 /// Ok(())
65 /// }
66 /// ```
67 ///
68 /// # Example: file backed index
69 ///
70 /// This is like the previous example, but instead of creating the index
71 /// in memory with `std::io::Cursor`, we write the index to a file.
72 ///
73 /// ```no_run
74 /// use std::fs::File;
75 /// use std::io;
76 /// use csv_index::RandomAccessSimple;
77 ///
78 /// # fn main() { example().unwrap(); }
79 /// fn example() -> csv::Result<()> {
80 /// let data = "\
81 /// city,country,pop
82 /// Boston,United States,4628910
83 /// Concord,United States,42695
84 /// ";
85 /// let mut rdr = csv::Reader::from_reader(data.as_bytes());
86 /// let mut wtr = File::create("data.csv.idx")?;
87 /// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
88 ///
89 /// let fileidx = File::open("data.csv.idx")?;
90 /// let idx = RandomAccessSimple::open(fileidx)?;
91 /// assert_eq!(idx.len(), 3);
92 /// Ok(())
93 /// }
94 /// ```
95 pub fn create<R: io::Read>(
96 rdr: &mut csv::Reader<R>,
97 mut wtr: W,
98 ) -> csv::Result<()> {
99 // If the reader is configured to read a header, then read that
100 // first. (The CSV reader otherwise won't yield the header record
101 // when calling `read_byte_record`.)
102 let mut len = 0;
103 if rdr.has_headers() {
104 let header = rdr.byte_headers()?;
105 if !header.is_empty() {
106 let pos = header.position().expect("position on header row");
107 wtr.write_u64::<BigEndian>(pos.byte())?;
108 len += 1;
109 }
110 }
111 let mut record = csv::ByteRecord::new();
112 while rdr.read_byte_record(&mut record)? {
113 let pos = record.position().expect("position on row");
114 wtr.write_u64::<BigEndian>(pos.byte())?;
115 len += 1;
116 }
117 wtr.write_u64::<BigEndian>(len)?;
118 Ok(())
119 }
120}
121
122impl<R: io::Read + io::Seek> RandomAccessSimple<R> {
123 /// Open an existing simple CSV index.
124 ///
125 /// The reader given must be seekable and should contain an index written
126 /// by `RandomAccessSimple::create`.
127 ///
128 /// # Example
129 ///
130 /// This example shows how to create a simple random access index, open it
131 /// and query the number of records in the index.
132 ///
133 /// ```
134 /// use std::io;
135 /// use csv_index::RandomAccessSimple;
136 ///
137 /// # fn main() { example().unwrap(); }
138 /// fn example() -> csv::Result<()> {
139 /// let data = "\
140 /// city,country,pop
141 /// Boston,United States,4628910
142 /// Concord,United States,42695
143 /// ";
144 /// let mut rdr = csv::Reader::from_reader(data.as_bytes());
145 /// let mut wtr = io::Cursor::new(vec![]);
146 /// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
147 ///
148 /// let idx = RandomAccessSimple::open(wtr)?;
149 /// assert_eq!(idx.len(), 3);
150 /// Ok(())
151 /// }
152 /// ```
153 pub fn open(mut rdr: R) -> csv::Result<RandomAccessSimple<R>> {
154 rdr.seek(io::SeekFrom::End(-8))?;
155 let len = rdr.read_u64::<BigEndian>()?;
156 Ok(RandomAccessSimple { rdr: rdr, len: len })
157 }
158
159 /// Get the position of the record at index `i`.
160 ///
161 /// The first record has index `0`.
162 ///
163 /// If the position returned is used to seek the CSV reader that was used
164 /// to create this index, then the next record read by the CSV reader will
165 /// be the `i`th record.
166 ///
167 /// Note that since this index does not store the line number of each
168 /// record, the position returned will always have a line number equivalent
169 /// to `1`. This in turn will cause the CSV reader to report all subsequent
170 /// line numbers incorrectly.
171 ///
172 /// # Example
173 ///
174 /// This example shows how to create a simple random access index, open it
175 /// and use it to seek a CSV reader to read an arbitrary record.
176 ///
177 /// ```
178 /// use std::error::Error;
179 /// use std::io;
180 /// use csv_index::RandomAccessSimple;
181 ///
182 /// # fn main() { example().unwrap(); }
183 /// fn example() -> Result<(), Box<dyn Error>> {
184 /// let data = "\
185 /// city,country,pop
186 /// Boston,United States,4628910
187 /// Concord,United States,42695
188 /// ";
189 /// // Note that we wrap our CSV data in an io::Cursor, which makes it
190 /// // seekable. If you're opening CSV data from a file, then this is
191 /// // not needed since a `File` is already seekable.
192 /// let mut rdr = csv::Reader::from_reader(io::Cursor::new(data));
193 /// let mut wtr = io::Cursor::new(vec![]);
194 /// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
195 ///
196 /// // Open the index we just created, get the position of the last
197 /// // record and seek the CSV reader.
198 /// let mut idx = RandomAccessSimple::open(wtr)?;
199 /// let pos = idx.get(2)?;
200 /// rdr.seek(pos)?;
201 ///
202 /// // Read the next record.
203 /// if let Some(result) = rdr.records().next() {
204 /// let record = result?;
205 /// assert_eq!(record, vec!["Concord", "United States", "42695"]);
206 /// Ok(())
207 /// } else {
208 /// Err(From::from("expected at least one record but got none"))
209 /// }
210 /// }
211 /// ```
212 pub fn get(&mut self, i: u64) -> csv::Result<csv::Position> {
213 if i >= self.len {
214 let msg = format!(
215 "invalid record index {} (there are {} records)",
216 i, self.len
217 );
218 let err = io::Error::new(io::ErrorKind::Other, msg);
219 return Err(csv::Error::from(err));
220 }
221 self.rdr.seek(io::SeekFrom::Start(i * 8))?;
222 let offset = self.rdr.read_u64::<BigEndian>()?;
223 let mut pos = csv::Position::new();
224 pos.set_byte(offset).set_record(i);
225 Ok(pos)
226 }
227
228 /// Return the number of records (including the header record) in this
229 /// index.
230 pub fn len(&self) -> u64 {
231 self.len
232 }
233
234 /// Return true if and only if this index has zero records.
235 pub fn is_empty(&self) -> bool {
236 self.len() == 0
237 }
238}
239
240#[cfg(test)]
241mod tests {
242 use std::io;
243
244 use csv;
245
246 use super::RandomAccessSimple;
247
248 struct Indexed<'a> {
249 csv: csv::Reader<io::Cursor<&'a str>>,
250 idx: RandomAccessSimple<io::Cursor<Vec<u8>>>,
251 }
252
253 impl<'a> Indexed<'a> {
254 fn new(headers: bool, csv_data: &'a str) -> Indexed<'a> {
255 let mut rdr = csv::ReaderBuilder::new()
256 .has_headers(headers)
257 .from_reader(io::Cursor::new(csv_data));
258 let mut idxbuf = io::Cursor::new(vec![]);
259 RandomAccessSimple::create(&mut rdr, &mut idxbuf).unwrap();
260 Indexed {
261 csv: rdr,
262 idx: RandomAccessSimple::open(idxbuf).unwrap(),
263 }
264 }
265
266 fn read_at(&mut self, record: u64) -> csv::StringRecord {
267 let pos = self.idx.get(record).unwrap();
268 self.csv.seek(pos).unwrap();
269 self.csv.records().next().unwrap().unwrap()
270 }
271 }
272
273 #[test]
274 fn headers_empty() {
275 let idx = Indexed::new(true, "");
276 assert_eq!(idx.idx.len(), 0);
277 }
278
279 #[test]
280 fn headers_one_field() {
281 let mut idx = Indexed::new(true, "h1\na\nb\nc\n");
282 assert_eq!(idx.idx.len(), 4);
283 assert_eq!(idx.read_at(0), vec!["h1"]);
284 assert_eq!(idx.read_at(1), vec!["a"]);
285 assert_eq!(idx.read_at(2), vec!["b"]);
286 assert_eq!(idx.read_at(3), vec!["c"]);
287 }
288
289 #[test]
290 fn headers_many_fields() {
291 let mut idx = Indexed::new(
292 true,
293 "\
294h1,h2,h3
295a,b,c
296d,e,f
297g,h,i
298",
299 );
300 assert_eq!(idx.idx.len(), 4);
301 assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]);
302 assert_eq!(idx.read_at(1), vec!["a", "b", "c"]);
303 assert_eq!(idx.read_at(2), vec!["d", "e", "f"]);
304 assert_eq!(idx.read_at(3), vec!["g", "h", "i"]);
305 }
306
307 #[test]
308 fn no_headers_one_field() {
309 let mut idx = Indexed::new(false, "h1\na\nb\nc\n");
310 assert_eq!(idx.idx.len(), 4);
311 assert_eq!(idx.read_at(0), vec!["h1"]);
312 assert_eq!(idx.read_at(1), vec!["a"]);
313 assert_eq!(idx.read_at(2), vec!["b"]);
314 assert_eq!(idx.read_at(3), vec!["c"]);
315 }
316
317 #[test]
318 fn no_headers_many_fields() {
319 let mut idx = Indexed::new(
320 false,
321 "\
322h1,h2,h3
323a,b,c
324d,e,f
325g,h,i
326",
327 );
328 assert_eq!(idx.idx.len(), 4);
329 assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]);
330 assert_eq!(idx.read_at(1), vec!["a", "b", "c"]);
331 assert_eq!(idx.read_at(2), vec!["d", "e", "f"]);
332 assert_eq!(idx.read_at(3), vec!["g", "h", "i"]);
333 }
334
335 #[test]
336 fn headers_one_field_newlines() {
337 let mut idx = Indexed::new(
338 true,
339 "
340
341
342
343
344h1
345
346a
347
348
349b
350
351
352
353
354
355
356c
357
358
359
360
361
362
363",
364 );
365 assert_eq!(idx.idx.len(), 4);
366 assert_eq!(idx.read_at(0), vec!["h1"]);
367 assert_eq!(idx.read_at(1), vec!["a"]);
368 assert_eq!(idx.read_at(2), vec!["b"]);
369 assert_eq!(idx.read_at(3), vec!["c"]);
370 }
371}