csv_index/
simple.rs

1use std::io;
2
3use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
4use csv;
5
6/// A simple index for random access to CSV records.
7///
8/// This index permits seeking to the start of any CSV record with a constant
9/// number of operations.
10///
11/// The format of the index is simplistic and amenable to serializing to disk.
12/// It consists of exactly `N+1` 64 bit big-endian integers, where `N` is the
13/// number of records in the CSV data that is indexed. Each `i`th integer
14/// corresponds to the approximate byte offset where the `i`th record in the
15/// CSV data begins. One additional integer is written to the end of the index
16/// which indicates the total number of records in the CSV data.
17///
18/// This indexing format does not store the line numbers of CSV records, so
19/// using the positions returned by this index to seek a CSV reader will likely
20/// cause any future line numbers reported by that reader to be incorrect.
21///
22/// This format will never change.
23///
24/// N.B. The format of this indexing scheme matches the format of the old the
25/// `csv::Indexed` type in pre-1.0 versions of the `csv` crate.
26pub struct RandomAccessSimple<R> {
27    rdr: R,
28    len: u64,
29}
30
31impl<W: io::Write> RandomAccessSimple<W> {
32    /// Write a simple index to the given writer for the given CSV reader.
33    ///
34    /// If there was a problem reading CSV records or writing to the given
35    /// writer, then an error is returned.
36    ///
37    /// That the given CSV reader is read as given until EOF. The index
38    /// produced includes all records, including the first record even if the
39    /// CSV reader is configured to interpret the first record as a header
40    /// record.
41    ///
42    /// # Example: in memory index
43    ///
44    /// This example shows how to create a simple random access index, open it
45    /// and query the number of records in the index.
46    ///
47    /// ```
48    /// use std::io;
49    /// use csv_index::RandomAccessSimple;
50    ///
51    /// # fn main() { example().unwrap(); }
52    /// fn example() -> csv::Result<()> {
53    ///     let data = "\
54    /// city,country,pop
55    /// Boston,United States,4628910
56    /// Concord,United States,42695
57    /// ";
58    ///     let mut rdr = csv::Reader::from_reader(data.as_bytes());
59    ///     let mut wtr = io::Cursor::new(vec![]);
60    ///     RandomAccessSimple::create(&mut rdr, &mut wtr)?;
61    ///
62    ///     let idx = RandomAccessSimple::open(wtr)?;
63    ///     assert_eq!(idx.len(), 3);
64    ///     Ok(())
65    /// }
66    /// ```
67    ///
68    /// # Example: file backed index
69    ///
70    /// This is like the previous example, but instead of creating the index
71    /// in memory with `std::io::Cursor`, we write the index to a file.
72    ///
73    /// ```no_run
74    /// use std::fs::File;
75    /// use std::io;
76    /// use csv_index::RandomAccessSimple;
77    ///
78    /// # fn main() { example().unwrap(); }
79    /// fn example() -> csv::Result<()> {
80    ///     let data = "\
81    /// city,country,pop
82    /// Boston,United States,4628910
83    /// Concord,United States,42695
84    /// ";
85    ///     let mut rdr = csv::Reader::from_reader(data.as_bytes());
86    ///     let mut wtr = File::create("data.csv.idx")?;
87    ///     RandomAccessSimple::create(&mut rdr, &mut wtr)?;
88    ///
89    ///     let fileidx = File::open("data.csv.idx")?;
90    ///     let idx = RandomAccessSimple::open(fileidx)?;
91    ///     assert_eq!(idx.len(), 3);
92    ///     Ok(())
93    /// }
94    /// ```
95    pub fn create<R: io::Read>(
96        rdr: &mut csv::Reader<R>,
97        mut wtr: W,
98    ) -> csv::Result<()> {
99        // If the reader is configured to read a header, then read that
100        // first. (The CSV reader otherwise won't yield the header record
101        // when calling `read_byte_record`.)
102        let mut len = 0;
103        if rdr.has_headers() {
104            let header = rdr.byte_headers()?;
105            if !header.is_empty() {
106                let pos = header.position().expect("position on header row");
107                wtr.write_u64::<BigEndian>(pos.byte())?;
108                len += 1;
109            }
110        }
111        let mut record = csv::ByteRecord::new();
112        while rdr.read_byte_record(&mut record)? {
113            let pos = record.position().expect("position on row");
114            wtr.write_u64::<BigEndian>(pos.byte())?;
115            len += 1;
116        }
117        wtr.write_u64::<BigEndian>(len)?;
118        Ok(())
119    }
120}
121
122impl<R: io::Read + io::Seek> RandomAccessSimple<R> {
123    /// Open an existing simple CSV index.
124    ///
125    /// The reader given must be seekable and should contain an index written
126    /// by `RandomAccessSimple::create`.
127    ///
128    /// # Example
129    ///
130    /// This example shows how to create a simple random access index, open it
131    /// and query the number of records in the index.
132    ///
133    /// ```
134    /// use std::io;
135    /// use csv_index::RandomAccessSimple;
136    ///
137    /// # fn main() { example().unwrap(); }
138    /// fn example() -> csv::Result<()> {
139    ///     let data = "\
140    /// city,country,pop
141    /// Boston,United States,4628910
142    /// Concord,United States,42695
143    /// ";
144    ///     let mut rdr = csv::Reader::from_reader(data.as_bytes());
145    ///     let mut wtr = io::Cursor::new(vec![]);
146    ///     RandomAccessSimple::create(&mut rdr, &mut wtr)?;
147    ///
148    ///     let idx = RandomAccessSimple::open(wtr)?;
149    ///     assert_eq!(idx.len(), 3);
150    ///     Ok(())
151    /// }
152    /// ```
153    pub fn open(mut rdr: R) -> csv::Result<RandomAccessSimple<R>> {
154        rdr.seek(io::SeekFrom::End(-8))?;
155        let len = rdr.read_u64::<BigEndian>()?;
156        Ok(RandomAccessSimple { rdr: rdr, len: len })
157    }
158
159    /// Get the position of the record at index `i`.
160    ///
161    /// The first record has index `0`.
162    ///
163    /// If the position returned is used to seek the CSV reader that was used
164    /// to create this index, then the next record read by the CSV reader will
165    /// be the `i`th record.
166    ///
167    /// Note that since this index does not store the line number of each
168    /// record, the position returned will always have a line number equivalent
169    /// to `1`. This in turn will cause the CSV reader to report all subsequent
170    /// line numbers incorrectly.
171    ///
172    /// # Example
173    ///
174    /// This example shows how to create a simple random access index, open it
175    /// and use it to seek a CSV reader to read an arbitrary record.
176    ///
177    /// ```
178    /// use std::error::Error;
179    /// use std::io;
180    /// use csv_index::RandomAccessSimple;
181    ///
182    /// # fn main() { example().unwrap(); }
183    /// fn example() -> Result<(), Box<dyn Error>> {
184    ///     let data = "\
185    /// city,country,pop
186    /// Boston,United States,4628910
187    /// Concord,United States,42695
188    /// ";
189    ///     // Note that we wrap our CSV data in an io::Cursor, which makes it
190    ///     // seekable. If you're opening CSV data from a file, then this is
191    ///     // not needed since a `File` is already seekable.
192    ///     let mut rdr = csv::Reader::from_reader(io::Cursor::new(data));
193    ///     let mut wtr = io::Cursor::new(vec![]);
194    ///     RandomAccessSimple::create(&mut rdr, &mut wtr)?;
195    ///
196    ///     // Open the index we just created, get the position of the last
197    ///     // record and seek the CSV reader.
198    ///     let mut idx = RandomAccessSimple::open(wtr)?;
199    ///     let pos = idx.get(2)?;
200    ///     rdr.seek(pos)?;
201    ///
202    ///     // Read the next record.
203    ///     if let Some(result) = rdr.records().next() {
204    ///         let record = result?;
205    ///         assert_eq!(record, vec!["Concord", "United States", "42695"]);
206    ///         Ok(())
207    ///     } else {
208    ///         Err(From::from("expected at least one record but got none"))
209    ///     }
210    /// }
211    /// ```
212    pub fn get(&mut self, i: u64) -> csv::Result<csv::Position> {
213        if i >= self.len {
214            let msg = format!(
215                "invalid record index {} (there are {} records)",
216                i, self.len
217            );
218            let err = io::Error::new(io::ErrorKind::Other, msg);
219            return Err(csv::Error::from(err));
220        }
221        self.rdr.seek(io::SeekFrom::Start(i * 8))?;
222        let offset = self.rdr.read_u64::<BigEndian>()?;
223        let mut pos = csv::Position::new();
224        pos.set_byte(offset).set_record(i);
225        Ok(pos)
226    }
227
228    /// Return the number of records (including the header record) in this
229    /// index.
230    pub fn len(&self) -> u64 {
231        self.len
232    }
233
234    /// Return true if and only if this index has zero records.
235    pub fn is_empty(&self) -> bool {
236        self.len() == 0
237    }
238}
239
240#[cfg(test)]
241mod tests {
242    use std::io;
243
244    use csv;
245
246    use super::RandomAccessSimple;
247
248    struct Indexed<'a> {
249        csv: csv::Reader<io::Cursor<&'a str>>,
250        idx: RandomAccessSimple<io::Cursor<Vec<u8>>>,
251    }
252
253    impl<'a> Indexed<'a> {
254        fn new(headers: bool, csv_data: &'a str) -> Indexed<'a> {
255            let mut rdr = csv::ReaderBuilder::new()
256                .has_headers(headers)
257                .from_reader(io::Cursor::new(csv_data));
258            let mut idxbuf = io::Cursor::new(vec![]);
259            RandomAccessSimple::create(&mut rdr, &mut idxbuf).unwrap();
260            Indexed {
261                csv: rdr,
262                idx: RandomAccessSimple::open(idxbuf).unwrap(),
263            }
264        }
265
266        fn read_at(&mut self, record: u64) -> csv::StringRecord {
267            let pos = self.idx.get(record).unwrap();
268            self.csv.seek(pos).unwrap();
269            self.csv.records().next().unwrap().unwrap()
270        }
271    }
272
273    #[test]
274    fn headers_empty() {
275        let idx = Indexed::new(true, "");
276        assert_eq!(idx.idx.len(), 0);
277    }
278
279    #[test]
280    fn headers_one_field() {
281        let mut idx = Indexed::new(true, "h1\na\nb\nc\n");
282        assert_eq!(idx.idx.len(), 4);
283        assert_eq!(idx.read_at(0), vec!["h1"]);
284        assert_eq!(idx.read_at(1), vec!["a"]);
285        assert_eq!(idx.read_at(2), vec!["b"]);
286        assert_eq!(idx.read_at(3), vec!["c"]);
287    }
288
289    #[test]
290    fn headers_many_fields() {
291        let mut idx = Indexed::new(
292            true,
293            "\
294h1,h2,h3
295a,b,c
296d,e,f
297g,h,i
298",
299        );
300        assert_eq!(idx.idx.len(), 4);
301        assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]);
302        assert_eq!(idx.read_at(1), vec!["a", "b", "c"]);
303        assert_eq!(idx.read_at(2), vec!["d", "e", "f"]);
304        assert_eq!(idx.read_at(3), vec!["g", "h", "i"]);
305    }
306
307    #[test]
308    fn no_headers_one_field() {
309        let mut idx = Indexed::new(false, "h1\na\nb\nc\n");
310        assert_eq!(idx.idx.len(), 4);
311        assert_eq!(idx.read_at(0), vec!["h1"]);
312        assert_eq!(idx.read_at(1), vec!["a"]);
313        assert_eq!(idx.read_at(2), vec!["b"]);
314        assert_eq!(idx.read_at(3), vec!["c"]);
315    }
316
317    #[test]
318    fn no_headers_many_fields() {
319        let mut idx = Indexed::new(
320            false,
321            "\
322h1,h2,h3
323a,b,c
324d,e,f
325g,h,i
326",
327        );
328        assert_eq!(idx.idx.len(), 4);
329        assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]);
330        assert_eq!(idx.read_at(1), vec!["a", "b", "c"]);
331        assert_eq!(idx.read_at(2), vec!["d", "e", "f"]);
332        assert_eq!(idx.read_at(3), vec!["g", "h", "i"]);
333    }
334
335    #[test]
336    fn headers_one_field_newlines() {
337        let mut idx = Indexed::new(
338            true,
339            "
340
341
342
343
344h1
345
346a
347
348
349b
350
351
352
353
354
355
356c
357
358
359
360
361
362
363",
364        );
365        assert_eq!(idx.idx.len(), 4);
366        assert_eq!(idx.read_at(0), vec!["h1"]);
367        assert_eq!(idx.read_at(1), vec!["a"]);
368        assert_eq!(idx.read_at(2), vec!["b"]);
369        assert_eq!(idx.read_at(3), vec!["c"]);
370    }
371}