1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
use std::io; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use csv; /// A simple index for random access to CSV records. /// /// This index permits seeking to the start of any CSV record with a constant /// number of operations. /// /// The format of the index is simplistic and amenable to serializing to disk. /// It consists of exactly `N+1` 64 bit big-endian integers, where `N` is the /// number of records in the CSV data that is indexed. Each `i`th integer /// corresponds to the approximate byte offset where the `i`th record in the /// CSV data begins. One additional integer is written to the end of the index /// which indicates the total number of records in the CSV data. /// /// This indexing format does not store the line numbers of CSV records, so /// using the positions returned by this index to seek a CSV reader will likely /// cause any future line numbers reported by that reader to be incorrect. /// /// This format will never change. /// /// N.B. The format of this indexing scheme matches the format of the old the /// `csv::Indexed` type in pre-1.0 versions of the `csv` crate. pub struct RandomAccessSimple<R> { rdr: R, len: u64, } impl<W: io::Write> RandomAccessSimple<W> { /// Write a simple index to the given writer for the given CSV reader. /// /// If there was a problem reading CSV records or writing to the given /// writer, then an error is returned. /// /// That the given CSV reader is read as given until EOF. The index /// produced includes all records, including the first record even if the /// CSV reader is configured to interpret the first record as a header /// record. /// /// # Example: in memory index /// /// This example shows how to create a simple random access index, open it /// and query the number of records in the index. /// /// ``` /// use std::io; /// use csv_index::RandomAccessSimple; /// /// # fn main() { example().unwrap(); } /// fn example() -> csv::Result<()> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); /// let mut wtr = io::Cursor::new(vec![]); /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; /// /// let idx = RandomAccessSimple::open(wtr)?; /// assert_eq!(idx.len(), 3); /// Ok(()) /// } /// ``` /// /// # Example: file backed index /// /// This is like the previous example, but instead of creating the index /// in memory with `std::io::Cursor`, we write the index to a file. /// /// ```no_run /// use std::fs::File; /// use std::io; /// use csv_index::RandomAccessSimple; /// /// # fn main() { example().unwrap(); } /// fn example() -> csv::Result<()> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); /// let mut wtr = File::create("data.csv.idx")?; /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; /// /// let fileidx = File::open("data.csv.idx")?; /// let idx = RandomAccessSimple::open(fileidx)?; /// assert_eq!(idx.len(), 3); /// Ok(()) /// } /// ``` pub fn create<R: io::Read>( rdr: &mut csv::Reader<R>, mut wtr: W, ) -> csv::Result<()> { // If the reader is configured to read a header, then read that // first. (The CSV reader otherwise won't yield the header record // when calling `read_byte_record`.) let mut len = 0; if rdr.has_headers() { let header = rdr.byte_headers()?; if !header.is_empty() { let pos = header.position().expect("position on header row"); wtr.write_u64::<BigEndian>(pos.byte())?; len += 1; } } let mut record = csv::ByteRecord::new(); while rdr.read_byte_record(&mut record)? { let pos = record.position().expect("position on row"); wtr.write_u64::<BigEndian>(pos.byte())?; len += 1; } wtr.write_u64::<BigEndian>(len)?; Ok(()) } } impl<R: io::Read + io::Seek> RandomAccessSimple<R> { /// Open an existing simple CSV index. /// /// The reader given must be seekable and should contain an index written /// by `RandomAccessSimple::create`. /// /// # Example /// /// This example shows how to create a simple random access index, open it /// and query the number of records in the index. /// /// ``` /// use std::io; /// use csv_index::RandomAccessSimple; /// /// # fn main() { example().unwrap(); } /// fn example() -> csv::Result<()> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); /// let mut wtr = io::Cursor::new(vec![]); /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; /// /// let idx = RandomAccessSimple::open(wtr)?; /// assert_eq!(idx.len(), 3); /// Ok(()) /// } /// ``` pub fn open(mut rdr: R) -> csv::Result<RandomAccessSimple<R>> { rdr.seek(io::SeekFrom::End(-8))?; let len = rdr.read_u64::<BigEndian>()?; Ok(RandomAccessSimple { rdr: rdr, len: len }) } /// Get the position of the record at index `i`. /// /// The first record has index `0`. /// /// If the position returned is used to seek the CSV reader that was used /// to create this index, then the next record read by the CSV reader will /// be the `i`th record. /// /// Note that since this index does not store the line number of each /// record, the position returned will always have a line number equivalent /// to `1`. This in turn will cause the CSV reader to report all subsequent /// line numbers incorrectly. /// /// # Example /// /// This example shows how to create a simple random access index, open it /// and use it to seek a CSV reader to read an arbitrary record. /// /// ``` /// use std::error::Error; /// use std::io; /// use csv_index::RandomAccessSimple; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box<dyn Error>> { /// let data = "\ /// city,country,pop /// Boston,United States,4628910 /// Concord,United States,42695 /// "; /// // Note that we wrap our CSV data in an io::Cursor, which makes it /// // seekable. If you're opening CSV data from a file, then this is /// // not needed since a `File` is already seekable. /// let mut rdr = csv::Reader::from_reader(io::Cursor::new(data)); /// let mut wtr = io::Cursor::new(vec![]); /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; /// /// // Open the index we just created, get the position of the last /// // record and seek the CSV reader. /// let mut idx = RandomAccessSimple::open(wtr)?; /// let pos = idx.get(2)?; /// rdr.seek(pos)?; /// /// // Read the next record. /// if let Some(result) = rdr.records().next() { /// let record = result?; /// assert_eq!(record, vec!["Concord", "United States", "42695"]); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn get(&mut self, i: u64) -> csv::Result<csv::Position> { if i >= self.len { let msg = format!( "invalid record index {} (there are {} records)", i, self.len ); let err = io::Error::new(io::ErrorKind::Other, msg); return Err(csv::Error::from(err)); } self.rdr.seek(io::SeekFrom::Start(i * 8))?; let offset = self.rdr.read_u64::<BigEndian>()?; let mut pos = csv::Position::new(); pos.set_byte(offset).set_record(i); Ok(pos) } /// Return the number of records (including the header record) in this /// index. pub fn len(&self) -> u64 { self.len } /// Return true if and only if this index has zero records. pub fn is_empty(&self) -> bool { self.len() == 0 } } #[cfg(test)] mod tests { use std::io; use csv; use super::RandomAccessSimple; struct Indexed<'a> { csv: csv::Reader<io::Cursor<&'a str>>, idx: RandomAccessSimple<io::Cursor<Vec<u8>>>, } impl<'a> Indexed<'a> { fn new(headers: bool, csv_data: &'a str) -> Indexed<'a> { let mut rdr = csv::ReaderBuilder::new() .has_headers(headers) .from_reader(io::Cursor::new(csv_data)); let mut idxbuf = io::Cursor::new(vec![]); RandomAccessSimple::create(&mut rdr, &mut idxbuf).unwrap(); Indexed { csv: rdr, idx: RandomAccessSimple::open(idxbuf).unwrap(), } } fn read_at(&mut self, record: u64) -> csv::StringRecord { let pos = self.idx.get(record).unwrap(); self.csv.seek(pos).unwrap(); self.csv.records().next().unwrap().unwrap() } } #[test] fn headers_empty() { let idx = Indexed::new(true, ""); assert_eq!(idx.idx.len(), 0); } #[test] fn headers_one_field() { let mut idx = Indexed::new(true, "h1\na\nb\nc\n"); assert_eq!(idx.idx.len(), 4); assert_eq!(idx.read_at(0), vec!["h1"]); assert_eq!(idx.read_at(1), vec!["a"]); assert_eq!(idx.read_at(2), vec!["b"]); assert_eq!(idx.read_at(3), vec!["c"]); } #[test] fn headers_many_fields() { let mut idx = Indexed::new( true, "\ h1,h2,h3 a,b,c d,e,f g,h,i ", ); assert_eq!(idx.idx.len(), 4); assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]); assert_eq!(idx.read_at(1), vec!["a", "b", "c"]); assert_eq!(idx.read_at(2), vec!["d", "e", "f"]); assert_eq!(idx.read_at(3), vec!["g", "h", "i"]); } #[test] fn no_headers_one_field() { let mut idx = Indexed::new(false, "h1\na\nb\nc\n"); assert_eq!(idx.idx.len(), 4); assert_eq!(idx.read_at(0), vec!["h1"]); assert_eq!(idx.read_at(1), vec!["a"]); assert_eq!(idx.read_at(2), vec!["b"]); assert_eq!(idx.read_at(3), vec!["c"]); } #[test] fn no_headers_many_fields() { let mut idx = Indexed::new( false, "\ h1,h2,h3 a,b,c d,e,f g,h,i ", ); assert_eq!(idx.idx.len(), 4); assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]); assert_eq!(idx.read_at(1), vec!["a", "b", "c"]); assert_eq!(idx.read_at(2), vec!["d", "e", "f"]); assert_eq!(idx.read_at(3), vec!["g", "h", "i"]); } #[test] fn headers_one_field_newlines() { let mut idx = Indexed::new( true, " h1 a b c ", ); assert_eq!(idx.idx.len(), 4); assert_eq!(idx.read_at(0), vec!["h1"]); assert_eq!(idx.read_at(1), vec!["a"]); assert_eq!(idx.read_at(2), vec!["b"]); assert_eq!(idx.read_at(3), vec!["c"]); } }