1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
#![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
       html_favicon_url = "https://www.rust-lang.org/favicon.ico", html_root_url = ".")]

//! The [`HashMap`][hash-map] and [`BTreeMap`][btree-map] in the standard library
//! offer very good performance when it comes to inserting and getting stuff,
//! but they're memory killers. If the "stuff" gets large - say, a trillion
//! (10<sup>12</sup>) of them, then we're gonna be in trouble, as we'll then
//! be needing gigs of RAM to hold the data.
//!
//! Moreover, once the program quits, all the *hard-earned* stuff gets deallocated,
//! and we'd have to re-insert them allover again. [`HashFile`][hash-file] deals
//! with this specific problem. It makes use of a `BTreeMap` for storing the keys
//! and values. So, until it reaches the defined capacity, it offers the same
//! performance as that of the btree-map. However, once (and whenever) it reaches
//! the capacity, it *flushes* the stuff to a file (the necessary parameters can be
//! defined in its methods).
//!
//! Hence, at any given moment, the upper limit for the memory eaten by this thing
//! is set by its [capacity][capacity]. This gives us good control over the space-time
//! trade-off. But, the flushing will take O(2<sup>n</sup>) time, depending on the
//! processor and I/O speed, as it does things on the fly with the help of iterators.
//!
//! After the [final manual flush][finish], the file can be stored, moved around, and
//! since it makes use of binary search, values can be obtained in O(log-n) time
//! whenever they're required (depending on the seeking speed of the drive). For
//! example, an average seek takes around 0.3 ms, and a file containing a trillion
//! values demands about 40 seeks (in the worse case), which translates to 12 ms.
//!
//! This kind of "search and seek" is [already being used][wiki] by databases. But,
//! the system is simply an unnecessary complication if you just want a table with
//! a *zillion* rows and only two columns (a key and a value).
//!
//! [*See the `HashFile` type for more info.*][hash-file]
//!
//! [hash-map]: https://doc.rust-lang.org/std/collections/struct.HashMap.html
//! [btree-map]: https://doc.rust-lang.org/std/collections/struct.BTreeMap.html
//! [finish]: struct.HashFile.html#method.finish
//! [capacity]: struct.HashFile.html#method.set_capacity
//! [hash-file]: struct.HashFile.html
/// [wiki]: https://en.wikipedia.org/wiki/B-tree#B-tree_usage_in_databases
mod helpers;

use helpers::{SEP, create_or_open_file, hash, get_size};
use helpers::{read_one_line, seek_from_start, write_buffer};

use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::error::Error;
use std::fmt::{self, Display};
use std::fs::{self, File};
use std::hash::{Hash, Hasher};
use std::io::{BufRead, BufReader, BufWriter};
use std::mem;
use std::ops::AddAssign;
use std::str::FromStr;

const TEMP_SUFFIX: &'static str = ".hash_file";
const DAT_SUFFIX: &'static str = ".dat";

// FIXME: have a bool for marking key/vals to be removed (required for `remove` method)
struct KeyIndex<K: Display + FromStr + Hash> {
    key: K,
    idx: u64,
    count: usize,
}

impl<K: Display + FromStr + Hash> KeyIndex<K> {
    pub fn new(key: K) -> KeyIndex<K> {
        KeyIndex {
            key: key,
            idx: 0,
            count: 0,
        }
    }
}

// FIXME: This should be changed to serialization
impl<K: Display + FromStr + Hash> Display for KeyIndex<K> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}{}{}{}{}", self.key, SEP, self.idx, SEP, self.count)
    }
}

impl<K: Display + FromStr + Hash> FromStr for KeyIndex<K> {
    type Err = String;

    fn from_str(s: &str) -> Result<KeyIndex<K>, String> {
        let mut split = s.split(SEP);
        Ok(KeyIndex {
            key: try!(split.next().unwrap_or("")
                                  .parse::<K>()
                                  .map_err(|_| format!("Cannot parse the key!"))),
            idx: try!(split.next().unwrap_or("")
                                  .parse::<u64>()
                                  .map_err(|_| format!("Cannot parse the index!"))),
            count: try!(split.next().unwrap_or("")
                                    .parse::<usize>()
                                    .map_err(|_| format!("Cannot parse 'overwritten' count"))),
        })
    }
}

impl<K: Display + FromStr + Hash> AddAssign for KeyIndex<K> {
    fn add_assign(&mut self, other: KeyIndex<K>) {
        self.idx = other.idx;
        self.count += 1;
    }
}

impl<K: Display + FromStr + Hash> Hash for KeyIndex<K> {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.key.hash(state);
    }
}

/// An implementation of a "file-based" map which stores key-value pairs in sorted fashion in
/// file(s), and gets them using binary search and file seeking in O(log-n) time.
///
/// # Design
///
/// `HashFile` maintains two files - one for the keys and value indices, and the other
/// for the values themselves. Since it uses padding to maintain constant line length
/// throughout the file, large values can lead to extremely huge and sparse files. Having two
/// files eliminates this problem.
///
/// - During insertion, the hash of the supplied key is obtained (using the built-in
/// [`SipHasher`][hasher]), which acts as the key for sorting.
///
/// - While flushing, two iterators (one for the key/values in the underlying map, another
/// for the keys in the file) and two writers (one for key-index, another for the values)
/// are maintained. The readers throw key/values in ascending order, the hashes of the keys
/// are compared, the values are appended to a temporary file, and the keys (along with the
/// value indices) are written to another temporary file. In the end, both the files are renamed.
///
/// - Basically, the files are in DSV format - one has keys and value indices separated by a
/// null byte, while the other has values separated by `\n`. Each line in the "key" file is
/// ensured to have the same length, by properly padding it with null bytes, which is done by
/// calling the [`finish`][finish] method. The method also does a cleanup and gets rid of
/// unnecessary values from the "data" file.
///
/// - While getting, the hash for the given key is computed, and a [binary search][search]
/// is made by seeking through the file. The value index is found in O(log-n) time, and the value
/// is obtained from the "data" file in O(1) time.
///
/// # Examples
///
/// Once you've added the package to your `Cargo.toml`
///
/// ``` toml
/// catalog = "0.1.1"
/// ```
///
/// ... it can be used as follows,
///
/// ```
/// extern crate catalog;
///
/// use catalog::HashFile;
///
/// // This will create a new file in the path (if it doesn't exist)
/// let mut hash_file: HashFile<usize, _> =
///     try!(HashFile::new("/tmp/SAMPLE").map(|hf| hf.set_capacity(100)));
///
/// // We don't have to mention all the types explicitly, leaving it to type inference
/// // But, there's a reason why I mentioned `usize` (which we'll see in a moment).
///
/// // Insert some stuff into the map (in this case, integers and upper case alphabets)
/// for i in 0..1000 {
///     try!(hf.insert(i, format!("{}", (65 + (i % 26)) as u8 as char)));
/// }
///
/// // This flushes the data to the file for every 100 key/value pairs, since
/// // we've set the capacity to 100.
///
/// // Call the finish method once you're done with insertion. It's necessary
/// // because it pads each line and ensures that all the lines have the same length.
/// try!(hf.finish());
///
/// // Now, we're ready to "get" the values.
/// let value = try!(hf.get(&0));
/// assert_eq!(Some(("A".to_owned(), 0)), value);
/// // Note that in addition to the value, there's a number.
///
/// // Let's try it again...
/// try!(hf.insert(0, format!("Z")));
/// // Don't forget to flush it to the file!
/// try!(hf.finish());
///
/// let value = try!(hf.get(&0));
/// assert_eq!(Some(("Z".to_owned(), 1)), value);
///
/// // So, the number is just a counter. HashFile keeps track of the number of
/// // times a value has been overridden (with insertion).
/// ```
///
/// Now, let's have a quick peek inside the generated file.
///
/// ``` bash
/// $ head -5 /tmp/SAMPLE*
/// ==> /tmp/SAMPLE <==
/// 68600
/// 18320
/// 59540
/// 50060
/// 1580

/// ==> /tmp/SAMPLE.dat <==
/// K
/// B
/// X
/// G
/// P
/// $ wc -l /tmp/SAMPLE*
///  1000 /tmp/SAMPLE
///  1000 /tmp/SAMPLE.dat
///  2000 total
/// $ ls -l /tmp/SAMPLE*
/// -rw-rw-r-- 1 user user 11000 Jul 09 22:10 /tmp/SAMPLE
/// -rw-rw-r-- 1 user user 2000 Jul 09 22:10 /tmp/SAMPLE.dat
/// ```
///
/// The file sizes will (and should!) always be a multiple of the number of
/// key/value pairs, since each line is padded to have the same length.
/// Now, we can have another program to get the key/value pairs.
///
/// ``` rust
/// // This will open the file in the path (if it exists)
/// let mut hf: HashFile<usize, String> = try!(HashFile::new("/tmp/SAMPLE"));
/// ```
///
/// A couple of things to note here. Before getting, we need to mention the types,
/// because rustc doesn't know what type we have in the file (and, it'll throw an error).
///
/// Moreover, if we hadn't explicitly mentioned `usize` during insertion,
/// `rustc` would've gone for some default type, and if we mention some other primitive
/// now, the hashes won't match i.e., `hash(0u32) != hash(0usize)`.
///
/// For example, `"2"` can be parsed to all the integer primitives (`u8`, `u64`, `isize`, etc.),
/// but, they all produce different hashes. In such a case, it's more likely that `HashFile`
/// returns `None` while getting the value corresponding to a key, even if it exists in
/// the file. Hence, it's up to the user to handle such cases
/// (by manually denoting the type during insertion and getting).
///
/// ``` rust
/// // Now, we can get the values...
/// let value = try!(hf.get(&0));
/// assert_eq!(Some(("Z".to_owned(), 1)), value);
///
/// // ... as many as we want!
/// let value = try!(hf.get(&1));
/// assert_eq!(Some(("B".to_owned(), 0)), value);
/// ```
///
/// We've used a lot of `try!` here, because each method invocation involves making OS
/// calls for manipulating the underlying file descriptor. Since all the methods have been
/// ensured to return a [`Result<T, E>`][result], `HashFile` can be guaranteed from
/// panicking along the run.
///
/// # Advantages:
/// - **Control over memory:** You're planning to put a great deal of "stuff" into a map, but you
/// cannot afford the memory it demands. You wanna have control on how much memory your map can
/// consume. That said, you still want a map which can throw the values for your requested keys in
/// appreciable time.
///
/// - **Long term storage:** You're sure that the large "stuff" won't change in the near future,
/// and so you're not willing to risk the deallocation (when the program quits) or re-insertion
/// (whenever the program starts).
///
/// # Drawbacks:
/// - **Sluggish insertion:** Re-allocation in memory is lightning fast, while putting stuff into
///  the usual maps, and so it won't be obvious during the execution of a program. But, that's not
/// the case when it comes to file. Flushing to a file takes time (as it makes OS calls), and it
/// increases exponentially as O(2<sup>n</sup>) during insertion, which would be *very* obvious
/// in our case.
///
/// [finish]: #method.finish
/// [hasher]: https://doc.rust-lang.org/std/hash/struct.SipHasher.html
/// [result]: https://doc.rust-lang.org/std/result/enum.Result.html
/// [search]: https://en.wikipedia.org/wiki/Binary_search_algorithm
pub struct HashFile<K: Display + FromStr + Hash, V: Display + FromStr> {
    file: File,
    path: String,
    size: u64,
    data_file: File,
    data_path: String,
    data_idx: u64,
    hashed: BTreeMap<u64, (KeyIndex<K>, V)>,
    capacity: usize,
    line_length: usize,
}

impl<K: Display + FromStr + Hash, V: Display + FromStr> HashFile<K, V> {
    /// Create a new `HashFile` for mapping key/value pairs in the given path
    pub fn new(path: &str) -> Result<HashFile<K, V>, String> {
        let mut file = try!(create_or_open_file(&path));
        let file_size = get_size(&file).unwrap_or(0);
        let data_path = format!("{}{}", path, DAT_SUFFIX);

        Ok(HashFile {
            hashed: BTreeMap::new(),
            capacity: 0,
            line_length: match file_size > 0 {
                true => {
                    let line = try!(read_one_line(&mut file));
                    line.len()
                },
                false => 0,
            },
            file: {
                try!(seek_from_start(&mut file, 0));
                file
            },
            data_file: try!(create_or_open_file(&data_path)),
            data_path: data_path,
            data_idx: 0,
            path: path.to_owned(),
            size: file_size,
        })
    }

    /// Set the capacity of the `HashFile` (to flush to the file whenever it exceeds this value)
    pub fn set_capacity(mut self, capacity: usize) -> HashFile<K, V> {
        self.capacity = capacity;
        self
    }

    fn rename_temp_file(&mut self, rename_dat: bool) -> Result<(), String> {
        if rename_dat {
            try!(fs::rename(format!("{}{}", &self.data_path, TEMP_SUFFIX), &self.data_path)
                    .map_err(|e| format!("Cannot rename the data file! ({})", e.description())));
            self.data_file = try!(create_or_open_file(&self.data_path));
        }

        try!(fs::rename(format!("{}{}", &self.path, TEMP_SUFFIX), &self.path)
                .map_err(|e| format!("Cannot rename the temp file! ({})", e.description())));
        self.file = try!(create_or_open_file(&self.path));
        self.size = try!(get_size(&self.file));
        Ok(())
    }

    /// Run this finally to flush the values (if any) from the struct to the file
    pub fn finish(&mut self) -> Result<(), String> {
        if self.hashed.len() > 0 {
            // seeking, so that we can ensure that the cursor is at the start while reading,
            // and at the end while writing.
            try!(seek_from_start(&mut self.file, 0));
            try!(seek_from_start(&mut self.data_file, self.data_idx));
            try!(self.flush_map());
        }

        // We need to traverse one last time to confirm that each row has the same length
        // (and of course, remove the *removed* values from the data file)

        {
            let buf_reader = BufReader::new(&mut self.file);
            let mut out_file = try!(create_or_open_file(&format!("{}{}", &self.path, TEMP_SUFFIX)));
            let mut buf_writer = BufWriter::new(&mut out_file);
            let mut data_file = try!(create_or_open_file(&format!("{}{}", &self.data_path, TEMP_SUFFIX)));
            let mut data_writer = BufWriter::new(&mut data_file);
            self.data_idx = 0;

            for ref line in buf_reader.lines().filter_map(|l| l.ok()) {
                let mut key_index = try!(line.parse::<KeyIndex<K>>());
                try!(seek_from_start(&mut self.data_file, key_index.idx));
                let value = try!(read_one_line(&mut self.data_file));
                key_index.idx = self.data_idx;
                self.data_idx += try!(write_buffer(&mut data_writer, &value, &mut 0));
                // Even though this takes a mutable reference, we can be certain that
                // we've found the maximum row length for this session
                try!(write_buffer(&mut buf_writer, &key_index.to_string(), &mut self.line_length));
            }
        }

        self.rename_temp_file(true)
    }

    fn flush_map(&mut self) -> Result<(), String> {
        let map = mem::replace(&mut self.hashed, BTreeMap::new());

        {
            let buf_reader = BufReader::new(&mut self.file);
            let mut out_file = try!(create_or_open_file(&format!("{}{}", &self.path, TEMP_SUFFIX)));
            let mut buf_writer = BufWriter::new(&mut out_file);
            let mut data_writer = BufWriter::new(&mut self.data_file);

            // both the iterators throw the values in ascending order
            let mut file_iter = buf_reader.lines().filter_map(|l| l.ok()).peekable();
            let mut map_iter = map.into_iter().peekable();

            loop {
                let compare_result = match (file_iter.peek(), map_iter.peek()) {
                    (Some(file_line), Some(&(btree_key_hash, _))) => {
                        let key = file_line.split(SEP).next().unwrap();
                        let file_key_hash = match key.parse::<K>() {
                            Ok(k_v) => hash(&k_v),
                            Err(_) => {
                                // skip the line if we find any errors
                                // (we don't wanna stop the giant build because of this error)
                                continue
                            },
                        };

                        file_key_hash.cmp(&btree_key_hash)
                    },
                    (Some(_), None) => Ordering::Less,
                    (None, Some(_)) => Ordering::Greater,
                    (None, None) => break,
                };

                match compare_result {
                    Ordering::Equal => {
                        let file_line = file_iter.next().unwrap();
                        let (_, (mut btree_key, val)) = map_iter.next().unwrap();
                        btree_key.idx = self.data_idx;
                        self.data_idx += try!(write_buffer(&mut data_writer, &val.to_string(), &mut 0));

                        let mut file_key = match file_line.parse::<KeyIndex<K>>() {
                            Ok(k_i) => k_i,
                            Err(_) => continue,     // skip on error
                        };

                        file_key += btree_key;
                        try!(write_buffer(&mut buf_writer, &file_key.to_string(),
                                          &mut self.line_length));
                    },
                    Ordering::Less => {
                        try!(write_buffer(&mut buf_writer, &file_iter.next().unwrap(),
                                          &mut self.line_length));
                    },
                    Ordering::Greater => {
                        let (_, (mut btree_key, val)) = map_iter.next().unwrap();
                        btree_key.idx = self.data_idx;
                        self.data_idx += try!(write_buffer(&mut data_writer, &val.to_string(), &mut 0));

                        try!(write_buffer(&mut buf_writer, &(btree_key.to_string()),
                                          &mut self.line_length));
                    },
                }
            }
        }

        self.rename_temp_file(false)
    }

    /// Insert a key/value pair
    pub fn insert(&mut self, key: K, value: V) -> Result<(), String> {
        let mut key_idx = KeyIndex::new(key);
        let hashed = hash(&key_idx);
        if let Some(key_val) = self.hashed.get_mut(&hashed) {
            *key_val = {
                key_idx.count += 1;
                (key_idx, value)
            };

            return Ok(())
        }

        self.hashed.insert(hashed, (key_idx, value));
        if self.hashed.len() > self.capacity {  // flush to file once the capacity is full
            try!(self.flush_map());
        }

        Ok(())
    }

    /// Get the value corresponding to the key from the file
    pub fn get(&mut self, key: &K) -> Result<Option<(V, usize)>, String> {
        let hashed_key = hash(key);
        if self.size == 0 || try!(get_size(&self.data_file)) == 0 {
            return Ok(None)
        }

        let row_length = (self.line_length + 1) as u64;
        let mut low = 0;
        let mut high = self.size;

        // Binary search and file seeking to find the value(s)

        while low <= high {
            let mid = (low + high) / 2;
            // place the cursor at the start of a line
            let new_line_pos = mid - (mid + row_length) % row_length;
            try!(seek_from_start(&mut self.file, new_line_pos));
            let line = try!(read_one_line(&mut self.file));

            // we'll only need the hash of the key
            let mut split = line.split(SEP);
            let key_str = split.next().unwrap();
            let key = try!(key_str.parse::<K>()
                                  .map_err(|_| format!("Cannot parse the key from file!")));
            let hashed = hash(&key);

            if hashed == hashed_key {
                let key_index = try!(line.parse::<KeyIndex<K>>());
                try!(seek_from_start(&mut self.data_file, key_index.idx));
                let line = try!(read_one_line(&mut self.data_file));
                let value = try!(line.parse::<V>()
                                     .map_err(|_| format!("Cannot parse the value from file!")));
                return Ok(Some((value, key_index.count)))
            } else if hashed < hashed_key {
                low = mid + 1;
            } else {
                high = mid - 1;
            }
        }

        Ok(None)
    }
}