1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298

#![feature(btree_range, collections_bound)]
#![deny(missing_docs)]

//! Overview
//! ---------
//! This library contains the internal data structure used by the ironstrom project
//!
//! Design goals
//! ---------------
//! - Lightning fast auto completion / type ahead lookups (~200 microseconds! per lookup)
//! - Not too much searchable text per entry, e.g: street names for locations or movie titles for movies
//! - High number of possible candidates (multiple gigabytes)
//! - It can be recommended, but must not be rquired to fit the whole data set into physical memory
//! - The LookupTable should use virtual memory and OS level optimization to handle larger data sets
//! - Full text search capability
//! - Optimized for hardly ever changing data sets, e.g.: All streets in a country
//! - No mulithreading if not absolutely required => Buy lookup speed with memory, not processing power!
//! - Optimize for returning a small number of matches, e.g: Find first 10 of 2 million movies that contain 'hero'
//! - Only one dimensional coarse sorting required, e.g: Fantasy books should be returnd before science fiction books
//! - Lazy stream/iterator based lookup implementation
//!
//! Accepted drawbacks
//! ------------------
//! - Creating a `LookupTable` for multiple gigabytes of data can take a few minutes
//! - A `LookupTable` can not be modified, only recreated
//! - No fine granular sorting possible: e.g: by lexicographical order
//!
//! Basic Usage
//! -----
//! 1. Create a custom type for the data you want to seacrh for, e.g.: a `Movie` struct
//! 2. Implement the `Lookup` trait for your custom type.
//! 3. Create an `Iterator` that will iterate over all the elements you would like to put into the `LookupTable`
//! 4. Create a new `LookupTable` by calling `LookupTable::from_iter(myMoviesIterator)`
//! 5. Call `myMoviesLookupTable.find("hero")` to get an lazy 'Iterator' over all matching elements
//!
//! Example
//! -------
//!
//! Let's build a `LookupTable` to find restaurants by name.
//!
//! ```rust
//! use std::iter::FromIterator;
//! use ironstorm_lookup::{LookupTable, Lookup, Bucket};
//!
//! // 1. Create a custom struct representing a restaurant
//! struct Restaurant<'a> {
//!     name: &'a str,
//!     cuisine: &'a str
//! }
//!
//! // 2. Implement the `Lookup` trait for `Restaurant` references
//! impl <'a> Lookup for &'a Restaurant<'a> {
//!
//!     // Make the restaurant name searchable
//!     fn searchable_text(&self) -> String {
//!         self.name.to_string()
//!     }
//!
//!     // Decide, based on cuisine, to which `Bucket` a restaurant belongs.
//!     // `Bucket` is just a type alias for an unsigned integer aka usize.
//!     // Matches in lower buckets will be returned before matches in higher buckets.
//!     fn bucket(&self) -> Bucket {
//!         match self.cuisine {
//!             "italian"   => 0,
//!             "german"    => 0,
//!             "chinese"   => 1,
//!             _           => 5
//!         }
//!     }
//! }
//!
//! // 3. Create some restaurants and the according iterator
//! let restaurants = vec![
//!     Restaurant{name:"India Man", cuisine:"indian"},
//!     Restaurant{name:"Ami Guy", cuisine:"american"},
//!     Restaurant{name:"Italiano Pizza", cuisine:"italian"},
//!     Restaurant{name:"Sushi House", cuisine:"chinese"},
//!     Restaurant{name:"Brezel Hut", cuisine:"german"}
//! ];
//! let iter = restaurants.iter();
//!
//! // 4. Create the `LookupTable`
//! let lookup_table = ironstorm_lookup::LookupTable::from_iter(iter);
//!
//! // 5. Find restaurants containing `i`
//!
//!
//! let mut result_iter = lookup_table.find("i");
//!
//! // two times 'Italiano pizza', because it's in the lowest bucket
//! // two times because it has two lower case `i` in the name
//! assert_eq!(result_iter.next().unwrap().name, "Italiano Pizza");
//! assert_eq!(result_iter.next().unwrap().name, "Italiano Pizza");
//!
//! // 'Sushi House', because it's in the second lowest bucket
//! assert_eq!(result_iter.next().unwrap().name, "Sushi House");
//!
//! // 'Ami Guy' or ' India Man'
//! // They are in the same bucket and there is no order within the same bucket
//! let indian_or_american_1 = result_iter.next().unwrap().name;
//! assert!(indian_or_american_1=="India Man" || indian_or_american_1=="Ami Guy");
//!
//! // The other one of 'Ami Guy' or ' India Man'
//! let indian_or_american_2 = result_iter.next().unwrap().name;
//! assert!(indian_or_american_2=="India Man" || indian_or_american_2=="Ami Guy");
//! assert!(indian_or_american_1 != indian_or_american_2);
//!
//! // No more matches
//! // "Brezel Hut" doesn't contain an "i" and was not part of the result.
//! assert!(result_iter.next().is_none());
//! ```

extern crate suffix;
extern crate itertools;

use suffix::SuffixTable;
use std::collections::{BTreeMap};
use std::collections::Bound::{Included, Unbounded};
use std::iter::FromIterator;


/// Every value that is inserted into the lookup table must be assigned to a bucket.
/// Values, assigned to a lower bucket, will be returned before values from a higher bucket.
/// This bucket mechanism is used instead a full blown sorting algorithm to boost performance.
pub type Bucket = usize;

type TextPosition = usize;

const SEPARATOR: &'static str = "\u{FFFF}";

/// Implement this trait for types that are going be put into a `LookupTable`
pub trait Lookup : Sync{

    /// The text that will be looked at when a lookup is executed.
    fn searchable_text(&self) -> String;

    /// The bucket in which this item will be put.
    /// Entries in lower buckets will be returned before entries in higher buckets.
    /// Don't introduce too many buckets per `LookupTable`.
    /// The worst case would be to have one Bucket per LookupTable entry.
    /// `Bucket` is just a type alias for an unsigned integer aka usize.
    fn bucket(&self) -> Bucket;
}

/// This is the actual `LookupTable` that creates the in memory data structure and uses it to perform the lookups.
/// It implements the `FromIterator` trait and its `from_iter(..)` method.
/// To create a new `LookupTable` instance, you first have to create an Iterator over some `Lookup` items.
/// Having that iterator, you can call `LookupTable::from_iter(myLookupItemIterator)``.
pub struct LookupTable<'a, V: 'a>  where V: Lookup{
    suffix_table_map: BTreeMap<Bucket, SuffixTable<'a,'a>>,
    position_map: BTreeMap<(Bucket, TextPosition), V>
}

impl <'a, A: Lookup>FromIterator<A> for LookupTable<'a, A>{

    /// Creates a `LookupTable` from the given Iterator
    fn from_iter<T>(iterator: T) -> Self where T: IntoIterator<Item=A>{
        let mut text_map: BTreeMap<Bucket, String> = BTreeMap::new();
        let mut position_map: BTreeMap<(Bucket, TextPosition), A> = BTreeMap::new();

        for value in iterator {
            let mut text = text_map.entry(value.bucket()).or_insert_with(String::new);
            let pos: TextPosition = text.len();

            text.push_str(&value.searchable_text().as_str());
            text.push_str(SEPARATOR);

            position_map.insert((value.bucket(), pos), value);
        }

        let mut suffix_table_map: BTreeMap<Bucket, SuffixTable> = BTreeMap::new();
        for (bucket, text) in text_map.into_iter(){
            suffix_table_map.insert(bucket, SuffixTable::new(text));
        }

        LookupTable{suffix_table_map: suffix_table_map, position_map: position_map}
    }
}

impl <'a, V>LookupTable<'a, V> where V: Lookup{

    fn get_value_for_position(&self, bucket: Bucket, text_position: TextPosition) -> &V{
        if let Some(value) = self.position_map.range(Unbounded, Included(&(bucket,(text_position as usize)))).rev().next() {
            let (&(_, _), value) = value;
            value
        }else {
            panic!("Could not find at least one value in position map.
                    This must be a bug! Please report it on https://github.com/forgemo/ironstorm_lookup/issues");
        }
    }

    /// Searches for `Lookup` entries with a `serachable_text` that contains the given `search_text`.
    /// If the `search_text` is found multiple times for the same entry, the entry will also be returned multiple times.
    /// If no matches are found, the Iterator will immediately start returning `None`.
    /// Entries in lower buckets will be returned before entries in higher buckets.
    /// The method is case sensitive.
    pub fn find(&'a self, search_text: &'a str) -> Box<Iterator<Item=&V> + Send + 'a> {
        let result_iter = self.suffix_table_map.iter()
        .flat_map(move |(bucket, suffix_table)|{
            suffix_table.positions(&search_text).iter().map(move |text_position|(bucket, text_position))
        })
        .map(move |(bucket, position)|self.get_value_for_position(*bucket, *position as usize));
        return Box::new(result_iter);
    }

    /// Returns the number of values for this `LookupTable`
    pub fn len(&self) -> usize {
        self.position_map.len()
    }

    /// Returns the number of buckets for this `LookupTable`
    pub fn bucket_count(&self) -> usize {
        self.suffix_table_map.len()
    }

}


#[cfg(test)]
mod tests {

    use {Lookup, LookupTable};
    use std::iter::FromIterator;

    impl <'a> Lookup for &'a str {
        fn searchable_text(&self) -> String {
            self.to_string()
        }
        fn bucket(&self) -> usize {
            self.len()
        }
    }

    #[test]
    fn it_works1() {
        let strings = vec!["a","a","a","a","a","a"];
        let t = LookupTable::from_iter(strings.into_iter());
        let len = t.find("a").collect::<Vec<_>>().len();
        assert_eq!(len, 6);
    }

    #[test]
    fn it_works2() {
        let strings = vec!["a","ab","abc","abcd","abcde","abcdef"];
        let t = LookupTable::from_iter(strings.into_iter());
        let mut i = t.find("a");
        assert_eq!(&"a", i.next().unwrap());
        assert_eq!(&"ab", i.next().unwrap());
        assert_eq!(&"abc", i.next().unwrap());
        assert_eq!(&"abcd", i.next().unwrap());
        assert_eq!(&"abcde", i.next().unwrap());
        assert_eq!(&"abcdef", i.next().unwrap());
    }

    #[test]
    fn it_works3() {
        let strings = vec!["ZZZ","ZZ","Z"];
        let t = LookupTable::from_iter(strings.into_iter());
        let mut i = t.find("Z");
        assert_eq!(&"Z", i.next().unwrap());
        assert_eq!(&"ZZ", i.next().unwrap());
        assert_eq!(&"ZZ", i.next().unwrap());
        assert_eq!(&"ZZZ", i.next().unwrap());
        assert_eq!(&"ZZZ", i.next().unwrap());
        assert_eq!(&"ZZZ", i.next().unwrap());
    }

    #[test]
    fn it_works4() {
        let strings = vec!["ZZZZZZZZ","ZZZZZZZZZ"];
        let t = LookupTable::from_iter(strings.into_iter());
        let i = t.find("Z");
        assert_eq!(17, i.count());
    }

    #[test]
    fn it_works5() {
        let strings = vec!["ZZZ","ZZZ","A","ZZZ","B","ZZZ"];
        let t = LookupTable::from_iter(strings.into_iter());
        let i = t.find("A");
        assert_eq!(1, i.count());
        let i = t.find("B");
        assert_eq!(1, i.count());
        let i = t.find("Z");
        assert_eq!(12, i.count());
        let i = t.find("ZZZ");
        assert_eq!(4, i.count());
    }

    #[test]
    fn it_works6() {
        let strings = vec!["A","B","C"];
        let t = LookupTable::from_iter(strings.into_iter());
        let i = t.find("D");
        assert_eq!(0, i.count());
    }
}