ironstorm_lookup/
lib.rs

1
2#![feature(btree_range, collections_bound)]
3#![deny(missing_docs)]
4
5//! Overview
6//! ---------
7//! This library contains the internal data structure used by the ironstrom project
8//!
9//! Design goals
10//! ---------------
11//! - Lightning fast auto completion / type ahead lookups (~200 microseconds! per lookup)
12//! - Not too much searchable text per entry, e.g: street names for locations or movie titles for movies
13//! - High number of possible candidates (multiple gigabytes)
14//! - It can be recommended, but must not be rquired to fit the whole data set into physical memory
15//! - The LookupTable should use virtual memory and OS level optimization to handle larger data sets
16//! - Full text search capability
17//! - Optimized for hardly ever changing data sets, e.g.: All streets in a country
18//! - No mulithreading if not absolutely required => Buy lookup speed with memory, not processing power!
19//! - Optimize for returning a small number of matches, e.g: Find first 10 of 2 million movies that contain 'hero'
20//! - Only one dimensional coarse sorting required, e.g: Fantasy books should be returnd before science fiction books
21//! - Lazy stream/iterator based lookup implementation
22//!
23//! Accepted drawbacks
24//! ------------------
25//! - Creating a `LookupTable` for multiple gigabytes of data can take a few minutes
26//! - A `LookupTable` can not be modified, only recreated
27//! - No fine granular sorting possible: e.g: by lexicographical order
28//!
29//! Basic Usage
30//! -----
31//! 1. Create a custom type for the data you want to seacrh for, e.g.: a `Movie` struct
32//! 2. Implement the `Lookup` trait for your custom type.
33//! 3. Create an `Iterator` that will iterate over all the elements you would like to put into the `LookupTable`
34//! 4. Create a new `LookupTable` by calling `LookupTable::from_iter(myMoviesIterator)`
35//! 5. Call `myMoviesLookupTable.find("hero")` to get an lazy 'Iterator' over all matching elements
36//!
37//! Example
38//! -------
39//!
40//! Let's build a `LookupTable` to find restaurants by name.
41//!
42//! ```rust
43//! use std::iter::FromIterator;
44//! use ironstorm_lookup::{LookupTable, Lookup, Bucket};
45//!
46//! // 1. Create a custom struct representing a restaurant
47//! struct Restaurant<'a> {
48//!     name: &'a str,
49//!     cuisine: &'a str
50//! }
51//!
52//! // 2. Implement the `Lookup` trait for `Restaurant` references
53//! impl <'a> Lookup for &'a Restaurant<'a> {
54//!
55//!     // Make the restaurant name searchable
56//!     fn searchable_text(&self) -> String {
57//!         self.name.to_string()
58//!     }
59//!
60//!     // Decide, based on cuisine, to which `Bucket` a restaurant belongs.
61//!     // `Bucket` is just a type alias for an unsigned integer aka usize.
62//!     // Matches in lower buckets will be returned before matches in higher buckets.
63//!     fn bucket(&self) -> Bucket {
64//!         match self.cuisine {
65//!             "italian"   => 0,
66//!             "german"    => 0,
67//!             "chinese"   => 1,
68//!             _           => 5
69//!         }
70//!     }
71//! }
72//!
73//! // 3. Create some restaurants and the according iterator
74//! let restaurants = vec![
75//!     Restaurant{name:"India Man", cuisine:"indian"},
76//!     Restaurant{name:"Ami Guy", cuisine:"american"},
77//!     Restaurant{name:"Italiano Pizza", cuisine:"italian"},
78//!     Restaurant{name:"Sushi House", cuisine:"chinese"},
79//!     Restaurant{name:"Brezel Hut", cuisine:"german"}
80//! ];
81//! let iter = restaurants.iter();
82//!
83//! // 4. Create the `LookupTable`
84//! let lookup_table = ironstorm_lookup::LookupTable::from_iter(iter);
85//!
86//! // 5. Find restaurants containing `i`
87//!
88//!
89//! let mut result_iter = lookup_table.find("i");
90//!
91//! // two times 'Italiano pizza', because it's in the lowest bucket
92//! // two times because it has two lower case `i` in the name
93//! assert_eq!(result_iter.next().unwrap().name, "Italiano Pizza");
94//! assert_eq!(result_iter.next().unwrap().name, "Italiano Pizza");
95//!
96//! // 'Sushi House', because it's in the second lowest bucket
97//! assert_eq!(result_iter.next().unwrap().name, "Sushi House");
98//!
99//! // 'Ami Guy' or ' India Man'
100//! // They are in the same bucket and there is no order within the same bucket
101//! let indian_or_american_1 = result_iter.next().unwrap().name;
102//! assert!(indian_or_american_1=="India Man" || indian_or_american_1=="Ami Guy");
103//!
104//! // The other one of 'Ami Guy' or ' India Man'
105//! let indian_or_american_2 = result_iter.next().unwrap().name;
106//! assert!(indian_or_american_2=="India Man" || indian_or_american_2=="Ami Guy");
107//! assert!(indian_or_american_1 != indian_or_american_2);
108//!
109//! // No more matches
110//! // "Brezel Hut" doesn't contain an "i" and was not part of the result.
111//! assert!(result_iter.next().is_none());
112//! ```
113
114extern crate suffix;
115extern crate itertools;
116
117use suffix::SuffixTable;
118use std::collections::{BTreeMap};
119use std::collections::Bound::{Included, Unbounded};
120use std::iter::FromIterator;
121
122
123/// Every value that is inserted into the lookup table must be assigned to a bucket.
124/// Values, assigned to a lower bucket, will be returned before values from a higher bucket.
125/// This bucket mechanism is used instead a full blown sorting algorithm to boost performance.
126pub type Bucket = usize;
127
128type TextPosition = usize;
129
130const SEPARATOR: &'static str = "\u{FFFF}";
131
132/// Implement this trait for types that are going be put into a `LookupTable`
133pub trait Lookup : Sync{
134
135    /// The text that will be looked at when a lookup is executed.
136    fn searchable_text(&self) -> String;
137
138    /// The bucket in which this item will be put.
139    /// Entries in lower buckets will be returned before entries in higher buckets.
140    /// Don't introduce too many buckets per `LookupTable`.
141    /// The worst case would be to have one Bucket per LookupTable entry.
142    /// `Bucket` is just a type alias for an unsigned integer aka usize.
143    fn bucket(&self) -> Bucket;
144}
145
146/// This is the actual `LookupTable` that creates the in memory data structure and uses it to perform the lookups.
147/// It implements the `FromIterator` trait and its `from_iter(..)` method.
148/// To create a new `LookupTable` instance, you first have to create an Iterator over some `Lookup` items.
149/// Having that iterator, you can call `LookupTable::from_iter(myLookupItemIterator)``.
150pub struct LookupTable<'a, V: 'a>  where V: Lookup{
151    suffix_table_map: BTreeMap<Bucket, SuffixTable<'a,'a>>,
152    position_map: BTreeMap<(Bucket, TextPosition), V>
153}
154
155impl <'a, A: Lookup>FromIterator<A> for LookupTable<'a, A>{
156
157    /// Creates a `LookupTable` from the given Iterator
158    fn from_iter<T>(iterator: T) -> Self where T: IntoIterator<Item=A>{
159        let mut text_map: BTreeMap<Bucket, String> = BTreeMap::new();
160        let mut position_map: BTreeMap<(Bucket, TextPosition), A> = BTreeMap::new();
161
162        for value in iterator {
163            let mut text = text_map.entry(value.bucket()).or_insert_with(String::new);
164            let pos: TextPosition = text.len();
165
166            text.push_str(&value.searchable_text().as_str());
167            text.push_str(SEPARATOR);
168
169            position_map.insert((value.bucket(), pos), value);
170        }
171
172        let mut suffix_table_map: BTreeMap<Bucket, SuffixTable> = BTreeMap::new();
173        for (bucket, text) in text_map.into_iter(){
174            suffix_table_map.insert(bucket, SuffixTable::new(text));
175        }
176
177        LookupTable{suffix_table_map: suffix_table_map, position_map: position_map}
178    }
179}
180
181impl <'a, V>LookupTable<'a, V> where V: Lookup{
182
183    fn get_value_for_position(&self, bucket: Bucket, text_position: TextPosition) -> &V{
184        if let Some(value) = self.position_map.range(Unbounded, Included(&(bucket,(text_position as usize)))).rev().next() {
185            let (&(_, _), value) = value;
186            value
187        }else {
188            panic!("Could not find at least one value in position map.
189                    This must be a bug! Please report it on https://github.com/forgemo/ironstorm_lookup/issues");
190        }
191    }
192
193    /// Searches for `Lookup` entries with a `serachable_text` that contains the given `search_text`.
194    /// If the `search_text` is found multiple times for the same entry, the entry will also be returned multiple times.
195    /// If no matches are found, the Iterator will immediately start returning `None`.
196    /// Entries in lower buckets will be returned before entries in higher buckets.
197    /// The method is case sensitive.
198    pub fn find(&'a self, search_text: &'a str) -> Box<Iterator<Item=&V> + Send + 'a> {
199        let result_iter = self.suffix_table_map.iter()
200        .flat_map(move |(bucket, suffix_table)|{
201            suffix_table.positions(&search_text).iter().map(move |text_position|(bucket, text_position))
202        })
203        .map(move |(bucket, position)|self.get_value_for_position(*bucket, *position as usize));
204        return Box::new(result_iter);
205    }
206
207    /// Returns the number of values for this `LookupTable`
208    pub fn len(&self) -> usize {
209        self.position_map.len()
210    }
211
212    /// Returns the number of buckets for this `LookupTable`
213    pub fn bucket_count(&self) -> usize {
214        self.suffix_table_map.len()
215    }
216
217}
218
219
220#[cfg(test)]
221mod tests {
222
223    use {Lookup, LookupTable};
224    use std::iter::FromIterator;
225
226    impl <'a> Lookup for &'a str {
227        fn searchable_text(&self) -> String {
228            self.to_string()
229        }
230        fn bucket(&self) -> usize {
231            self.len()
232        }
233    }
234
235    #[test]
236    fn it_works1() {
237        let strings = vec!["a","a","a","a","a","a"];
238        let t = LookupTable::from_iter(strings.into_iter());
239        let len = t.find("a").collect::<Vec<_>>().len();
240        assert_eq!(len, 6);
241    }
242
243    #[test]
244    fn it_works2() {
245        let strings = vec!["a","ab","abc","abcd","abcde","abcdef"];
246        let t = LookupTable::from_iter(strings.into_iter());
247        let mut i = t.find("a");
248        assert_eq!(&"a", i.next().unwrap());
249        assert_eq!(&"ab", i.next().unwrap());
250        assert_eq!(&"abc", i.next().unwrap());
251        assert_eq!(&"abcd", i.next().unwrap());
252        assert_eq!(&"abcde", i.next().unwrap());
253        assert_eq!(&"abcdef", i.next().unwrap());
254    }
255
256    #[test]
257    fn it_works3() {
258        let strings = vec!["ZZZ","ZZ","Z"];
259        let t = LookupTable::from_iter(strings.into_iter());
260        let mut i = t.find("Z");
261        assert_eq!(&"Z", i.next().unwrap());
262        assert_eq!(&"ZZ", i.next().unwrap());
263        assert_eq!(&"ZZ", i.next().unwrap());
264        assert_eq!(&"ZZZ", i.next().unwrap());
265        assert_eq!(&"ZZZ", i.next().unwrap());
266        assert_eq!(&"ZZZ", i.next().unwrap());
267    }
268
269    #[test]
270    fn it_works4() {
271        let strings = vec!["ZZZZZZZZ","ZZZZZZZZZ"];
272        let t = LookupTable::from_iter(strings.into_iter());
273        let i = t.find("Z");
274        assert_eq!(17, i.count());
275    }
276
277    #[test]
278    fn it_works5() {
279        let strings = vec!["ZZZ","ZZZ","A","ZZZ","B","ZZZ"];
280        let t = LookupTable::from_iter(strings.into_iter());
281        let i = t.find("A");
282        assert_eq!(1, i.count());
283        let i = t.find("B");
284        assert_eq!(1, i.count());
285        let i = t.find("Z");
286        assert_eq!(12, i.count());
287        let i = t.find("ZZZ");
288        assert_eq!(4, i.count());
289    }
290
291    #[test]
292    fn it_works6() {
293        let strings = vec!["A","B","C"];
294        let t = LookupTable::from_iter(strings.into_iter());
295        let i = t.find("D");
296        assert_eq!(0, i.count());
297    }
298}