ironstorm_lookup/lib.rs
1
2#![feature(btree_range, collections_bound)]
3#![deny(missing_docs)]
4
5//! Overview
6//! ---------
7//! This library contains the internal data structure used by the ironstrom project
8//!
9//! Design goals
10//! ---------------
11//! - Lightning fast auto completion / type ahead lookups (~200 microseconds! per lookup)
12//! - Not too much searchable text per entry, e.g: street names for locations or movie titles for movies
13//! - High number of possible candidates (multiple gigabytes)
14//! - It can be recommended, but must not be rquired to fit the whole data set into physical memory
15//! - The LookupTable should use virtual memory and OS level optimization to handle larger data sets
16//! - Full text search capability
17//! - Optimized for hardly ever changing data sets, e.g.: All streets in a country
18//! - No mulithreading if not absolutely required => Buy lookup speed with memory, not processing power!
19//! - Optimize for returning a small number of matches, e.g: Find first 10 of 2 million movies that contain 'hero'
20//! - Only one dimensional coarse sorting required, e.g: Fantasy books should be returnd before science fiction books
21//! - Lazy stream/iterator based lookup implementation
22//!
23//! Accepted drawbacks
24//! ------------------
25//! - Creating a `LookupTable` for multiple gigabytes of data can take a few minutes
26//! - A `LookupTable` can not be modified, only recreated
27//! - No fine granular sorting possible: e.g: by lexicographical order
28//!
29//! Basic Usage
30//! -----
31//! 1. Create a custom type for the data you want to seacrh for, e.g.: a `Movie` struct
32//! 2. Implement the `Lookup` trait for your custom type.
33//! 3. Create an `Iterator` that will iterate over all the elements you would like to put into the `LookupTable`
34//! 4. Create a new `LookupTable` by calling `LookupTable::from_iter(myMoviesIterator)`
35//! 5. Call `myMoviesLookupTable.find("hero")` to get an lazy 'Iterator' over all matching elements
36//!
37//! Example
38//! -------
39//!
40//! Let's build a `LookupTable` to find restaurants by name.
41//!
42//! ```rust
43//! use std::iter::FromIterator;
44//! use ironstorm_lookup::{LookupTable, Lookup, Bucket};
45//!
46//! // 1. Create a custom struct representing a restaurant
47//! struct Restaurant<'a> {
48//! name: &'a str,
49//! cuisine: &'a str
50//! }
51//!
52//! // 2. Implement the `Lookup` trait for `Restaurant` references
53//! impl <'a> Lookup for &'a Restaurant<'a> {
54//!
55//! // Make the restaurant name searchable
56//! fn searchable_text(&self) -> String {
57//! self.name.to_string()
58//! }
59//!
60//! // Decide, based on cuisine, to which `Bucket` a restaurant belongs.
61//! // `Bucket` is just a type alias for an unsigned integer aka usize.
62//! // Matches in lower buckets will be returned before matches in higher buckets.
63//! fn bucket(&self) -> Bucket {
64//! match self.cuisine {
65//! "italian" => 0,
66//! "german" => 0,
67//! "chinese" => 1,
68//! _ => 5
69//! }
70//! }
71//! }
72//!
73//! // 3. Create some restaurants and the according iterator
74//! let restaurants = vec![
75//! Restaurant{name:"India Man", cuisine:"indian"},
76//! Restaurant{name:"Ami Guy", cuisine:"american"},
77//! Restaurant{name:"Italiano Pizza", cuisine:"italian"},
78//! Restaurant{name:"Sushi House", cuisine:"chinese"},
79//! Restaurant{name:"Brezel Hut", cuisine:"german"}
80//! ];
81//! let iter = restaurants.iter();
82//!
83//! // 4. Create the `LookupTable`
84//! let lookup_table = ironstorm_lookup::LookupTable::from_iter(iter);
85//!
86//! // 5. Find restaurants containing `i`
87//!
88//!
89//! let mut result_iter = lookup_table.find("i");
90//!
91//! // two times 'Italiano pizza', because it's in the lowest bucket
92//! // two times because it has two lower case `i` in the name
93//! assert_eq!(result_iter.next().unwrap().name, "Italiano Pizza");
94//! assert_eq!(result_iter.next().unwrap().name, "Italiano Pizza");
95//!
96//! // 'Sushi House', because it's in the second lowest bucket
97//! assert_eq!(result_iter.next().unwrap().name, "Sushi House");
98//!
99//! // 'Ami Guy' or ' India Man'
100//! // They are in the same bucket and there is no order within the same bucket
101//! let indian_or_american_1 = result_iter.next().unwrap().name;
102//! assert!(indian_or_american_1=="India Man" || indian_or_american_1=="Ami Guy");
103//!
104//! // The other one of 'Ami Guy' or ' India Man'
105//! let indian_or_american_2 = result_iter.next().unwrap().name;
106//! assert!(indian_or_american_2=="India Man" || indian_or_american_2=="Ami Guy");
107//! assert!(indian_or_american_1 != indian_or_american_2);
108//!
109//! // No more matches
110//! // "Brezel Hut" doesn't contain an "i" and was not part of the result.
111//! assert!(result_iter.next().is_none());
112//! ```
113
114extern crate suffix;
115extern crate itertools;
116
117use suffix::SuffixTable;
118use std::collections::{BTreeMap};
119use std::collections::Bound::{Included, Unbounded};
120use std::iter::FromIterator;
121
122
123/// Every value that is inserted into the lookup table must be assigned to a bucket.
124/// Values, assigned to a lower bucket, will be returned before values from a higher bucket.
125/// This bucket mechanism is used instead a full blown sorting algorithm to boost performance.
126pub type Bucket = usize;
127
128type TextPosition = usize;
129
130const SEPARATOR: &'static str = "\u{FFFF}";
131
132/// Implement this trait for types that are going be put into a `LookupTable`
133pub trait Lookup : Sync{
134
135 /// The text that will be looked at when a lookup is executed.
136 fn searchable_text(&self) -> String;
137
138 /// The bucket in which this item will be put.
139 /// Entries in lower buckets will be returned before entries in higher buckets.
140 /// Don't introduce too many buckets per `LookupTable`.
141 /// The worst case would be to have one Bucket per LookupTable entry.
142 /// `Bucket` is just a type alias for an unsigned integer aka usize.
143 fn bucket(&self) -> Bucket;
144}
145
146/// This is the actual `LookupTable` that creates the in memory data structure and uses it to perform the lookups.
147/// It implements the `FromIterator` trait and its `from_iter(..)` method.
148/// To create a new `LookupTable` instance, you first have to create an Iterator over some `Lookup` items.
149/// Having that iterator, you can call `LookupTable::from_iter(myLookupItemIterator)``.
150pub struct LookupTable<'a, V: 'a> where V: Lookup{
151 suffix_table_map: BTreeMap<Bucket, SuffixTable<'a,'a>>,
152 position_map: BTreeMap<(Bucket, TextPosition), V>
153}
154
155impl <'a, A: Lookup>FromIterator<A> for LookupTable<'a, A>{
156
157 /// Creates a `LookupTable` from the given Iterator
158 fn from_iter<T>(iterator: T) -> Self where T: IntoIterator<Item=A>{
159 let mut text_map: BTreeMap<Bucket, String> = BTreeMap::new();
160 let mut position_map: BTreeMap<(Bucket, TextPosition), A> = BTreeMap::new();
161
162 for value in iterator {
163 let mut text = text_map.entry(value.bucket()).or_insert_with(String::new);
164 let pos: TextPosition = text.len();
165
166 text.push_str(&value.searchable_text().as_str());
167 text.push_str(SEPARATOR);
168
169 position_map.insert((value.bucket(), pos), value);
170 }
171
172 let mut suffix_table_map: BTreeMap<Bucket, SuffixTable> = BTreeMap::new();
173 for (bucket, text) in text_map.into_iter(){
174 suffix_table_map.insert(bucket, SuffixTable::new(text));
175 }
176
177 LookupTable{suffix_table_map: suffix_table_map, position_map: position_map}
178 }
179}
180
181impl <'a, V>LookupTable<'a, V> where V: Lookup{
182
183 fn get_value_for_position(&self, bucket: Bucket, text_position: TextPosition) -> &V{
184 if let Some(value) = self.position_map.range(Unbounded, Included(&(bucket,(text_position as usize)))).rev().next() {
185 let (&(_, _), value) = value;
186 value
187 }else {
188 panic!("Could not find at least one value in position map.
189 This must be a bug! Please report it on https://github.com/forgemo/ironstorm_lookup/issues");
190 }
191 }
192
193 /// Searches for `Lookup` entries with a `serachable_text` that contains the given `search_text`.
194 /// If the `search_text` is found multiple times for the same entry, the entry will also be returned multiple times.
195 /// If no matches are found, the Iterator will immediately start returning `None`.
196 /// Entries in lower buckets will be returned before entries in higher buckets.
197 /// The method is case sensitive.
198 pub fn find(&'a self, search_text: &'a str) -> Box<Iterator<Item=&V> + Send + 'a> {
199 let result_iter = self.suffix_table_map.iter()
200 .flat_map(move |(bucket, suffix_table)|{
201 suffix_table.positions(&search_text).iter().map(move |text_position|(bucket, text_position))
202 })
203 .map(move |(bucket, position)|self.get_value_for_position(*bucket, *position as usize));
204 return Box::new(result_iter);
205 }
206
207 /// Returns the number of values for this `LookupTable`
208 pub fn len(&self) -> usize {
209 self.position_map.len()
210 }
211
212 /// Returns the number of buckets for this `LookupTable`
213 pub fn bucket_count(&self) -> usize {
214 self.suffix_table_map.len()
215 }
216
217}
218
219
220#[cfg(test)]
221mod tests {
222
223 use {Lookup, LookupTable};
224 use std::iter::FromIterator;
225
226 impl <'a> Lookup for &'a str {
227 fn searchable_text(&self) -> String {
228 self.to_string()
229 }
230 fn bucket(&self) -> usize {
231 self.len()
232 }
233 }
234
235 #[test]
236 fn it_works1() {
237 let strings = vec!["a","a","a","a","a","a"];
238 let t = LookupTable::from_iter(strings.into_iter());
239 let len = t.find("a").collect::<Vec<_>>().len();
240 assert_eq!(len, 6);
241 }
242
243 #[test]
244 fn it_works2() {
245 let strings = vec!["a","ab","abc","abcd","abcde","abcdef"];
246 let t = LookupTable::from_iter(strings.into_iter());
247 let mut i = t.find("a");
248 assert_eq!(&"a", i.next().unwrap());
249 assert_eq!(&"ab", i.next().unwrap());
250 assert_eq!(&"abc", i.next().unwrap());
251 assert_eq!(&"abcd", i.next().unwrap());
252 assert_eq!(&"abcde", i.next().unwrap());
253 assert_eq!(&"abcdef", i.next().unwrap());
254 }
255
256 #[test]
257 fn it_works3() {
258 let strings = vec!["ZZZ","ZZ","Z"];
259 let t = LookupTable::from_iter(strings.into_iter());
260 let mut i = t.find("Z");
261 assert_eq!(&"Z", i.next().unwrap());
262 assert_eq!(&"ZZ", i.next().unwrap());
263 assert_eq!(&"ZZ", i.next().unwrap());
264 assert_eq!(&"ZZZ", i.next().unwrap());
265 assert_eq!(&"ZZZ", i.next().unwrap());
266 assert_eq!(&"ZZZ", i.next().unwrap());
267 }
268
269 #[test]
270 fn it_works4() {
271 let strings = vec!["ZZZZZZZZ","ZZZZZZZZZ"];
272 let t = LookupTable::from_iter(strings.into_iter());
273 let i = t.find("Z");
274 assert_eq!(17, i.count());
275 }
276
277 #[test]
278 fn it_works5() {
279 let strings = vec!["ZZZ","ZZZ","A","ZZZ","B","ZZZ"];
280 let t = LookupTable::from_iter(strings.into_iter());
281 let i = t.find("A");
282 assert_eq!(1, i.count());
283 let i = t.find("B");
284 assert_eq!(1, i.count());
285 let i = t.find("Z");
286 assert_eq!(12, i.count());
287 let i = t.find("ZZZ");
288 assert_eq!(4, i.count());
289 }
290
291 #[test]
292 fn it_works6() {
293 let strings = vec!["A","B","C"];
294 let t = LookupTable::from_iter(strings.into_iter());
295 let i = t.find("D");
296 assert_eq!(0, i.count());
297 }
298}