joinkit/lib.rs
1#![warn(missing_docs)]
2#![crate_name="joinkit"]
3
4//! Joinkit provides iterator adaptors for efficient SQL-like joins.
5//!
6//! # Strategies
7//!
8//! There are two join strategies, which fit different scenarios:
9//! - **Hash Join** - a shorter data stream is loaded entirely into memory (`HashMap`), while the
10//! longer can be arbitrarily large and is matched against `HashMap` sequentially. The greatest
11//! advantage is that data do not need to be sorted and it has amortized O(n) complexity, therefore
12//! it is very efficient. This is the right choice if data is not sorted and the smaller stream
13//! fits into memory.
14//! - **Merge Join** - the data streams *must* be sorted, but can be *both* arbitrarily large. This
15//! is the right choice if the data is already sorted, as in this case it is slightly more
16//! efficient than Hash Join.
17//!
18//! To use the iterator adaptors in this crate, import `Joinkit trait`:
19//!
20//! ```
21//! use joinkit::Joinkit;
22//! ```
23//!
24//! The crate contains also 2 binaries `hjoin` and `mjoin`, which can be used to perform `Hash
25//! Join` and `Merge Join` on command line.
26
27#[macro_use]
28extern crate clap;
29extern crate itertools;
30
31use std::iter::{IntoIterator};
32use std::cmp::Ordering;
33use std::hash::Hash;
34
35pub use merge_join::{MergeJoinInner, MergeJoinLeftExcl, MergeJoinLeftOuter, MergeJoinFullOuter};
36pub use hash_join::{HashJoinInner, HashJoinLeftExcl, HashJoinLeftOuter, HashJoinRightExcl,
37HashJoinRightOuter, HashJoinFullOuter};
38
39pub mod util;
40mod merge_join;
41mod hash_join;
42
43/// A value yielded by `merge_join` and `hash_join` outer iterators.
44/// Contains one or two values, depending on which input iterator is exhausted.
45///
46#[derive(Clone, PartialEq, Eq, Debug)]
47pub enum EitherOrBoth<L, R> {
48 /// Neither input iterator is exhausted yet, yielding two values.
49 Both(L, R),
50 /// The parameter iterator is exhausted, only yielding a value from the `self` iterator.
51 Left(L),
52 /// The `self` iterator is exhausted, only yielding a value from the parameter iterator.
53 Right(R),
54}
55
56/// Trait `Joinkit` provides the extra iterator adaptors for efficient SQL-like joins.
57pub trait Joinkit : Iterator {
58 /// Return an iterator adaptor that [inner
59 /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Inner_join) the two input iterators in
60 /// ascending order. The resulting iterator is the intersection of the two input iterators.
61 ///
62 /// The both input iterators must be sorted and unique on the join key (e.g. by
63 /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
64 /// them, if necessary) to produce the correct results.
65 ///
66 /// Iterator element type is `(L::Item, R::Item)`.
67 ///
68 /// ```
69 /// use joinkit::Joinkit;
70 ///
71 /// // tuples of (key, [value,...]), where the key is extracted from the value
72 /// // notice the values are grouped by the key
73 /// let l = vec![("0", vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
74 /// let r = vec![("1", vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
75 /// let mut it = l.merge_join_inner_by(r, |x, y| Ord::cmp(&x.0, &y.0));
76 ///
77 /// assert_eq!(it.next(), Some((("1", vec!["1;B"]), ("1", vec!["1;X", "1;Y"]))));
78 /// assert_eq!(it.next(), None);
79 /// ```
80 fn merge_join_inner_by<R, F>(self, other: R, cmp: F) -> MergeJoinInner<Self, R::IntoIter, F>
81 where Self: Sized,
82 R: IntoIterator,
83 F: FnMut(&Self::Item, &R::Item) -> Ordering
84 {
85 MergeJoinInner::new(self, other.into_iter(), cmp)
86 }
87
88 /// Return an iterator adaptor that *left exclusive joins* the two input iterators in
89 /// ascending order. The resulting iterator contains only those records from the left input
90 /// iterator, which do not match the right input iterator. There is no direct equivalent in
91 /// SQL.
92 ///
93 /// The both input iterators must be sorted and unique on the join key (e.g. by
94 /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
95 /// them, if necessary) to produce the correct results.
96 ///
97 /// Iterator element type is `L::Item`.
98 ///
99 /// ```
100 /// use joinkit::Joinkit;
101 ///
102 /// // tuples of (key, [value,...]), where the key is extracted from the value
103 /// // notice the values are grouped by the key
104 /// let l = vec![("0", vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
105 /// let r = vec![("1", vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
106 /// let mut it = l.merge_join_left_excl_by(r, |x, y| Ord::cmp(&x.0, &y.0));
107 ///
108 /// assert_eq!(it.next(), Some(("0", vec!["0;A"])));
109 /// assert_eq!(it.next(), None);
110 /// ```
111 fn merge_join_left_excl_by<R, F>(self, other: R, cmp: F)
112 -> MergeJoinLeftExcl<Self, R::IntoIter, F>
113 where Self: Sized,
114 R: IntoIterator,
115 F: FnMut(&Self::Item, &R::Item) -> Ordering
116 {
117 MergeJoinLeftExcl::new(self, other.into_iter(), cmp)
118 }
119
120 /// Return an iterator adaptor that [left outer
121 /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Left_outer_join) the two input iterators
122 /// in ascending order. The resulting iterator contains all the records from the left input
123 /// iterator, even if they do not match the right input iterator.
124 ///
125 /// The both input iterators must be sorted and unique on the join key (e.g. by
126 /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
127 /// them, if necessary) to produce the correct results.
128 ///
129 /// Iterator element type is [`EitherOrBoth<L::Item, R::Item>`](enum.EitherOrBoth.html).
130 ///
131 /// ```
132 /// use joinkit::Joinkit;
133 /// use joinkit::EitherOrBoth::{Left, Both, Right};
134 ///
135 /// // tuples of (key, [value,...]), where the key is extracted from the value
136 /// // notice the values are grouped by the key
137 /// let l = vec![("0", vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
138 /// let r = vec![("1", vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
139 /// let mut it = l.merge_join_left_outer_by(r, |x, y| Ord::cmp(&x.0, &y.0));
140 ///
141 /// assert_eq!(it.next(), Some(Left(("0", vec!["0;A"]))));
142 /// assert_eq!(it.next(), Some(Both(("1", vec!["1;B"]), ("1", vec!["1;X", "1;Y"]))));
143 /// assert_eq!(it.next(), None);
144 /// ```
145 fn merge_join_left_outer_by<R, F>(self, other: R, cmp: F)
146 -> MergeJoinLeftOuter<Self, R::IntoIter, F>
147 where Self: Sized,
148 R: IntoIterator,
149 F: FnMut(&Self::Item, &R::Item) -> Ordering
150 {
151 MergeJoinLeftOuter::new(self, other.into_iter(), cmp)
152 }
153
154 /// Return an iterator adaptor that [full outer
155 /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Full_outer_join) the two input iterators
156 /// in ascending order. The resulting iterator contains all the records from the both input
157 /// iterators.
158 ///
159 /// The both input iterators must be sorted and unique on the join key (e.g. by
160 /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
161 /// them, if necessary) to produce the correct results.
162 ///
163 /// Iterator element type is [`EitherOrBoth<L::Item, R::Item>`](enum.EitherOrBoth.html).
164 ///
165 /// ```
166 /// use joinkit::Joinkit;
167 /// use joinkit::EitherOrBoth::{Left, Both, Right};
168 ///
169 ///
170 /// // tuples of (key, [value,...]), where the key is extracted from the value
171 /// // notice the values are grouped by the key
172 /// let l = vec![("0",vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
173 /// let r = vec![("1",vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
174 /// let mut it = l.merge_join_full_outer_by(r, |x, y| Ord::cmp(&x.0, &y.0));
175 ///
176 /// assert_eq!(it.next(), Some(Left(("0", vec!["0;A"]))));
177 /// assert_eq!(it.next(), Some(Both(("1", vec!["1;B"]), ("1", vec!["1;X", "1;Y"]))));
178 /// assert_eq!(it.next(), Some(Right(("2", vec!["2;Z"]))));
179 /// assert_eq!(it.next(), None);
180 /// ```
181 fn merge_join_full_outer_by<R, F>(self, other: R, cmp: F)
182 -> MergeJoinFullOuter<Self, R::IntoIter, F>
183 where Self: Sized,
184 R: IntoIterator,
185 F: FnMut(&Self::Item, &R::Item) -> Ordering
186 {
187 MergeJoinFullOuter::new(self, other.into_iter(), cmp)
188 }
189
190 /// Return an iterator adaptor that [inner
191 /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Inner_join) the two input iterators in
192 /// ascending order. The resulting iterator is the intersection of the two input iterators.
193 ///
194 /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
195 /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
196 /// unique on the key.
197 ///
198 /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`.
199 /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
200 /// Clone`.
201 ///
202 /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
203 ///
204 /// Iterator element type is `(LV, vec![RV,...])`.
205 /// The `RV` is cloned from `HashMap` for each joined value. A single `RV` can be expected to
206 /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
207 /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
208 ///
209 /// ```
210 /// use joinkit::Joinkit;
211 ///
212 /// // tuples of (key, value), where the key is extracted from the value
213 /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
214 /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
215 /// let mut it = l.hash_join_inner(r);
216 ///
217 /// // notice the grouped right values
218 /// assert_eq!(it.next(), Some(("1;B", vec!["1;X", "1;Y"])));
219 /// assert_eq!(it.next(), None);
220 /// ```
221 fn hash_join_inner<K, RI, RV>(self, other: RI) -> HashJoinInner<Self, K, RV>
222 where Self: Sized,
223 K: Hash + Eq,
224 RV: Clone,
225 RI: IntoIterator<Item=(K, RV)>
226 {
227 HashJoinInner::new(self, other)
228 }
229
230 /// Return an iterator adaptor that *left exclusive joins* the two input iterators. The
231 /// resulting iterator contains only those records from the left input iterator, which do not
232 /// match the right input iterator. There is no direct equivalent in SQL.
233 ///
234 /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
235 /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
236 /// unique on the key.
237 ///
238 /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`.
239 /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq`.
240 ///
241 /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
242 ///
243 /// Iterator element type is `LV`.
244 ///
245 /// ```
246 /// use joinkit::Joinkit;
247 ///
248 /// // tuples of (key, value), where the key is extracted from the value
249 /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
250 /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
251 /// let mut it = l.hash_join_left_excl(r);
252 ///
253 /// assert_eq!(it.next(), Some("0;A"));
254 /// assert_eq!(it.next(), None);
255 /// ```
256 fn hash_join_left_excl<K, RI, RV>(self, other: RI) -> HashJoinLeftExcl<Self, K>
257 where Self: Sized,
258 K: Hash + Eq,
259 RI: IntoIterator<Item=(K, RV)>
260 {
261 HashJoinLeftExcl::new(self, other)
262 }
263
264 /// Return an iterator adaptor that [left outer
265 /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Left_outer_join) the two input
266 /// iterators. The resulting iterator contains all the records from the left input iterator,
267 /// even if they do not match the right input iterator.
268 ///
269 /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
270 /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
271 /// unique on the key.
272 ///
273 /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`.
274 /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
275 /// Clone`.
276 ///
277 /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
278 ///
279 /// Iterator element type is [`EitherOrBoth<LV, RV>`](enum.EitherOrBoth.html).
280 /// The `RV` is cloned from `HashMap` for each joined value. It is expected a single `RV` will
281 /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
282 /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
283 ///
284 /// ```
285 /// use joinkit::Joinkit;
286 /// use joinkit::EitherOrBoth::{Left, Both, Right};
287 ///
288 /// // tuples of (key, value), where the key is extracted from the value
289 /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
290 /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
291 /// let mut it = l.hash_join_left_outer(r);
292 ///
293 /// // notice the grouped right values
294 /// assert_eq!(it.next(), Some(Left("0;A")));
295 /// assert_eq!(it.next(), Some(Both("1;B", vec!["1;X", "1;Y"])));
296 /// assert_eq!(it.next(), None);
297 /// ```
298 fn hash_join_left_outer<K, RI, RV>(self, other: RI) -> HashJoinLeftOuter<Self, K, RV>
299 where Self: Sized,
300 K: Hash + Eq,
301 RV: Clone,
302 RI: IntoIterator<Item=(K, RV)>
303 {
304 HashJoinLeftOuter::new(self, other)
305 }
306
307 /// Return an iterator adaptor that *right exclusive joins* the two input iterators. The resulting
308 /// iterator contains only those records from the right input iterator, which do not match the
309 /// left input iterator. There is no direct equivalent in SQL.
310 ///
311 /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
312 /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
313 /// unique on the key.
314 ///
315 /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`.
316 /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq`.
317 ///
318 /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
319 ///
320 /// Iterator element type is `vec![RV,...]`.
321 ///
322 /// ```
323 /// use joinkit::Joinkit;
324 ///
325 /// // tuples of (key, value), where the key is extracted from the value
326 /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
327 /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
328 /// let mut it = l.hash_join_right_excl(r);
329 ///
330 /// assert_eq!(it.next(), Some(vec!["2;Z"]));
331 /// assert_eq!(it.next(), None);
332 /// ```
333 fn hash_join_right_excl<K, RI, RV>(self, other: RI) -> HashJoinRightExcl<Self, K, RV>
334 where Self: Sized,
335 K: Hash + Eq,
336 RI: IntoIterator<Item=(K, RV)>
337 {
338 HashJoinRightExcl::new(self, other)
339 }
340
341 /// Return an iterator adaptor that [right outer
342 /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Right_outer_join) the two input
343 /// iterators. The resulting iterator contains all the records from the right input iterator,
344 /// even if they do not match the left input iterator.
345 ///
346 /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
347 /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
348 /// unique on the key.
349 ///
350 /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`.
351 /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
352 /// Clone`.
353 ///
354 /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
355 ///
356 /// Iterator element type is [`EitherOrBoth<LV, RV>`](enum.EitherOrBoth.html).
357 /// The `RV` is cloned from `HashMap` for each joined value. It is expected a single `RV` will
358 /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
359 /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
360 ///
361 /// ```
362 /// use joinkit::Joinkit;
363 /// use joinkit::EitherOrBoth::{Left, Both, Right};
364 ///
365 /// // tuples of (key, value), where the key is extracted from the value
366 /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
367 /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
368 /// let mut it = l.hash_join_right_outer(r);
369 ///
370 /// // notice the grouped right values
371 /// assert_eq!(it.next(), Some(Both("1;B", vec!["1;X", "1;Y"])));
372 /// assert_eq!(it.next(), Some(Right(vec!["2;Z"])));
373 /// assert_eq!(it.next(), None);
374 /// ```
375 fn hash_join_right_outer<K, RI, RV>(self, other: RI) -> HashJoinRightOuter<Self, K, RV>
376 where Self: Sized,
377 K: Hash + Eq,
378 RV: Clone,
379 RI: IntoIterator<Item=(K, RV)>
380 {
381 HashJoinRightOuter::new(self, other)
382 }
383
384 /// Return an iterator adaptor that [full outer
385 /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Full_outer_join) the two input
386 /// iterators. The resulting iterator contains all the records from the both input iterators.
387 ///
388 /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
389 /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
390 /// unique on the key.
391 ///
392 /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`.
393 /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
394 /// Clone`.
395 ///
396 /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
397 ///
398 /// Iterator element type is [`EitherOrBoth<LV, RV>`](enum.EitherOrBoth.html).
399 /// The `RV` is cloned from `HashMap` for each joined value. It is expected a single `RV` will
400 /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
401 /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
402 ///
403 /// ```
404 /// use joinkit::Joinkit;
405 /// use joinkit::EitherOrBoth::{Left, Both, Right};
406 ///
407 /// // tuples of (key, value), where the key is extracted from the value
408 /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
409 /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
410 /// let mut it = l.hash_join_full_outer(r);
411 ///
412 /// // notice the grouped right values
413 /// assert_eq!(it.next(), Some(Left("0;A")));
414 /// assert_eq!(it.next(), Some(Both("1;B", vec!["1;X", "1;Y"])));
415 /// assert_eq!(it.next(), Some(Right(vec!["2;Z"])));
416 /// assert_eq!(it.next(), None);
417 /// ```
418 fn hash_join_full_outer<K, RI, RV>(self, other: RI) -> HashJoinFullOuter<Self, K, RV>
419 where Self: Sized,
420 K: Hash + Eq,
421 RV: Clone,
422 RI: IntoIterator<Item=(K, RV)>
423 {
424 HashJoinFullOuter::new(self, other)
425 }
426}
427
428impl<T: ?Sized> Joinkit for T where T: Iterator { }