joinkit/
lib.rs

1#![warn(missing_docs)]
2#![crate_name="joinkit"]
3
4//! Joinkit provides iterator adaptors for efficient SQL-like joins.
5//! 
6//! # Strategies
7//!
8//! There are two join strategies, which fit different scenarios:
9//! - **Hash Join** - a shorter data stream is loaded entirely into memory (`HashMap`), while the
10//! longer can be arbitrarily large and is matched against `HashMap` sequentially. The greatest
11//! advantage is that data do not need to be sorted and it has amortized O(n) complexity, therefore
12//! it is very efficient.  This is the right choice if data is not sorted and the smaller stream
13//! fits into memory. 
14//! - **Merge Join** - the data streams *must* be sorted, but can be *both* arbitrarily large. This
15//! is the right choice if the data is already sorted, as in this case it is slightly more
16//! efficient than Hash Join. 
17//!
18//! To use the iterator adaptors in this crate, import `Joinkit trait`:
19//!
20//! ```
21//! use joinkit::Joinkit;
22//! ```
23//!
24//! The crate contains also 2 binaries `hjoin` and `mjoin`, which can be used to perform `Hash
25//! Join` and `Merge Join` on command line. 
26
27#[macro_use]
28extern crate clap;
29extern crate itertools;
30
31use std::iter::{IntoIterator};
32use std::cmp::Ordering;
33use std::hash::Hash;
34
35pub use merge_join::{MergeJoinInner, MergeJoinLeftExcl, MergeJoinLeftOuter, MergeJoinFullOuter};
36pub use hash_join::{HashJoinInner, HashJoinLeftExcl, HashJoinLeftOuter, HashJoinRightExcl,
37HashJoinRightOuter, HashJoinFullOuter};
38
39pub mod util;
40mod merge_join;
41mod hash_join;
42
43/// A value yielded by `merge_join` and `hash_join` outer iterators.
44/// Contains one or two values, depending on which input iterator is exhausted.
45///
46#[derive(Clone, PartialEq, Eq, Debug)]
47pub enum EitherOrBoth<L, R> {
48    /// Neither input iterator is exhausted yet, yielding two values.
49    Both(L, R),
50    /// The parameter iterator is exhausted, only yielding a value from the `self` iterator.
51    Left(L),
52    /// The `self` iterator is exhausted, only yielding a value from the parameter iterator.
53    Right(R),
54}
55
56/// Trait `Joinkit` provides the extra iterator adaptors for efficient SQL-like joins.
57pub trait Joinkit : Iterator {
58    /// Return an iterator adaptor that [inner
59    /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Inner_join) the two input iterators in
60    /// ascending order. The resulting iterator is the intersection of the two input iterators.
61    ///
62    /// The both input iterators must be sorted and unique on the join key (e.g. by
63    /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
64    /// them, if necessary) to produce the correct results.
65    ///
66    /// Iterator element type is `(L::Item, R::Item)`.
67    ///
68    /// ```
69    /// use joinkit::Joinkit;
70    ///
71    /// // tuples of (key, [value,...]), where the key is extracted from the value
72    /// // notice the values are grouped by the key
73    /// let l = vec![("0", vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
74    /// let r = vec![("1", vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
75    /// let mut it = l.merge_join_inner_by(r, |x, y| Ord::cmp(&x.0, &y.0));
76    ///
77    /// assert_eq!(it.next(), Some((("1", vec!["1;B"]), ("1", vec!["1;X", "1;Y"]))));
78    /// assert_eq!(it.next(), None);
79    /// ```
80    fn merge_join_inner_by<R, F>(self, other: R, cmp: F) -> MergeJoinInner<Self, R::IntoIter, F> 
81        where Self: Sized,
82              R: IntoIterator,
83              F: FnMut(&Self::Item, &R::Item) -> Ordering
84    {
85        MergeJoinInner::new(self, other.into_iter(), cmp)
86    }
87
88    /// Return an iterator adaptor that *left exclusive joins* the two input iterators in
89    /// ascending order. The resulting iterator contains only those records from the left input
90    /// iterator, which do not match the right input iterator. There is no direct equivalent in
91    /// SQL.
92    /// 
93    /// The both input iterators must be sorted and unique on the join key (e.g. by
94    /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
95    /// them, if necessary) to produce the correct results.
96    ///
97    /// Iterator element type is `L::Item`.
98    ///
99    /// ```
100    /// use joinkit::Joinkit;
101    ///
102    /// // tuples of (key, [value,...]), where the key is extracted from the value
103    /// // notice the values are grouped by the key
104    /// let l = vec![("0", vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
105    /// let r = vec![("1", vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
106    /// let mut it = l.merge_join_left_excl_by(r, |x, y| Ord::cmp(&x.0, &y.0));
107    ///
108    /// assert_eq!(it.next(), Some(("0", vec!["0;A"])));
109    /// assert_eq!(it.next(), None);
110    /// ```
111    fn merge_join_left_excl_by<R, F>(self, other: R, cmp: F) 
112                                        -> MergeJoinLeftExcl<Self, R::IntoIter, F> 
113        where Self: Sized,
114              R: IntoIterator,
115              F: FnMut(&Self::Item, &R::Item) -> Ordering
116    {
117        MergeJoinLeftExcl::new(self, other.into_iter(), cmp)
118    }
119
120    /// Return an iterator adaptor that [left outer
121    /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Left_outer_join) the two input iterators
122    /// in ascending order. The resulting iterator contains all the records from the left input
123    /// iterator, even if they do not match the right input iterator.
124    ///
125    /// The both input iterators must be sorted and unique on the join key (e.g. by
126    /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
127    /// them, if necessary) to produce the correct results.
128    ///
129    /// Iterator element type is [`EitherOrBoth<L::Item, R::Item>`](enum.EitherOrBoth.html).
130    ///
131    /// ```
132    /// use joinkit::Joinkit;
133    /// use joinkit::EitherOrBoth::{Left, Both, Right};
134    ///
135    /// // tuples of (key, [value,...]), where the key is extracted from the value
136    /// // notice the values are grouped by the key
137    /// let l = vec![("0", vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
138    /// let r = vec![("1", vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
139    /// let mut it = l.merge_join_left_outer_by(r, |x, y| Ord::cmp(&x.0, &y.0));
140    ///
141    /// assert_eq!(it.next(), Some(Left(("0", vec!["0;A"]))));
142    /// assert_eq!(it.next(), Some(Both(("1", vec!["1;B"]), ("1", vec!["1;X", "1;Y"]))));
143    /// assert_eq!(it.next(), None);
144    /// ```
145    fn merge_join_left_outer_by<R, F>(self, other: R, cmp: F) 
146                                         -> MergeJoinLeftOuter<Self, R::IntoIter, F> 
147        where Self: Sized,
148              R: IntoIterator,
149              F: FnMut(&Self::Item, &R::Item) -> Ordering
150    {
151        MergeJoinLeftOuter::new(self, other.into_iter(), cmp)
152    }
153
154    /// Return an iterator adaptor that [full outer
155    /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Full_outer_join) the two input iterators
156    /// in ascending order. The resulting iterator contains all the records from the both input
157    /// iterators.
158    ///
159    /// The both input iterators must be sorted and unique on the join key (e.g. by
160    /// [grouping](http://bluss.github.io/rust-itertools/doc/itertools/trait.Itertools.html#method.group_by)
161    /// them, if necessary) to produce the correct results.
162    ///
163    /// Iterator element type is [`EitherOrBoth<L::Item, R::Item>`](enum.EitherOrBoth.html).
164    ///
165    /// ```
166    /// use joinkit::Joinkit;
167    /// use joinkit::EitherOrBoth::{Left, Both, Right};
168    ///
169    ///
170    /// // tuples of (key, [value,...]), where the key is extracted from the value
171    /// // notice the values are grouped by the key
172    /// let l = vec![("0",vec!["0;A"]), ("1", vec!["1;B"])].into_iter();
173    /// let r = vec![("1",vec!["1;X", "1;Y"]), ("2", vec!["2;Z"])].into_iter();
174    /// let mut it = l.merge_join_full_outer_by(r, |x, y| Ord::cmp(&x.0, &y.0));
175    ///
176    /// assert_eq!(it.next(), Some(Left(("0", vec!["0;A"]))));
177    /// assert_eq!(it.next(), Some(Both(("1", vec!["1;B"]), ("1", vec!["1;X", "1;Y"]))));
178    /// assert_eq!(it.next(), Some(Right(("2", vec!["2;Z"]))));
179    /// assert_eq!(it.next(), None);
180    /// ```
181    fn merge_join_full_outer_by<R, F>(self, other: R, cmp: F) 
182                                         -> MergeJoinFullOuter<Self, R::IntoIter, F> 
183        where Self: Sized,
184              R: IntoIterator,
185              F: FnMut(&Self::Item, &R::Item) -> Ordering
186    {
187        MergeJoinFullOuter::new(self, other.into_iter(), cmp)
188    }
189
190    /// Return an iterator adaptor that [inner
191    /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Inner_join) the two input iterators in
192    /// ascending order. The resulting iterator is the intersection of the two input iterators.
193    ///
194    /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
195    /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
196    /// unique on the key.
197    ///
198    /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`. 
199    /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
200    /// Clone`.
201    ///
202    /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
203    ///
204    /// Iterator element type is `(LV, vec![RV,...])`. 
205    /// The `RV` is cloned from `HashMap` for each joined value. A single `RV` can be expected to
206    /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
207    /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
208    ///
209    /// ```
210    /// use joinkit::Joinkit;
211    ///
212    /// // tuples of (key, value), where the key is extracted from the value
213    /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
214    /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
215    /// let mut it = l.hash_join_inner(r);
216    ///
217    /// // notice the grouped right values
218    /// assert_eq!(it.next(), Some(("1;B", vec!["1;X", "1;Y"])));
219    /// assert_eq!(it.next(), None);
220    /// ```
221    fn hash_join_inner<K, RI, RV>(self, other: RI) -> HashJoinInner<Self, K, RV> 
222        where Self: Sized,
223              K: Hash + Eq,
224              RV: Clone,
225              RI: IntoIterator<Item=(K, RV)>
226    {
227        HashJoinInner::new(self, other)
228    }
229
230    /// Return an iterator adaptor that *left exclusive joins* the two input iterators. The
231    /// resulting iterator contains only those records from the left input iterator, which do not
232    /// match the right input iterator. There is no direct equivalent in SQL.
233    ///
234    /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
235    /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
236    /// unique on the key.
237    ///
238    /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`. 
239    /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq`.
240    ///
241    /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
242    ///
243    /// Iterator element type is `LV`.
244    ///
245    /// ```
246    /// use joinkit::Joinkit;
247    ///
248    /// // tuples of (key, value), where the key is extracted from the value
249    /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
250    /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
251    /// let mut it = l.hash_join_left_excl(r);
252    ///
253    /// assert_eq!(it.next(), Some("0;A"));
254    /// assert_eq!(it.next(), None);
255    /// ```
256    fn hash_join_left_excl<K, RI, RV>(self, other: RI) -> HashJoinLeftExcl<Self, K> 
257        where Self: Sized,
258              K: Hash + Eq,
259              RI: IntoIterator<Item=(K, RV)>
260    {
261        HashJoinLeftExcl::new(self, other)
262    }
263
264    /// Return an iterator adaptor that [left outer
265    /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Left_outer_join) the two input
266    /// iterators.  The resulting iterator contains all the records from the left input iterator,
267    /// even if they do not match the right input iterator.
268    ///
269    /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
270    /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
271    /// unique on the key.
272    ///
273    /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`. 
274    /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
275    /// Clone`.
276    ///
277    /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
278    ///
279    /// Iterator element type is [`EitherOrBoth<LV, RV>`](enum.EitherOrBoth.html).
280    /// The `RV` is cloned from `HashMap` for each joined value. It is expected a single `RV` will
281    /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
282    /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
283    ///
284    /// ```
285    /// use joinkit::Joinkit;
286    /// use joinkit::EitherOrBoth::{Left, Both, Right};
287    ///
288    /// // tuples of (key, value), where the key is extracted from the value
289    /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
290    /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
291    /// let mut it = l.hash_join_left_outer(r);
292    ///
293    /// // notice the grouped right values
294    /// assert_eq!(it.next(), Some(Left("0;A")));
295    /// assert_eq!(it.next(), Some(Both("1;B", vec!["1;X", "1;Y"])));
296    /// assert_eq!(it.next(), None);
297    /// ```
298    fn hash_join_left_outer<K, RI, RV>(self, other: RI) -> HashJoinLeftOuter<Self, K, RV> 
299        where Self: Sized,
300              K: Hash + Eq,
301              RV: Clone,
302              RI: IntoIterator<Item=(K, RV)>
303    {
304        HashJoinLeftOuter::new(self, other)
305    }
306
307    /// Return an iterator adaptor that *right exclusive joins* the two input iterators. The resulting
308    /// iterator contains only those records from the right input iterator, which do not match the
309    /// left input iterator. There is no direct equivalent in SQL.
310    ///
311    /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
312    /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
313    /// unique on the key.
314    ///
315    /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`. 
316    /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq`.
317    ///
318    /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
319    ///
320    /// Iterator element type is `vec![RV,...]`.
321    ///
322    /// ```
323    /// use joinkit::Joinkit;
324    ///
325    /// // tuples of (key, value), where the key is extracted from the value
326    /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
327    /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
328    /// let mut it = l.hash_join_right_excl(r);
329    ///
330    /// assert_eq!(it.next(), Some(vec!["2;Z"]));
331    /// assert_eq!(it.next(), None);
332    /// ```
333    fn hash_join_right_excl<K, RI, RV>(self, other: RI) -> HashJoinRightExcl<Self, K, RV> 
334        where Self: Sized,
335              K: Hash + Eq,
336              RI: IntoIterator<Item=(K, RV)>
337    {
338        HashJoinRightExcl::new(self, other)
339    }
340
341    /// Return an iterator adaptor that [right outer
342    /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Right_outer_join) the two input
343    /// iterators.  The resulting iterator contains all the records from the right input iterator,
344    /// even if they do not match the left input iterator.
345    ///
346    /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
347    /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
348    /// unique on the key.
349    ///
350    /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`. 
351    /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
352    /// Clone`.
353    ///
354    /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
355    ///
356    /// Iterator element type is [`EitherOrBoth<LV, RV>`](enum.EitherOrBoth.html).
357    /// The `RV` is cloned from `HashMap` for each joined value. It is expected a single `RV` will
358    /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
359    /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
360    ///
361    /// ```
362    /// use joinkit::Joinkit;
363    /// use joinkit::EitherOrBoth::{Left, Both, Right};
364    ///
365    /// // tuples of (key, value), where the key is extracted from the value
366    /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
367    /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
368    /// let mut it = l.hash_join_right_outer(r);
369    ///
370    /// // notice the grouped right values
371    /// assert_eq!(it.next(), Some(Both("1;B", vec!["1;X", "1;Y"])));
372    /// assert_eq!(it.next(), Some(Right(vec!["2;Z"])));
373    /// assert_eq!(it.next(), None);
374    /// ```
375    fn hash_join_right_outer<K, RI, RV>(self, other: RI) -> HashJoinRightOuter<Self, K, RV> 
376        where Self: Sized,
377              K: Hash + Eq,
378              RV: Clone,
379              RI: IntoIterator<Item=(K, RV)>
380    {
381        HashJoinRightOuter::new(self, other)
382    }
383
384    /// Return an iterator adaptor that [full outer
385    /// joins](https://en.wikipedia.org/wiki/Join_%28SQL%29#Full_outer_join) the two input
386    /// iterators.  The resulting iterator contains all the records from the both input iterators.
387    ///
388    /// The input iterators do *not* need to be sorted. The right input iterator is loaded into
389    /// `HashMap` and grouped by the key automatically. Neither the left input iterator need to be
390    /// unique on the key.
391    ///
392    /// The left input iterator element type must be `(K, LV)`, where `K: Hash + Eq`. 
393    /// The right input iterator element type must be `(K, RV)`, where `K: Hash + Eq` and `RV:
394    /// Clone`.
395    ///
396    /// When the join adaptor is created, the right iterator is **consumed** into `HashMap`.
397    ///
398    /// Iterator element type is [`EitherOrBoth<LV, RV>`](enum.EitherOrBoth.html).
399    /// The `RV` is cloned from `HashMap` for each joined value. It is expected a single `RV` will
400    /// be joined (and cloned) multiple times to `LV`. To increase performance, consider wrapping
401    /// `RV` into `std::rc::Rc` pointer to avoid unnecessary allocations.
402    ///
403    /// ```
404    /// use joinkit::Joinkit;
405    /// use joinkit::EitherOrBoth::{Left, Both, Right};
406    ///
407    /// // tuples of (key, value), where the key is extracted from the value
408    /// let l = vec![("0", "0;A"), ("1", "1;B")].into_iter();
409    /// let r = vec![("1", "1;X"), ("2", "2;Z"), ("1", "1;Y")].into_iter();
410    /// let mut it = l.hash_join_full_outer(r);
411    ///
412    /// // notice the grouped right values
413    /// assert_eq!(it.next(), Some(Left("0;A")));
414    /// assert_eq!(it.next(), Some(Both("1;B", vec!["1;X", "1;Y"])));
415    /// assert_eq!(it.next(), Some(Right(vec!["2;Z"])));
416    /// assert_eq!(it.next(), None);
417    /// ```
418    fn hash_join_full_outer<K, RI, RV>(self, other: RI) -> HashJoinFullOuter<Self, K, RV> 
419        where Self: Sized,
420              K: Hash + Eq,
421              RV: Clone,
422              RI: IntoIterator<Item=(K, RV)>
423    {
424        HashJoinFullOuter::new(self, other)
425    }
426}
427
428impl<T: ?Sized> Joinkit for T where T: Iterator { }