1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

//! An extension trait to provide `Iterator` adapters that process items in parallel.
//!
//! The heart of this crate (and what should be considered its entry point) is the [`Polyester`]
//! trait, which is applied to any `Iterator` where it and its items are `Send`.
//!
//! [`Polyester`]: trait.Polyester.html

#![doc(test(attr(allow(unused_variables))))]

extern crate num_cpus;
extern crate coco;
extern crate synchronoise;

use std::sync::{Arc, mpsc};
use std::sync::atomic::{AtomicBool, AtomicUsize, ATOMIC_USIZE_INIT};
use std::sync::atomic::Ordering::SeqCst;
use std::sync::mpsc::channel;
use std::thread;

use coco::deque;
use synchronoise::{SignalEvent, SignalKind};

/// A trait to extend `Iterator`s with consumers that work in parallel.
///
/// This trait is applied to any iterator where it and its items are `Send`, allowing them to be
/// processed by multiple threads. Importing the trait into your code allows these adaptors to be
/// used like any other iterator adaptor - the only difference is that between the time they're
/// started and when they finish, they'll have spawned a number of threads to perform their work.
///
/// # Implementation Note
///
/// It's worth noting that even though this promises parallel processing, that's no guarantee that
/// it will be faster than just doing it sequentially. The iterator itself is a bottleneck for the
/// processing, since it needs an exclusive `&mut self` borrow to get each item. This library
/// attempts to get around that by draining the items in a separate thread into a cache that the
/// workers load from, but the synchronization costs for this mean that switching `map` for
/// `par_map` (for example) is not a universal win. Because of this, these adapters are only
/// expected to speed up processing if your source iterator is rather quick, and the closures you
/// hand to the adapters are not.
pub trait Polyester<T>
{
    /// Fold the iterator in parallel.
    ///
    /// This method works in two parts:
    ///
    /// 1. Use a set of threads to fold items individually into a per-thread "sub-accumulator"
    ///    using `inner_fold`. Each per-thread sub-accumulator begins with a clone of `seed`.
    /// 2. Once the source iterator is exhausted and has no more items, collect each intermediate
    ///    sub-accumulator into a final accumulator, starting with the first thread's personal
    ///    sub-accumulator and folding additional sub-accumulators using `outer_fold`.
    ///
    /// If there are no items in the iterator, `seed` is returned untouched.
    ///
    /// # Example
    ///
    /// ```
    /// use polyester::Polyester;
    /// # fn some_expensive_computation(it: usize) -> usize {
    /// #     if it == 7 { std::thread::sleep(std::time::Duration::from_secs(1)); }
    /// #     it
    /// # }
    ///
    /// let my_results = (0..1_000_000).par_fold(
    ///     vec![],
    ///     |mut acc, it| {
    ///         acc.push(some_expensive_computation(it));
    ///         acc
    ///     },
    ///     |mut left, right| {
    ///         left.extend(right);
    ///         left
    ///     }
    /// );
    /// ```
    fn par_fold<Acc, InnerFold, OuterFold>(
        self,
        seed: Acc,
        inner_fold: InnerFold,
        outer_fold: OuterFold,
    ) -> Acc
    where
        Acc: Clone + Send + 'static,
        InnerFold: Fn(Acc, T) -> Acc + Send + Sync + 'static,
        OuterFold: Fn(Acc, Acc) -> Acc;

    /// Maps the given closure onto each element in the iterator, in parallel.
    ///
    /// The `ParMap` adaptor returned by this function starts up a thread pool to run `map` on each
    /// item of the iterator. The result of each `map` is then passed back to the calling thread,
    /// where it can then be returned by `ParMap`'s `Iterator` implementation. Note that `ParMap`
    /// will yield items in the order the *threads* return them, which may not be the same as the
    /// order the *source iterator* does.
    ///
    /// The `ParMap` adaptor does not start its thread pool until it is first polled, after which
    /// it will block for the next item until the iterator is exhausted.
    ///
    /// # Example
    ///
    /// ```
    /// use polyester::Polyester;
    /// # fn some_expensive_computation(it: usize) -> usize {
    /// #     if it == 7 { std::thread::sleep(std::time::Duration::from_secs(1)); }
    /// #     it
    /// # }
    ///
    /// let my_results = (0..1_000_000).par_map(|it| some_expensive_computation(it))
    ///                                .collect::<Vec<_>>();
    /// ```
    fn par_map<Map, Out>(self, map: Map) -> ParMap<Self, Map, Out>
    where
        Self: Sized,
        Map: Fn(T) -> Out + Send + Sync + 'static,
        Out: Send + 'static;
}

impl<T, I> Polyester<T> for I
where
    I: Iterator<Item=T> + Send + 'static,
    T: Send + 'static,
{
    fn par_fold<Acc, InnerFold, OuterFold>(
        self,
        seed: Acc,
        inner_fold: InnerFold,
        outer_fold: OuterFold,
    ) -> Acc
    where
        T: Send + 'static,
        Acc: Clone + Send + 'static,
        InnerFold: Fn(Acc, T) -> Acc + Send + Sync + 'static,
        OuterFold: Fn(Acc, Acc) -> Acc
    {
        let num_jobs = get_thread_count();

        if num_jobs == 1 {
            //it's not worth collecting the items into the hopper and spawning a thread if it's
            //still going to be serial, just fold it inline
            return self.fold(seed, inner_fold);
        }

        let hopper = Hopper::new(self, num_jobs);
        let inner_fold = Arc::new(inner_fold);
        let (report, recv) = channel();

        //launch the workers
        for id in 0..num_jobs {
            let hopper = hopper.clone();
            let acc = seed.clone();
            let inner_fold = inner_fold.clone();
            let report = report.clone();
            std::thread::spawn(move || {
                let mut acc = acc;

                loop {
                    let item = hopper.get_item(id);

                    if let Some(item) = item {
                        acc = inner_fold(acc, item);
                    } else {
                        break;
                    }
                }

                report.send(acc).unwrap();
            });
        }

        //hang up our initial channel so we don't wait for a response from it
        drop(report);

        //collect and fold the workers' results
        let mut acc: Option<Acc> = None;
        for res in recv.iter() {
            if acc.is_none() {
                acc = Some(res);
            } else {
                acc = acc.map(|acc| outer_fold(acc, res));
            }
        }

        acc.unwrap_or(seed)
    }

    fn par_map<Map, Out>(self, map: Map) -> ParMap<I, Map, Out>
    where
        Self: Sized,
        T: Send + 'static,
        Map: Fn(T) -> Out + Send + Sync + 'static,
        Out: Send + 'static
    {
        ParMap {
            iter: Some(self),
            map: Some(map),
            recv: None,
        }
    }
}

/// A parallel `map` adapter, which uses multiple threads to process items.
///
/// This struct is returned by [`Polyester::par_map`]. See that function's documentation for more
/// details.
///
/// [`Polyester::par_map`]: trait.Polyester.html#method.par_map
pub struct ParMap<Iter, Map, T>
{
    iter: Option<Iter>,
    map: Option<Map>,
    recv: Option<mpsc::IntoIter<T>>,
}

impl<Iter, Map, T> Iterator for ParMap<Iter, Map, T>
where
    Iter: Iterator + Send + 'static,
    Iter::Item: Send + 'static,
    Map: Fn(Iter::Item) -> T + Send + Sync + 'static,
    T: Send + 'static,
{
    type Item = T;

    fn next(&mut self) -> Option<Self::Item> {
        if let (Some(iter), Some(map)) = (self.iter.take(), self.map.take()) {
            let num_jobs = get_thread_count();

            let hopper = Hopper::new(iter, num_jobs);
            let map = Arc::new(map);
            let (report, recv) = channel();

            //launch the workers
            for id in 0..num_jobs {
                let hopper = hopper.clone();
                let report = report.clone();
                let map = map.clone();
                std::thread::spawn(move || {
                    loop {
                        let item = hopper.get_item(id);

                        if let Some(item) = item {
                            report.send(map(item)).unwrap();
                        } else {
                            break;
                        }
                    }
                });
            }

            self.recv = Some(recv.into_iter());
        }

        self.recv.as_mut().and_then(|r| r.next())
    }
}

/// A distributed cache of an iterator's items, meant to be filled by a background thread so that
/// worker threads can pull from a per-thread cache.
struct Hopper<T> {
    /// The cache of items, broken down into per-thread sub-caches.
    cache: Vec<deque::Stealer<T>>,
    /// A set of associated `Condvar`s for worker threads to wait on while the background thread
    /// adds more items to its cache.
    signals: Vec<SignalEvent>,
    /// A signal that tells whether or not the source iterator has been exhausted.
    done: AtomicBool,
}

impl<T> Hopper<T> {
    /// Creates a new `Hopper` from the given iterator, with the given number of slots, and begins
    /// the background cache-filler worker.
    fn new<I>(iter: I, slots: usize) -> Arc<Hopper<T>>
    where
        I: Iterator<Item=T> + Send + 'static,
        T: Send + 'static,
    {
        //the fillers go into the filler thread, the cache and signals go into the final hopper
        let mut fillers = Vec::with_capacity(slots);
        let mut cache = Vec::with_capacity(slots);
        let mut signals = Vec::<SignalEvent>::with_capacity(slots);

        for _ in 0..slots {
            let (filler, stealer) = deque::new();
            fillers.push(filler);
            cache.push(stealer);
            //start the SignalEvents as "ready" in case the filler thread gets a heard start on the
            //workers
            signals.push(SignalEvent::new(true, SignalKind::Auto));
        }

        let ret = Arc::new(Hopper {
            cache: cache,
            signals: signals,
            done: AtomicBool::new(false),
        });

        let hopper = ret.clone();

        thread::spawn(move || {
            let hopper = hopper;
            let fillers = fillers;
            let mut current_slot = 0;
            let mut rounds = 0usize;

            for item in iter {
                fillers[current_slot].push(item);

                current_slot = (current_slot + 1) % slots;
                if current_slot == 0 {
                    rounds += 1;
                }

                //every time we've added (slots * 2) items to each slot, wake up all the threads if
                //they're waiting on more items
                if (rounds % (slots * 2)) == 0 {
                    hopper.signals[current_slot].signal();
                }
            }

            //we're out of items, so tell all the workers that we're done
            hopper.done.store(true, SeqCst);

            //...and wake them up so they can react to the "done" signal
            for signal in &hopper.signals {
                signal.signal();
            }
        });

        ret
    }

    /// Loads an item from the given cache slot, potentially blocking while the cache-filler worker
    /// adds more items.
    fn get_item(&self, id: usize) -> Option<T> {
        loop {
            //go pull from our cache to see if we have anything
            if let Some(item) = self.cache[id].steal() {
                return Some(item);
            }

            //...but before we sleep, go check the other caches to see if they still have anything
            let mut current_id = id;
            loop {
                current_id = (current_id + 1) % self.cache.len();
                if current_id == id { break; }

                if let Some(item) = self.cache[current_id].steal() {
                    return Some(item);
                }
            }

            //double-check the cache-length before blocking, in case the filler got to ours in the
            //meantime
            if self.cache[id].len() == 0 {
                //as a final check, see whether the filler thread is finished
                if self.done.load(SeqCst) {
                    return None;
                }

                //otherwise, wait for the cache-filler to get more items
                self.signals[id].wait();
            }
        }
    }
}

static THREAD_COUNT: AtomicUsize = ATOMIC_USIZE_INIT;

/// Sets the number of threads started by the [`Polyester`] adaptors.
///
/// [`Polyester`]: trait.Polyester.html
///
/// This sets an atomic counter accessed by the [`Polyester`] adaptors to calculate how many worker
/// threads to start. Note that one extra thread will always be spawned, to handle the iterator
/// item dispatch.
///
/// If this is set to zero (or never set in the first place), the adapters will spawn as many
/// threads as the number of cores on the current system.
pub fn set_thread_count(count: usize) {
    THREAD_COUNT.store(count, SeqCst);
}

fn get_thread_count() -> usize {
    let count = THREAD_COUNT.load(SeqCst);

    if count == 0 {
        num_cpus::get()
    } else {
        count
    }
}

#[cfg(test)]
mod tests {
    use super::Polyester;
    use std::time::{Instant, Duration};

    fn secs_millis(dur: Duration) -> (u64, u32) {
        (dur.as_secs(), dur.subsec_nanos() / 1_000_000)
    }

    #[test]
    fn basic_fold() {
        let before = Instant::now();
        let par = (0..1_000_000).par_fold(0usize, |l,r| l+r, |l,r| l+r);
        let after_par = Instant::now();
        let seq = (0..1_000_000).fold(0usize, |l,r| l+r);
        let after_seq = Instant::now();

        let par_dur = secs_millis(after_par.duration_since(before));
        let seq_dur = secs_millis(after_seq.duration_since(after_par));
        println!("");
        println!("    parallel fold:   {}.{:03}s", par_dur.0, par_dur.1);
        println!("    sequential fold: {}.{:03}s", seq_dur.0, seq_dur.1);

        assert_eq!(par, seq);
    }

    #[test]
    fn basic_map() {
        let before = Instant::now();
        let mut par = (0..1_000_000).par_map(|x| x*x).collect::<Vec<usize>>();
        let after_par = Instant::now();
        let mut seq = (0..1_000_000).map(|x| x*x).collect::<Vec<usize>>();
        let after_seq = Instant::now();

        par.sort();
        seq.sort();

        let par_dur = secs_millis(after_par.duration_since(before));
        let seq_dur = secs_millis(after_seq.duration_since(after_par));
        println!("");
        println!("    parallel map:   {}.{:03}s", par_dur.0, par_dur.1);
        println!("    sequential map: {}.{:03}s", seq_dur.0, seq_dur.1);

        assert_eq!(par, seq);
    }
}