1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
use atomic_option::AtomicOption;
use failure::Error;
use fnv::FnvHashMap;
use proto::{self, Client, ClientOptions, HeartbeatStatus, Reconnect};
use std::error::Error as StdError;
use std::io::prelude::*;
use std::net::TcpStream;
use std::sync::{atomic, Arc};

use proto::{Ack, Fail, Job};

const STATUS_RUNNING: usize = 0;
const STATUS_QUIET: usize = 1;
const STATUS_TERMINATING: usize = 2;

type JobRunner<E> = Fn(Job) -> Result<(), E> + Send + Sync;
type BoxedJobRunner<E> = Box<JobRunner<E>>;

/// `Consumer` is used to run a worker that processes jobs provided by Faktory.
///
/// # Building the worker
///
/// Faktory needs a decent amount of information from its workers, such as a unique worker ID, a
/// hostname for the worker, its process ID, and a set of labels used to identify the worker. In
/// order to enable setting all these, constructing a worker is a two-step process. You first use a
/// [`ConsumerBuilder`](struct.ConsumerBuilder.html) (which conveniently implements a sensible
/// `Default`) to set the worker metadata, as well as to register any job handlers. You then use
/// one of the `connect_*` methods to finalize the worker and connect to the Faktory server.
///
/// In most cases, `ConsumerBuilder::default()` will do what you want. You only need to augment it
/// with calls to [`register`](struct.ConsumerBuilder.html#method.register) to register handlers
/// for each of your job types, and then you can connect. If you have different *types* of workers,
/// you may also want to use [`labels`](struct.ConsumerBuilder.html#method.labels) to distinguish
/// them in the Faktory Web UI. To specify that some jobs should only go to some workers, use
/// different queues.
///
/// ## Handlers
///
/// For each [`Job`](struct.Job.html) that the worker receives, the handler that is registered for
/// that job's type will be called. If a job is received with a type for which no handler exists,
/// the job will be failed and returned to the Faktory server. Similarly, if a handler returns an
/// error response, the job will be failed, and the error reported back to the Faktory server.
///
/// If you are new to Rust, getting the handler types to work out can be a little tricky. If you
/// want to understand why, I highly recommend that you have a look at the chapter on [closures and
/// generic
/// parameters](https://doc.rust-lang.org/book/second-edition/ch13-01-closures.html#using-closures-with-generic-parameters-and-the-fn-traits)
/// in the Rust Book. If you just want it to work, my recommendation is to either use regular
/// functions instead of closures, and giving `&func_name` as the handler, **or** wrapping all your
/// closures in `Box::new()`.
///
/// ## Concurrency
///
/// By default, only a single thread is spun up to process the jobs given to this worker. If you
/// want to dedicate more resources to processing jobs, you have a number of options listed below.
/// As you go down the list below, efficiency increases, but fault isolation decreases. I will not
/// give further detail here, but rather recommend that if these don't mean much to you, you should
/// use the last approach and let the library handle the concurrency for you.
///
///  - You can spin up more worker processes by launching your worker program more than once.
///  - You can create more than one `Consumer`.
///  - You can call [`ConsumerBuilder::workers`](struct.ConsumerBuilder.html#method.workers) to set
///    the number of worker threads you'd like the `Consumer` to use internally.
///
/// # Connecting to Faktory
///
/// To fetch jobs, the `Consumer` must first be connected to the Faktory server. Exactly how you do
/// that depends on your setup. In most cases, you'll want to use `Consumer::connect`, and provide
/// a connection URL. If you supply a URL, it must be of the form:
///
/// ```text
/// protocol://[:password@]hostname[:port]
/// ```
///
/// Faktory suggests using the `FAKTORY_PROVIDER` and `FAKTORY_URL` environment variables (see
/// their docs for more information) with `localhost:7419` as the fallback default. If you want
/// this behavior, pass `None` as the URL.
///
/// See the [`Producer` examples](struct.Producer.html#examples) for examples of how to connect to
/// different Factory setups.
///
/// # Worker lifecycle
///
/// Okay, so you've built your worker and connected to the Faktory server. Now what?
///
/// If all this process is doing is handling jobs, reconnecting on failure, and exiting when told
/// to by the Faktory server, you should use
/// [`run_to_completion`](struct.Consumer.html#method.run_to_completion). If you want more
/// fine-grained control over the lifetime of your process, you should use
/// [`Consumer::run`](struct.Consumer.html#method.run). See the documentation for each of these
/// methods for details.
///
/// # Examples
///
/// Create a worker with all default options, register a single handler (for the `foobar` job
/// type), connect to the Faktory server, and start accepting jobs.
///
/// ```no_run
/// use faktory::ConsumerBuilder;
/// use std::io;
/// let mut c = ConsumerBuilder::default();
/// c.register("foobar", |job| -> io::Result<()> {
///     println!("{:?}", job);
///     Ok(())
/// });
/// let mut c = c.connect(None).unwrap();
/// if let Err(e) = c.run(&["default"]) {
///     println!("worker failed: {}", e);
/// }
/// ```
pub struct Consumer<S, E>
where
    S: Read + Write,
{
    c: Client<S>,
    last_job_results: Arc<Vec<AtomicOption<Result<String, Fail>>>>,
    running_jobs: Arc<Vec<AtomicOption<String>>>,
    callbacks: Arc<FnvHashMap<String, BoxedJobRunner<E>>>,
    terminated: bool,
}

/// Convenience wrapper for building a Faktory worker.
///
/// See the [`Consumer`](struct.Consumer.html) documentation for details.
pub struct ConsumerBuilder<E> {
    opts: ClientOptions,
    workers: usize,
    callbacks: FnvHashMap<String, BoxedJobRunner<E>>,
}

impl<E> Default for ConsumerBuilder<E> {
    /// Construct a new worker with default worker options and the url fetched from environment
    /// variables.
    ///
    /// This will construct a worker where:
    ///
    ///  - `hostname` is this machine's hostname.
    ///  - `wid` is a randomly generated string.
    ///  - `pid` is the OS PID of this process.
    ///  - `labels` is `["rust"]`.
    ///
    fn default() -> Self {
        ConsumerBuilder {
            opts: ClientOptions::default(),
            workers: 1,
            callbacks: Default::default(),
        }
    }
}

impl<E> ConsumerBuilder<E> {
    /// Set the hostname to use for this worker.
    ///
    /// Defaults to the machine's hostname as reported by the operating system.
    pub fn hostname(&mut self, hn: String) -> &mut Self {
        self.opts.hostname = Some(hn);
        self
    }

    /// Set a unique identifier for this worker.
    ///
    /// Defaults to a randomly generated ASCII string.
    pub fn wid(&mut self, wid: String) -> &mut Self {
        self.opts.wid = Some(wid);
        self
    }

    /// Set the labels to use for this worker.
    ///
    /// Defaults to `["rust"]`.
    pub fn labels(&mut self, labels: Vec<String>) -> &mut Self {
        self.opts.labels = labels;
        self
    }

    /// Set the number of workers to use for `run` and `run_to_completion_*`.
    ///
    /// Defaults to 1.
    pub fn workers(&mut self, w: usize) -> &mut Self {
        self.workers = w;
        self
    }

    /// Register a handler function for the given job type (`kind`).
    ///
    /// Whenever a job whose type matches `kind` is fetched from the Faktory, the given handler
    /// function is called with that job as its argument.
    pub fn register<K, H>(&mut self, kind: K, handler: H) -> &mut Self
    where
        K: Into<String>,
        // Annoyingly, can't just use the JobRunner<E> type alias here.
        H: Fn(Job) -> Result<(), E> + Send + Sync + 'static,
    {
        self.callbacks.insert(kind.into(), Box::new(handler));
        self
    }

    /// Connect to a Faktory server.
    ///
    /// If `url` is not given, will use the standard Faktory environment variables. Specifically,
    /// `FAKTORY_PROVIDER` is read to get the name of the environment variable to get the address
    /// from (defaults to `FAKTORY_URL`), and then that environment variable is read to get the
    /// server address. If the latter environment variable is not defined, the connection will be
    /// made to
    ///
    /// ```text
    /// tcp://localhost:7419
    /// ```
    ///
    /// If `url` is given, but does not specify a port, it defaults to 7419.
    pub fn connect(self, url: Option<&str>) -> Result<Consumer<TcpStream, E>, Error> {
        let url = match url {
            Some(url) => proto::url_parse(url),
            None => proto::url_parse(&proto::get_env_url()),
        }?;
        let stream = TcpStream::connect(proto::host_from_url(&url))?;
        Self::connect_with(self, stream, url.password().map(|p| p.to_string()))
    }

    /// Connect to a Faktory server with a non-standard stream.
    pub fn connect_with<S: Read + Write>(
        mut self,
        stream: S,
        pwd: Option<String>,
    ) -> Result<Consumer<S, E>, Error> {
        self.opts.password = pwd;
        Ok(Consumer::new(
            Client::new(stream, self.opts)?,
            self.workers,
            self.callbacks,
        ))
    }
}

enum Failed<E: StdError> {
    Application(E),
    BadJobType(String),
}

impl<E, S: Read + Write> Consumer<S, E> {
    fn new(c: Client<S>, workers: usize, callbacks: FnvHashMap<String, BoxedJobRunner<E>>) -> Self {
        Consumer {
            c: c,
            callbacks: Arc::new(callbacks),
            running_jobs: Arc::new((0..workers).map(|_| AtomicOption::empty()).collect()),
            last_job_results: Arc::new((0..workers).map(|_| AtomicOption::empty()).collect()),
            terminated: false,
        }
    }
}

impl<E, S: Read + Write + Reconnect> Consumer<S, E> {
    fn reconnect(&mut self) -> Result<(), Error> {
        self.c.reconnect()
    }
}

impl<S, E> Consumer<S, E>
where
    S: Read + Write,
    E: StdError,
{
    fn run_job(&mut self, job: Job) -> Result<(), Failed<E>> {
        match self.callbacks.get(&job.kind) {
            Some(callback) => callback(job).map_err(Failed::Application),
            None => {
                // cannot execute job, since no handler exists
                Err(Failed::BadJobType(job.kind))
            }
        }
    }

    /// Fetch and run a single job on the current thread, and then return.
    pub fn run_one<Q>(&mut self, worker: usize, queues: &[Q]) -> Result<bool, Error>
    where
        Q: AsRef<str>,
    {
        // get a job
        let job = match self.c.fetch(queues)? {
            Some(job) => job,
            None => return Ok(false),
        };

        // remember the job id
        let jid = job.jid.clone();

        // keep track of running job in case we're terminated during it
        self.running_jobs[worker].swap(Box::new(jid.clone()), atomic::Ordering::SeqCst);

        // process the job
        let r = self.run_job(job);

        // report back
        match r {
            Ok(_) => {
                // job done -- acknowledge
                // remember it in case we fail to notify the server (e.g., broken connection)
                self.last_job_results[worker]
                    .swap(Box::new(Ok(jid.clone())), atomic::Ordering::SeqCst);
                self.c.issue(&Ack::new(jid))?.await_ok()?;
            }
            Err(e) => {
                // job failed -- let server know
                // "unknown" is the errtype used by the go library too
                let fail = match e {
                    Failed::BadJobType(jt) => {
                        Fail::new(jid, "unknown", format!("No handler for {}", jt))
                    }
                    Failed::Application(e) => {
                        let mut f = Fail::new(jid, "unknown", format!("{}", e));
                        let mut root = e.cause();
                        let mut backtrace = Vec::new();
                        while let Some(r) = root.take() {
                            backtrace.push(format!("{}", r));
                            root = r.cause();
                        }
                        f.set_backtrace(backtrace);
                        f
                    }
                };

                let fail2 = fail.clone();
                self.last_job_results[worker].swap(Box::new(Err(fail)), atomic::Ordering::SeqCst);
                self.c.issue(&fail2)?.await_ok()?;
            }
        }

        // we won't have to tell the server again
        self.last_job_results[worker].take(atomic::Ordering::SeqCst);
        self.running_jobs[worker].take(atomic::Ordering::SeqCst);
        Ok(true)
    }

    #[cfg(test)]
    pub(crate) fn run_n<Q>(&mut self, n: usize, queues: &[Q]) -> Result<(), Error>
    where
        Q: AsRef<str>,
    {
        for _ in 0..n {
            self.run_one(0, queues)?;
        }
        Ok(())
    }
}

impl<S, E> Consumer<S, E>
where
    S: Read + Write + Reconnect + Send + 'static,
    E: StdError + 'static,
{
    fn for_worker(&mut self) -> Result<Self, Error> {
        Ok(Consumer {
            c: self.c.connect_again()?,
            callbacks: Arc::clone(&self.callbacks),
            running_jobs: Arc::clone(&self.running_jobs),
            last_job_results: Arc::clone(&self.last_job_results),
            terminated: self.terminated,
        })
    }

    /// Run this worker on the given `queues` until an I/O error occurs (`Err` is returned), or
    /// until the server tells the worker to disengage (`Ok` is returned).
    ///
    /// The value in an `Ok` indicates the number of workers that may still be processing jobs.
    ///
    /// Note that if the worker fails, [`reconnect()`](struct.Consumer.html#method.reconnect)
    /// should likely be called before calling `run()` again. If an error occurred while reporting
    /// a job success or failure, the result will be re-reported to the server without re-executing
    /// the job. If the worker was terminated (i.e., `run` returns with an `Ok` response), the
    /// worker should **not** try to resume by calling `run` again. This will cause a panic.
    pub fn run<Q>(&mut self, queues: &[Q]) -> Result<usize, Error>
    where
        Q: AsRef<str>,
    {
        assert!(!self.terminated, "do not re-run a terminated worker");
        assert_eq!(Arc::strong_count(&self.last_job_results), 1);

        // retry delivering notification about our last job result.
        // we know there's no leftover thread at this point, so there's no race on the option.
        for last_job_result in self.last_job_results.iter() {
            if let Some(res) = last_job_result.take(atomic::Ordering::SeqCst) {
                let r = match *res {
                    Ok(ref jid) => self.c.issue(&Ack::new(&**jid)),
                    Err(ref fail) => self.c.issue(fail),
                };

                let r = match r {
                    Ok(r) => r,
                    Err(e) => {
                        last_job_result.swap(res, atomic::Ordering::SeqCst);
                        return Err(e);
                    }
                };

                if let Err(e) = r.await_ok() {
                    // it could be that the server did previously get our ACK/FAIL, and that it was
                    // the resulting OK that failed. in that case, we would get an error response
                    // when re-sending the job response. this should not count as critical. other
                    // errors, however, should!
                    if e.downcast_ref::<::std::io::Error>().is_some() {
                        last_job_result.swap(res, atomic::Ordering::SeqCst);
                        return Err(e);
                    }
                }
            }
        }

        // keep track of the current status of each worker
        let status: Vec<_> = (0..self.running_jobs.len())
            .map(|_| Arc::new(atomic::AtomicUsize::new(STATUS_RUNNING)))
            .collect();

        // start worker threads
        use std::thread;
        let workers = status
            .iter()
            .enumerate()
            .map(|(worker, status)| {
                let mut w = self.for_worker()?;
                let status = Arc::clone(status);
                let queues: Vec<_> = queues.into_iter().map(|s| s.as_ref().to_string()).collect();
                Ok(thread::spawn(move || {
                    while status.load(atomic::Ordering::SeqCst) == STATUS_RUNNING {
                        if let Err(e) = w.run_one(worker, &queues[..]) {
                            status.store(STATUS_TERMINATING, atomic::Ordering::SeqCst);
                            return Err(e);
                        }
                    }
                    status.store(STATUS_TERMINATING, atomic::Ordering::SeqCst);
                    Ok(())
                }))
            })
            .collect::<Result<Vec<_>, Error>>()?;

        // listen for heartbeats
        let mut target = STATUS_RUNNING;
        let exit = {
            use std::time;
            let mut last = time::Instant::now();

            loop {
                use std::thread;

                thread::sleep(time::Duration::from_millis(100));

                // has a worker failed?
                if target == STATUS_RUNNING
                    && status
                        .iter()
                        .any(|s| s.load(atomic::Ordering::SeqCst) == STATUS_TERMINATING)
                {
                    // tell all workers to exit
                    // (though chances are they've all failed already)
                    for s in &status {
                        s.store(STATUS_TERMINATING, atomic::Ordering::SeqCst);
                    }
                    break Ok(false);
                }

                if last.elapsed().as_secs() < 5 {
                    // don't sent a heartbeat yet
                    continue;
                }

                match self.c.heartbeat() {
                    Ok(hb) => {
                        match hb {
                            HeartbeatStatus::Ok => {}
                            HeartbeatStatus::Quiet => {
                                // tell the workers to eventually terminate
                                for s in &status {
                                    s.store(STATUS_QUIET, atomic::Ordering::SeqCst);
                                }
                                target = STATUS_QUIET;
                            }
                            HeartbeatStatus::Terminate => {
                                // tell the workers to terminate
                                // *and* fail the current job and immediately return
                                for s in &status {
                                    s.store(STATUS_QUIET, atomic::Ordering::SeqCst);
                                }
                                break Ok(true);
                            }
                        }
                    }
                    Err(e) => {
                        // for this to fail, the workers have probably also failed
                        for s in &status {
                            s.store(STATUS_TERMINATING, atomic::Ordering::SeqCst);
                        }
                        break Err(e);
                    }
                }
                last = time::Instant::now();
            }
        };

        // there are a couple of cases here:
        //
        //  - we got TERMINATE, so we should just return, even if a worker is still running
        //  - we got TERMINATE and all workers has exited
        //  - we got an error from heartbeat()
        //
        self.terminated = exit.is_ok();
        if let Ok(true) = exit {
            // FAIL currently running jobs even though they're still running
            let mut running = 0;
            for running_job in self.running_jobs.iter() {
                if let Some(jid) = running_job.take(atomic::Ordering::SeqCst) {
                    let f = Fail::new(&**jid, "unknown", "terminated");

                    // if this fails, we don't want to exit with Err(),
                    // because we *were* still terminated!
                    self.c.issue(&f).and_then(|r| r.await_ok()).is_ok();

                    running += 1;
                }
            }

            if running != 0 {
                return Ok(running);
            }
        }

        match exit {
            Ok(_) => {
                // we want to expose any worker errors
                workers
                    .into_iter()
                    .map(|w| w.join().unwrap())
                    .collect::<Result<Vec<_>, _>>()
                    .map(|_| 0)
            }
            Err(e) => {
                // we want to expose worker errors, or otherwise the heartbeat error
                workers
                    .into_iter()
                    .map(|w| w.join().unwrap())
                    .collect::<Result<Vec<_>, _>>()
                    .and_then(|_| Err(e))
            }
        }
    }

    /// Run this worker until the server tells us to exit or a connection cannot be re-established.
    ///
    /// This function never returns. When the worker decides to exit, the process is terminated.
    pub fn run_to_completion<Q>(mut self, queues: &[Q]) -> !
    where
        Q: AsRef<str>,
    {
        use std::process;
        while self.run(queues).is_err() {
            if self.reconnect().is_err() {
                break;
            }
        }

        process::exit(0);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    // https://github.com/rust-lang/rust/pull/42219
    //#[allow_fail]
    #[ignore]
    fn it_works() {
        use producer::Producer;
        use std::io;

        let mut p = Producer::connect(None).unwrap();
        let mut j = Job::new("foobar", vec!["z"]);
        j.queue = "worker_test_1".to_string();
        p.enqueue(j).unwrap();

        let mut c = ConsumerBuilder::default();
        c.register("foobar", |job: Job| -> Result<(), io::Error> {
            assert_eq!(job.args, vec!["z"]);
            Ok(())
        });
        let mut c = c.connect(None).unwrap();
        let e = c.run_n(1, &["worker_test_1"]);
        if e.is_err() {
            println!("{:?}", e);
        }
        assert!(e.is_ok());
    }
}