1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

//! A fail point implementation for Rust.
//!
//! Fail points are code instrumentations that allow errors and other behavior
//! to be injected dynamically at runtime, primarily for testing purposes. Fail
//! points are flexible and can be configured to exhibit a variety of behavior,
//! including panics, early returns, and sleeping. They can be controlled both
//! programmatically and via the environment, and can be triggered
//! conditionally and probabilistically.
//!
//! This crate is inspired by FreeBSD's
//! [failpoints](https://freebsd.org/cgi/man.cgi?query=fail).
//!
//! ## Usage
//!
//! First, add this to your `Cargo.toml`:
//!
//! ```toml
//! [dependencies]
//! fail = "0.2"
//! ```
//!
//! Now you can import the `fail_point!` macro from the `fail` crate and use it
//! to inject dynamic failures.
//!
//! As an example, here's a simple program that uses a fail point to simulate an
//! I/O panic:
//!
//! ```rust
//! #[macro_use]
//! extern crate fail;
//!
//! fn do_fallible_work() {
//!     fail_point!("read-dir");
//!     let _dir: Vec<_> = std::fs::read_dir(".").unwrap().collect();
//!     // ... do some work on the directory ...
//! }
//!
//! fn main() {
//!     fail::setup();
//!     do_fallible_work();
//!     fail::teardown();
//!     println!("done");
//! }
//! ```
//!
//! Here, the program calls `unwrap` on the result of `read_dir`, a function
//! that returns a `Result`. In other words, this particular program expects
//! this call to `read_dir` to always succeed. And in practice it almost always
//! will, which makes the behavior of this program when `read_dir` fails
//! difficult to test. By instrumenting the program with a fail point we can
//! pretend that `read_dir` failed, causing the subsequent `unwrap` to panic,
//! and allowing us to observe the program's behavior under failure conditions.
//!
//! When the program is run normally it just prints "done":
//!
//! ```sh
//! $ cargo run
//!     Finished dev [unoptimized + debuginfo] target(s) in 0.01s
//!      Running `target/debug/failpointtest`
//! done
//! ```
//!
//! But now, by setting the `FAILPOINTS` variable we can see what happens if the
//! `read_dir` fails:
//!
//! ```sh
//! FAILPOINTS=read-dir=panic cargo run
//!     Finished dev [unoptimized + debuginfo] target(s) in 0.01s
//!      Running `target/debug/failpointtest`
//! thread 'main' panicked at 'failpoint read-dir panic', /home/ubuntu/.cargo/registry/src/github.com-1ecc6299db9ec823/fail-0.2.0/src/lib.rs:286:25
//! note: Run with `RUST_BACKTRACE=1` for a backtrace.
//! ```
//!
//! ## Usage in tests
//!
//! The previous example triggers a fail point by modifying the `FAILPOINT`
//! environment variable. In practice, you'll often want to trigger fail points
//! programmatically, in unit tests. Unfortunately, unit testing with fail
//! points is complicated by concurrency concerns, so requires some careful
//! setup. First, let's see the intuitive &mdash; but wrong &mdash; way to test
//! with fail points.
//!
//! This next example is like the previous, except instead of controlling fail
//! points with an environment variable, it does so with the `fail::cfg`
//! function, and instead of having a `main` function, it has a test case:
//!
//! ```rust
//! #[macro_use]
//! extern crate fail;
//!
//! fn do_fallible_work() {
//!     fail_point!("read-dir");
//!     let _dir: Vec<_> = std::fs::read_dir(".").unwrap().collect();
//!     // ... do some work on the directory ...
//! }
//!
//! #[test]
//! #[should_panic]
//! fn test_fallible_work() {
//!     fail::setup();
//!     fail::cfg("read-dir", "panic").unwrap();
//!     do_fallible_work();
//!     fail::teardown();
//! }
//! # fn main() { }
//! ```
//!
//! So this is a test that sets up the fail point to panic, and the test is
//! expected to panic because it has the `#[should_panic]` attribute.
//!
//! And this works fine.
//!
//! But only in this simple case. It is not correct generally. This is because
//! fail points are global resources that can be accessed from any thread, and
//! `setup` and `teardown` are operations that have global effect, and Rust
//! tests are run in multiple threads, in parallel. As a result, _if more than
//! one test calls `setup`, `teardown`, or configures the same fail point then
//! their result is non-deterministic_.
//!
//! To account for this we need to serialize the execution of tests by holding
//! a global lock, and only running a single fail point test at a time.
//!
//! Here's the correct way to write this test, and the basic pattern for writing
//! tests with fail points:
//!
//! ```
//! #[macro_use]
//! extern crate lazy_static;
//! #[macro_use]
//! extern crate fail;
//!
//! use std::sync::{Mutex, MutexGuard};
//!
//! fn do_fallible_work() {
//!     fail_point!("read-dir");
//!     let _dir: Vec<_> = std::fs::read_dir(".").unwrap().collect();
//!     // ... do some work on the directory ...
//! }
//!
//! lazy_static! {
//!     static ref LOCK: Mutex<()> = Mutex::new(());
//! }
//!
//! fn setup<'a>() -> MutexGuard<'a, ()> {
//!     let guard = LOCK.lock().unwrap_or_else(|e| e.into_inner());
//!     fail::teardown();
//!     fail::setup();
//!     guard
//! }
//!
//! #[test]
//! #[should_panic]
//! fn test_fallible_work() {
//!     let _gaurd = setup();
//!     fail::cfg("read-dir", "panic").unwrap();
//!     do_fallible_work();
//! }
//! # fn main() { }
//! ```
//!
//! With this arrangement, any test that calls `setup` and holds the resulting
//! guard for the duration will not run in parallel with other tests. It depends
//! on the [`lazy_static`](https://crates.io/crates/lazy_static) crate to
//! initialize a global mutex.
//!
//! Note that this type of guard is not only necessary for test cases that
//! configure fail points, but also, if there are _any_ test cases that enable
//! fail points in the same crate, then the guard is also necessary for any
//! tests that execute the code containing those fail points, even if those
//! tests don't call `fail::cfg` themselves. In our example, consider what
//! happens of we have two test cases that test `do_fallible_work`, and one of
//! them configures the fail point, expecting the function to fail, while the
//! other does not configure the fail point, expecting it to succeed. Then
//! consider what might happen if those tests execute in parallel &mdash; the
//! result is not deterministic and there will be spurious test failures.
//!
//! Because of this it is a best practice to put all fail point unit tests into
//! their own binary. Here's an example of a snippet from `Cargo.toml` that
//! creates a fail-point-specific test binary:
//!
//! ```toml
//! [[test]]
//! name = "failpoints"
//! path = "tests/failpoints/mod.rs"
//! ```
//!
//!
//! ## Early return
//!
//! The previous examples illustrate injecting panics via fail points, but
//! panics aren't the only &mdash; or even the most common &mdash; error pattern
//! in Rust. The more common type of error is propagated by `Result` return
//! values, and fail points can inject those as well with "early returns". That
//! is, when configuring a fail point as "return" (as opposed to "panic"), the
//! fail point will immediately return from the function, optionally with a
//! configurable value.
//!
//! The setup for early return requires a slightly diferent invocation of the
//! `fail_point!` macro. To illustrate this, let's modify the `do_fallible_work`
//! function we used earlier to return a `Result`:
//!
//! ```rust
//! #[macro_use]
//! extern crate fail;
//!
//! use std::io;
//!
//! fn do_fallible_work() -> io::Result<()> {
//!     fail_point!("read-dir");
//!     let _dir: Vec<_> = std::fs::read_dir(".")?.collect();
//!     // ... do some work on the directory ...
//!     Ok(())
//! }
//!
//! fn main() -> io::Result<()> {
//!     fail::setup();
//!     do_fallible_work()?;
//!     fail::teardown();
//!     println!("done");
//!     Ok(())
//! }
//! ```
//!
//! So this example has more proper Rust error handling, with no unwraps
//! anywhere. Instead it uses `?` to propagate errors via the `Result` type
//! return values. This is more realistic Rust code.
//!
//! The "read-dir" fail point though is not yet configured to support early
//! return, so if we attempt to configure it to "return", we'll see an error
//! like
//!
//! ```sh
//! $ FAILPOINTS=read-dir=return cargo run
//!     Finished dev [unoptimized + debuginfo] target(s) in 0.13s
//!      Running `target/debug/failpointtest`
//! thread 'main' panicked at 'Return is not supported for the fail point "read-dir"', src/main.rs:7:5
//! note: Run with `RUST_BACKTRACE=1` for a backtrace.
//! ```
//!
//! This error tells us that the "read-dir" fail point is not defined correctly
//! to support early return, and gives us the line number of that fail point.
//! What we're missing in the fail point definition is code describring _how_ to
//! return an error value, and the way we do this is by passing `fail_point!` a
//! closure that returns the same type as the enclosing function.
//!
//! Here's a variation that does so:
//!
//! ```rust
//! # #[macro_use] extern crate fail;
//! # use std::io;
//! fn do_fallible_work() -> io::Result<()> {
//!     fail_point!("read-dir", |_| {
//!         Err(io::Error::new(io::ErrorKind::PermissionDenied, "error"))
//!     });
//!     let _dir: Vec<_> = std::fs::read_dir(".")?.collect();
//!     // ... do some work on the directory ...
//!     Ok(())
//! }
//! ```
//!
//! And now if the "read-dir" fail point is configured to "return" we get a
//! different result:
//!
//! ```sh
//! $ FAILPOINTS=read-dir=return cargo run
//!    Compiling failpointtest v0.1.0 (/home/brian/pingcap/failpointtest)
//!     Finished dev [unoptimized + debuginfo] target(s) in 2.38s
//!      Running `target/debug/failpointtest`
//! Error: Custom { kind: PermissionDenied, error: StringError("error") }
//! ```
//!
//! This time, `do_fallible_work` returned the error defined in our closure,
//! which propagated all the way up and out of main, then Rust's default error
//! handler printed the error. All as expected.
//!
//! There's one other thing to understand about this closure used for early
//! return, and that's the purpose of the argument. Notice that in the previous
//! example our closure accepted an argument, but only with the placeholder `_`
//! &mdash; it didn't do anything with it.
//!
//! The purpose of this argument is to customize the return value dynamically:
//! when configuring a fail point for return, you can also provide a string
//! representing _what_ should be returned, e.g. "return(true)" or
//! "return(false)". The closure receives that string inside an `Option<String>`
//! and is responsible for converting into the proper return type.
//!
//! So here's one final variation that accepts that string and incorporates it
//! into the return value:
//!
//! ```rust
//! # #[macro_use] extern crate fail;
//! # use std::io;
//! fn do_fallible_work() -> io::Result<()> {
//!     fail_point!("read-dir", |err| {
//!         let err = err.unwrap_or("error".to_string());
//!         Err(io::Error::new(io::ErrorKind::PermissionDenied, err))
//!     });
//!     let _dir: Vec<_> = std::fs::read_dir(".")?.collect();
//!     // ... do some work on the directory ...
//!     Ok(())
//! }
//! ```
//!
//! And running it with a custom value:
//!
//! ```sh
//! $ FAILPOINTS="read-dir=return(kablooey)" cargo run
//!     Finished dev [unoptimized + debuginfo] target(s) in 0.10s
//!      Running `target/debug/failpointtest`
//! Error: Custom { kind: PermissionDenied, error: StringError("kablooey") }
//! ```
//!
//! ## Advanced usage
//!
//! That's the basics of fail points: defining them with `fail_point!`,
//! configuring them with `FAILPOINTS` and `fail::cfg`, and configuring them to
//! panic and return early. But that's not all they can do. To learn more see
//! the documentation for [`cfg`](fn.cfg.html) and
//! [`fail_point!`](macro.fail_point.html).
//!
//!
//! ## Usage considerations
//!
//! For most effective fail point usage, keep in mind the following:
//!
//!  - Enable the `no_fail` feature in your release build. This will remove all
//!    the code for individual fail points, though not the code for calls to
//!    `setup` and `teardown`.
//!  - Carefully consider complex, concurrent, non-deterministic combinations of
//!    fail points. Put test cases exercising fail points into their own test
//!    crate and protect each test case with a mutex guard.
//!  - Use self-describing fail point names.
//!  - Fail points might have the same name, in which case they take the
//!    same actions. Be careful about duplicating fail point names, either within
//!    a single crate, or across multiple crates.

#![deny(missing_docs, missing_debug_implementations)]

#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate log;
extern crate rand;

use std::collections::HashMap;
use std::env::VarError;
use std::str::FromStr;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Condvar, Mutex, RwLock, TryLockError};
use std::time::{Duration, Instant};
use std::{env, thread};

use rand::Closed01;

/// Supported tasks.
#[derive(Clone, Debug, PartialEq)]
enum Task {
    /// Do nothing.
    Off,
    /// Return the value.
    Return(Option<String>),
    /// Sleep for some milliseconds.
    Sleep(u64),
    /// Panic with the message.
    Panic(Option<String>),
    /// Print the message.
    Print(Option<String>),
    /// Sleep until other action is set.
    Pause,
    /// Yield the CPU.
    Yield,
    /// Busy waiting for some milliseconds.
    Delay(u64),
}

#[derive(Debug)]
struct Action {
    task: Task,
    freq: f32,
    count: Option<AtomicUsize>,
}

impl PartialEq for Action {
    fn eq(&self, hs: &Action) -> bool {
        if self.task != hs.task || self.freq != hs.freq {
            return false;
        }
        if let Some(ref lhs) = self.count {
            if let Some(ref rhs) = hs.count {
                return lhs.load(Ordering::Relaxed) == rhs.load(Ordering::Relaxed);
            }
        } else if hs.count.is_none() {
            return true;
        }
        false
    }
}

impl Action {
    fn new(task: Task, freq: f32, max_cnt: Option<usize>) -> Action {
        Action {
            task,
            freq,
            count: max_cnt.map(AtomicUsize::new),
        }
    }

    fn get_task(&self) -> Option<Task> {
        if let Some(ref cnt) = self.count {
            let c = cnt.load(Ordering::Acquire);
            if c == 0 {
                return None;
            }
        }
        if self.freq < 1f32 {
            let Closed01(f) = rand::random::<Closed01<f32>>();
            if f > self.freq {
                return None;
            }
        }
        if let Some(ref cnt) = self.count {
            loop {
                let c = cnt.load(Ordering::Acquire);
                if c == 0 {
                    return None;
                }
                if c == cnt.compare_and_swap(c, c - 1, Ordering::AcqRel) {
                    break;
                }
            }
        }
        Some(self.task.clone())
    }
}

fn partition(s: &str, pattern: char) -> (&str, Option<&str>) {
    let mut splits = s.splitn(2, pattern);
    (splits.next().unwrap(), splits.next())
}

impl FromStr for Action {
    type Err = String;

    /// Parse an action.
    ///
    /// `s` should be in the format `[p%][cnt*]task[(args)]`, `p%` is the frequency,
    /// `cnt` is the max times the action can be triggered.
    fn from_str(s: &str) -> Result<Action, String> {
        let mut remain = s.trim();
        let mut args = None;
        // in case there is '%' in args, we need to parse it first.
        let (first, second) = partition(remain, '(');
        if let Some(second) = second {
            remain = first;
            if !second.ends_with(')') {
                return Err("parentheses do not match".to_owned());
            }
            args = Some(&second[..second.len() - 1]);
        }

        let mut frequency = 1f32;
        let (first, second) = partition(remain, '%');
        if let Some(second) = second {
            remain = second;
            match first.parse::<f32>() {
                Err(e) => return Err(format!("failed to parse frequency: {}", e)),
                Ok(freq) => frequency = freq / 100.0,
            }
        }

        let mut max_cnt = None;
        let (first, second) = partition(remain, '*');
        if let Some(second) = second {
            remain = second;
            match first.parse() {
                Err(e) => return Err(format!("failed to parse count: {}", e)),
                Ok(cnt) => max_cnt = Some(cnt),
            }
        }

        let parse_timeout = || match args {
            None => Err("sleep require timeout".to_owned()),
            Some(timeout_str) => match timeout_str.parse() {
                Err(e) => Err(format!("failed to parse timeout: {}", e)),
                Ok(timeout) => Ok(timeout),
            },
        };

        let task = match remain {
            "off" => Task::Off,
            "return" => Task::Return(args.map(str::to_owned)),
            "sleep" => Task::Sleep(try!(parse_timeout())),
            "panic" => Task::Panic(args.map(str::to_owned)),
            "print" => Task::Print(args.map(str::to_owned)),
            "pause" => Task::Pause,
            "yield" => Task::Yield,
            "delay" => Task::Delay(try!(parse_timeout())),
            _ => return Err(format!("unrecognized command {:?}", remain)),
        };

        Ok(Action::new(task, frequency, max_cnt))
    }
}

struct FailPoint {
    pause: Mutex<bool>,
    pause_notifier: Condvar,
    actions: RwLock<Vec<Action>>,
    actions_str: RwLock<String>,
}

#[cfg_attr(feature = "cargo-clippy", allow(clippy::mutex_atomic))]
impl FailPoint {
    fn new() -> FailPoint {
        FailPoint {
            pause: Mutex::new(false),
            pause_notifier: Condvar::new(),
            actions: RwLock::default(),
            actions_str: RwLock::default(),
        }
    }

    fn set_actions(&self, actions_str: &str, actions: Vec<Action>) {
        loop {
            // TODO: maybe busy waiting here.
            match self.actions.try_write() {
                Err(TryLockError::WouldBlock) => {}
                Ok(mut guard) => {
                    *guard = actions;
                    *self.actions_str.write().unwrap() = actions_str.to_string();
                    return;
                }
                Err(e) => panic!("unexpected poison: {:?}", e),
            }
            let mut guard = self.pause.lock().unwrap();
            *guard = false;
            self.pause_notifier.notify_all();
        }
    }

    #[cfg_attr(feature = "cargo-clippy", allow(clippy::option_option))]
    fn eval(&self, name: &str) -> Option<Option<String>> {
        let task = {
            let actions = self.actions.read().unwrap();
            match actions.iter().filter_map(|a| a.get_task()).next() {
                Some(Task::Pause) => {
                    let mut guard = self.pause.lock().unwrap();
                    *guard = true;
                    loop {
                        guard = self.pause_notifier.wait(guard).unwrap();
                        if !*guard {
                            break;
                        }
                    }
                    return None;
                }
                Some(t) => t,
                None => return None,
            }
        };

        match task {
            Task::Off => {}
            Task::Return(s) => return Some(s),
            Task::Sleep(t) => thread::sleep(Duration::from_millis(t)),
            Task::Panic(msg) => match msg {
                Some(ref msg) => panic!("{}", msg),
                None => panic!("failpoint {} panic", name),
            },
            Task::Print(msg) => match msg {
                Some(ref msg) => info!("{}", msg),
                None => info!("failpoint {} executed.", name),
            },
            Task::Pause => unreachable!(),
            Task::Yield => thread::yield_now(),
            Task::Delay(t) => {
                let timer = Instant::now();
                let timeout = Duration::from_millis(t);
                while timer.elapsed() < timeout {}
            }
        }
        None
    }
}

#[derive(Default)]
struct FailPointRegistry {
    // TODO: remove rwlock or store *mut FailPoint
    registry: RwLock<HashMap<String, Arc<FailPoint>>>,
}

lazy_static! {
    static ref REGISTRY: FailPointRegistry = FailPointRegistry::default();
}

/// Set up the fail point system.
///
/// Configures all fail points specified in the `FAILPOINTS` environment variable.
/// It does not otherwise change any existing fail point configuration
///
/// The format of `FAILPOINTS` is `failpoint=actions;...`, where
/// `failpoint` is the name of the fail point. For more information
/// about fail point actions see the [`cfg`](fn.cfg.html) function and
/// the [`fail_point`](macro.fail_point.html) macro.
///
/// `FAILPOINTS` may configure fail points that are not actually defined. In
/// this case the configuration has no effect.
///
/// This function should generally be called prior to running a test with fail
/// points, and afterward paired with [`teardown`](fn.teardown.html).
///
/// # Panics
///
/// Panics if an action is not formatted correctly.
pub fn setup() {
    let mut registry = REGISTRY.registry.write().unwrap();
    let failpoints = match env::var("FAILPOINTS") {
        Ok(s) => s,
        Err(VarError::NotPresent) => return,
        Err(e) => panic!("invalid failpoints: {:?}", e),
    };
    for mut cfg in failpoints.trim().split(';') {
        cfg = cfg.trim();
        if cfg.trim().is_empty() {
            continue;
        }
        let (name, order) = partition(cfg, '=');
        match order {
            None => panic!("invalid failpoint: {:?}", cfg),
            Some(order) => {
                if let Err(e) = set(&mut registry, name.to_owned(), order) {
                    panic!("unable to configure failpoint \"{}\": {}", name, e);
                }
            }
        }
    }
}

/// Tear down the fail point system.
///
/// Clears the configuration of all fail points. Any paused fail
/// points will be notified before they are deactivated.
///
/// This function should generally be called after running a test with fail points.
/// Calling `teardown` without previously calling `setup` results in a no-op.
pub fn teardown() {
    let mut registry = REGISTRY.registry.write().unwrap();
    for p in registry.values() {
        // wake up all pause failpoint.
        p.set_actions("", vec![]);
    }
    registry.clear();
}

/// Get all registered fail points.
///
/// Return a vector of `(name, actions)` pairs.
pub fn list() -> Vec<(String, String)> {
    let registry = REGISTRY.registry.read().unwrap();
    registry
        .iter()
        .map(|(name, fp)| (name.to_string(), fp.actions_str.read().unwrap().clone()))
        .collect()
}

#[doc(hidden)]
pub fn eval<R, F: FnOnce(Option<String>) -> R>(name: &str, f: F) -> Option<R> {
    let p = {
        let registry = REGISTRY.registry.read().unwrap();
        match registry.get(name) {
            None => return None,
            Some(p) => p.clone(),
        }
    };
    p.eval(name).map(f)
}

/// Configure the actions for a fail point at runtime.
///
/// Each fail point can be configured with a series of actions, specified by the
/// `actions` argument. The format of `actions` is `action[->action...]`. When
/// multiple actions are specified, an action will be checked only when its
/// former action is not triggered.
///
/// The format of a single action is `[p%][cnt*]task[(arg)]`. `p%` is the
/// expected probability that the action is triggered, and `cnt*` is the max
/// times the action can be triggered. The supported values of `task` are:
///
/// - `off`, the fail point will do nothing.
/// - `return(arg)`, return early when the fail point is triggered. `arg` is passed to `$e` (
/// defined via the `fail_point!` macro) as a string.
/// - `sleep(milliseconds)`, sleep for the specified time.
/// - `panic(msg)`, panic with the message.
/// - `print(msg)`, log the message, using the `log` crate, at the `info` level.
/// - `pause`, sleep until other action is set to the fail point.
/// - `yield`, yield the CPU.
/// - `delay(milliseconds)`, busy waiting for the specified time.
///
/// For example, `20%3*print(still alive!)->panic` means the fail point has 20% chance to print a
/// message "still alive!" and 80% chance to panic. And the message will be printed at most 3
/// times.
///
/// The `FAILPOINTS` environment variable accepts this same syntax for its fail
/// point actions.
///
/// A call to `cfg` with a particular fail point name overwrites any existing actions for
/// that fail point, including those set via the `FAILPOINTS` environment variable.
pub fn cfg<S: Into<String>>(name: S, actions: &str) -> Result<(), String> {
    let mut registry = REGISTRY.registry.write().unwrap();
    set(&mut registry, name.into(), actions)
}

/// Remove a fail point.
///
/// If the fail point doesn't exist, nothing will happen.
pub fn remove<S: AsRef<str>>(name: S) {
    let mut registry = REGISTRY.registry.write().unwrap();
    if let Some(p) = registry.remove(name.as_ref()) {
        // wake up all pause failpoint.
        p.set_actions("", vec![]);
    }
}

fn set(
    registry: &mut HashMap<String, Arc<FailPoint>>,
    name: String,
    actions: &str,
) -> Result<(), String> {
    let actions_str = actions;
    // `actions` are in the format of `failpoint[->failpoint...]`.
    let actions = try!(actions.split("->").map(Action::from_str).collect());
    // Please note that we can't figure out whether there is a failpoint named `name`,
    // so we may insert a failpoint that doesn't exist at all.
    let p = registry
        .entry(name)
        .or_insert_with(|| Arc::new(FailPoint::new()));
    p.set_actions(actions_str, actions);
    Ok(())
}

/// Define a fail point.
///
/// The `fail_point!` macro has three forms, and they all take a name as the
/// first argument. The simplest form takes only a name and is suitable for
/// executing most fail point behavior, including panicking, but not for early
/// return or conditional execution based on a local flag.
///
/// The three forms of fail points look as follows.
///
/// 1. A basic fail point:
///
/// ```rust
/// # #[macro_use] extern crate fail;
/// fn function_return_unit() {
///     fail_point!("fail-point-1");
/// }
/// ```
///
/// This form of fail point can be configured to panic, print, sleep, pause, etc., but
/// not to return from the function early.
///
/// 2. A fail point that may return early:
///
/// ```rust
/// # #[macro_use] extern crate fail;
/// fn function_return_value() -> u64 {
///     fail_point!("fail-point-2", |r| r.map_or(2, |e| e.parse().unwrap()));
///     0
/// }
/// ```
///
/// This form of fail point can additionally be configured to return early from
/// the enclosing function. It accepts a closure, which itself accepts an
/// `Option<String>`, and is expected to transform that argument into the early
/// return value. The argument string is sourced from the fail point
/// configuration string. For example configuring this "fail-point-2" as
/// "return(100)" will execute the fail point closure, passing it a `Some` value
/// containing a `String` equal to "100"; the closure then parses it into the
/// return value.
///
/// 3. A fail point with conditional execution:
///
/// ```rust
/// # #[macro_use] extern crate fail;
/// fn function_conditional(enable: bool) {
///     fail_point!("fail-point-3", enable, |_| {});
/// }
/// ```
///
/// In this final form, the second argument is a local boolean expression that
/// must evaluate to `true` before the fail point is evaluated. The third
/// argument is again an early-return closure.
///
/// The three macro arguments (or "designators") are called `$name`, `$cond`,
/// and `$e`. `$name` must be `&str`, `$cond` must be a boolean expression,
/// and`$e` must be a function or closure that accepts an `Option<String>` and
/// returns the same type as the enclosing function.
///
/// For more examples see the [crate documentation](index.html). For more
/// information about controlling fail points see the [`cfg`](fn.cfg.html)
/// function.
#[macro_export]
#[cfg(not(feature = "no_fail"))]
macro_rules! fail_point {
    ($name:expr) => {{
        $crate::eval($name, |_| {
            panic!("Return is not supported for the fail point \"{}\"", $name);
        });
    }};
    ($name:expr, $e:expr) => {{
        if let Some(res) = $crate::eval($name, $e) {
            return res;
        }
    }};
    ($name:expr, $cond:expr, $e:expr) => {{
        if $cond {
            fail_point!($name, $e);
        }
    }};
}

#[macro_export]
#[cfg(feature = "no_fail")]
macro_rules! fail_point {
    ($name:expr, $e:expr) => {{}};
    ($name:expr) => {{}};
    ($name:expr, $cond:expr, $e:expr) => {{}};
}

#[cfg(test)]
mod tests {
    use super::*;

    use std::sync::*;

    use log::*;

    #[test]
    fn test_off() {
        let point = FailPoint::new();
        point.set_actions("", vec![Action::new(Task::Off, 1.0, None)]);
        assert!(point.eval("test_fail_point_off").is_none());
    }

    #[test]
    fn test_return() {
        let point = FailPoint::new();
        point.set_actions("", vec![Action::new(Task::Return(None), 1.0, None)]);
        let res = point.eval("test_fail_point_return");
        assert_eq!(res, Some(None));

        let ret = Some("test".to_owned());
        point.set_actions("", vec![Action::new(Task::Return(ret.clone()), 1.0, None)]);
        let res = point.eval("test_fail_point_return");
        assert_eq!(res, Some(ret));
    }

    #[test]
    fn test_sleep() {
        let point = FailPoint::new();
        let timer = Instant::now();
        point.set_actions("", vec![Action::new(Task::Sleep(1000), 1.0, None)]);
        assert!(point.eval("test_fail_point_sleep").is_none());
        assert!(timer.elapsed() > Duration::from_millis(1000));
    }

    #[should_panic]
    #[test]
    fn test_panic() {
        let point = FailPoint::new();
        point.set_actions("", vec![Action::new(Task::Panic(None), 1.0, None)]);
        point.eval("test_fail_point_panic");
    }

    #[test]
    fn test_print() {
        struct LogCollector(Arc<Mutex<Vec<String>>>);
        impl Log for LogCollector {
            fn enabled(&self, _: &LogMetadata) -> bool {
                true
            }
            fn log(&self, record: &LogRecord) {
                let mut buf = self.0.lock().unwrap();
                buf.push(format!("{}", record.args()));
            }
        }

        let buffer = Arc::new(Mutex::new(vec![]));
        let collector = LogCollector(buffer.clone());
        log::set_logger(|e| {
            e.set(LogLevelFilter::Info);
            Box::new(collector)
        })
        .unwrap();

        let point = FailPoint::new();
        point.set_actions("", vec![Action::new(Task::Print(None), 1.0, None)]);
        assert!(point.eval("test_fail_point_print").is_none());
        let msg = buffer.lock().unwrap().pop().unwrap();
        assert_eq!(msg, "failpoint test_fail_point_print executed.");
    }

    #[test]
    fn test_pause() {
        let point = Arc::new(FailPoint::new());
        point.set_actions("", vec![Action::new(Task::Pause, 1.0, None)]);
        let p = point.clone();
        let (tx, rx) = mpsc::channel();
        thread::spawn(move || {
            assert_eq!(p.eval("test_fail_point_pause"), None);
            tx.send(()).unwrap();
        });
        assert!(rx.recv_timeout(Duration::from_secs(1)).is_err());
        point.set_actions("", vec![Action::new(Task::Off, 1.0, None)]);
        rx.recv_timeout(Duration::from_secs(1)).unwrap();
    }

    #[test]
    fn test_yield() {
        let point = FailPoint::new();
        point.set_actions("", vec![Action::new(Task::Yield, 1.0, None)]);
        assert!(point.eval("test_fail_point_yield").is_none());
    }

    #[test]
    fn test_delay() {
        let point = FailPoint::new();
        let timer = Instant::now();
        point.set_actions("", vec![Action::new(Task::Delay(1000), 1.0, None)]);
        assert!(point.eval("test_fail_point_delay").is_none());
        assert!(timer.elapsed() > Duration::from_millis(1000));
    }

    #[test]
    fn test_frequency_and_count() {
        let point = FailPoint::new();
        point.set_actions("", vec![Action::new(Task::Return(None), 0.8, Some(100))]);
        let mut count = 0;
        let mut times = 0f64;
        while count < 100 {
            if point.eval("test_fail_point_frequency").is_some() {
                count += 1;
            }
            times += 1f64;
        }
        assert!(100.0 / 0.9 < times && times < 100.0 / 0.7, "{}", times);
        for _ in 0..times as u64 {
            assert!(point.eval("test_fail_point_frequency").is_none());
        }
    }

    #[test]
    fn test_parse() {
        let cases = vec![
            ("return", Action::new(Task::Return(None), 1.0, None)),
            (
                "return(64)",
                Action::new(Task::Return(Some("64".to_owned())), 1.0, None),
            ),
            ("5*return", Action::new(Task::Return(None), 1.0, Some(5))),
            ("25%return", Action::new(Task::Return(None), 0.25, None)),
            (
                "125%2*return",
                Action::new(Task::Return(None), 1.25, Some(2)),
            ),
            (
                "return(2%5)",
                Action::new(Task::Return(Some("2%5".to_owned())), 1.0, None),
            ),
            ("125%2*off", Action::new(Task::Off, 1.25, Some(2))),
            (
                "125%2*sleep(100)",
                Action::new(Task::Sleep(100), 1.25, Some(2)),
            ),
            (" 125%2*off ", Action::new(Task::Off, 1.25, Some(2))),
            ("125%2*panic", Action::new(Task::Panic(None), 1.25, Some(2))),
            (
                "125%2*panic(msg)",
                Action::new(Task::Panic(Some("msg".to_owned())), 1.25, Some(2)),
            ),
            ("125%2*print", Action::new(Task::Print(None), 1.25, Some(2))),
            (
                "125%2*print(msg)",
                Action::new(Task::Print(Some("msg".to_owned())), 1.25, Some(2)),
            ),
            ("125%2*pause", Action::new(Task::Pause, 1.25, Some(2))),
            ("125%2*yield", Action::new(Task::Yield, 1.25, Some(2))),
            ("125%2*delay(2)", Action::new(Task::Delay(2), 1.25, Some(2))),
        ];
        for (expr, exp) in cases {
            let res: Action = expr.parse().unwrap();
            assert_eq!(res, exp);
        }

        let fail_cases = vec![
            "delay",
            "sleep",
            "Return",
            "ab%return",
            "ab*return",
            "return(msg",
            "unknown",
        ];
        for case in fail_cases {
            assert!(case.parse::<Action>().is_err());
        }
    }

    // This case should be tested as integration case, but when calling `teardown` other cases
    // like `test_pause` maybe also affected, so it's better keep it here.
    #[test]
    fn test_setup_and_teardown() {
        let f1 = || {
            fail_point!("setup_and_teardown1", |_| 1);
            0
        };
        let f2 = || {
            fail_point!("setup_and_teardown2", |_| 2);
            0
        };
        env::set_var(
            "FAILPOINTS",
            "setup_and_teardown1=return;setup_and_teardown2=pause;",
        );
        setup();
        assert_eq!(f1(), 1);

        let (tx, rx) = mpsc::channel();
        thread::spawn(move || {
            tx.send(f2()).unwrap();
        });
        assert!(rx.recv_timeout(Duration::from_millis(500)).is_err());

        teardown();
        assert_eq!(rx.recv_timeout(Duration::from_millis(500)).unwrap(), 0);
        assert_eq!(f1(), 0);
    }
}