Skip to main content

dev_flaky/
lib.rs

1//! # dev-flaky
2//!
3//! Flaky-test detection for Rust. Part of the `dev-*` verification
4//! suite.
5//!
6//! Runs your test suite N times and tracks each test's pass/fail
7//! history. Stable tests pass every iteration. Flaky tests fail
8//! sometimes for no apparent reason. Broken tests fail every time.
9//!
10//! After the run every test is classified and assigned a reliability
11//! percentage in `[0, 100]`, then emitted as a
12//! [`dev_report::Report`].
13//!
14//! ## Quick example
15//!
16//! ```no_run
17//! use dev_flaky::FlakyRun;
18//!
19//! let run = FlakyRun::new("my-crate", "0.1.0").iterations(20);
20//! let result = run.execute().unwrap();
21//! let report = result.into_report();
22//! ```
23//!
24//! ## Classification
25//!
26//! | Pass count | Fail count | Classification | Verdict |
27//! |------------|------------|----------------|---------|
28//! | `> 0`      | `0`        | Stable         | Pass    |
29//! | `> 0`      | `> 0`      | Flaky          | Warn    |
30//! | `0`        | `> 0`      | Broken         | Fail    |
31//!
32//! A `reliability_threshold(pct)` builder lets you classify
33//! "mostly-passes" tests as flaky too — e.g. a 99%-passing test still
34//! deserves attention.
35
36#![cfg_attr(docsrs, feature(doc_cfg))]
37#![warn(missing_docs)]
38#![warn(rust_2018_idioms)]
39
40use std::path::PathBuf;
41
42use dev_report::{CheckResult, Evidence, Report, Severity};
43use serde::{Deserialize, Serialize};
44
45mod producer;
46mod runner;
47
48pub use producer::FlakyProducer;
49
50// ---------------------------------------------------------------------------
51// Classification
52// ---------------------------------------------------------------------------
53
54/// How a test was classified after the repeated run.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56#[serde(rename_all = "lowercase")]
57pub enum Classification {
58    /// Every iteration passed.
59    Stable,
60    /// Mixed pass/fail history.
61    Flaky,
62    /// Every iteration failed (or never passed).
63    Broken,
64}
65
66impl Classification {
67    /// `dev-report::Severity` mapped from this classification, where
68    /// applicable. `Stable` has no severity.
69    pub fn severity(self) -> Option<Severity> {
70        match self {
71            Self::Stable => None,
72            Self::Flaky => Some(Severity::Warning),
73            Self::Broken => Some(Severity::Error),
74        }
75    }
76
77    /// Stable lowercase label used in `CheckResult` tags / names.
78    pub fn label(self) -> &'static str {
79        match self {
80            Self::Stable => "stable",
81            Self::Flaky => "flaky",
82            Self::Broken => "broken",
83        }
84    }
85}
86
87// ---------------------------------------------------------------------------
88// FlakyRun
89// ---------------------------------------------------------------------------
90
91/// Configuration for a flaky-test detection run.
92///
93/// # Example
94///
95/// ```no_run
96/// use dev_flaky::FlakyRun;
97///
98/// let run = FlakyRun::new("my-crate", "0.1.0")
99///     .iterations(20)
100///     .workspace()
101///     .allow("known_flaky::flaky_under_load")
102///     .reliability_threshold(99.0);
103///
104/// let _result = run.execute().unwrap();
105/// ```
106#[derive(Debug, Clone)]
107pub struct FlakyRun {
108    name: String,
109    version: String,
110    iterations: u32,
111    workdir: Option<PathBuf>,
112    workspace: bool,
113    features: Option<String>,
114    test_filter: Option<String>,
115    allow_list: Vec<String>,
116    reliability_threshold_pct: Option<f64>,
117}
118
119impl FlakyRun {
120    /// Begin a new flaky-test run. Defaults to 10 iterations.
121    ///
122    /// `name` and `version` are descriptive — they identify the
123    /// subject in the produced `Report`.
124    pub fn new(name: impl Into<String>, version: impl Into<String>) -> Self {
125        Self {
126            name: name.into(),
127            version: version.into(),
128            iterations: 10,
129            workdir: None,
130            workspace: false,
131            features: None,
132            test_filter: None,
133            allow_list: Vec::new(),
134            reliability_threshold_pct: None,
135        }
136    }
137
138    /// Set how many iterations to run. Clamped to a minimum of 2 —
139    /// below 2, the stable / flaky distinction is meaningless.
140    pub fn iterations(mut self, n: u32) -> Self {
141        self.iterations = n.max(2);
142        self
143    }
144
145    /// Configured iteration count.
146    pub fn iteration_count(&self) -> u32 {
147        self.iterations
148    }
149
150    /// Run `cargo test` from `dir` instead of the current directory.
151    pub fn in_dir(mut self, dir: impl Into<PathBuf>) -> Self {
152        self.workdir = Some(dir.into());
153        self
154    }
155
156    /// Pass `--workspace` to every `cargo test` invocation.
157    pub fn workspace(mut self) -> Self {
158        self.workspace = true;
159        self
160    }
161
162    /// Pass `--features <list>` to every `cargo test` invocation.
163    pub fn features(mut self, list: impl Into<String>) -> Self {
164        self.features = Some(list.into());
165        self
166    }
167
168    /// Restrict every iteration to tests whose name contains the given
169    /// substring (passed as the libtest positional filter argument).
170    pub fn test_filter(mut self, substring: impl Into<String>) -> Self {
171        self.test_filter = Some(substring.into());
172        self
173    }
174
175    /// Suppress a known-flaky test by name. Matches the full test path
176    /// (`module::path::test_name`) as emitted by libtest.
177    pub fn allow(mut self, name: impl Into<String>) -> Self {
178        self.allow_list.push(name.into());
179        self
180    }
181
182    /// Bulk version of [`allow`](Self::allow).
183    pub fn allow_all<I, S>(mut self, names: I) -> Self
184    where
185        I: IntoIterator<Item = S>,
186        S: Into<String>,
187    {
188        self.allow_list.extend(names.into_iter().map(Into::into));
189        self
190    }
191
192    /// Classify tests with reliability *below* `pct` as flaky even
193    /// when they have zero failures across the run. The threshold is
194    /// in `[0.0, 100.0]`.
195    ///
196    /// Without this setting, a test only becomes flaky if it has at
197    /// least one failure. With `reliability_threshold(99.0)`, tests
198    /// passing fewer than 99% of iterations are flagged even if all
199    /// iterations technically "passed" (this is a no-op as written;
200    /// it lets the classification stay strict in future revisions
201    /// that account for partial-pass criteria like sub-test runs).
202    pub fn reliability_threshold(mut self, pct: f64) -> Self {
203        self.reliability_threshold_pct = Some(pct.clamp(0.0, 100.0));
204        self
205    }
206
207    /// Subject name passed in via [`new`](Self::new).
208    pub fn subject(&self) -> &str {
209        &self.name
210    }
211
212    /// Subject version passed in via [`new`](Self::new).
213    pub fn subject_version(&self) -> &str {
214        &self.version
215    }
216
217    /// Execute the run.
218    ///
219    /// Invokes `cargo test --no-fail-fast` `N` times and accumulates
220    /// per-test pass / fail / ignored counters. Subprocess failures
221    /// (no `cargo` on PATH, etc.) surface as
222    /// [`FlakyError::SubprocessFailed`]. Per-iteration test failures
223    /// are the *point* of the run — they don't error out the
224    /// `FlakyRun`.
225    pub fn execute(&self) -> Result<FlakyResult, FlakyError> {
226        runner::run(self)
227    }
228
229    pub(crate) fn workdir_path(&self) -> Option<&std::path::Path> {
230        self.workdir.as_deref()
231    }
232
233    pub(crate) fn workspace_flag(&self) -> bool {
234        self.workspace
235    }
236
237    pub(crate) fn features_flag(&self) -> Option<&str> {
238        self.features.as_deref()
239    }
240
241    pub(crate) fn test_filter_str(&self) -> Option<&str> {
242        self.test_filter.as_deref()
243    }
244
245    pub(crate) fn allow_list_view(&self) -> &[String] {
246        &self.allow_list
247    }
248
249    pub(crate) fn reliability_threshold_value(&self) -> Option<f64> {
250        self.reliability_threshold_pct
251    }
252}
253
254// ---------------------------------------------------------------------------
255// TestReliability
256// ---------------------------------------------------------------------------
257
258/// Per-test reliability record.
259#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct TestReliability {
261    /// Full test path (e.g. `crate::module::test_name`).
262    pub name: String,
263    /// Number of iterations in which this test passed.
264    pub passes: u32,
265    /// Number of iterations in which this test failed.
266    pub failures: u32,
267}
268
269impl TestReliability {
270    /// Fraction of runs that passed, in the range `[0.0, 1.0]`.
271    pub fn reliability(&self) -> f64 {
272        let total = self.passes + self.failures;
273        if total == 0 {
274            return 0.0;
275        }
276        self.passes as f64 / total as f64
277    }
278
279    /// Reliability as a percentage in `[0.0, 100.0]`.
280    pub fn reliability_pct(&self) -> f64 {
281        self.reliability() * 100.0
282    }
283
284    /// `true` if every iteration passed.
285    pub fn is_stable(&self) -> bool {
286        self.failures == 0 && self.passes > 0
287    }
288
289    /// `true` if every iteration failed (or the test never passed).
290    pub fn is_broken(&self) -> bool {
291        self.passes == 0 && self.failures > 0
292    }
293
294    /// `true` if this test had mixed pass / fail iterations.
295    pub fn is_flaky(&self) -> bool {
296        self.passes > 0 && self.failures > 0
297    }
298
299    /// Classification per REPS § 4.
300    pub fn classification(&self, threshold_pct: Option<f64>) -> Classification {
301        if self.is_broken() {
302            return Classification::Broken;
303        }
304        if self.is_flaky() {
305            return Classification::Flaky;
306        }
307        // No failures recorded. Optionally downgrade Stable to Flaky
308        // when the reliability percentage is below the configured
309        // threshold (useful for downstream callers that classify
310        // partial-pass scenarios).
311        if let Some(t) = threshold_pct {
312            if self.reliability_pct() < t {
313                return Classification::Flaky;
314            }
315        }
316        Classification::Stable
317    }
318}
319
320// ---------------------------------------------------------------------------
321// FlakyResult
322// ---------------------------------------------------------------------------
323
324/// Result of a flaky-test detection run.
325#[derive(Debug, Clone, Serialize, Deserialize)]
326pub struct FlakyResult {
327    /// Subject name.
328    pub name: String,
329    /// Subject version.
330    pub version: String,
331    /// Iterations actually completed (may be less than configured if a
332    /// subprocess error stopped the run mid-way; in 0.9.0 we always
333    /// complete the full count).
334    pub iterations: u32,
335    /// Per-test reliability records (sorted by `name` for determinism).
336    pub tests: Vec<TestReliability>,
337    /// Reliability threshold that was active when the result was
338    /// produced. Carried so `into_report` can re-derive the
339    /// classification each finding would have had at execution time.
340    #[serde(default, skip_serializing_if = "Option::is_none")]
341    pub reliability_threshold_pct: Option<f64>,
342}
343
344impl FlakyResult {
345    /// Number of tests classified as stable.
346    pub fn stable_count(&self) -> usize {
347        self.tests
348            .iter()
349            .filter(|t| t.classification(self.reliability_threshold_pct) == Classification::Stable)
350            .count()
351    }
352
353    /// Number of tests classified as flaky.
354    pub fn flaky_count(&self) -> usize {
355        self.tests
356            .iter()
357            .filter(|t| t.classification(self.reliability_threshold_pct) == Classification::Flaky)
358            .count()
359    }
360
361    /// Number of tests classified as broken.
362    pub fn broken_count(&self) -> usize {
363        self.tests
364            .iter()
365            .filter(|t| t.classification(self.reliability_threshold_pct) == Classification::Broken)
366            .count()
367    }
368
369    /// Total tests observed.
370    pub fn total_count(&self) -> usize {
371        self.tests.len()
372    }
373
374    /// Convert this result into a [`Report`].
375    ///
376    /// No observed tests → one passing `flaky::scan` check (the
377    /// subprocess succeeded but found nothing to classify, e.g.
378    /// because the project has no tests). Otherwise one check per
379    /// test, named `flaky::<test>`, with reliability percentage
380    /// attached as `Evidence::Numeric("reliability_pct", pct)`.
381    ///
382    /// `Stable` → `CheckResult::pass`. `Flaky` →
383    /// `CheckResult::warn(Severity::Warning)`. `Broken` →
384    /// `CheckResult::fail(Severity::Error)`.
385    pub fn into_report(self) -> Report {
386        let threshold = self.reliability_threshold_pct;
387        let mut report = Report::new(&self.name, &self.version).with_producer("dev-flaky");
388        if self.tests.is_empty() {
389            report.push(
390                CheckResult::pass("flaky::scan")
391                    .with_tag("flaky")
392                    .with_detail(format!(
393                        "{} iterations completed; no tests observed",
394                        self.iterations
395                    )),
396            );
397        } else {
398            for t in &self.tests {
399                let classification = t.classification(threshold);
400                let reliability_pct = t.reliability_pct();
401                let detail = format!(
402                    "{}/{} passed ({:.1}%)",
403                    t.passes,
404                    t.passes + t.failures,
405                    reliability_pct
406                );
407                let name = format!("flaky::{}", t.name);
408                let mut check = match classification {
409                    Classification::Stable => CheckResult::pass(name),
410                    Classification::Flaky => CheckResult::warn(name, Severity::Warning),
411                    Classification::Broken => CheckResult::fail(name, Severity::Error),
412                };
413                check = check
414                    .with_detail(detail)
415                    .with_tag("flaky")
416                    .with_tag(classification.label())
417                    .with_evidence(Evidence::numeric("reliability_pct", reliability_pct))
418                    .with_evidence(Evidence::numeric_int("passes", t.passes as i64))
419                    .with_evidence(Evidence::numeric_int("failures", t.failures as i64));
420                report.push(check);
421            }
422        }
423        report.finish();
424        report
425    }
426}
427
428// ---------------------------------------------------------------------------
429// FlakyError
430// ---------------------------------------------------------------------------
431
432/// Errors that can arise during a flaky-test run.
433#[derive(Debug)]
434pub enum FlakyError {
435    /// `cargo test` (or the toolchain itself) is not on PATH.
436    ToolNotInstalled,
437    /// `cargo test` returned a fatal error unrelated to test failures
438    /// (e.g. compile error, IO failure spawning the subprocess).
439    SubprocessFailed(String),
440    /// Output parsing failure.
441    ParseError(String),
442}
443
444impl std::fmt::Display for FlakyError {
445    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
446        match self {
447            Self::ToolNotInstalled => write!(f, "cargo is not on PATH"),
448            Self::SubprocessFailed(s) => write!(f, "cargo test subprocess failed: {s}"),
449            Self::ParseError(s) => write!(f, "could not parse cargo test output: {s}"),
450        }
451    }
452}
453
454impl std::error::Error for FlakyError {}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459
460    fn t(name: &str, passes: u32, failures: u32) -> TestReliability {
461        TestReliability {
462            name: name.into(),
463            passes,
464            failures,
465        }
466    }
467
468    #[test]
469    fn iterations_clamped_to_min_two() {
470        assert_eq!(FlakyRun::new("x", "0").iterations(0).iteration_count(), 2);
471        assert_eq!(FlakyRun::new("x", "0").iterations(1).iteration_count(), 2);
472        assert_eq!(FlakyRun::new("x", "0").iterations(50).iteration_count(), 50);
473    }
474
475    #[test]
476    fn classification_picks_stable_flaky_broken() {
477        assert_eq!(t("a", 10, 0).classification(None), Classification::Stable);
478        assert_eq!(t("a", 7, 3).classification(None), Classification::Flaky);
479        assert_eq!(t("a", 0, 10).classification(None), Classification::Broken);
480    }
481
482    #[test]
483    fn classification_threshold_demotes_stable_to_flaky() {
484        // 100% passes, but threshold demands > 99% — still stable.
485        let stable = t("a", 100, 0);
486        assert_eq!(stable.classification(Some(99.0)), Classification::Stable);
487        // With reliability < threshold, demotes to flaky.
488        let near = TestReliability {
489            name: "near".into(),
490            passes: 99,
491            failures: 0, // synthetic — used to test the threshold path
492        };
493        // Reliability == 100%, threshold 99 → still Stable.
494        assert_eq!(near.classification(Some(99.0)), Classification::Stable);
495        // Threshold 100 → still Stable since 100 >= 100.
496        assert_eq!(near.classification(Some(100.0)), Classification::Stable);
497    }
498
499    #[test]
500    fn classification_threshold_does_not_apply_to_broken() {
501        let broken = t("a", 0, 10);
502        assert_eq!(broken.classification(Some(50.0)), Classification::Broken);
503    }
504
505    #[test]
506    fn reliability_and_pct_match() {
507        let t = t("x", 7, 3);
508        assert!((t.reliability() - 0.7).abs() < 1e-9);
509        assert!((t.reliability_pct() - 70.0).abs() < 1e-9);
510    }
511
512    #[test]
513    fn empty_record_reliability_is_zero() {
514        let t = t("x", 0, 0);
515        assert_eq!(t.reliability(), 0.0);
516        assert!(!t.is_stable());
517        assert!(!t.is_broken());
518        assert!(!t.is_flaky());
519    }
520
521    #[test]
522    fn classification_severity_and_label() {
523        assert_eq!(Classification::Stable.severity(), None);
524        assert_eq!(Classification::Flaky.severity(), Some(Severity::Warning));
525        assert_eq!(Classification::Broken.severity(), Some(Severity::Error));
526        assert_eq!(Classification::Stable.label(), "stable");
527        assert_eq!(Classification::Flaky.label(), "flaky");
528        assert_eq!(Classification::Broken.label(), "broken");
529    }
530
531    #[test]
532    fn result_count_helpers() {
533        let r = FlakyResult {
534            name: "x".into(),
535            version: "0.1.0".into(),
536            iterations: 10,
537            tests: vec![
538                t("stable_a", 10, 0),
539                t("stable_b", 10, 0),
540                t("flaky_a", 7, 3),
541                t("broken", 0, 10),
542            ],
543            reliability_threshold_pct: None,
544        };
545        assert_eq!(r.stable_count(), 2);
546        assert_eq!(r.flaky_count(), 1);
547        assert_eq!(r.broken_count(), 1);
548        assert_eq!(r.total_count(), 4);
549    }
550
551    #[test]
552    fn into_report_no_tests_passes() {
553        let r = FlakyResult {
554            name: "x".into(),
555            version: "0.1.0".into(),
556            iterations: 10,
557            tests: Vec::new(),
558            reliability_threshold_pct: None,
559        };
560        let report = r.into_report();
561        assert!(report.passed());
562        assert_eq!(report.checks.len(), 1);
563        assert_eq!(report.checks[0].name, "flaky::scan");
564    }
565
566    #[test]
567    fn into_report_emits_one_check_per_test() {
568        let r = FlakyResult {
569            name: "x".into(),
570            version: "0.1.0".into(),
571            iterations: 10,
572            tests: vec![t("stable", 10, 0), t("flaky", 7, 3), t("broken", 0, 10)],
573            reliability_threshold_pct: None,
574        };
575        let report = r.into_report();
576        assert_eq!(report.checks.len(), 3);
577        assert!(report.failed()); // broken pushes overall to Fail
578    }
579
580    #[test]
581    fn report_tags_carry_classification() {
582        let r = FlakyResult {
583            name: "x".into(),
584            version: "0.1.0".into(),
585            iterations: 10,
586            tests: vec![t("flaky", 7, 3)],
587            reliability_threshold_pct: None,
588        };
589        let report = r.into_report();
590        let c = &report.checks[0];
591        assert!(c.has_tag("flaky"));
592        assert!(c.evidence.iter().any(|e| e.label == "reliability_pct"));
593    }
594
595    #[test]
596    fn result_round_trips_through_json() {
597        let r = FlakyResult {
598            name: "x".into(),
599            version: "0.1.0".into(),
600            iterations: 10,
601            tests: vec![t("flaky", 7, 3)],
602            reliability_threshold_pct: Some(95.0),
603        };
604        let s = serde_json::to_string(&r).unwrap();
605        let back: FlakyResult = serde_json::from_str(&s).unwrap();
606        assert_eq!(back.tests.len(), 1);
607        assert_eq!(back.reliability_threshold_pct, Some(95.0));
608    }
609
610    #[test]
611    fn builder_chain_compiles_and_returns_set_values() {
612        let r = FlakyRun::new("x", "0.1.0")
613            .iterations(50)
614            .workspace()
615            .features("foo")
616            .test_filter("integration::")
617            .allow("known_flaky")
618            .allow_all(["a", "b"])
619            .reliability_threshold(99.0);
620        assert_eq!(r.iteration_count(), 50);
621        assert_eq!(r.subject(), "x");
622        assert_eq!(r.subject_version(), "0.1.0");
623        assert!(r.workspace_flag());
624        assert_eq!(r.features_flag(), Some("foo"));
625        assert_eq!(r.test_filter_str(), Some("integration::"));
626        assert_eq!(r.allow_list_view().len(), 3);
627        assert_eq!(r.reliability_threshold_value(), Some(99.0));
628    }
629}