trusty-review 0.3.3

Fast local PR-review service for trusty-tools — orchestrates LLM-backed code review
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
//! Severity-anchored, deterministic grade derivation.
//!
//! Why: the calibration run against the duetto code-review board (30 PRs)
//! revealed two systemic problems:
//!   - BLOCK was never emitted (0% detection): the model soft-pedalled critical
//!     issues to APPROVE* instead of escalating to BLOCK.
//!   - REQUEST_CHANGES leaked to APPROVE* 64% of the time: High findings were
//!     under-graded.
//!
//! The fix has two deterministic rules applied in `derive_verdict`:
//!
//! 1. LOW-CONFIDENCE OVERRIDE (checked first): if ALL findings have confidence
//!    ≤ 0.65 AND none are `High`-effort, force APPROVE — overriding even a
//!    model-proposed APPROVE* downward.  Prevents APPROVE* over-fire on
//!    clean PRs with speculative low-confidence findings.
//!
//! 2. SEVERITY FLOOR: take the stricter of (model-proposed, severity-derived):
//!
//!   | Finding set                               | Minimum floor   |
//!   |-------------------------------------------|-----------------|
//!   | Any `High` effort (critical/high sev.)    | BLOCK           |
//!   | ≥2 `Medium` effort findings               | REQUEST_CHANGES |
//!   | Exactly 1 `Medium` effort finding         | APPROVE*        |
//!   | Only `Low` effort or no findings           | APPROVE         |
//!
//!   The model can never soften a Critical or High finding below the floor.
//!
//! `Verdict::Unknown` is always preserved (pass-through) — the model has
//! signalled the diff was unassessable and no rule applies.
//!
//! What: exposes `derive_verdict` which accepts a model-proposed `Verdict` and
//! a slice of `Finding` values (each carrying an `Effort` severity proxy), then
//! returns the final verdict.  The `Effort` enum is the existing in-model
//! severity proxy:
//!
//! - `Effort::High`   → Critical or High severity finding
//! - `Effort::Medium` → Medium severity finding
//! - `Effort::Low`    → Low severity finding
//!
//! Test: `grade_critical_high_effort_yields_block`,
//! `grade_two_medium_yields_request_changes`,
//! `grade_one_medium_yields_approve_star`,
//! `grade_only_low_yields_approve`,
//! `grade_unknown_is_preserved`,
//! `grade_floor_overrides_model_approve`,
//! `grade_model_block_kept_when_no_critical_finding`,
//! `grade_low_confidence_all_medium_yields_approve`,
//! `grade_high_confidence_medium_beats_low_confidence_check`.

use tracing::debug;

use crate::models::{Effort, Finding, Verdict};

// ─── Confidence threshold ─────────────────────────────────────────────────────

/// Confidence threshold below which a finding is considered advisory-only.
///
/// Why: the model sometimes emits speculative Medium-severity findings with very
/// low confidence (e.g. 0.5).  If ALL findings fall below this threshold and
/// none are High-effort, the floor collapses from APPROVE* to APPROVE so we
/// don't over-fire on clean PRs.
/// What: any finding with `confidence > LOW_CONFIDENCE_THRESHOLD` is treated as
/// substantive; those at or below are advisory.
/// Test: `grade_low_confidence_all_medium_yields_approve`.
const LOW_CONFIDENCE_THRESHOLD: f32 = 0.65;

// ─── Public API ───────────────────────────────────────────────────────────────

/// Compute the final review verdict from the model-proposed verdict and findings.
///
/// Why: the calibration run showed the model systematically under-fires
/// (BLOCK=0%, REQUEST_CHANGES=36%).  Applying a deterministic severity-derived
/// FLOOR ensures Critical/High issues are never silently softened to APPROVE*.
///
/// What: two-pass derivation:
///
/// 1. LOW-CONFIDENCE OVERRIDE (ceiling): if ALL findings have confidence ≤ 0.65
///    AND none are High-effort, the entire batch is advisory noise.  The result is
///    forced to APPROVE — overriding even a model-proposed APPROVE* downward.
///    This prevents APPROVE* over-fire on clean PRs with speculative low-confidence
///    findings.
///
/// 2. SEVERITY FLOOR (minimum): outside the override window, compute a floor from
///    the finding severity distribution (see `severity_floor`) and return
///    `max(model_proposed, floor)`.  The model can never soften a Critical/High
///    finding to APPROVE*.
///
/// Special case: `Verdict::Unknown` is always returned as-is — the model has
/// determined the diff was unassessable and no floor or override applies.
///
/// Test: see module-level test list.
pub fn derive_verdict(model_proposed: Verdict, findings: &[Finding]) -> Verdict {
    // UNKNOWN is a special terminal state — preserve it unconditionally.
    if model_proposed == Verdict::Unknown {
        debug!("verdict=UNKNOWN from model — preserving (diff unassessable)");
        return Verdict::Unknown;
    }

    // Low-confidence override (ceiling): if ALL findings are advisory-only
    // (confidence ≤ threshold) AND none are High-effort, the batch is noise.
    // Override the model down to APPROVE — this specifically prevents APPROVE*
    // over-fire (Fix 4).  High-effort findings escape this gate: a confirmed
    // bug with low confidence should still BLOCK, not disappear.
    let has_high = findings.iter().any(|f| f.effort == Effort::High);
    let all_low_confidence = !findings.is_empty()
        && findings
            .iter()
            .all(|f| f.confidence <= LOW_CONFIDENCE_THRESHOLD);

    if all_low_confidence && !has_high {
        debug!(
            model_verdict = %model_proposed,
            "low-confidence override: all findings ≤0.65 confidence, no High-effort → APPROVE"
        );
        return Verdict::Approve;
    }

    // Severity floor: take the stricter of model-proposed and severity-derived.
    let floor = severity_floor(findings);
    let final_verdict = stricter_of(model_proposed.clone(), floor.clone());

    debug!(
        model_verdict = %model_proposed,
        severity_floor = %floor,
        final_verdict = %final_verdict,
        "grade derivation: floor={floor}, model={model_proposed}, final={final_verdict}",
    );

    final_verdict
}

// ─── Floor computation ────────────────────────────────────────────────────────

/// Compute the minimum (floor) verdict from the finding severity distribution.
///
/// Why: the floor is the deterministic component of grade derivation.  It is
/// applied as a lower-bound over the model's own verdict in `derive_verdict`.
/// The low-confidence override is handled separately in `derive_verdict` before
/// this function is called; by the time this is reached, the batch has at least
/// one substantive finding.
/// What: applies the four-tier rule set:
///
/// 1. Any `High`-effort finding → BLOCK (Critical/High severity)
/// 2. ≥2 `Medium`-effort findings → REQUEST_CHANGES
/// 3. Exactly 1 `Medium`-effort finding → APPROVE*
/// 4. Only `Low` / no findings → APPROVE
///
/// Test: `grade_two_medium_yields_request_changes`, `grade_one_medium_yields_approve_star`.
fn severity_floor(findings: &[Finding]) -> Verdict {
    if findings.is_empty() {
        return Verdict::Approve;
    }

    // Partition findings by effort tier.
    let has_high = findings.iter().any(|f| f.effort == Effort::High);
    let medium_count = findings
        .iter()
        .filter(|f| f.effort == Effort::Medium)
        .count();

    // Tier 1: any High-effort (critical/high severity) → BLOCK floor.
    if has_high {
        return Verdict::Block;
    }

    // Tier 2: ≥2 Medium-effort findings → REQUEST_CHANGES.
    if medium_count >= 2 {
        return Verdict::RequestChanges;
    }

    // Tier 3: exactly 1 Medium-effort finding → APPROVE*.
    if medium_count == 1 {
        return Verdict::ApproveWithReservations;
    }

    // Tier 4: only Low-effort or no findings.
    Verdict::Approve
}

// ─── Verdict ordering ─────────────────────────────────────────────────────────

/// Return the stricter (higher severity) of two verdicts.
///
/// Why: the floor is a MINIMUM; we take `max(model, floor)` using verdict
/// severity ordering so the model can escalate beyond the floor but cannot
/// go below it.
/// What: defines an ordinal ordering APPROVE(0) < APPROVE*(1) <
/// REQUEST_CHANGES(2) < BLOCK(3).  Unknown(4) is a separate terminal case
/// handled before `stricter_of` is called.
/// Test: `grade_floor_overrides_model_approve`,
/// `grade_model_block_kept_when_no_critical_finding`.
fn stricter_of(a: Verdict, b: Verdict) -> Verdict {
    if verdict_ord(&b) > verdict_ord(&a) {
        b
    } else {
        a
    }
}

/// Ordinal severity for a verdict (higher = more severe).
///
/// Why: needed by `stricter_of` to compare two verdicts without a full match.
/// What: APPROVE=0, APPROVE*=1, REQUEST_CHANGES=2, BLOCK=3.  UNKNOWN is never
/// passed here (handled before the call site).
/// Test: covered transitively by `stricter_of` tests.
fn verdict_ord(v: &Verdict) -> u8 {
    match v {
        Verdict::Approve => 0,
        Verdict::ApproveWithReservations => 1,
        Verdict::RequestChanges => 2,
        Verdict::Block => 3,
        Verdict::Unknown => 4, // Should not reach this branch in normal flow.
    }
}

// ─── Unit tests ───────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use crate::models::Finding;

    fn finding(effort: Effort, confidence: f32) -> Finding {
        Finding::new("src/lib.rs", "test", "desc", "", confidence, effort)
    }

    // ── Tier 1: Critical / High ──────────────────────────────────────────────

    /// Any High-effort finding must floor to BLOCK.
    ///
    /// Why: the calibration run showed 0% BLOCK detection; this rule is the
    /// primary fix — High-effort (critical/high severity) findings must BLOCK.
    /// What: model proposes APPROVE*, one High-effort finding → BLOCK.
    #[test]
    fn grade_critical_high_effort_yields_block() {
        let findings = vec![finding(Effort::High, 0.9)];
        let verdict = derive_verdict(Verdict::ApproveWithReservations, &findings);
        assert_eq!(
            verdict,
            Verdict::Block,
            "High-effort finding must floor to BLOCK"
        );
    }

    /// High-effort floor beats a model-proposed REQUEST_CHANGES.
    ///
    /// Why: even if the model correctly escalates to REQUEST_CHANGES, a Critical
    /// finding must escalate further to BLOCK.
    #[test]
    fn grade_high_effort_beats_request_changes() {
        let findings = vec![finding(Effort::High, 0.85)];
        let verdict = derive_verdict(Verdict::RequestChanges, &findings);
        assert_eq!(verdict, Verdict::Block);
    }

    // ── Tier 2: ≥2 Medium ───────────────────────────────────────────────────

    /// Two Medium findings with sufficient confidence must floor to REQUEST_CHANGES.
    ///
    /// Why: the calibration run showed REQUEST_CHANGES only 36% — this tier
    /// closes the gap for PRs with multiple real concerns.
    #[test]
    fn grade_two_medium_yields_request_changes() {
        let findings = vec![finding(Effort::Medium, 0.8), finding(Effort::Medium, 0.75)];
        let verdict = derive_verdict(Verdict::ApproveWithReservations, &findings);
        assert_eq!(verdict, Verdict::RequestChanges);
    }

    /// Three Medium findings must also floor to REQUEST_CHANGES.
    #[test]
    fn grade_three_medium_yields_request_changes() {
        let findings = vec![
            finding(Effort::Medium, 0.7),
            finding(Effort::Medium, 0.7),
            finding(Effort::Medium, 0.7),
        ];
        let verdict = derive_verdict(Verdict::Approve, &findings);
        assert_eq!(verdict, Verdict::RequestChanges);
    }

    // ── Tier 3: Exactly 1 Medium ─────────────────────────────────────────────

    /// One Medium finding must floor to APPROVE*.
    ///
    /// Why: a single advisory concern should not block the PR but warrants noting.
    #[test]
    fn grade_one_medium_yields_approve_star() {
        let findings = vec![finding(Effort::Medium, 0.75)];
        let verdict = derive_verdict(Verdict::Approve, &findings);
        assert_eq!(verdict, Verdict::ApproveWithReservations);
    }

    // ── Tier 4: Only Low or no findings ─────────────────────────────────────

    /// No findings → APPROVE.
    #[test]
    fn grade_no_findings_yields_approve() {
        let verdict = derive_verdict(Verdict::Approve, &[]);
        assert_eq!(verdict, Verdict::Approve);
    }

    /// Only Low-effort findings → APPROVE.
    #[test]
    fn grade_only_low_yields_approve() {
        let findings = vec![finding(Effort::Low, 0.9), finding(Effort::Low, 0.7)];
        let verdict = derive_verdict(Verdict::Approve, &findings);
        assert_eq!(verdict, Verdict::Approve);
    }

    // ── UNKNOWN preservation ─────────────────────────────────────────────────

    /// Verdict::Unknown from the model is always preserved — diff unassessable.
    ///
    /// Why: UNKNOWN signals "model could not assess", not "clean PR"; we must
    /// not collapse it to APPROVE.
    #[test]
    fn grade_unknown_is_preserved() {
        let findings = vec![finding(Effort::Low, 0.9)];
        let verdict = derive_verdict(Verdict::Unknown, &findings);
        assert_eq!(verdict, Verdict::Unknown, "UNKNOWN must be preserved");
    }

    #[test]
    fn grade_unknown_preserved_with_no_findings() {
        let verdict = derive_verdict(Verdict::Unknown, &[]);
        assert_eq!(verdict, Verdict::Unknown);
    }

    // ── Floor takes the stricter ─────────────────────────────────────────────

    /// Floor beats a model-proposed APPROVE when findings are High.
    ///
    /// Why: this is the core "stricter floor" invariant — the model cannot
    /// soften a High finding by proposing APPROVE.
    #[test]
    fn grade_floor_overrides_model_approve() {
        let findings = vec![finding(Effort::High, 0.95)];
        let verdict = derive_verdict(Verdict::Approve, &findings);
        assert_eq!(
            verdict,
            Verdict::Block,
            "severity floor must override model-proposed APPROVE"
        );
    }

    /// Model-proposed BLOCK is kept even when no High finding (model knows more).
    ///
    /// Why: the floor is a minimum; the model can still escalate beyond the floor.
    /// A BLOCK from the model with only Medium findings remains BLOCK.
    #[test]
    fn grade_model_block_kept_when_no_critical_finding() {
        let findings = vec![finding(Effort::Medium, 0.9)];
        let verdict = derive_verdict(Verdict::Block, &findings);
        assert_eq!(
            verdict,
            Verdict::Block,
            "model BLOCK must not be downgraded by floor"
        );
    }

    /// Model-proposed REQUEST_CHANGES is preserved when floor is lower.
    ///
    /// Why: the model may have identified a logic bug that the effort heuristic
    /// grades as Low; the model's escalation should stand.
    #[test]
    fn grade_model_request_changes_preserved_over_lower_floor() {
        let findings = vec![finding(Effort::Low, 0.9)];
        let verdict = derive_verdict(Verdict::RequestChanges, &findings);
        assert_eq!(
            verdict,
            Verdict::RequestChanges,
            "model REQUEST_CHANGES must not be downgraded to APPROVE"
        );
    }

    // ── Low-confidence collapse ─────────────────────────────────────────────

    /// All findings confidence ≤ 0.65 with Medium effort → APPROVE (not APPROVE*).
    ///
    /// Why: Fix 4 — curb APPROVE* over-fire on clean PRs.  When the model
    /// signals low confidence across all findings and none are High-effort, the
    /// advisory batch is treated as noise and the floor collapses to APPROVE.
    #[test]
    fn grade_low_confidence_all_medium_yields_approve() {
        let findings = vec![finding(Effort::Medium, 0.6), finding(Effort::Medium, 0.55)];
        let verdict = derive_verdict(Verdict::ApproveWithReservations, &findings);
        assert_eq!(
            verdict,
            Verdict::Approve,
            "all-low-confidence advisory batch must not fire APPROVE*"
        );
    }

    /// One Medium finding with confidence exactly at threshold is still advisory.
    ///
    /// Why: the boundary condition: confidence = 0.65 is still "low confidence"
    /// (≤ threshold), so the collapse applies.
    #[test]
    fn grade_confidence_at_threshold_collapses() {
        let findings = vec![finding(Effort::Medium, 0.65)];
        let verdict = derive_verdict(Verdict::ApproveWithReservations, &findings);
        assert_eq!(
            verdict,
            Verdict::Approve,
            "confidence at threshold must collapse"
        );
    }

    /// One Medium finding with confidence just above threshold is APPROVE*.
    ///
    /// Why: above the threshold the finding is substantive.
    #[test]
    fn grade_high_confidence_medium_beats_low_confidence_check() {
        let findings = vec![finding(Effort::Medium, 0.66)];
        let verdict = derive_verdict(Verdict::Approve, &findings);
        assert_eq!(
            verdict,
            Verdict::ApproveWithReservations,
            "confidence above threshold must yield APPROVE*"
        );
    }

    /// Mixed confidence: one high-confidence Medium + one low-confidence Medium
    /// is still REQUEST_CHANGES (not collapsed — not ALL low-confidence).
    #[test]
    fn grade_mixed_confidence_two_medium_not_collapsed() {
        let findings = vec![finding(Effort::Medium, 0.8), finding(Effort::Medium, 0.5)];
        let verdict = derive_verdict(Verdict::Approve, &findings);
        assert_eq!(
            verdict,
            Verdict::RequestChanges,
            "mixed-confidence Medium findings must not collapse"
        );
    }

    // ── Compile-break BLOCK rule (severity-anchor path) ──────────────────────

    /// A compile-break finding (deleted symbol with remaining references) assigned
    /// High effort flows through to BLOCK via the tier rules.
    ///
    /// Why: Fix 3 — compile-break detection.  The system prompt now instructs the
    /// model to assign Critical severity (→ Effort::High) to removed-symbol
    /// compile breaks.  This test confirms the tier rules complete the flow to
    /// BLOCK.
    #[test]
    fn grade_compile_break_high_effort_flows_to_block() {
        // Model returns APPROVE* (it usually under-fires without the prompt fix).
        // The High-effort finding from the prompt's compile-break rule floors it.
        let findings = vec![finding(Effort::High, 0.95)];
        let verdict = derive_verdict(Verdict::ApproveWithReservations, &findings);
        assert_eq!(
            verdict,
            Verdict::Block,
            "compile-break (High effort) must escalate to BLOCK"
        );
    }
}