trusty-review 0.4.0

LLM-backed code review service — reviews GitHub PRs and unified diffs via AWS Bedrock or OpenRouter
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
//! Cross-period synthesis for the contributor-profile pipeline (#566).
//!
//! Why: per-period findings (from `batch_reviewer`) are independent snapshots;
//! synthesis merges them across all periods to identify recurrence, resolution,
//! and worsening trends — the core longitudinal signal of the profile feature.
//! What: `Synthesizer` performs two steps:
//!   1. Deterministic dedup + trend-tag assignment using Jaccard token-set
//!      similarity (≥ 0.7 → Recurring/Resolved/Worsening; < 0.7 → New).
//!   2. One LLM "profiler" call that receives the deduped finding list,
//!      period quality scores, and frequency counts, and returns strengths,
//!      recurring_weaknesses, improvement_trajectory, and a narrative.
//!
//! Fail-safe: if the LLM call fails, the profile is still emitted with the
//! deterministic parts populated and an empty/placeholder narrative.
//!
//! Test: `tests` module below covers dedup, trend-tag assignment, quality_trend
//! derivation, trajectory from slope, and fail-safe narrative.

use std::sync::Arc;
use std::time::Instant;

use serde::Deserialize;
use tracing::{debug, warn};

use crate::llm::{LlmProvider, LlmResponse};
use crate::profile::types::{
    ContributorProfile, LongitudinalFinding, PeriodBatch, Trajectory, TrendTag,
};

pub use synthesizer_prompt::build_synthesizer_prompt;

#[path = "synthesizer_prompt.rs"]
mod synthesizer_prompt;

// ─── Prompt constants ─────────────────────────────────────────────────────────

const SYNTHESIZER_TEMPERATURE: f32 = 0.3;
const SYNTHESIZER_MAX_TOKENS: u32 = 2048;

/// Jaccard similarity threshold above which two findings are considered the
/// same conceptual issue.
const JACCARD_THRESHOLD: f64 = 0.7;

// ─── Synthesizer ──────────────────────────────────────────────────────────────

/// Cross-period finding synthesiser.
///
/// Why: individual period findings need to be merged and annotated with trend
/// tags before a coherent longitudinal narrative can be written.
/// What: holds an `LlmProvider` for the final narrative pass.  `synthesize`
/// performs deterministic dedup + trend-tag assignment then calls the LLM
/// for the narrative.  If the LLM call fails, a fallback narrative is used.
/// Test: see `tests` module below.
pub struct Synthesizer {
    llm: Arc<dyn LlmProvider>,
    model: String,
}

impl Synthesizer {
    /// Create a `Synthesizer` from an injected provider and model slug.
    ///
    /// Why: dependency injection allows tests to supply a fake provider.
    /// What: stores the provider and model slug for the narrative LLM call.
    /// Test: exercised by all synthesizer tests.
    pub fn new(llm: Arc<dyn LlmProvider>, model: impl Into<String>) -> Self {
        Self {
            llm,
            model: model.into(),
        }
    }

    /// Synthesise per-period findings into a complete `ContributorProfile`.
    ///
    /// Why: the profile pipeline needs one coherent output that combines the
    /// deterministic trend analysis with the LLM narrative.
    /// What:
    ///  1. Populates `profile.quality_trend` from `periods` stats.
    ///  2. Assigns `trend_tag` to every finding via Jaccard dedup.
    ///  3. Derives `improvement_trajectory` from quality score slope.
    ///  4. Calls the LLM for narrative + strengths + recurring_weaknesses.
    ///     On LLM failure: trajectory + findings are still emitted; narrative is
    ///     set to a fallback message.
    ///  5. Accumulates telemetry into `profile.token_cost`.
    ///
    /// Test: `tests::synthesizer_dedup_assigns_recurring`,
    /// `tests::synthesizer_quality_trend_populated`,
    /// `tests::synthesizer_fail_safe_narrative`.
    pub async fn synthesize(
        &self,
        mut profile: ContributorProfile,
        all_period_findings: Vec<Vec<LongitudinalFinding>>,
        periods: &[PeriodBatch],
    ) -> ContributorProfile {
        // Step 1: populate quality_trend deterministically.
        profile.quality_trend = periods
            .iter()
            .map(|b| (b.stats.period_label.clone(), b.stats.quality_score))
            .collect();

        // Step 2: flatten + dedup + assign trend_tags.
        let flat: Vec<LongitudinalFinding> = all_period_findings.into_iter().flatten().collect();
        let tagged = assign_trend_tags(flat);
        profile.all_findings = tagged;

        // Step 3: derive trajectory from quality score slope.
        profile.improvement_trajectory = derive_trajectory(&profile.quality_trend);

        // Step 4: LLM narrative pass.
        let start = Instant::now();
        let req = build_synthesizer_prompt(&profile, &self.model);

        match self.llm.complete(req).await {
            Ok(resp) => {
                let latency = start.elapsed().as_millis() as u64;
                profile.token_cost.accumulate(
                    resp.input_tokens as u64,
                    resp.output_tokens as u64,
                    resp.cost_usd,
                    latency,
                );
                debug!(
                    input_tokens = resp.input_tokens,
                    output_tokens = resp.output_tokens,
                    "synthesizer: LLM call complete"
                );
                apply_llm_synthesis(&mut profile, &resp);
            }
            Err(e) => {
                warn!(
                    error = %e,
                    "synthesizer: LLM call failed — using deterministic fallback (fail-safe)"
                );
                apply_fallback_narrative(&mut profile);
            }
        }

        profile
    }
}

// ─── Jaccard dedup + trend-tag assignment ─────────────────────────────────────

/// Assign `TrendTag` values to a flat list of findings from all periods.
///
/// Why: raw per-period findings have `trend_tag = None`; this function performs
/// cross-period dedup by Jaccard similarity and assigns the appropriate tag.
/// What:
///  - Two findings are considered "the same" if their description token-set
///    Jaccard similarity is ≥ `JACCARD_THRESHOLD`.
///  - A group of similar findings spanning ≥2 periods → `Recurring`.
///  - A group appearing only in the latest period (and not earlier) → `New`.
///  - A group that appeared in earlier periods but NOT the latest → `Resolved`.
///  - A group where the latest occurrence has higher severity than earlier → `Worsening`.
///
/// Returns the same findings with `trend_tag` set on every item.
///
/// Test: `tests::synthesizer_dedup_assigns_recurring`,
///   `tests::synthesizer_dedup_assigns_new`,
///   `tests::synthesizer_dedup_assigns_resolved`.
pub fn assign_trend_tags(findings: Vec<LongitudinalFinding>) -> Vec<LongitudinalFinding> {
    if findings.is_empty() {
        return findings;
    }

    // Collect unique period labels in insertion order.
    let mut period_order: Vec<String> = Vec::new();
    for f in &findings {
        if !period_order.contains(&f.period_label) {
            period_order.push(f.period_label.clone());
        }
    }
    let latest_period = period_order.last().cloned().unwrap_or_default();

    // Group findings by description-similarity clusters.
    let mut clusters: Vec<Vec<usize>> = Vec::new(); // indices into `findings`
    let mut assigned = vec![false; findings.len()];

    for i in 0..findings.len() {
        if assigned[i] {
            continue;
        }
        let mut cluster = vec![i];
        assigned[i] = true;
        for j in (i + 1)..findings.len() {
            if assigned[j] {
                continue;
            }
            if jaccard_similarity(
                &findings[i].finding.description,
                &findings[j].finding.description,
            ) >= JACCARD_THRESHOLD
            {
                cluster.push(j);
                assigned[j] = true;
            }
        }
        clusters.push(cluster);
    }

    // Assign trend tags based on cluster period coverage.
    let mut tagged = findings;
    for cluster in &clusters {
        let periods_in_cluster: Vec<&str> = cluster
            .iter()
            .map(|&idx| tagged[idx].period_label.as_str())
            .collect();

        let in_latest = periods_in_cluster.contains(&latest_period.as_str());
        let in_earlier = periods_in_cluster
            .iter()
            .any(|&p| p != latest_period.as_str());

        // Detect worsening: does the most recent instance have higher severity
        // than the first instance?  We use confidence as a proxy for severity
        // since findings from different providers may not have an explicit
        // severity field.
        let worsening = if in_latest && in_earlier && cluster.len() >= 2 {
            let first_conf = tagged[cluster[0]].finding.confidence;
            let last_idx = cluster[cluster.len() - 1];
            let last_conf = tagged[last_idx].finding.confidence;
            last_conf > first_conf + 0.1
        } else {
            false
        };

        let tag = if worsening {
            TrendTag::Worsening
        } else if in_latest && in_earlier {
            TrendTag::Recurring
        } else if in_latest && !in_earlier {
            TrendTag::New
        } else {
            // in_earlier && !in_latest
            TrendTag::Resolved
        };

        for &idx in cluster {
            tagged[idx].trend_tag = Some(tag.clone());
        }
    }

    tagged
}

/// Compute Jaccard similarity between two description strings.
///
/// Why: a simple, dependency-free metric for finding-description similarity
/// that works well for short technical phrases.
/// What: tokenises each string into lowercase words (split on whitespace +
/// punctuation), computes |intersection| / |union| of the token sets.
/// Returns 0.0 for empty inputs.
/// Test: `tests::jaccard_similarity_basic`.
pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
    let tokens_a = tokenize(a);
    let tokens_b = tokenize(b);
    if tokens_a.is_empty() && tokens_b.is_empty() {
        return 1.0;
    }
    if tokens_a.is_empty() || tokens_b.is_empty() {
        return 0.0;
    }
    let mut intersection = 0usize;
    for t in &tokens_a {
        if tokens_b.contains(t) {
            intersection += 1;
        }
    }
    let union = tokens_a.len() + tokens_b.len() - intersection;
    intersection as f64 / union as f64
}

/// Tokenise a string into lowercase alphabetic/numeric tokens.
fn tokenize(s: &str) -> Vec<String> {
    s.split(|c: char| !c.is_alphanumeric())
        .filter(|t| !t.is_empty())
        .map(|t| t.to_lowercase())
        .collect()
}

// ─── Trajectory derivation ────────────────────────────────────────────────────

/// Derive a `Trajectory` from the quality score time series.
///
/// Why: the profile needs a deterministic trajectory even if the LLM call
/// fails; a linear slope over the quality_trend series provides this.
/// What: computes the slope of a least-squares linear fit over the
/// `(index, score)` pairs.  Slope > 0.1 → Improving; slope < −0.1 →
/// Declining; otherwise Stable.  Returns Stable for < 2 data points.
/// Test: `tests::synthesizer_trajectory_from_slope`.
pub fn derive_trajectory(quality_trend: &[(String, f64)]) -> Trajectory {
    if quality_trend.len() < 2 {
        return Trajectory::Stable;
    }
    let n = quality_trend.len() as f64;
    let sum_x: f64 = (0..quality_trend.len()).map(|i| i as f64).sum();
    let sum_y: f64 = quality_trend.iter().map(|(_, s)| s).sum();
    let sum_xy: f64 = quality_trend
        .iter()
        .enumerate()
        .map(|(i, (_, s))| i as f64 * s)
        .sum();
    let sum_xx: f64 = (0..quality_trend.len())
        .map(|i| (i as f64) * (i as f64))
        .sum();
    let denom = n * sum_xx - sum_x * sum_x;
    if denom.abs() < f64::EPSILON {
        return Trajectory::Stable;
    }
    let slope = (n * sum_xy - sum_x * sum_y) / denom;
    if slope > 0.1 {
        Trajectory::Improving
    } else if slope < -0.1 {
        Trajectory::Declining
    } else {
        Trajectory::Stable
    }
}

/// Apply the LLM synthesis response to the profile.
///
/// Why: the LLM response provides human-readable strengths, weaknesses,
/// trajectory, and narrative that cannot be derived deterministically.
/// What: tries direct JSON parse first (structured output path where the body
/// IS the JSON object), then falls back to fence-based extraction (legacy
/// free-text path).  If both fail, the deterministic fallback is used.
/// Test: covered transitively by synthesizer integration tests.
fn apply_llm_synthesis(profile: &mut ContributorProfile, resp: &LlmResponse) {
    #[derive(Deserialize)]
    struct SynthesisBlock {
        #[serde(default)]
        strengths: Vec<String>,
        #[serde(default)]
        recurring_weaknesses: Vec<String>,
        #[serde(default)]
        improvement_trajectory: String,
        #[serde(default)]
        narrative: String,
    }

    let body = resp.text.trim();

    // Strategy 1: direct JSON parse (structured output path).
    let block_opt: Option<SynthesisBlock> = if body.starts_with('{') {
        serde_json::from_str(body).ok()
    } else {
        None
    };

    // Strategy 2: fence-based extraction (legacy free-text path).
    let block_opt = block_opt.or_else(|| {
        let fence_start = body.rfind("```json")?;
        let after = &body[fence_start + 7..];
        let fence_end = after.find("```")?;
        let json_text = after[..fence_end].trim();
        match serde_json::from_str::<SynthesisBlock>(json_text) {
            Ok(b) => Some(b),
            Err(e) => {
                warn!(error = %e, "synthesizer: JSON parse error (fence path)");
                None
            }
        }
    });

    let block = match block_opt {
        Some(b) => b,
        None => {
            warn!("synthesizer: no parseable JSON in LLM response — applying fallback narrative");
            apply_fallback_narrative(profile);
            return;
        }
    };

    profile.strengths = block.strengths;
    profile.recurring_weaknesses = block.recurring_weaknesses;
    if !block.narrative.is_empty() {
        profile.narrative = block.narrative;
    } else {
        apply_fallback_narrative(profile);
    }
    // LLM trajectory overrides deterministic only if valid.
    let llm_traj = match block.improvement_trajectory.to_lowercase().as_str() {
        "improving" => Some(Trajectory::Improving),
        "declining" => Some(Trajectory::Declining),
        "stable" => Some(Trajectory::Stable),
        _ => None,
    };
    if let Some(t) = llm_traj {
        profile.improvement_trajectory = t;
    }
}

/// Apply a deterministic fallback narrative when the LLM call fails.
///
/// Why: the profile must be useful even without LLM narrative; a minimal
/// text derived from the deterministic parts prevents an empty profile.
/// What: sets `narrative` to a template based on trajectory and finding counts.
/// Test: `tests::synthesizer_fail_safe_narrative`.
fn apply_fallback_narrative(profile: &mut ContributorProfile) {
    let traj_str = match profile.improvement_trajectory {
        Trajectory::Improving => "improving",
        Trajectory::Stable => "stable",
        Trajectory::Declining => "declining",
    };
    let n_recurring = profile
        .all_findings
        .iter()
        .filter(|f| f.trend_tag == Some(TrendTag::Recurring))
        .count();
    profile.narrative = format!(
        "Longitudinal profile for {} ({} to {}). \
         Quality trajectory: {}. \
         {} recurring issue(s) identified across periods. \
         (Narrative generation unavailable — LLM call failed or returned invalid output.)",
        profile.canonical_name,
        profile.profiled_since,
        profile.profiled_until,
        traj_str,
        n_recurring,
    );
}

// ─── Unit tests ───────────────────────────────────────────────────────────────
// Tests live in synthesizer_tests.rs to keep this file under 500 lines.

#[cfg(test)]
#[path = "synthesizer_tests.rs"]
mod tests;