trusty-review 0.4.0

LLM-backed code review service — reviews GitHub PRs and unified diffs via AWS Bedrock or OpenRouter
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
//! Per-period LLM finding extraction for the contributor-profile pipeline (#565).
//!
//! Why: each period batch (statistics + sampled diffs) must be sent to an LLM
//! to extract code-quality findings so they can be synthesised into longitudinal
//! trends.  Doing this per-period keeps individual prompts focused and token
//! counts manageable.
//! What: `BatchReviewer` calls the LLM with a per-period "reviewer" prompt,
//! parses the JSON findings block (same strategy as the MVP parser), and
//! accumulates telemetry into `TokenCostSummary`.  Fail-safe: any LLM or parse
//! error records the error and returns an empty finding list — never panics.
//! Verification round is OUT OF SCOPE for v0.1 (TODO: add a verification pass
//! in a future PR to cross-check high-confidence findings against the diff).
//! Test: `tests` module below covers prompt content, JSON parsing, fail-safe
//! on empty/malformed LLM response, and telemetry accumulation — all using a
//! fake `LlmProvider` injected via `Arc<dyn LlmProvider>`.

use std::sync::Arc;
use std::time::Instant;

use serde::Deserialize;
use tracing::{debug, warn};

use crate::llm::{
    ChatMessage, LlmProvider, LlmRequest, LlmResponse, ResponseSchema, strip_provider_prefix,
};
use crate::models::{Effort, Finding};
use crate::profile::types::{LongitudinalFinding, PeriodBatch, TokenCostSummary};

// ─── Prompt constants ─────────────────────────────────────────────────────────

/// Temperature for the period-review LLM call — tighter than chat for
/// consistent structured output.
const PERIOD_REVIEWER_TEMPERATURE: f32 = 0.2;

/// Max tokens per period-review LLM call.
const PERIOD_REVIEWER_MAX_TOKENS: u32 = 2048;

// ─── BatchReviewer ────────────────────────────────────────────────────────────

/// Per-period LLM finding extractor.
///
/// Why: each period batch must be analysed independently so findings can later
/// be compared across periods (recurrence, worsening, resolution).
/// What: holds a reference to an `LlmProvider` and the model slug used for
/// period-review calls.  `review_period` sends one LLM request per period,
/// parses the JSON block, and returns `Vec<LongitudinalFinding>`.  The
/// `trend_tag` on each finding is intentionally left `None` here — the
/// synthesiser assigns tags after seeing all periods.
/// Test: see `tests` module below.
pub struct BatchReviewer {
    llm: Arc<dyn LlmProvider>,
    model: String,
}

impl BatchReviewer {
    /// Create a `BatchReviewer` from an injected provider and model slug.
    ///
    /// Why: dependency injection allows tests to supply a fake provider without
    /// spawning real network connections.
    /// What: stores the provider and model for reuse across period calls.
    /// Test: exercised by all `tests::*` tests.
    pub fn new(llm: Arc<dyn LlmProvider>, model: impl Into<String>) -> Self {
        Self {
            llm,
            model: model.into(),
        }
    }

    /// Extract findings for one `PeriodBatch` via LLM.
    ///
    /// Why: each period is reviewed independently so per-period findings can
    /// be compared for recurrence/trend in a later synthesis step.
    /// What: builds the review prompt for the period, calls the LLM, tries to
    /// parse the JSON findings block, and appends the parsed findings (with
    /// `period_label` set and `trend_tag = None`) to the output.  On any error
    /// the function logs and returns an empty `Vec` — never propagates the
    /// error so a single bad period does not abort the entire profile run.
    /// Accumulates telemetry into `cost_out` in-place.
    /// Test: `tests::batch_reviewer_parses_findings_from_json`,
    /// `tests::batch_reviewer_fail_safe_on_empty_response`,
    /// `tests::batch_reviewer_fail_safe_on_malformed_json`.
    pub async fn review_period(
        &self,
        batch: &PeriodBatch,
        cost_out: &mut TokenCostSummary,
    ) -> Vec<LongitudinalFinding> {
        let req = build_period_prompt(batch, &self.model);
        let start = Instant::now();

        let resp = match self.llm.complete(req).await {
            Ok(r) => r,
            Err(e) => {
                warn!(
                    period = %batch.stats.period_label,
                    error = %e,
                    "batch_reviewer: LLM call failed — returning empty findings (fail-safe)"
                );
                return Vec::new();
            }
        };

        let latency = start.elapsed().as_millis() as u64;
        cost_out.accumulate(
            resp.input_tokens as u64,
            resp.output_tokens as u64,
            resp.cost_usd,
            latency,
        );

        debug!(
            period = %batch.stats.period_label,
            input_tokens = resp.input_tokens,
            output_tokens = resp.output_tokens,
            latency_ms = latency,
            "batch_reviewer: LLM call complete"
        );

        parse_period_findings(&resp, &batch.stats.period_label)
    }
}

// ─── Prompt builder ───────────────────────────────────────────────────────────

/// Build the JSON Schema for the period findings output structure.
///
/// Why: forced structured output eliminates parse failures in the period
/// reviewer (batch_reviewer); with a schema, the model MUST emit valid JSON
/// conforming to the `PeriodFindingsBlock` shape.
/// What: returns a `ResponseSchema` with name `"period_findings"` and a
/// JSON Schema matching `PeriodFindingsBlock` / `PeriodFindingWire`.
/// Test: `tests::build_period_prompt_includes_schema`.
fn period_findings_schema() -> ResponseSchema {
    ResponseSchema {
        name: "period_findings".to_string(),
        schema: serde_json::json!({
            "type": "object",
            "properties": {
                "findings": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "kind": {"type": "string"},
                            "description": {"type": "string"},
                            "suggestion": {"type": "string"},
                            "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
                            "file": {"type": "string"},
                            "severity": {
                                "type": "string",
                                "enum": ["low", "medium", "high", "critical"]
                            }
                        },
                        "required": ["kind", "description"]
                    }
                }
            },
            "required": ["findings"]
        }),
    }
}

/// Build the LLM request for reviewing a single period batch.
///
/// Why: centralises prompt assembly so the wording can be iterated without
/// touching reviewer logic.
/// What: assembles a system prompt (reviewer role instructions + JSON output
/// schema) and a user message containing the period stats summary + sampled
/// diff snippets.  Includes `response_schema` so the provider forces structured
/// output — eliminating parse failures in the profile pipeline.
/// `model` may carry a `bedrock/` or `openrouter/` routing prefix; this
/// function strips it before setting `LlmRequest.model` so the bare id reaches
/// the provider API.
/// Test: `tests::batch_reviewer_prompt_contains_period_label`,
/// `tests::batch_period_prompt_strips_bedrock_prefix`,
/// `tests::build_period_prompt_includes_schema`.
pub fn build_period_prompt(batch: &PeriodBatch, model: &str) -> LlmRequest {
    let system = period_reviewer_system_prompt();
    let user = build_period_user_message(batch);
    LlmRequest {
        model: strip_provider_prefix(model).to_string(),
        system: system.to_string(),
        messages: vec![ChatMessage {
            role: "user".to_string(),
            content: user,
        }],
        temperature: PERIOD_REVIEWER_TEMPERATURE,
        max_tokens: PERIOD_REVIEWER_MAX_TOKENS,
        response_schema: Some(period_findings_schema()),
    }
}

/// System prompt for the period-batch reviewer role.
///
/// Why: the system prompt defines the review criteria and the structured
/// output shape the parser depends on.  With forced structured output active,
/// the model populates the response fields directly rather than emitting a
/// fenced JSON block.
/// What: instructs the LLM to act as a senior engineer reviewing a sample of
/// one engineer's commits and populate the `findings` array.
/// Test: asserted in `tests::batch_reviewer_system_prompt_contains_schema`.
pub(super) fn period_reviewer_system_prompt() -> &'static str {
    r#"You are a senior software engineer reviewing a sample of one engineer's commits
over a specific time window as part of a longitudinal quality analysis.

## Task
Identify code-quality findings present in the sampled diffs. Focus on:
- Correctness bugs, error-handling gaps, resource leaks
- Security weaknesses (injection, auth, secrets in code)
- Logic errors, off-by-one issues, data-loss risks
- Missing tests or test-quality issues
- Recurring anti-patterns visible across multiple commits in this window

## Output (REQUIRED)
Populate the structured response with a `findings` array.
Each finding must include:
- `kind`: short category label (e.g. error_handling, security, logic)
- `description`: concise description of the issue observed
- `suggestion`: concrete improvement suggestion
- `confidence`: float in [0.0, 1.0]
- `file`: most relevant file path; use "multiple" if the issue spans files
- `severity`: one of low, medium, high, critical

`findings` may be an empty array if the sample looks clean."#
}

/// Build the user-turn message for a period review.
///
/// Why: the user message carries the period statistics and sampled diff
/// snippets that the LLM needs to produce findings.
/// What: formats the period label/dates, commit stats summary, and up to
/// `MAX_DIFFS_IN_PROMPT` sampled diffs each in a fenced code block.
/// Test: `tests::batch_reviewer_prompt_contains_period_label`.
fn build_period_user_message(batch: &PeriodBatch) -> String {
    const MAX_DIFFS_IN_PROMPT: usize = 10;
    let s = &batch.stats;
    let mut msg = String::with_capacity(4096);

    msg.push_str(&format!(
        "## Period: {}\nFrom {} to {}\n\n",
        s.period_label, s.since, s.until
    ));

    msg.push_str("### Statistics\n");
    msg.push_str(&format!("- Commits: {}\n", s.commit_count));
    msg.push_str(&format!("- Quality score: {:.2}\n", s.quality_score));
    msg.push_str(&format!("- Ticketed %: {:.0}%\n", s.ticketed_pct * 100.0));

    if !s.categories.is_empty() {
        let mut cats: Vec<(&String, &u64)> = s.categories.iter().collect();
        cats.sort_by_key(|(k, _)| k.as_str());
        let cat_str: Vec<String> = cats.iter().map(|(k, v)| format!("{k}={v}")).collect();
        msg.push_str(&format!("- Categories: {}\n", cat_str.join(", ")));
    }

    if !s.repositories.is_empty() {
        msg.push_str(&format!("- Repositories: {}\n", s.repositories.join(", ")));
    }
    msg.push('\n');

    if batch.sampled_diffs.is_empty() {
        msg.push_str("### Sampled diffs\n*(no diffs available for this period)*\n\n");
    } else {
        msg.push_str("### Sampled diffs\n\n");
        for (i, diff) in batch
            .sampled_diffs
            .iter()
            .enumerate()
            .take(MAX_DIFFS_IN_PROMPT)
        {
            let cat = diff.category.as_deref().unwrap_or("unknown");
            let effort = diff.effort.as_deref().unwrap_or("?");
            msg.push_str(&format!(
                "#### Diff {}{} ({repo}) [category={cat}, effort={effort}]\n",
                i + 1,
                &diff.sha[..8.min(diff.sha.len())],
                repo = diff.repository,
            ));
            msg.push_str(&format!("Commit: {}\n\n", diff.message));
            msg.push_str("```diff\n");
            msg.push_str(&diff.diff_text);
            if !diff.diff_text.ends_with('\n') {
                msg.push('\n');
            }
            msg.push_str("```\n\n");
        }
    }

    msg.push_str(
        "Please review the diffs above and populate the structured `findings` \
         array as specified in the system prompt.\n",
    );

    msg
}

// ─── JSON block parser ────────────────────────────────────────────────────────

/// Wire type for the per-period findings JSON block.
#[derive(Debug, Deserialize)]
struct PeriodFindingsBlock {
    #[serde(default)]
    findings: Vec<PeriodFindingWire>,
}

/// Wire type for a single finding in the JSON block.
#[derive(Debug, Deserialize)]
struct PeriodFindingWire {
    #[serde(default)]
    kind: String,
    #[serde(default)]
    description: String,
    #[serde(default)]
    suggestion: String,
    #[serde(default)]
    confidence: f32,
    #[serde(default)]
    file: String,
    #[serde(default)]
    severity: String,
}

/// Parse findings from an LLM response for one period.
///
/// Why: the MVP parser (`pipeline::parser`) targets `ReviewResult` verdict +
/// findings; the period parser targets just findings with slightly different
/// field names.  A separate function keeps the schemas independent.
/// What: tries direct JSON parse first (structured output path where the body
/// IS the JSON object), then falls back to the legacy fence-based extraction.
/// Converts wire findings to `LongitudinalFinding` (with `trend_tag = None`
/// and the given `period_label`).  Fail-safe: any parse error logs and returns
/// empty — never panics.
/// Test: `tests::batch_reviewer_parses_findings_from_json`,
/// `tests::batch_reviewer_parses_direct_json`.
fn parse_period_findings(resp: &LlmResponse, period_label: &str) -> Vec<LongitudinalFinding> {
    let body = resp.text.trim();
    if body.is_empty() {
        warn!(period = %period_label, "batch_reviewer: empty LLM response — returning empty findings");
        return Vec::new();
    }

    // Strategy 1: direct JSON (structured output path).
    if body.starts_with('{')
        && let Ok(block) = serde_json::from_str::<PeriodFindingsBlock>(body)
    {
        debug!(
            period = %period_label,
            findings = block.findings.len(),
            "batch_reviewer: parsed via direct JSON (structured output)"
        );
        return convert_period_block(block, period_label);
    }

    // Strategy 2: legacy fence-based extraction.
    let Some(fence_start) = body.rfind("```json") else {
        warn!(
            period = %period_label,
            "batch_reviewer: no JSON block found in LLM response — returning empty findings"
        );
        return Vec::new();
    };

    let after = &body[fence_start + 7..];
    let Some(fence_end) = after.find("```") else {
        warn!(period = %period_label, "batch_reviewer: unclosed JSON block — returning empty findings");
        return Vec::new();
    };

    let json_text = after[..fence_end].trim();
    let block: PeriodFindingsBlock = match serde_json::from_str(json_text) {
        Ok(b) => b,
        Err(e) => {
            warn!(
                period = %period_label,
                error = %e,
                "batch_reviewer: JSON parse error — returning empty findings"
            );
            return Vec::new();
        }
    };

    convert_period_block(block, period_label)
}

/// Convert a `PeriodFindingsBlock` into `LongitudinalFinding` values.
///
/// Why: extracted to eliminate duplication between the two parse strategies.
/// What: maps each wire finding to `LongitudinalFinding` with `trend_tag = None`.
/// Test: covered by all parse-path tests in the test module.
fn convert_period_block(
    block: PeriodFindingsBlock,
    period_label: &str,
) -> Vec<LongitudinalFinding> {
    block
        .findings
        .into_iter()
        .map(|f| {
            let effort = severity_to_effort(&f.severity);
            let file = if f.file.is_empty() {
                "unknown".to_string()
            } else {
                f.file
            };
            let kind = if f.kind.is_empty() {
                "general".to_string()
            } else {
                f.kind
            };
            LongitudinalFinding {
                period_label: period_label.to_string(),
                finding: Finding::new(
                    file,
                    kind,
                    f.description,
                    f.suggestion,
                    f.confidence,
                    effort,
                ),
                trend_tag: None,
            }
        })
        .collect()
}

/// Map a severity string to a `Finding` effort level.
fn severity_to_effort(severity: &str) -> Effort {
    match severity.to_lowercase().as_str() {
        "high" | "critical" => Effort::High,
        "medium" => Effort::Medium,
        _ => Effort::Low,
    }
}

// ─── Unit tests ───────────────────────────────────────────────────────────────
// Tests live in batch_reviewer_tests.rs to keep this file under 500 lines.

#[cfg(test)]
#[path = "batch_reviewer_tests.rs"]
mod tests;