trusty-review 0.2.0

Fast local PR-review service for trusty-tools — orchestrates LLM-backed code review
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
//! Pluggable context-source layer (Phase 6, #550).
//!
//! Why: a review is sharper when the LLM also sees *external* context — the
//! JIRA ticket the PR implements, the Confluence design doc it follows, the
//! GitHub issue it closes.  Rather than hard-wiring each integration into the
//! prompt builder, this module defines a single `ContextSource` trait so every
//! enrichment source (JIRA / Confluence / GitHub Issues today; an indexed APEX
//! knowledgebase in PR-B) plugs into the same orchestrator and renders into the
//! same `## Related <source>` section of the reviewer user message.
//!
//! What: this `mod.rs` is a thin facade.  It defines the shared value types
//! (`ReviewSubject`, `ContextSnippet`, `ContextSection`, `RetrievalMode`,
//! `ContextSourceError`) and the `ContextSource` trait, then re-exports the
//! concrete sources and the orchestrator from sibling modules.
//!
//! ## Fail-open contract (CRITICAL — read before adding a source)
//!
//! These context sources are **supplementary / best-effort enrichment**.  A
//! source that errors, times out, or has no credentials MUST log to stderr and
//! return zero snippets — it MUST NOT block, skip, or fail the review.  This is
//! deliberately DIFFERENT from the trusty-search / trusty-analyze required gate
//! (#590): those two are the *core value* of trusty-review and their absence
//! skips the review; these external sources are nice-to-have and degrade
//! silently to "no extra context".  Every `ContextSource::gather`
//! implementation therefore returns `Result` only so the orchestrator can log
//! the failure — the orchestrator NEVER propagates a source error upward.
//!
//! Test: trait object-safety and the value types are unit-tested in this module;
//! each source carries its own parse/query tests; the orchestrator's fail-open
//! and section-assembly behaviour is tested in `orchestrator`.

pub mod atlassian;
pub mod config;
pub mod confluence;
pub mod confluence_parse;
pub mod github_issues;
pub mod jira;
pub mod jira_parse;
pub mod orchestrator;

pub use config::{ContextSourcesConfig, ContextSourcesFileConfig, SourceConfig, SourceFileConfig};
pub use confluence::ConfluenceSource;
pub use github_issues::GithubIssuesSource;
pub use jira::JiraSource;
pub use orchestrator::{gather_external_context, render_sections};

use async_trait::async_trait;

// ─── Retrieval mode ─────────────────────────────────────────────────────────

/// How a context source obtains its snippets.
///
/// Why: #550 makes retrieval mode a per-source config knob.  A `Live` source
/// queries its own API at review time; a `Semantic` source queries the
/// trusty-search daemon against a pre-built index (APEX / any knowledgebase).
/// PR-A implements only the `Live` backend for the three external sources; a
/// source configured `Semantic` here returns a clear "not yet implemented"
/// error (logged, fail-open) until PR-B lands the indexed backend.
/// What: a two-variant enum, deserialised lowercase (`"live"` / `"semantic"`).
/// Test: `retrieval_mode_serde_roundtrip`, and the orchestrator's
/// `semantic_mode_not_yet_implemented` path.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Deserialize, serde::Serialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum RetrievalMode {
    /// Query the source's own API directly at review time (no pre-indexing).
    #[default]
    Live,
    /// Retrieve from a trusty-search index by vector/hybrid search (PR-B).
    Semantic,
}

impl std::fmt::Display for RetrievalMode {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            RetrievalMode::Live => write!(f, "live"),
            RetrievalMode::Semantic => write!(f, "semantic"),
        }
    }
}

// ─── Review subject ─────────────────────────────────────────────────────────

/// The review subject handed to every context source.
///
/// Why: each source searches its backend by the same signal set — the PR title
/// keywords and the identifiers extracted from the diff — so bundling them in
/// one borrow-friendly struct keeps the `ContextSource::gather` signature stable
/// as new sources are added.
/// What: holds the owner/repo (needed by the GitHub-Issues source to scope its
/// search), the PR title, the changed file paths, and the extracted identifiers.
/// All fields are owned so the subject can be cheaply shared across the
/// concurrent gather tasks.
/// Test: constructed by `orchestrator` tests and each source's tests.
#[derive(Debug, Clone, Default)]
pub struct ReviewSubject {
    /// Repository owner / org (GitHub-Issues source scopes its query to this).
    pub owner: String,
    /// Repository name.
    pub repo: String,
    /// PR title (empty in local-diff mode — sources should skip on empty query).
    pub title: String,
    /// PR description / body (empty in local-diff mode).
    ///
    /// Why: the incumbent (`pr_review_service.py:4068`, `:3800`) regex-scans the
    /// PR title **and** description for JIRA ticket keys and folds the first 500
    /// chars of the body into the semantic query — the PR body is where authors
    /// name the ticket they implement and describe intent in prose that matches
    /// docs far better than bare code identifiers do.  Carrying it on the subject
    /// lets the JIRA ticket-ID path and `keyword_query` reach parity.
    pub body: String,
    /// Changed file paths from the diff.
    pub changed_files: Vec<String>,
    /// Identifiers extracted from the diff (function/type/symbol names).
    pub identifiers: Vec<String>,
}

impl ReviewSubject {
    /// Build the free-text keyword query a live source should search by.
    ///
    /// Why: every live source searches by the same signal — PR-title words, the
    /// PR description prose, plus a bounded set of diff identifiers — so the query
    /// construction lives here once instead of being duplicated (and drifting)
    /// across three sources.  The PR body is included to match the incumbent,
    /// which builds its query from `title + "\n" + description[:500]`
    /// (`pr_review_service.py:3797-3800`); the body is the strongest signal for
    /// Confluence + GitHub-Issues matches, where prose beats code identifiers.
    /// What: joins the PR title, the first `BODY_QUERY_CHARS` chars of the body,
    /// and up to `max_identifiers` identifiers into a single space-separated
    /// string, de-duplicating exact-token repeats and dropping empties.  Returns
    /// an empty string when there is no usable signal (caller skips).
    /// Test: `keyword_query_combines_title_and_identifiers`,
    /// `keyword_query_includes_body`, `keyword_query_empty_when_no_signal`.
    pub fn keyword_query(&self, max_identifiers: usize) -> String {
        let mut parts: Vec<&str> = Vec::new();
        let title = self.title.trim();
        if !title.is_empty() {
            parts.push(title);
        }
        // Fold the PR description in after the title (bounded), matching the
        // incumbent's `title + "\n" + body[:500]` query construction.
        let body = truncate_on_char_boundary(self.body.trim(), BODY_QUERY_CHARS).trim();
        if !body.is_empty() && !parts.contains(&body) {
            parts.push(body);
        }
        for id in self.identifiers.iter().take(max_identifiers) {
            let id = id.trim();
            if !id.is_empty() && !parts.contains(&id) {
                parts.push(id);
            }
        }
        parts.join(" ")
    }
}

/// Max chars of the PR body folded into `keyword_query` (incumbent parity).
///
/// Why: the incumbent caps the body it adds to the query at 500 chars
/// (`pr_review_service.py:3800`) so a long PR description does not swamp the
/// query; we match that bound.
/// What: a `usize` constant used by `ReviewSubject::keyword_query`.
/// Test: exercised by `keyword_query_includes_body`.
const BODY_QUERY_CHARS: usize = 500;

/// Max chars of a snippet body embedded under a `## Related …` bullet.
///
/// Why: each source embeds a description/excerpt so the model sees the ticket /
/// page / issue prose, not just a one-line label (`pr_review_service.py:4249`
/// JIRA at 800, `:4848` aux at 400); we standardise on 500 chars across the
/// three live sources — long enough to convey intent, short enough to keep the
/// prompt bounded.
/// What: a `usize` constant used by every source's body extraction.
/// Test: exercised by each source's body-population test.
pub const SNIPPET_BODY_CHARS: usize = 500;

/// Truncate `s` to at most `max` chars, respecting UTF-8 char boundaries.
///
/// Why: every source truncates a body excerpt to a bounded length before
/// embedding it; slicing a `String` by byte index can panic mid-codepoint, so
/// the truncation must walk char boundaries.  Centralising it keeps all three
/// sources consistent and panic-free (no `unwrap`/byte-slice in library code).
/// What: returns the input unchanged when it is already within `max` chars;
/// otherwise returns the first `max` chars (by `char` count) as a `&str` slice.
/// Test: `truncate_on_char_boundary_*` in this module.
pub fn truncate_on_char_boundary(s: &str, max: usize) -> &str {
    match s.char_indices().nth(max) {
        Some((idx, _)) => &s[..idx],
        None => s,
    }
}

// ─── Snippets + sections ────────────────────────────────────────────────────

/// One retrieved context item from a source.
///
/// Why: sources differ in their native shape (a JIRA ticket, a Confluence page,
/// a GitHub issue), but the prompt only needs a uniform title/subtitle/body/link
/// quad to render a bullet.  Normalising to this struct keeps the section
/// renderer source-agnostic so PR-B's APEX source slots in unchanged.
/// What: a short `title` (such as `PROJ-123 — Add auth`), an optional `subtitle`
/// (status, space, or issue state), an optional `body` snippet, and a `link`.
/// Test: `render_sections_emits_headings_and_bullets` in `orchestrator`.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ContextSnippet {
    /// Primary label for the bullet (ticket key + summary, doc title, etc.).
    pub title: String,
    /// Optional secondary label (status, space, issue state).
    pub subtitle: Option<String>,
    /// Optional body excerpt to embed under the bullet.
    pub body: Option<String>,
    /// Optional canonical link to the item.
    pub link: Option<String>,
}

/// A rendered `## Related <source>` section produced by one source.
///
/// Why: the orchestrator collects one section per source and the prompt builder
/// concatenates them in a deterministic order; carrying the heading alongside
/// the snippets keeps assembly trivial and order-stable.
/// What: `heading` is the markdown H2 text (e.g. `Related JIRA tickets`);
/// `snippets` are the items to render as bullets.  A section with zero snippets
/// is dropped by the orchestrator (nothing to show).
/// Test: `orchestrator` section-ordering and empty-drop tests.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ContextSection {
    /// Markdown H2 heading text (without the leading `## `).
    pub heading: String,
    /// Items to render as bullets under the heading.
    pub snippets: Vec<ContextSnippet>,
}

// ─── Error type ─────────────────────────────────────────────────────────────

/// Errors a context source may surface to the orchestrator.
///
/// Why: the orchestrator needs to *log* a meaningful reason when a source fails,
/// even though it never propagates the error (fail-open).  Typed variants let
/// the log line distinguish "missing creds" (skip, expected) from "API error"
/// (transient) from "semantic mode not implemented yet" (config/PR-B).
/// What: a `thiserror` enum (library convention) covering the failure classes
/// every source shares.
/// Test: `context_source_error_display` in this module.
///
/// Note: every variant carries a `src` (the source name) field rather than
/// `source` — the latter name is special-cased by `thiserror` as a `#[source]`
/// error chain and would not satisfy `&'static str: Error`.
#[derive(Debug, thiserror::Error)]
pub enum ContextSourceError {
    /// Required credentials / base URL are absent — the source is simply skipped.
    #[error("{src} skipped: {reason}")]
    NotConfigured {
        /// Source name (e.g. `"jira"`).
        src: &'static str,
        /// Human-readable reason (which env var is missing).
        reason: String,
    },

    /// HTTP transport failure (DNS, connect, TLS, timeout).
    #[error("{src} transport error: {err}")]
    Transport {
        /// Source name.
        src: &'static str,
        /// Underlying transport error text.
        err: TransportErr,
    },

    /// The backend returned a non-2xx status.
    #[error("{src} API returned {status}: {body}")]
    Api {
        /// Source name.
        src: &'static str,
        /// HTTP status code.
        status: u16,
        /// Response body (may be truncated).
        body: String,
    },

    /// The response could not be parsed.
    #[error("{src} response parse error: {detail}")]
    Parse {
        /// Source name.
        src: &'static str,
        /// Parse error text.
        detail: String,
    },

    /// The source is configured `mode = semantic`, which PR-A does not implement.
    #[error("{src}: semantic mode not yet implemented (see PR-B / APEX indexed knowledgebase)")]
    SemanticNotImplemented {
        /// Source name.
        src: &'static str,
    },
}

/// Newtype wrapper carrying a transport error message.
///
/// Why: reqwest errors are not `Clone` and we want a simple owned string for
/// testability; a newtype keeps `Transport` self-describing.
/// What: wraps a message string with a `Display` impl.
/// Test: covered transitively by `context_source_error_display`.
#[derive(Debug, thiserror::Error)]
#[error("{0}")]
pub struct TransportErr(pub String);

// ─── The trait ──────────────────────────────────────────────────────────────

/// A pluggable external context source.
///
/// Why: this is the seam that lets the orchestrator treat JIRA, Confluence,
/// GitHub Issues, and (PR-B) an indexed APEX knowledgebase identically.  Each
/// source declares its name + enabled flag + retrieval mode and knows how to
/// turn a `ReviewSubject` into a `ContextSection`.
/// What: `name` is a stable identifier used in logs and the config table;
/// `is_enabled` lets the orchestrator skip a disabled source cheaply (without a
/// network round-trip); `mode` is the configured retrieval mode; `gather`
/// performs the actual retrieval.
///
/// Fail-open contract: `gather` returns `Result` ONLY so the orchestrator can
/// log a reason; the orchestrator treats `Err(_)` exactly like an empty section
/// and continues.  Implementations should therefore NOT panic and should map
/// every failure to a `ContextSourceError`.
/// Test: `context_source_object_safe` (object-safety) in this module; per-source
/// behaviour in each source's own tests.
#[async_trait]
pub trait ContextSource: Send + Sync {
    /// Stable source identifier (e.g. `"jira"`), used in logs + config keys.
    fn name(&self) -> &'static str;

    /// Whether this source is enabled (config + credentials present).
    fn is_enabled(&self) -> bool;

    /// The configured retrieval mode for this source.
    fn mode(&self) -> RetrievalMode;

    /// Retrieve context for the review subject.
    ///
    /// Returns a `ContextSection` (possibly empty) on success.  On any failure
    /// returns a `ContextSourceError` for the orchestrator to log; the
    /// orchestrator NEVER propagates this error (fail-open).
    async fn gather(&self, subject: &ReviewSubject) -> Result<ContextSection, ContextSourceError>;
}

// ─── Unit tests ─────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn retrieval_mode_serde_roundtrip() {
        let live: RetrievalMode = serde_json::from_str("\"live\"").unwrap();
        assert_eq!(live, RetrievalMode::Live);
        let sem: RetrievalMode = serde_json::from_str("\"semantic\"").unwrap();
        assert_eq!(sem, RetrievalMode::Semantic);
        assert_eq!(RetrievalMode::default(), RetrievalMode::Live);
        assert_eq!(
            serde_json::to_string(&RetrievalMode::Semantic).unwrap(),
            "\"semantic\""
        );
    }

    #[test]
    fn keyword_query_combines_title_and_identifiers() {
        let subj = ReviewSubject {
            title: "Add auth flow".to_string(),
            identifiers: vec!["authenticate".to_string(), "TokenStore".to_string()],
            ..Default::default()
        };
        let q = subj.keyword_query(8);
        assert_eq!(q, "Add auth flow authenticate TokenStore");
    }

    #[test]
    fn keyword_query_dedupes_and_caps_identifiers() {
        let subj = ReviewSubject {
            title: "fix".to_string(),
            identifiers: vec![
                "fix".to_string(), // duplicate of title token — dropped
                "a".to_string(),
                "b".to_string(),
                "c".to_string(),
            ],
            ..Default::default()
        };
        // The cap of 2 applies to the identifier iterator BEFORE dedup, so we
        // consider ["fix", "a"]; "fix" duplicates the title token and is dropped,
        // leaving just "a". ("b"/"c" are past the cap.)
        let q = subj.keyword_query(2);
        assert_eq!(q, "fix a");
    }

    #[test]
    fn keyword_query_includes_body() {
        // The PR body is folded in after the title and before identifiers,
        // matching the incumbent's `title + "\n" + body[:500]` construction.
        let subj = ReviewSubject {
            title: "Add auth flow".to_string(),
            body: "Implements PROJ-123: refresh tokens before expiry.".to_string(),
            identifiers: vec!["TokenStore".to_string()],
            ..Default::default()
        };
        let q = subj.keyword_query(8);
        assert_eq!(
            q,
            "Add auth flow Implements PROJ-123: refresh tokens before expiry. TokenStore"
        );
    }

    #[test]
    fn keyword_query_truncates_long_body() {
        // A body longer than BODY_QUERY_CHARS is bounded; only the first 500
        // chars participate in the query so a long description cannot swamp it.
        let long = "x".repeat(BODY_QUERY_CHARS + 200);
        let subj = ReviewSubject {
            title: "t".to_string(),
            body: long,
            ..Default::default()
        };
        let q = subj.keyword_query(0);
        // "t" + " " + 500 x's
        assert_eq!(q.len(), 1 + 1 + BODY_QUERY_CHARS);
    }

    #[test]
    fn truncate_on_char_boundary_short_input_unchanged() {
        assert_eq!(truncate_on_char_boundary("hello", 10), "hello");
        assert_eq!(truncate_on_char_boundary("hello", 5), "hello");
    }

    #[test]
    fn truncate_on_char_boundary_long_input_clipped() {
        assert_eq!(truncate_on_char_boundary("hello world", 5), "hello");
    }

    #[test]
    fn truncate_on_char_boundary_respects_multibyte() {
        // A multibyte string must not panic and must clip on a char boundary.
        let s = "héllo wörld"; // 'é' and 'ö' are 2 bytes each
        let out = truncate_on_char_boundary(s, 5);
        assert_eq!(out, "héllo");
        // No panic on a cut that would land mid-codepoint if done by byte.
        let out2 = truncate_on_char_boundary(s, 2);
        assert_eq!(out2, "");
    }

    #[test]
    fn keyword_query_empty_when_no_signal() {
        let subj = ReviewSubject {
            title: "   ".to_string(),
            identifiers: vec![String::new(), "  ".to_string()],
            ..Default::default()
        };
        assert_eq!(subj.keyword_query(8), "");
    }

    #[test]
    fn context_source_error_display() {
        let e = ContextSourceError::NotConfigured {
            src: "jira",
            reason: "ATLASSIAN_API_TOKEN unset".to_string(),
        };
        assert!(e.to_string().contains("jira skipped"));
        assert!(e.to_string().contains("ATLASSIAN_API_TOKEN"));

        let e = ContextSourceError::SemanticNotImplemented { src: "jira" };
        assert!(e.to_string().contains("semantic mode not yet implemented"));
        assert!(e.to_string().contains("PR-B"));

        let e = ContextSourceError::Api {
            src: "confluence",
            status: 503,
            body: "overloaded".to_string(),
        };
        let s = e.to_string();
        assert!(s.contains("confluence"));
        assert!(s.contains("503"));
    }

    #[test]
    fn context_source_object_safe() {
        // Proves `ContextSource` is object-safe (usable as `dyn`).
        fn _accepts(_s: &dyn ContextSource) {}
    }
}