Skip to main content

difflore_core/cloud/
session_mined.rs

1//! Session-mined candidate rules.
2//!
3//! The local worker turns session transcripts into candidate rules:
4//!
5//! 1. A SessionEnd / Stop / N-turn watermark fires the local worker
6//!    (see `difflore-cli/src/session_mine/`).
7//! 2. The worker pulls the last few user-prompt / assistant-text pairs
8//!    from the platform transcript, strips tool calls + thinking blocks,
9//!    and hands them to a small LLM gate (Haiku-class).
10//! 3. The gate verdict is either KEEP (a brand-new reusable rule) or
11//!    MERGE:<id> (an extension of an existing rule). In either case the
12//!    worker enqueues a [`SessionMinedCandidate`] on `cloud_outbox` with
13//!    `kind = "session_mined_candidate"`.
14//! 4. The cloud clusters those rows into draft `candidate_rules` with
15//!    `origin = 'session_mined'` and `requires_human_approval = true`.
16
17use sha2::{Digest, Sha256};
18
19/// Cap on `title` (chars, not bytes). Matches the existing
20/// `Observation::title` convention so cloud-side renderers can share
21/// truncation logic.
22pub const TITLE_MAX_CHARS: usize = 120;
23
24/// Cap on `body` (chars, not bytes). 2 KB is enough for a 3-5 sentence
25/// rule body with a snippet; anything longer is almost certainly the
26/// raw transcript text the gate failed to compress.
27pub const BODY_MAX_CHARS: usize = 2000;
28
29/// Maximum number of file glob patterns we will accept from the gate.
30/// 1-3 is the sweet spot: a single broad glob is too noisy, more than
31/// three usually means the gate failed to localise the rule.
32pub const MAX_FILE_PATTERNS: usize = 3;
33
34/// Stable origin tag for the candidate-rule pipeline.
35pub const ORIGIN: &str = "session_mined";
36
37/// Wire format for one session-mined candidate. Serialised verbatim
38/// into `cloud_outbox.payload_json` under `kind =
39/// "session_mined_candidate"`.
40///
41/// Every field except `gate_verdict` carries a hard local invariant —
42/// see the `validate` method and the constructor builders below.
43/// `source_repo` is the load-bearing one: it MUST be derived from the
44/// current git remote / cwd and never empty, otherwise the candidate
45/// has no Project Scope and the cloud has no way to attribute it.
46#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
47pub struct SessionMinedCandidate {
48    /// Platform session id from the hook stdin payload (Claude Code
49    /// `session_id`, Cursor session uuid, …). Empty string is *not*
50    /// accepted — cloud-side dedup keys on this.
51    pub session_id: String,
52    /// Unix-ms at the moment the gate produced its verdict.
53    pub ts_ms: i64,
54    /// `owner/repo` for the GitHub remote, or the cwd basename as a
55    /// fallback for non-GitHub repos. Never empty; rejected if the
56    /// builder can't determine a repo identity for the current
57    /// workspace.
58    pub source_repo: String,
59    /// Single-line title, ≤ [`TITLE_MAX_CHARS`] chars. The gate is
60    /// instructed to write this as a behavioural rule (imperative or
61    /// declarative); rendering on the cloud side adds prefixes like
62    /// "Remember:" so we don't double them.
63    pub title: String,
64    /// Multi-sentence rule body, ≤ [`BODY_MAX_CHARS`] chars. Usually
65    /// 2-5 sentences plus an optional code snippet.
66    pub body: String,
67    /// 1-3 file globs, never empty. Cloud-side cascade ordering keys
68    /// on these (rules whose patterns match the target file surface
69    /// first), so a candidate with zero patterns can never be served.
70    pub file_patterns: Vec<String>,
71    /// Provider:model identifier for the gate call, e.g.
72    /// `"claude:haiku"`. Used for audits + future per-model recall.
73    pub gate_model: String,
74    /// `"KEEP"` for a brand-new rule, or `"MERGE:<id>"` where `<id>` is
75    /// the cloud rule id the gate decided to extend. Cloud applies
76    /// the MERGE shape against the named rule's body; KEEP becomes a
77    /// fresh `candidate_rules` row.
78    pub gate_verdict: String,
79    /// 16-char hex sha256 of `source_repo|title|body`. Stable across
80    /// retries so the cloud can dedup duplicate outbox uploads of the
81    /// same candidate.
82    pub content_hash: String,
83    /// Origin discriminator on the wire, always [`ORIGIN`].
84    pub origin: String,
85    /// Draft gate; session-mined candidates require human approval.
86    pub requires_human_approval: bool,
87}
88
89/// Errors a [`SessionMinedCandidate`] can fail validation with.
90/// Returned by the constructor / `validate` so the worker can swallow
91/// invalid candidates without retrying through the outbox.
92#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
93pub enum CandidateError {
94    /// `source_repo` is empty or whitespace-only. The Project Scope
95    /// Invariant says: drop the candidate, don't enqueue a scopeless row.
96    #[error("session-mined candidate is missing source_repo — drop")]
97    MissingSourceRepo,
98    /// `session_id` is empty. Without a session id the cloud cannot
99    /// dedup or attribute the candidate.
100    #[error("session-mined candidate is missing session_id — drop")]
101    MissingSessionId,
102    /// `title` empty or longer than [`TITLE_MAX_CHARS`].
103    #[error("session-mined candidate title invalid (empty or > {TITLE_MAX_CHARS} chars)")]
104    InvalidTitle,
105    /// `body` empty or longer than [`BODY_MAX_CHARS`].
106    #[error("session-mined candidate body invalid (empty or > {BODY_MAX_CHARS} chars)")]
107    InvalidBody,
108    /// `file_patterns` empty or > [`MAX_FILE_PATTERNS`].
109    #[error("session-mined candidate must carry 1-{MAX_FILE_PATTERNS} file patterns")]
110    InvalidFilePatterns,
111    /// `gate_model` empty.
112    #[error("session-mined candidate is missing gate_model")]
113    MissingGateModel,
114    /// `gate_verdict` empty / not `"KEEP"` / not `"MERGE:<id>"`.
115    #[error("session-mined candidate gate_verdict must be 'KEEP' or 'MERGE:<id>'")]
116    InvalidGateVerdict,
117    /// `requires_human_approval` was set to `false`.
118    #[error("session-mined candidates must keep requires_human_approval = true")]
119    NotDraft,
120    /// `origin` is anything other than [`ORIGIN`].
121    #[error("session-mined candidate has wrong origin (expected {ORIGIN})")]
122    WrongOrigin,
123}
124
125impl SessionMinedCandidate {
126    /// Build a new candidate from gate output. Truncates title/body to
127    /// the documented caps, derives the content hash, and pins the
128    /// origin and draft flag. Callers must drop invalid candidates.
129    pub fn try_new(args: SessionMinedCandidateArgs) -> Result<Self, CandidateError> {
130        let SessionMinedCandidateArgs {
131            session_id,
132            ts_ms,
133            source_repo,
134            title,
135            body,
136            file_patterns,
137            gate_model,
138            gate_verdict,
139        } = args;
140
141        let session_id = session_id.trim().to_owned();
142        if session_id.is_empty() {
143            return Err(CandidateError::MissingSessionId);
144        }
145        let source_repo = source_repo.trim().to_owned();
146        if source_repo.is_empty() {
147            return Err(CandidateError::MissingSourceRepo);
148        }
149        let title = truncate_chars(title.trim(), TITLE_MAX_CHARS);
150        if title.is_empty() {
151            return Err(CandidateError::InvalidTitle);
152        }
153        let body = truncate_chars(body.trim(), BODY_MAX_CHARS);
154        if body.is_empty() {
155            return Err(CandidateError::InvalidBody);
156        }
157        let file_patterns: Vec<String> = file_patterns
158            .into_iter()
159            .map(|p| p.trim().to_owned())
160            .filter(|p| !p.is_empty())
161            .take(MAX_FILE_PATTERNS)
162            .collect();
163        if file_patterns.is_empty() {
164            return Err(CandidateError::InvalidFilePatterns);
165        }
166        let gate_model = gate_model.trim().to_owned();
167        if gate_model.is_empty() {
168            return Err(CandidateError::MissingGateModel);
169        }
170        let gate_verdict = gate_verdict.trim().to_owned();
171        if !is_valid_verdict(&gate_verdict) {
172            return Err(CandidateError::InvalidGateVerdict);
173        }
174
175        let content_hash = compute_content_hash(&source_repo, &title, &body);
176
177        Ok(Self {
178            session_id,
179            ts_ms,
180            source_repo,
181            title,
182            body,
183            file_patterns,
184            gate_model,
185            gate_verdict,
186            content_hash,
187            origin: ORIGIN.to_owned(),
188            requires_human_approval: true,
189        })
190    }
191
192    /// Re-validate an existing candidate. Used by the outbox
193    /// dispatcher before posting so a corrupted row (e.g. tampered
194    /// payload, wrong-origin row migrated in) never reaches the
195    /// cloud endpoint.
196    pub fn validate(&self) -> Result<(), CandidateError> {
197        if self.session_id.trim().is_empty() {
198            return Err(CandidateError::MissingSessionId);
199        }
200        if self.source_repo.trim().is_empty() {
201            return Err(CandidateError::MissingSourceRepo);
202        }
203        if self.title.is_empty() || self.title.chars().count() > TITLE_MAX_CHARS {
204            return Err(CandidateError::InvalidTitle);
205        }
206        if self.body.is_empty() || self.body.chars().count() > BODY_MAX_CHARS {
207            return Err(CandidateError::InvalidBody);
208        }
209        if self.file_patterns.is_empty() || self.file_patterns.len() > MAX_FILE_PATTERNS {
210            return Err(CandidateError::InvalidFilePatterns);
211        }
212        if self.gate_model.trim().is_empty() {
213            return Err(CandidateError::MissingGateModel);
214        }
215        if !is_valid_verdict(&self.gate_verdict) {
216            return Err(CandidateError::InvalidGateVerdict);
217        }
218        if self.origin != ORIGIN {
219            return Err(CandidateError::WrongOrigin);
220        }
221        if !self.requires_human_approval {
222            return Err(CandidateError::NotDraft);
223        }
224        Ok(())
225    }
226}
227
228/// Builder bundle accepted by [`SessionMinedCandidate::try_new`]. Kept
229/// as a struct (rather than a long argument list) so future fields
230/// can be added without breaking call sites.
231#[derive(Debug, Clone)]
232pub struct SessionMinedCandidateArgs {
233    pub session_id: String,
234    pub ts_ms: i64,
235    pub source_repo: String,
236    pub title: String,
237    pub body: String,
238    pub file_patterns: Vec<String>,
239    pub gate_model: String,
240    pub gate_verdict: String,
241}
242
243fn is_valid_verdict(verdict: &str) -> bool {
244    if verdict == "KEEP" {
245        return true;
246    }
247    if let Some(rest) = verdict.strip_prefix("MERGE:") {
248        return !rest.trim().is_empty();
249    }
250    false
251}
252
253/// `sha256(source_repo|title|body)[:16]` as lowercase hex. Mirrors
254/// the 16-char convention used by `Observation::content_hash` and
255/// `remember_rule` so cloud-side dedup logic doesn't need a separate
256/// hash family.
257pub fn compute_content_hash(source_repo: &str, title: &str, body: &str) -> String {
258    let mut hasher = Sha256::new();
259    hasher.update(source_repo.as_bytes());
260    hasher.update(b"|");
261    hasher.update(title.as_bytes());
262    hasher.update(b"|");
263    hasher.update(body.as_bytes());
264    let digest = hasher.finalize();
265    let mut hex = String::with_capacity(16);
266    for byte in digest.iter().take(8) {
267        hex.push_str(&format!("{byte:02x}"));
268    }
269    hex
270}
271
272fn truncate_chars(s: &str, max_chars: usize) -> String {
273    if s.chars().count() <= max_chars {
274        return s.to_owned();
275    }
276    let mut out: String = s.chars().take(max_chars.saturating_sub(1)).collect();
277    out.push('…');
278    out
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    fn args() -> SessionMinedCandidateArgs {
286        SessionMinedCandidateArgs {
287            session_id: "sess_test".to_owned(),
288            ts_ms: 1_714_000_000_000,
289            source_repo: "owner/repo".to_owned(),
290            title: "Prefer typed deserialization over Value::as_str".to_owned(),
291            body: "When parsing oRPC payloads, deserialize into a concrete struct \
292                   instead of walking serde_json::Value with as_str()."
293                .to_owned(),
294            file_patterns: vec!["src/**/*.rs".to_owned()],
295            gate_model: "claude:haiku".to_owned(),
296            gate_verdict: "KEEP".to_owned(),
297        }
298    }
299
300    #[test]
301    fn try_new_sets_origin_and_draft_flag_unconditionally() {
302        let cand = SessionMinedCandidate::try_new(args()).expect("valid");
303        assert_eq!(cand.origin, ORIGIN);
304        assert!(
305            cand.requires_human_approval,
306            "session-mined candidates must default to draft"
307        );
308    }
309
310    #[test]
311    fn try_new_rejects_missing_source_repo() {
312        // Scopeless candidates are dropped, never enqueued.
313        let mut a = args();
314        a.source_repo = String::new();
315        let err = SessionMinedCandidate::try_new(a).unwrap_err();
316        assert_eq!(err, CandidateError::MissingSourceRepo);
317    }
318
319    #[test]
320    fn try_new_rejects_missing_session_id() {
321        let mut a = args();
322        a.session_id = "   ".to_owned();
323        let err = SessionMinedCandidate::try_new(a).unwrap_err();
324        assert_eq!(err, CandidateError::MissingSessionId);
325    }
326
327    #[test]
328    fn try_new_rejects_empty_file_patterns() {
329        let mut a = args();
330        a.file_patterns = vec![];
331        let err = SessionMinedCandidate::try_new(a).unwrap_err();
332        assert_eq!(err, CandidateError::InvalidFilePatterns);
333
334        let mut a = args();
335        a.file_patterns = vec!["   ".to_owned(), String::new()];
336        let err = SessionMinedCandidate::try_new(a).unwrap_err();
337        assert_eq!(err, CandidateError::InvalidFilePatterns);
338    }
339
340    #[test]
341    fn try_new_caps_file_patterns_at_three() {
342        let mut a = args();
343        a.file_patterns = vec![
344            "a.rs".to_owned(),
345            "b.rs".to_owned(),
346            "c.rs".to_owned(),
347            "d.rs".to_owned(),
348        ];
349        let cand = SessionMinedCandidate::try_new(a).expect("valid");
350        assert_eq!(cand.file_patterns.len(), MAX_FILE_PATTERNS);
351    }
352
353    #[test]
354    fn try_new_validates_verdict_shape() {
355        for bad in ["", "MERGE:", "merge:abc", "REJECT", "merge"] {
356            let mut a = args();
357            a.gate_verdict = bad.to_owned();
358            let err = SessionMinedCandidate::try_new(a).unwrap_err();
359            assert_eq!(err, CandidateError::InvalidGateVerdict, "verdict='{bad}'");
360        }
361        for ok in ["KEEP", "MERGE:rule-123"] {
362            let mut a = args();
363            a.gate_verdict = ok.to_owned();
364            SessionMinedCandidate::try_new(a).expect("verdict must be accepted");
365        }
366    }
367
368    #[test]
369    fn try_new_truncates_oversize_title_and_body() {
370        let long: String = "x".repeat(TITLE_MAX_CHARS + 50);
371        let big: String = "y".repeat(BODY_MAX_CHARS + 100);
372        let mut a = args();
373        a.title.clone_from(&long);
374        a.body.clone_from(&big);
375        let cand = SessionMinedCandidate::try_new(a).expect("valid");
376        assert!(cand.title.chars().count() <= TITLE_MAX_CHARS);
377        assert!(cand.body.chars().count() <= BODY_MAX_CHARS);
378    }
379
380    #[test]
381    fn content_hash_is_stable_and_input_sensitive() {
382        let a = SessionMinedCandidate::try_new(args()).unwrap();
383        let b = SessionMinedCandidate::try_new(args()).unwrap();
384        assert_eq!(a.content_hash, b.content_hash);
385        assert_eq!(a.content_hash.len(), 16);
386
387        let mut other = args();
388        other.title = "Different rule".to_owned();
389        let c = SessionMinedCandidate::try_new(other).unwrap();
390        assert_ne!(a.content_hash, c.content_hash);
391    }
392
393    #[test]
394    fn validate_rejects_tampered_origin_or_unpublished_off() {
395        let cand = SessionMinedCandidate::try_new(args()).unwrap();
396        cand.validate().unwrap();
397
398        let mut tampered = cand.clone();
399        tampered.origin = "remember_rule".to_owned();
400        assert_eq!(
401            tampered.validate().unwrap_err(),
402            CandidateError::WrongOrigin
403        );
404
405        let mut leaked = cand;
406        leaked.requires_human_approval = false;
407        assert_eq!(leaked.validate().unwrap_err(), CandidateError::NotDraft);
408    }
409
410    #[test]
411    fn wire_shape_serializes_with_snake_case_keys() {
412        // Lock the snake_case wire contract used by the cloud endpoint.
413        let cand = SessionMinedCandidate::try_new(args()).unwrap();
414        let value = serde_json::to_value(&cand).expect("serialize");
415        for required in [
416            "session_id",
417            "ts_ms",
418            "source_repo",
419            "title",
420            "body",
421            "file_patterns",
422            "gate_model",
423            "gate_verdict",
424            "content_hash",
425            "origin",
426            "requires_human_approval",
427        ] {
428            assert!(value.get(required).is_some(), "missing field: {required}");
429        }
430        assert_eq!(value["requires_human_approval"], true);
431        assert_eq!(value["origin"], ORIGIN);
432    }
433}