Skip to main content

mati_core/analysis/
onboarding.rs

1//! Onboarding import (idea 2.2) — propose gotcha *candidates* by mining
2//! artifacts that already exist in a repo: CODEOWNERS ownership rules and
3//! load-bearing / security marker comments.
4//!
5//! Each candidate is a `confirmed: false` [`GotchaRecord`] stub
6//! (`RecordSource::Import`) that surfaces in `mati review` for a developer to
7//! approve — turning the blank-slate "confirm your gotchas" step into "here are
8//! N candidates we found." This module is **pure**: parsing and record
9//! construction take string content and emit [`Record`]s; file discovery and
10//! store I/O live in the `mati suggest` CLI command.
11
12use uuid::Uuid;
13
14use crate::store::record::{
15    Category, ConfidenceScore, GotchaRecord, Priority, QualityScore, QualityTier, Record,
16    RecordSource,
17};
18
19/// Load-bearing / security markers we treat as strong, unambiguous signals.
20/// Deliberately narrow (no `TODO`/`FIXME`/`HACK`) to keep candidate quality high.
21const MARKERS: &[&str] = &[
22    "DO NOT REMOVE",
23    "DO NOT EDIT",
24    "DO NOT MODIFY",
25    "DO NOT DELETE",
26    "SECURITY:",
27    "SECURITY-CRITICAL",
28];
29
30/// Skip lines longer than this (minified / generated) to limit false positives.
31const MAX_LINE_LEN: usize = 400;
32
33/// Cap total marker candidates so a large repo can't flood `mati review`.
34pub const MAX_MARKER_CANDIDATES: usize = 200;
35
36// ── CODEOWNERS ────────────────────────────────────────────────────────────────
37
38/// A parsed CODEOWNERS entry: a path pattern and its owners.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct OwnerRule {
41    pub pattern: String,
42    pub owners: Vec<String>,
43}
44
45/// Parse CODEOWNERS content into `(pattern, owners)` rules. Ignores comments
46/// (`#`) and blank lines; a valid line is `<pattern> <owner...>` with ≥1 owner.
47pub fn parse_codeowners(content: &str) -> Vec<OwnerRule> {
48    let mut rules = Vec::new();
49    for raw in content.lines() {
50        let line = raw.split('#').next().unwrap_or("").trim();
51        if line.is_empty() {
52            continue;
53        }
54        let mut parts = line.split_whitespace();
55        let Some(pattern) = parts.next() else {
56            continue;
57        };
58        let owners: Vec<String> = parts.map(str::to_string).collect();
59        if owners.is_empty() {
60            continue;
61        }
62        rules.push(OwnerRule {
63            pattern: pattern.to_string(),
64            owners,
65        });
66    }
67    rules
68}
69
70// ── Marker comments ───────────────────────────────────────────────────────────
71
72/// A marker-comment hit in a source file.
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub struct MarkerHit {
75    pub path: String,
76    pub line: usize,
77    pub marker: String,
78    pub text: String,
79}
80
81/// Scan one file's content for load-bearing / security markers (case-insensitive).
82pub fn scan_markers(path: &str, content: &str) -> Vec<MarkerHit> {
83    let mut hits = Vec::new();
84    for (i, raw) in content.lines().enumerate() {
85        if raw.len() > MAX_LINE_LEN {
86            continue;
87        }
88        let upper = raw.to_uppercase();
89        if let Some(marker) = MARKERS.iter().find(|m| upper.contains(**m)) {
90            hits.push(MarkerHit {
91                path: path.to_string(),
92                line: i + 1,
93                marker: (*marker).to_string(),
94                text: raw.trim().to_string(),
95            });
96        }
97    }
98    hits
99}
100
101// ── Candidate record construction ─────────────────────────────────────────────
102
103/// Build one `confirmed: false` gotcha candidate Record. Mirrors the Layer-0
104/// stub pattern used by `init`'s git-signal candidates.
105#[allow(clippy::too_many_arguments)]
106fn candidate_record(
107    key: String,
108    rule: String,
109    reason: String,
110    severity: Priority,
111    affected_files: Vec<String>,
112    tags: Vec<String>,
113    device_id: Uuid,
114    logical_clock: u64,
115    now: u64,
116) -> Record {
117    let gotcha = GotchaRecord {
118        rule: rule.clone(),
119        reason,
120        severity: severity.clone(),
121        affected_files,
122        ref_url: None,
123        discovered_session: now,
124        confirmed: false,
125    };
126    let mut rec = Record::layer0_file_stub(&key, device_id, logical_clock, now);
127    rec.category = Category::Gotcha;
128    rec.source = RecordSource::Import;
129    rec.priority = severity;
130    rec.value = rule;
131    rec.quality = QualityScore {
132        value: 0.50,
133        tier: QualityTier::Acceptable,
134        signals: vec![],
135        computed_at: now,
136    };
137    // `for_new_record(Import)` sits below the 0.80 "confirmed" floor, so the
138    // stub stays a candidate until a developer confirms it.
139    rec.confidence = ConfidenceScore::for_new_record(&RecordSource::Import);
140    rec.tags = tags;
141    rec.payload = serde_json::to_value(&gotcha).ok();
142    rec
143}
144
145/// Candidate records from CODEOWNERS rules (ownership coordination gotchas).
146pub fn codeowners_candidates(
147    rules: &[OwnerRule],
148    device_id: Uuid,
149    clock_start: u64,
150    now: u64,
151) -> Vec<Record> {
152    rules
153        .iter()
154        .enumerate()
155        .map(|(i, r)| {
156            let owners = r.owners.join(", ");
157            let rule = format!(
158                "`{}` is owned by {} (CODEOWNERS) — coordinate changes with them.",
159                r.pattern, owners
160            );
161            let reason = format!("Listed in CODEOWNERS: {} → {}.", r.pattern, owners);
162            let key = format!("gotcha:codeowners:{}", r.pattern);
163            candidate_record(
164                key,
165                rule,
166                reason,
167                Priority::Normal,
168                vec![r.pattern.clone()],
169                vec!["codeowners".into(), "auto-generated".into()],
170                device_id,
171                clock_start + i as u64,
172                now,
173            )
174        })
175        .collect()
176}
177
178/// Candidate records from marker hits (capped at [`MAX_MARKER_CANDIDATES`]).
179pub fn marker_candidates(
180    hits: &[MarkerHit],
181    device_id: Uuid,
182    clock_start: u64,
183    now: u64,
184) -> Vec<Record> {
185    hits.iter()
186        .take(MAX_MARKER_CANDIDATES)
187        .enumerate()
188        .map(|(i, h)| {
189            let rule = format!(
190                "`{}` carries a `{}` marker at line {} — preserve it through edits.",
191                h.path, h.marker, h.line
192            );
193            let reason = format!("Developer marker in source: {}", h.text);
194            let key = format!("gotcha:marker:{}:{}", h.path, h.line);
195            // Load-bearing / security markers are high severity by definition.
196            candidate_record(
197                key,
198                rule,
199                reason,
200                Priority::High,
201                vec![h.path.clone()],
202                vec!["code-marker".into(), "auto-generated".into()],
203                device_id,
204                clock_start + i as u64,
205                now,
206            )
207        })
208        .collect()
209}
210
211/// Build all onboarding candidates from already-read artifact content. Pure:
212/// `codeowners` is the CODEOWNERS file content (if found) and `files` is a list
213/// of `(repo-relative path, content)` pairs to scan for markers.
214pub fn build_candidates(
215    codeowners: Option<&str>,
216    files: &[(String, String)],
217    device_id: Uuid,
218    clock_start: u64,
219    now: u64,
220) -> Vec<Record> {
221    let mut out = Vec::new();
222    let mut clock = clock_start;
223
224    if let Some(content) = codeowners {
225        let rules = parse_codeowners(content);
226        let recs = codeowners_candidates(&rules, device_id, clock, now);
227        clock += recs.len() as u64;
228        out.extend(recs);
229    }
230
231    let mut hits = Vec::new();
232    for (path, content) in files {
233        hits.extend(scan_markers(path, content));
234    }
235    out.extend(marker_candidates(&hits, device_id, clock, now));
236
237    out
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    fn dev() -> Uuid {
245        Uuid::nil()
246    }
247
248    fn is_unconfirmed_gotcha(rec: &Record) -> bool {
249        rec.category == Category::Gotcha
250            && rec.source == RecordSource::Import
251            && rec
252                .payload
253                .as_ref()
254                .and_then(|p| serde_json::from_value::<GotchaRecord>(p.clone()).ok())
255                .is_some_and(|g| !g.confirmed)
256    }
257
258    #[test]
259    fn parse_codeowners_ignores_comments_and_blank_and_ownerless() {
260        let content = "\
261# comment\n\
262\n\
263src/payments/** @pay-team @alice\n\
264docs/   # trailing comment\n\
265*.rs @rustfolk\n";
266        let rules = parse_codeowners(content);
267        assert_eq!(rules.len(), 2, "ownerless `docs/` line is skipped");
268        assert_eq!(rules[0].pattern, "src/payments/**");
269        assert_eq!(rules[0].owners, vec!["@pay-team", "@alice"]);
270        assert_eq!(rules[1].pattern, "*.rs");
271    }
272
273    #[test]
274    fn scan_markers_is_case_insensitive_and_skips_long_lines() {
275        let content = "\
276let x = 1;\n\
277// do not remove: load-bearing init order\n\
278// SECURITY: validate before deref\n\
279let normal = 2;\n";
280        let hits = scan_markers("src/lib.rs", content);
281        assert_eq!(hits.len(), 2);
282        assert_eq!(hits[0].marker, "DO NOT REMOVE");
283        assert_eq!(hits[0].line, 2);
284        assert_eq!(hits[1].marker, "SECURITY:");
285
286        // Over-long (minified) lines are skipped.
287        let long = format!("// DO NOT REMOVE {}", "x".repeat(MAX_LINE_LEN));
288        assert!(scan_markers("min.js", &long).is_empty());
289    }
290
291    #[test]
292    fn codeowners_candidates_are_unconfirmed_gotchas_keyed_by_pattern() {
293        let rules = parse_codeowners("src/payments/** @pay-team\n");
294        let recs = codeowners_candidates(&rules, dev(), 0, 100);
295        assert_eq!(recs.len(), 1);
296        assert!(is_unconfirmed_gotcha(&recs[0]));
297        assert_eq!(recs[0].key, "gotcha:codeowners:src/payments/**");
298        let g: GotchaRecord = serde_json::from_value(recs[0].payload.clone().unwrap()).unwrap();
299        assert_eq!(g.affected_files, vec!["src/payments/**"]);
300        assert!(!g.confirmed);
301    }
302
303    #[test]
304    fn marker_candidates_cap_and_key_format() {
305        // Build more hits than the cap.
306        let hits: Vec<MarkerHit> = (0..MAX_MARKER_CANDIDATES + 50)
307            .map(|i| MarkerHit {
308                path: format!("src/f{i}.rs"),
309                line: i + 1,
310                marker: "DO NOT REMOVE".into(),
311                text: "// DO NOT REMOVE".into(),
312            })
313            .collect();
314        let recs = marker_candidates(&hits, dev(), 0, 100);
315        assert_eq!(recs.len(), MAX_MARKER_CANDIDATES, "capped");
316        assert_eq!(recs[0].key, "gotcha:marker:src/f0.rs:1");
317        assert_eq!(recs[0].priority, Priority::High);
318        assert!(is_unconfirmed_gotcha(&recs[0]));
319    }
320
321    #[test]
322    fn build_candidates_combines_both_sources() {
323        let files = vec![(
324            "src/auth.rs".to_string(),
325            "// SECURITY: constant-time compare\n".to_string(),
326        )];
327        let recs = build_candidates(Some("src/** @team\n"), &files, dev(), 0, 100);
328        assert_eq!(recs.len(), 2);
329        assert!(recs.iter().all(is_unconfirmed_gotcha));
330        assert!(recs.iter().any(|r| r.key.starts_with("gotcha:codeowners:")));
331        assert!(recs.iter().any(|r| r.key.starts_with("gotcha:marker:")));
332        // Logical clocks are distinct (no collisions across sources).
333        let clocks: std::collections::HashSet<u64> =
334            recs.iter().map(|r| r.version.logical_clock).collect();
335        assert_eq!(clocks.len(), recs.len());
336    }
337}