Skip to main content

vigil_redaction/
merge.rs

1//! ADR 0013 — T0 模型 × 硬指纹 findings 合并决策层(纯函数)。
2//!
3//! 语义(详见 `docs/adr/0013-hardfp-model-merge.md`):
4//!
5//! - **D1** Hard 优先(fast-path):调用方先跑 Hard 再跑 Model,两者结果送本 merge
6//! - **D3** Span 重叠 → Hard 赢(丢 Model)
7//! - **D4** 不重复加权:同 span 冲突时 Model 的 `risk_delta` 随 finding 一起被 drop
8//! - **D5** 非重叠 → 两者都保留(互补覆盖)
9//! - **不变量**:输出按 `span.start` 升序;纯函数不改动输入
10//!
11//! 本模块**只负责 merge 决策**;Hard detect / Model 推理 / risk_delta 累加策略由 caller
12//! 决定(ISS-005 scaffold + ISS-010 firewall preflight 消费者)。
13//!
14//! 类型是 minimal + 保守:`kind` 用 `&'static str` 字面量,避免提前锁死 FindingKind 枚举
15//! 形态(ISS-005 真正扩 API 时可平滑升级;现有字符串字面量规则集见 `crates/vigil-redaction/src/lib.rs`
16//! HARD_RULES)。
17
18#![allow(missing_docs)] // 本模块是 ISS-005 scaffold 前置;完整 rustdoc 由 ISS-005 补
19
20/// Finding 来源分类。
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum FindingSource {
23    /// 正则 / 结构化硬指纹(v0.3 HARD_RULES 14 项)—— 高 precision,<1ms
24    Hard,
25    /// OpenAI Privacy Filter 模型输出(8 类标签)—— 高 recall,400-630 ms CPU
26    Model,
27}
28
29/// 统一 finding 结构;Hard 和 Model 使用同一类型,merge 后 caller 按 `source` 区分
30/// 需要时的差异化处理(如审计展现 / risk 加权)。
31#[derive(Debug, Clone, PartialEq)]
32pub struct Finding {
33    /// label 字面量(Hard 侧:见 HARD_RULES `name`;Model 侧:`private_*` / `secret` / `account_number`)
34    pub kind: &'static str,
35    /// 来源层
36    pub source: FindingSource,
37    /// byte 区间 `[start, end)`(UTF-8 offset,与 tokenizer offsets 对齐)
38    pub span: (usize, usize),
39    /// 置信度 [0.0, 1.0];Hard 总为 1.0(正则命中即确定);Model 为 softmax
40    pub confidence: f32,
41    /// 风险加权基础值(ADR 0012 §1.3 风险分级);merge 后 caller 累加
42    pub risk_delta: u32,
43}
44
45impl Finding {
46    /// 硬指纹 finding 构造辅助(confidence 固定 1.0)
47    pub fn hard(kind: &'static str, span: (usize, usize), risk_delta: u32) -> Self {
48        Self {
49            kind,
50            source: FindingSource::Hard,
51            span,
52            confidence: 1.0,
53            risk_delta,
54        }
55    }
56
57    /// Model finding 构造辅助
58    pub fn model(
59        kind: &'static str,
60        span: (usize, usize),
61        confidence: f32,
62        risk_delta: u32,
63    ) -> Self {
64        Self {
65            kind,
66            source: FindingSource::Model,
67            span,
68            confidence,
69            risk_delta,
70        }
71    }
72}
73
74/// 两 span 严格重叠判定(strict-less):`[a_start, a_end) ∩ [b_start, b_end) != ∅`。
75/// 相邻(`a_end == b_start`)**不** 视为重叠 —— 允许相邻 findings 都保留。
76#[inline]
77fn spans_overlap(a: (usize, usize), b: (usize, usize)) -> bool {
78    a.0 < b.1 && b.0 < a.1
79}
80
81/// ADR 0013 核心 merge 函数。
82///
83/// **契约**:
84/// - Hard findings 全保留(D1)
85/// - Model findings 与**任何** Hard finding 重叠即丢弃(D3 + D4)
86/// - 非重叠 Model findings 保留(D5)
87/// - 结果按 `span.0` 升序;同 start 保持"Hard 先于 Model"(稳定排序)
88/// - 纯函数,不改动输入
89///
90/// caller(ISS-005 scan_text / ISS-010 preflight)可按 `source` 做差异化审计展现
91/// 或按 `risk_delta` 累加得总 risk score。
92pub fn merge_findings(hard: &[Finding], model: &[Finding]) -> Vec<Finding> {
93    let mut out: Vec<Finding> = Vec::with_capacity(hard.len() + model.len());
94    // Hard 全收(D1)
95    out.extend(hard.iter().cloned());
96    // Model 逐条检查 overlap(D3)
97    for m in model {
98        let overlapped = hard.iter().any(|h| spans_overlap(h.span, m.span));
99        if !overlapped {
100            out.push(m.clone());
101        }
102    }
103    // 稳定按 span.start 升序(D5 表现要求);sort_by 稳定,同 start 保 Hard 在前
104    out.sort_by_key(|f| f.span.0);
105    out
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    // 辅助 —— 简化构造
113    fn h(kind: &'static str, start: usize, end: usize, risk: u32) -> Finding {
114        Finding::hard(kind, (start, end), risk)
115    }
116    fn m(kind: &'static str, start: usize, end: usize, conf: f32, risk: u32) -> Finding {
117        Finding::model(kind, (start, end), conf, risk)
118    }
119
120    // ──────────────────────────── 1. 空输入 ────────────────────────────
121    #[test]
122    fn merge_empty_both() {
123        assert_eq!(merge_findings(&[], &[]), vec![]);
124    }
125
126    // ──────────────────────────── 2. 仅 Hard ────────────────────────────
127    #[test]
128    fn merge_hard_only() {
129        let hard = vec![h("email", 10, 30, 10), h("aws_access_key_id", 50, 70, 25)];
130        let merged = merge_findings(&hard, &[]);
131        assert_eq!(merged, hard, "Hard findings 应按 span.start 升序保留");
132    }
133
134    // ──────────────────────────── 3. 仅 Model ────────────────────────────
135    #[test]
136    fn merge_model_only() {
137        let model = vec![
138            m("private_person", 0, 13, 0.99, 5),
139            m("private_date", 20, 30, 0.98, 5),
140        ];
141        let merged = merge_findings(&[], &model);
142        assert_eq!(merged, model);
143    }
144
145    // ──────────────────────────── 4. 非重叠:两侧共存 ────────────────────────────
146    #[test]
147    fn merge_non_overlapping_both_kept() {
148        // Hard 命中 email [73..109];Model 命中 person [0..13] / date [26..36]
149        let hard = vec![h("email", 73, 109, 10)];
150        let model = vec![
151            m("private_person", 0, 13, 0.99, 5),
152            m("private_date", 26, 36, 0.98, 5),
153        ];
154        let merged = merge_findings(&hard, &model);
155        assert_eq!(merged.len(), 3, "3 条不重叠 finding 应全保留");
156        // 按 span.start 升序:person(0)→ date(26)→ email(73)
157        assert_eq!(merged[0].kind, "private_person");
158        assert_eq!(merged[1].kind, "private_date");
159        assert_eq!(merged[2].kind, "email");
160    }
161
162    // ──────────────────────────── 5. 完全重叠:Hard 赢(D3)────────────────────────────
163    #[test]
164    fn merge_fully_overlapping_hard_wins() {
165        // Hard `email` vs Model `private_email` 同 span
166        let hard = vec![h("email", 73, 109, 10)];
167        let model = vec![m("private_email", 73, 109, 1.0, 10)];
168        let merged = merge_findings(&hard, &model);
169        assert_eq!(merged.len(), 1, "重叠应只留 Hard");
170        assert_eq!(merged[0].kind, "email");
171        assert_eq!(merged[0].source, FindingSource::Hard);
172    }
173
174    // ──────────────────────────── 6. 部分重叠:Model drop ────────────────────────────
175    #[test]
176    fn merge_partially_overlapping_hard_wins() {
177        // Hard [73..109];Model [70..85] 部分重叠前缀
178        let hard = vec![h("email", 73, 109, 10)];
179        let model = vec![m("private_email", 70, 85, 0.9, 10)];
180        let merged = merge_findings(&hard, &model);
181        assert_eq!(merged.len(), 1);
182        assert_eq!(merged[0].source, FindingSource::Hard);
183
184        // 反向:Model [100..120] 部分重叠后缀
185        let model2 = vec![m("private_email", 100, 120, 0.9, 10)];
186        let merged2 = merge_findings(&hard, &model2);
187        assert_eq!(merged2.len(), 1);
188        assert_eq!(merged2[0].source, FindingSource::Hard);
189
190        // 反向:Model [70..120] 完全包含 Hard
191        let model3 = vec![m("private_email", 70, 120, 0.9, 10)];
192        let merged3 = merge_findings(&hard, &model3);
193        assert_eq!(merged3.len(), 1);
194        assert_eq!(merged3[0].source, FindingSource::Hard);
195    }
196
197    // ──────────────────────────── 7. 相邻不重叠:两侧保留 ────────────────────────────
198    #[test]
199    fn merge_adjacent_not_overlap() {
200        // Hard [10..20];Model [20..30](端点相邻)
201        let hard = vec![h("email", 10, 20, 10)];
202        let model = vec![m("private_person", 20, 30, 0.9, 5)];
203        let merged = merge_findings(&hard, &model);
204        assert_eq!(
205            merged.len(),
206            2,
207            "相邻 span 两者都保留(spans_overlap 严格 strict-less)"
208        );
209        assert_eq!(merged[0].kind, "email");
210        assert_eq!(merged[1].kind, "private_person");
211    }
212
213    // ──────────────────────────── 8. risk_delta 不双倍(D4)────────────────────────────
214    #[test]
215    fn merge_no_double_weighting_on_overlap() {
216        // 同 span 重叠:只计 Hard.risk,不加 Model.risk
217        let hard = vec![h("email", 73, 109, 10)];
218        let model = vec![m("private_email", 73, 109, 1.0, 10)];
219        let merged = merge_findings(&hard, &model);
220        let total: u32 = merged.iter().map(|f| f.risk_delta).sum();
221        assert_eq!(
222            total, 10,
223            "重叠时 risk 只计 Hard 一次,不应 Hard+Model 双加为 20"
224        );
225
226        // 对照:非重叠时两者都计
227        let model2 = vec![m("private_email", 200, 220, 1.0, 10)];
228        let merged2 = merge_findings(&hard, &model2);
229        let total2: u32 = merged2.iter().map(|f| f.risk_delta).sum();
230        assert_eq!(total2, 20, "非重叠时 Hard + Model 正常累加");
231    }
232
233    // ──────────────────────────── 9. 综合场景(ISS-022 medium 实际样本)────────────────────────────
234    #[test]
235    fn merge_iss_022_medium_sample_scenario() {
236        // 模拟 ISS-022 medium 样本(文档 §1.3)的 merge 结果:
237        //   Hard:  email [73..109]
238        //   Model: private_person [0..13], private_date [26..36],
239        //          private_person [45..70],
240        //          private_email [73..109]  ← 与 Hard 冲突,应丢
241        //          private_phone [117..135],
242        //          private_address [157..201]
243        let hard = vec![h("email", 73, 109, 10)];
244        let model = vec![
245            m("private_person", 0, 13, 0.99, 5),
246            m("private_date", 26, 36, 0.98, 5),
247            m("private_person", 45, 70, 0.97, 5),
248            m("private_email", 73, 109, 1.0, 10),
249            m("private_phone", 117, 135, 1.0, 10),
250            m("private_address", 157, 201, 0.99, 5),
251        ];
252        let merged = merge_findings(&hard, &model);
253        assert_eq!(
254            merged.len(),
255            6,
256            "合并后 6 条(Hard 1 + Model 5,private_email drop)"
257        );
258        // 校验 private_email 被丢
259        assert!(!merged.iter().any(|f| f.kind == "private_email"));
260        // 校验 email(Hard)保留
261        assert!(merged
262            .iter()
263            .any(|f| f.kind == "email" && f.source == FindingSource::Hard));
264        // 校验排序
265        let starts: Vec<usize> = merged.iter().map(|f| f.span.0).collect();
266        assert_eq!(starts, vec![0, 26, 45, 73, 117, 157]);
267
268        // risk_delta 合计(按 ADR 0012 §1.3 分级)
269        let total: u32 = merged.iter().map(|f| f.risk_delta).sum();
270        // 5(person) + 5(date) + 5(person) + 10(email,Hard 赢) + 10(phone) + 5(address) = 40
271        assert_eq!(total, 40);
272    }
273
274    // ──────────────────────────── 10. 纯函数纪律:不改动输入 ────────────────────────────
275    #[test]
276    fn merge_does_not_mutate_inputs() {
277        let hard = vec![h("email", 10, 20, 10)];
278        let model = vec![m("private_email", 10, 20, 1.0, 10)];
279        let hard_before = hard.clone();
280        let model_before = model.clone();
281        let _ = merge_findings(&hard, &model);
282        assert_eq!(hard, hard_before);
283        assert_eq!(model, model_before);
284    }
285
286    // ───────── ISS-021:Hard kind × PrivacyLabel × merge 决策 全 kind 矩阵 golden ─────────
287    //
288    // ADR 0013 Revised(D-final-1 / D-final-2)要求把"D3 一刀切"细化为
289    // "每条 Hard rule 的具体 merge 行为 + PrivacyLabel 映射"都锁死。
290    //
291    // 14 个 Hard kind 字面量(与 `vigil-redaction::lib.rs::ALL_RULES.name` 对齐;
292    // 12 secret-类 + email + internal_ipv4)+ 期望 PrivacyLabel:
293    const HARD_KIND_TO_LABEL: &[(&str, crate::PrivacyLabel)] = &[
294        ("aws_access_key_id", crate::PrivacyLabel::Secret),
295        ("github_token", crate::PrivacyLabel::Secret),
296        ("anthropic_api_key", crate::PrivacyLabel::Secret),
297        ("openai_api_key", crate::PrivacyLabel::Secret),
298        ("jwt", crate::PrivacyLabel::Secret),
299        ("pem_private_key", crate::PrivacyLabel::Secret),
300        ("env_assignment", crate::PrivacyLabel::Secret),
301        ("slack_webhook", crate::PrivacyLabel::Secret),
302        ("stripe_secret_key", crate::PrivacyLabel::Secret),
303        ("google_api_key", crate::PrivacyLabel::Secret),
304        ("gitlab_pat", crate::PrivacyLabel::Secret),
305        ("database_url", crate::PrivacyLabel::Secret),
306        ("email", crate::PrivacyLabel::Email),
307        ("internal_ipv4", crate::PrivacyLabel::Url),
308    ];
309
310    /// 为每个 Hard kind 选一个**与其 PrivacyLabel 一致**的 Model 端字面量。
311    /// 选取规则:
312    /// - Hard 落 `Email`  → Model `private_email`(Stage 2 模型典型输出)
313    /// - Hard 落 `Url`    → Model `private_url`
314    /// - Hard 落 `Secret` → Model `secret`(裸 label,Privacy Filter 33-class 之一)
315    ///
316    /// 这样 merge 重叠时,业务上"两边讲的是同一件事",Hard 赢的语义清晰。
317    fn paired_model_kind(hard_kind: &str) -> &'static str {
318        match hard_kind {
319            "email" => "private_email",
320            "internal_ipv4" => "private_url",
321            // 其余 12 secret-类:Model 用裸 `secret`(8 类标签之一)
322            _ => "secret",
323        }
324    }
325
326    /// D-final-2 封闭映射:每个 Hard kind 字面量必须能映射到某个 PrivacyLabel,
327    /// 且映射结果与本 ISS 锁定的 golden 表一致。
328    #[test]
329    fn iss_021_hard_kind_to_privacy_label_golden() {
330        use crate::PrivacyLabel;
331        for (kind, expected) in HARD_KIND_TO_LABEL {
332            assert_eq!(
333                PrivacyLabel::from_kind(kind),
334                Some(*expected),
335                "Hard kind {kind:?} 应映射到 {expected:?}\
336                 (ADR 0013 Revised D-final-2 封闭映射;改字面量需同步 \
337                 vigil-redaction::label.rs::from_kind + 本 golden 表)"
338            );
339        }
340    }
341
342    /// D-final-1 矩阵化:每个 Hard kind 在同 span 重叠时必赢、Model finding 必丢。
343    #[test]
344    fn iss_021_merge_overlap_hard_wins_for_each_kind() {
345        for (kind, _) in HARD_KIND_TO_LABEL {
346            let hard = vec![Finding::hard(kind, (10, 30), 25)];
347            let model = vec![Finding::model(paired_model_kind(kind), (10, 30), 1.0, 25)];
348            let merged = merge_findings(&hard, &model);
349            assert_eq!(
350                merged.len(),
351                1,
352                "Hard kind {kind:?} 同 span 重叠 merge 必去重为 1 条"
353            );
354            assert_eq!(
355                merged[0].source,
356                FindingSource::Hard,
357                "Hard kind {kind:?} 同 span 重叠应 Hard 赢(ADR 0013 D-final-1)"
358            );
359            assert_eq!(merged[0].kind, *kind);
360            // D4 不双倍:risk 只取 Hard 一次
361            assert_eq!(
362                merged[0].risk_delta, 25,
363                "Hard kind {kind:?} 重叠时 risk 只计 Hard 一次,不应 Hard+Model 双加"
364            );
365        }
366    }
367
368    /// D5 矩阵化:每个 Hard kind 与非重叠 Model finding 共存时,两者都保留。
369    #[test]
370    fn iss_021_merge_no_overlap_both_kept_for_each_kind() {
371        for (kind, _) in HARD_KIND_TO_LABEL {
372            let hard = vec![Finding::hard(kind, (10, 30), 25)];
373            let model = vec![Finding::model(paired_model_kind(kind), (50, 70), 1.0, 25)];
374            let merged = merge_findings(&hard, &model);
375            assert_eq!(
376                merged.len(),
377                2,
378                "Hard kind {kind:?} 非重叠 merge 两者都保留(ADR 0013 D5)"
379            );
380            // 升序 by span.start:Hard 在前(10),Model 在后(50)
381            assert_eq!(merged[0].source, FindingSource::Hard);
382            assert_eq!(merged[1].source, FindingSource::Model);
383        }
384    }
385
386    /// 集合守门(R1 NICE 强化):HARD_KIND_TO_LABEL 必须**精确等于**
387    /// `crate::HARD_RULES.name` 集合 + ALL_RULES 独有的 email/internal_ipv4。
388    ///
389    /// 比单纯 `len == 14` 守门**更强**:Codex R1 NICE 指出 len 守门不能抓"加新
390    /// HARD_RULES 但忘了同步 HARD_KIND_TO_LABEL"或"删了某个 HARD_RULES.name 但
391    /// 这里残留"两类漂移。本测试做集合双向 diff,任一侧漂移即指出具体差异。
392    ///
393    /// **覆盖关系**:
394    /// - `HARD_RULES`(crate::pub(crate) 静态)= 12 secret-类 hard rule
395    /// - `ALL_RULES` 独有 = `email` + `internal_ipv4`(redact 路径用,**故意不进**
396    ///   `HARD_RULES` 因为可能误报正常业务文本;但 PrivacyLabel::from_kind 必须
397    ///   认它们,否则 Model 侧产 private_email/private_url 后映射会落空)
398    /// - 总和 = 14,与 vigil-browser FindingKind 12 (LOCAL_ONLY 除外) 的关系由
399    ///   `vigil-browser/tests/rule_sync.rs::iss_021_*` 守门(详见 ADR 0013 Revised
400    ///   跨 crate 不变量表)
401    #[test]
402    fn iss_021_hard_kind_set_size_matches_redaction_rules() {
403        use std::collections::BTreeSet;
404
405        // 本表的 kinds
406        let golden_kinds: BTreeSet<&str> = HARD_KIND_TO_LABEL.iter().map(|(k, _)| *k).collect();
407
408        // 真实 HARD_RULES.name 集合(12 secret-类)+ ALL_RULES 独有的 2 项
409        let mut expected_kinds: BTreeSet<&'static str> =
410            crate::HARD_RULES.iter().map(|r| r.name).collect();
411        expected_kinds.insert("email");
412        expected_kinds.insert("internal_ipv4");
413
414        // 集合双向 diff,任一侧漂移即 fail 并指出具体差异
415        assert_eq!(
416            golden_kinds, expected_kinds,
417            "HARD_KIND_TO_LABEL 与 (HARD_RULES + email/internal_ipv4) 集合漂移;\
418             检查 vigil-redaction lib.rs HARD_RULES 是否新增 / 删除了 hard rule,\
419             以及 ALL_RULES 是否还独有 email/internal_ipv4(若改动需同步本表 + \
420             ADR 0013 Revised 版本史)"
421        );
422
423        // 兜底:精确数量 14(R1 原守门保留,语义冗余但便于回归 triage)
424        assert_eq!(golden_kinds.len(), 14);
425    }
426}