use std::collections::BTreeMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use sha2::{Digest, Sha256};
use vigil_audit::{Ledger, NewRedactionFinding, NewRedactionScan};
use vigil_policy::PiiFindingSummary;
use vigil_redaction::{PrivacyLabel, RedactionResult, ScanError};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum EngineStatusReport {
Ok,
DegradedTimeout,
DegradedError,
Unsupported,
}
impl EngineStatusReport {
pub fn stable_code(&self) -> &'static str {
match self {
EngineStatusReport::Ok => "ok",
EngineStatusReport::DegradedTimeout => "degraded_timeout",
EngineStatusReport::DegradedError => "degraded_error",
EngineStatusReport::Unsupported => "unsupported",
}
}
}
impl From<vigil_redaction::EngineStatus> for EngineStatusReport {
fn from(s: vigil_redaction::EngineStatus) -> Self {
match s {
vigil_redaction::EngineStatus::Ok => Self::Ok,
vigil_redaction::EngineStatus::DegradedTimeout => Self::DegradedTimeout,
vigil_redaction::EngineStatus::DegradedError => Self::DegradedError,
}
}
}
pub trait PiiScanner: Send + Sync + 'static {
fn scan(&self, text: &str) -> Result<RedactionResult, ScanError>;
fn scan_with_status(
&self,
text: &str,
) -> Result<(RedactionResult, EngineStatusReport), ScanError> {
self.scan(text)
.map(|r| (r, EngineStatusReport::Unsupported))
}
}
#[derive(Debug, Default, Clone, Copy)]
pub(crate) struct DefaultScanner;
impl PiiScanner for DefaultScanner {
fn scan(&self, text: &str) -> Result<RedactionResult, ScanError> {
vigil_redaction::scan_text(text)
}
}
pub(crate) fn extract_long_text_fields(args: &serde_json::Value, threshold: usize) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
walk(args, threshold, &mut out);
out
}
fn walk(v: &serde_json::Value, threshold: usize, out: &mut Vec<String>) {
match v {
serde_json::Value::String(s) if s.len() >= threshold => {
out.push(s.clone());
}
serde_json::Value::Array(xs) => {
for x in xs {
walk(x, threshold, out);
}
}
serde_json::Value::Object(m) => {
for (_k, vv) in m.iter() {
walk(vv, threshold, out);
}
}
_ => {}
}
}
pub(crate) fn compute_pii_risk_delta(result: &RedactionResult) -> u32 {
result.risk_signals.total_risk_delta
}
pub(crate) fn persist_scan_to_ledger(
ledger: &Ledger,
session_id: &str,
text: &str,
result: &RedactionResult,
) -> vigil_audit::Result<()> {
let fp = sha256_prefix16_hex(text.as_bytes());
let scan_id = ledger.insert_redaction_scan(NewRedactionScan {
session_id,
source: "tool_arg",
text_length: text.len(),
fingerprint: &fp,
})?;
for finding in &result.findings {
let Some(label) = PrivacyLabel::from_kind(finding.kind) else {
continue;
};
let (start, end) = finding.span;
if start > end || end > text.len() {
continue;
}
if !text.is_char_boundary(start) || !text.is_char_boundary(end) {
continue;
}
let span_slice = &text[start..end];
let span_fp = sha256_prefix16_hex(span_slice.as_bytes());
ledger.insert_redaction_finding(NewRedactionFinding {
scan_id: &scan_id,
label: label.as_str(),
offset: start,
fingerprint: &span_fp,
action_taken: "redacted",
})?;
}
Ok(())
}
fn sha256_prefix16_hex(bytes: &[u8]) -> String {
let mut h = Sha256::new();
h.update(bytes);
let digest = h.finalize();
let mut s = String::with_capacity(32);
for b in digest.iter().take(16) {
s.push_str(&format!("{b:02x}"));
}
s
}
pub(crate) struct PreflightOutcome {
pub(crate) pii_summary: Vec<PiiFindingSummary>,
pub(crate) risk_delta: u32,
pub(crate) engine_status: EngineStatusReport,
}
impl PreflightOutcome {
pub(crate) fn counts_csv(&self) -> String {
self.pii_summary
.iter()
.map(|s| format!("{}={}", s.label, s.count))
.collect::<Vec<_>>()
.join(",")
}
}
pub(crate) enum PreflightError {
ScanFailed { reason: String },
}
pub(crate) type AuditPersistCounter = AtomicU64;
pub(crate) fn run_preflight(
scanner: &dyn PiiScanner,
ledger: &Ledger,
audit_failures: &AuditPersistCounter,
session_id: &str,
args: &serde_json::Value,
threshold: usize,
) -> Result<PreflightOutcome, PreflightError> {
let long_texts = extract_long_text_fields(args, threshold);
let mut by_label: BTreeMap<&'static str, u32> = BTreeMap::new();
let mut total_risk_delta: u32 = 0;
let mut worst_status = EngineStatusReport::Unsupported;
for text in &long_texts {
match scanner.scan_with_status(text) {
Ok((result, status)) => {
worst_status = elevate_status(worst_status, status);
for (label, cnt) in &result.risk_signals.counts_by_label {
*by_label.entry(label.as_str()).or_insert(0) += cnt;
}
total_risk_delta = total_risk_delta.saturating_add(compute_pii_risk_delta(&result));
if persist_scan_to_ledger(ledger, session_id, text, &result).is_err() {
audit_failures.fetch_add(1, Ordering::Relaxed);
}
}
Err(ScanError::EmptyInput) => {
continue;
}
Err(ScanError::InferenceFailed { .. }) => {
return Err(PreflightError::ScanFailed {
reason: "t0_inference_failed".to_string(),
});
}
#[allow(unreachable_patterns)]
Err(_other) => {
return Err(PreflightError::ScanFailed {
reason: "t0_scan_failed".to_string(),
});
}
}
}
let pii_summary: Vec<PiiFindingSummary> = by_label
.into_iter()
.map(|(label, count)| PiiFindingSummary {
label: label.to_string(),
count,
})
.collect();
Ok(PreflightOutcome {
pii_summary,
risk_delta: total_risk_delta,
engine_status: worst_status,
})
}
fn elevate_status(a: EngineStatusReport, b: EngineStatusReport) -> EngineStatusReport {
fn rank(s: EngineStatusReport) -> u8 {
match s {
EngineStatusReport::DegradedError => 3,
EngineStatusReport::DegradedTimeout => 2,
EngineStatusReport::Ok => 1,
EngineStatusReport::Unsupported => 0,
}
}
if rank(a) >= rank(b) {
a
} else {
b
}
}
pub(crate) fn default_scanner_arc() -> Arc<dyn PiiScanner> {
Arc::new(DefaultScanner)
}
#[cfg(feature = "ort")]
mod ort_scanner {
use std::sync::Arc;
use std::time::Duration;
use vigil_redaction::{
scan_text_with_engine, scan_text_with_engine_budgeted, OrtEngine, RedactionEngine,
RedactionResult, ScanError,
};
use super::{EngineStatusReport, PiiScanner};
pub(crate) struct OrtPiiScanner {
engine: Arc<OrtEngine>,
}
impl OrtPiiScanner {
pub(crate) fn new(engine: Arc<OrtEngine>) -> Self {
Self { engine }
}
}
impl PiiScanner for OrtPiiScanner {
fn scan(&self, text: &str) -> Result<RedactionResult, ScanError> {
scan_text_with_engine(text, &*self.engine)
}
}
pub(crate) struct BudgetedOrtPiiScanner {
engine: Arc<OrtEngine>,
budget: Duration,
}
impl BudgetedOrtPiiScanner {
pub(crate) fn new(engine: Arc<OrtEngine>, budget: Duration) -> Self {
Self { engine, budget }
}
}
impl PiiScanner for BudgetedOrtPiiScanner {
fn scan(&self, text: &str) -> Result<RedactionResult, ScanError> {
self.scan_with_status(text).map(|(r, _status)| r)
}
fn scan_with_status(
&self,
text: &str,
) -> Result<(RedactionResult, EngineStatusReport), ScanError> {
let engine: Arc<dyn RedactionEngine> = Arc::clone(&self.engine) as _;
scan_text_with_engine_budgeted(text, engine, self.budget)
.map(|outcome| (outcome.result, outcome.status.into()))
}
}
pub(crate) struct EnsembleOrtPiiScanner {
ensemble: vigil_redaction::EnsembleEngine,
}
impl EnsembleOrtPiiScanner {
#[allow(dead_code)] pub(crate) fn new(engines: Vec<Arc<dyn RedactionEngine>>) -> Self {
Self {
ensemble: vigil_redaction::EnsembleEngine::new(engines),
}
}
pub(crate) fn from_ensemble(ensemble: vigil_redaction::EnsembleEngine) -> Self {
Self { ensemble }
}
}
impl PiiScanner for EnsembleOrtPiiScanner {
fn scan(&self, text: &str) -> Result<RedactionResult, ScanError> {
scan_text_with_engine(text, &self.ensemble)
}
}
}
#[cfg(feature = "ort")]
pub fn ort_scanner_arc_from_env(
) -> Result<Arc<dyn PiiScanner>, vigil_redaction::engine::EngineError> {
let engine = Arc::new(vigil_redaction::OrtEngine::from_env()?);
Ok(Arc::new(ort_scanner::OrtPiiScanner::new(engine)))
}
#[cfg(feature = "ort")]
pub fn ort_ensemble_scanner_arc_from_env(
) -> Result<Arc<dyn PiiScanner>, vigil_redaction::engine::EngineError> {
build_ort_ensemble_scanner_arc_from_env(None)
}
#[cfg(feature = "ort")]
pub fn ort_ensemble_scanner_arc_from_env_with_xlmr_mode(
xlmr_mode: vigil_redaction::model_descriptor::XlmrProfileMode,
) -> Result<Arc<dyn PiiScanner>, vigil_redaction::engine::EngineError> {
build_ort_ensemble_scanner_arc_from_env(Some(xlmr_mode))
}
#[cfg(feature = "ort")]
fn build_ort_ensemble_scanner_arc_from_env(
xlmr_mode: Option<vigil_redaction::model_descriptor::XlmrProfileMode>,
) -> Result<Arc<dyn PiiScanner>, vigil_redaction::engine::EngineError> {
use std::path::PathBuf;
use vigil_redaction::engine::EngineError;
use vigil_redaction::model_descriptor::{
OpenAIPrivacyFilterDescriptor, XlmrPiiDescriptor, YonigoPiiDescriptor,
};
let openai_dir = std::env::var("VIGIL_ENSEMBLE_OPENAI_DIR")
.map(PathBuf::from)
.map_err(|_| EngineError::ModelNotFound {
dir: "<VIGIL_ENSEMBLE_OPENAI_DIR unset>".to_string(),
})?;
let xlmr_dir = std::env::var("VIGIL_ENSEMBLE_XLMR_DIR")
.map(PathBuf::from)
.map_err(|_| EngineError::ModelNotFound {
dir: "<VIGIL_ENSEMBLE_XLMR_DIR unset>".to_string(),
})?;
let yonigo_dir = std::env::var("VIGIL_ENSEMBLE_YONIGO_DIR")
.map(PathBuf::from)
.map_err(|_| EngineError::ModelNotFound {
dir: "<VIGIL_ENSEMBLE_YONIGO_DIR unset>".to_string(),
})?;
let xlmr_descriptor = match xlmr_mode {
Some(mode) => XlmrPiiDescriptor::with_mode(mode), None => XlmrPiiDescriptor::default(), };
let openai = Arc::new(vigil_redaction::OrtEngine::from_dir_with_descriptor(
&openai_dir,
Box::new(OpenAIPrivacyFilterDescriptor),
)?);
let xlmr = Arc::new(vigil_redaction::OrtEngine::from_dir_with_descriptor(
&xlmr_dir,
Box::new(xlmr_descriptor),
)?);
let yonigo = Arc::new(vigil_redaction::OrtEngine::from_dir_with_descriptor(
&yonigo_dir,
Box::new(YonigoPiiDescriptor),
)?);
let engines: Vec<Arc<dyn vigil_redaction::RedactionEngine>> = vec![openai, xlmr, yonigo];
let ensemble = vigil_redaction::EnsembleEngine::new(engines);
let ensemble = if let Ok(s) = std::env::var("VIGIL_ENSEMBLE_DUAL_CONFIRM") {
let labels: Vec<vigil_redaction::PrivacyLabel> = s
.split(',')
.filter_map(|t| {
let trimmed = t.trim().to_lowercase();
vigil_redaction::PrivacyLabel::from_kind(&trimmed)
})
.collect();
if !labels.is_empty() {
ensemble.with_dual_confirm(labels)
} else {
ensemble
}
} else {
ensemble
};
Ok(Arc::new(ort_scanner::EnsembleOrtPiiScanner::from_ensemble(
ensemble,
)))
}
#[cfg(feature = "ort")]
pub fn ort_scanner_arc_from_env_with_budget(
budget: std::time::Duration,
) -> Result<Arc<dyn PiiScanner>, vigil_redaction::engine::EngineError> {
let engine = Arc::new(vigil_redaction::OrtEngine::from_env()?);
Ok(Arc::new(ort_scanner::BudgetedOrtPiiScanner::new(
engine, budget,
)))
}
#[cfg(test)]
#[allow(clippy::panic)] mod tests {
use super::*;
#[test]
fn extract_long_text_fields_threshold_filters_short() {
let args = serde_json::json!({
"short": "hi",
"long": "x".repeat(150),
});
let out = extract_long_text_fields(&args, 100);
assert_eq!(out.len(), 1);
assert_eq!(out[0].len(), 150);
}
#[test]
fn extract_long_text_fields_recursive_arrays_and_objects() {
let args = serde_json::json!({
"outer": {
"nested": ["a", "b".repeat(50)],
"deep": { "leaf": "c".repeat(80) }
}
});
let out = extract_long_text_fields(&args, 30);
assert_eq!(out.len(), 2);
}
#[test]
fn extract_long_text_fields_null_args_returns_empty() {
let out = extract_long_text_fields(&serde_json::Value::Null, 100);
assert!(out.is_empty());
let out2 = extract_long_text_fields(&serde_json::json!({}), 100);
assert!(out2.is_empty());
}
#[test]
fn extract_long_text_fields_skips_numbers_and_booleans() {
let args = serde_json::json!({
"n": 12345,
"b": true,
"s": "x".repeat(120),
});
let out = extract_long_text_fields(&args, 100);
assert_eq!(out.len(), 1);
}
#[test]
fn sha256_prefix16_hex_is_32_lowercase_chars() {
let fp = sha256_prefix16_hex(b"hello world");
assert_eq!(fp.len(), 32);
assert!(fp
.chars()
.all(|c| c.is_ascii_hexdigit() && !c.is_uppercase()));
}
#[test]
fn compute_pii_risk_delta_transparent_to_signals() {
let text = "junk ghp_abcdefghijklmnopqrstuvwxyzABCDEFGHIJ more";
let r = vigil_redaction::scan_text(text).expect("non-empty");
let delta = compute_pii_risk_delta(&r);
assert!(
delta >= 25,
"Secret 类应至少 25,实际 {delta}(signals={:?})",
r.risk_signals
);
}
struct LocalFailingScanner;
impl PiiScanner for LocalFailingScanner {
fn scan(&self, _: &str) -> Result<RedactionResult, ScanError> {
Err(ScanError::InferenceFailed {
reason: "unit-test".into(),
})
}
}
#[test]
fn failing_scanner_propagates_inference_failed() {
let s = LocalFailingScanner;
match s.scan("some text") {
Err(ScanError::InferenceFailed { reason }) => {
assert_eq!(reason, "unit-test");
}
other => panic!("local failing scanner should return InferenceFailed, got {other:?}"),
}
}
#[test]
fn default_scanner_forwards_to_scan_text() {
let s = DefaultScanner;
match s.scan("") {
Err(ScanError::EmptyInput) => {}
other => panic!("DefaultScanner('') should be EmptyInput, got {other:?}"),
}
}
#[test]
fn elevate_status_total_order_safety() {
use EngineStatusReport::*;
for s in [DegradedError, DegradedTimeout, Ok, Unsupported] {
assert_eq!(elevate_status(s, s), s);
}
assert_eq!(elevate_status(Unsupported, Ok), Ok);
assert_eq!(elevate_status(Ok, DegradedTimeout), DegradedTimeout);
assert_eq!(
elevate_status(DegradedTimeout, DegradedError),
DegradedError
);
assert_eq!(elevate_status(Unsupported, DegradedError), DegradedError);
assert_eq!(elevate_status(Ok, Unsupported), Ok);
assert_eq!(elevate_status(DegradedError, Ok), DegradedError);
}
#[test]
fn run_preflight_with_default_status_scanner_yields_unsupported() {
struct LocalDefaultStatusScanner;
impl PiiScanner for LocalDefaultStatusScanner {
fn scan(&self, _text: &str) -> Result<RedactionResult, ScanError> {
Ok(RedactionResult {
findings: Vec::new(),
redacted_text: String::new(),
risk_signals: vigil_redaction::RiskSignals::default(),
})
}
}
let ledger = vigil_audit::Ledger::open_in_memory().expect("open_in_memory");
let sid = ledger
.start_session("a2-r1-test", Some("default_status_unsupported"))
.expect("start_session");
let counter = AuditPersistCounter::new(0);
let args = serde_json::json!({ "long": "x".repeat(200) });
let outcome = run_preflight(
&LocalDefaultStatusScanner,
&ledger,
&counter,
&sid,
&args,
100,
)
.unwrap_or_else(|_| panic!("run_preflight should succeed with non-failing scanner"));
assert_eq!(
outcome.engine_status,
EngineStatusReport::Unsupported,
"default scan_with_status path 必须聚合为 Unsupported,**不**得被 fake-safe 升级为 Ok;\
这是 Codex § 2 改进版 A 的关键不变量(R1 MUST-FIX 锁定项)"
);
}
#[test]
fn run_preflight_with_real_ok_status_overrides_unsupported() {
struct LocalOkStatusScanner;
impl PiiScanner for LocalOkStatusScanner {
fn scan(&self, _text: &str) -> Result<RedactionResult, ScanError> {
Ok(RedactionResult {
findings: Vec::new(),
redacted_text: String::new(),
risk_signals: vigil_redaction::RiskSignals::default(),
})
}
fn scan_with_status(
&self,
text: &str,
) -> Result<(RedactionResult, EngineStatusReport), ScanError> {
self.scan(text).map(|r| (r, EngineStatusReport::Ok))
}
}
let ledger = vigil_audit::Ledger::open_in_memory().unwrap();
let sid = ledger
.start_session("a2-r1-test", Some("ok_overrides"))
.unwrap();
let counter = AuditPersistCounter::new(0);
let args = serde_json::json!({ "long": "x".repeat(200) });
let outcome = run_preflight(&LocalOkStatusScanner, &ledger, &counter, &sid, &args, 100)
.unwrap_or_else(|_| panic!("run_preflight should succeed with ok scanner"));
assert_eq!(
outcome.engine_status,
EngineStatusReport::Ok,
"真 Ok status 必须覆盖初始 Unsupported(`Ok > Unsupported` 严重度)"
);
}
#[cfg(feature = "ort")]
#[test]
fn ort_ensemble_scanner_arc_from_env_missing_envs_returns_modelnotfound() {
let any_set = [
"VIGIL_ENSEMBLE_OPENAI_DIR",
"VIGIL_ENSEMBLE_XLMR_DIR",
"VIGIL_ENSEMBLE_YONIGO_DIR",
]
.iter()
.any(|k| std::env::var(k).is_ok());
if any_set {
eprintln!("skip: VIGIL_ENSEMBLE_*_DIR already set");
return;
}
let r = ort_ensemble_scanner_arc_from_env();
match r {
Err(vigil_redaction::engine::EngineError::ModelNotFound { dir }) => {
assert!(
dir.contains("VIGIL_ENSEMBLE_") && dir.contains("unset"),
"ModelNotFound.dir 应含 VIGIL_ENSEMBLE_ env 名,实际: {}",
dir
);
}
other => panic!(
"env unset 应返 ModelNotFound,实际: {:?}",
other.map(|_| "Ok(scanner)")
),
}
}
#[cfg(feature = "ort")]
#[test]
fn ort_scanner_arc_from_env_with_budget_env_miss_returns_modelnotfound() {
if std::env::var("VIGIL_PRIVACY_FILTER_MODEL_DIR").is_ok() {
eprintln!("skip: VIGIL_PRIVACY_FILTER_MODEL_DIR already set");
return;
}
let r = ort_scanner_arc_from_env_with_budget(std::time::Duration::from_secs(1));
match r {
Err(vigil_redaction::engine::EngineError::ModelNotFound { .. }) => {}
other => panic!(
"env unset 应返 ModelNotFound,实际: {:?}",
other.map(|_| "Ok(scanner)")
),
}
}
}