rust-sanitize 0.10.0

Deterministic one-way data sanitization engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
//! LLM prompt formatting — template resolution and prompt assembly.
//!
//! Provides the built-in prompt templates and the helpers needed to build a
//! structured LLM prompt from sanitized content and an optional sanitization
//! report.
//!
//! # Built-in templates
//!
//! | Name | Use case |
//! |------|----------|
//! | `"troubleshoot"` | Incident triage — root cause, event sequence, remediation |
//! | `"review-config"` | Config review — misconfigurations, security concerns, best practices |
//! | `"review-security"` | Security posture — auth, exposure, TLS, CVEs, hardcoded secrets |
//!
//! A filesystem path can be supplied instead of a name; the file's raw content
//! is used as-is (no substitution is applied to custom templates).
//!
//! # Prompt modes
//!
//! **Inline** ([`format_llm_prompt`]) — sanitized bytes are embedded directly in
//! `<content>` blocks. Use when piping output to an LLM without writing files.
//!
//! **Reference** ([`format_llm_prompt_reference`]) — sanitized files are written
//! to disk and the prompt lists their absolute paths. Use with `--output` so an
//! agentic LLM can read the files via its own tools.
//!
//! # Example
//!
//! ```rust
//! use sanitize_engine::llm::{format_llm_prompt, LlmEntry};
//!
//! let entries: Vec<LlmEntry> = vec![
//!     ("app.log".to_string(), b"INFO start\nERROR disk full\n".to_vec()),
//! ];
//! let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
//! assert!(prompt.contains("Root cause"));
//! assert!(prompt.contains("<content name=\"app.log\">"));
//! ```

use crate::report::SanitizeReport;
use std::fmt::Write as FmtWrite;
use std::fs;
use std::path::PathBuf;

/// A single content entry for the LLM prompt: `(label, sanitized_bytes)`.
pub type LlmEntry = (String, Vec<u8>);

/// A reference entry for the LLM prompt: `(input_label, sanitized_output_path)`.
///
/// Used by [`format_llm_prompt_reference`] when sanitized files are written to
/// disk and the prompt should reference them by absolute path instead of
/// inlining their content.
pub type LlmPathEntry = (String, PathBuf);

/// Preamble injected into every built-in template, explaining the sanitization
/// model to the LLM so it does not attempt to recover original values.
pub const PROMPT_PREAMBLE: &str = "\
Content sanitized:
- Structured fields (passwords, tokens, API keys) → __SANITIZED-<hash>__
- Typed values (emails, IPs, hostnames, UUIDs) → same-format/length substitute

Same original = same replacement per run. Repeated values safe to correlate.

No inferring originals. Critical value missing? Ask specific clarifying question. Use sanitized forms only.
";

/// Built-in template for incident troubleshooting.
pub const TEMPLATE_TROUBLESHOOT: &str = "\
SRE troubleshooting incident. Logs/output sanitized.

{preamble}
Provide:
1. Root cause — specific triggering failure
2. Event sequence — timeline to failure
3. Remediation — concrete fix + prevent recurrence

Data insufficient? State what info needed and why. No speculating on sanitized values.

";

/// Built-in template for configuration review.
pub const TEMPLATE_REVIEW_CONFIG: &str = "\
Systems engineer reviewing sanitized config.

{preamble}
Identify:
1. Misconfigurations — invalid/inconsistent settings causing failures
2. Security concerns — exposed services, permissive rules, weak/default settings
3. Best practice violations — deprecated options, missing fields, non-standard patterns
4. Credential placement — flag secret locations; presence/placement = hardcoding risk

Cannot assess redacted credential strength. Risk depends on actual value? Flag + ask.

";

/// Built-in template for security posture review.
pub const TEMPLATE_REVIEW_SECURITY: &str = "\
Security engineer: posture review of sanitized config/logs.

{preamble}
Assess and report:
1. Authentication/authz — weak configs, missing enforcement, privilege issues
2. Network exposure — ports/services/interfaces needing restriction
3. Encryption/TLS — weak ciphers, outdated protocols, insecure defaults
4. Hardcoded secrets — flag credential locations; presence/placement = finding
5. Known CVEs — tie visible version strings to known weaknesses
6. Cannot assess — list findings needing original values (e.g. password strength, token format)

Cite field/file/line per finding. No guessing sanitized values. Need actual value? Ask specifically.

";

/// Resolve a template name or path to its instruction text.
///
/// Accepts `"troubleshoot"`, `"review-config"`, `"review-security"` (built-in
/// templates with the preamble embedded), or an arbitrary filesystem path whose
/// raw content is returned unchanged.
///
/// # Errors
///
/// Returns an error string if a custom path cannot be read from disk.
pub fn resolve_llm_template(template_name: &str) -> Result<String, String> {
    match template_name {
        "troubleshoot" => Ok(TEMPLATE_TROUBLESHOOT.replace("{preamble}", PROMPT_PREAMBLE)),
        "review-config" => Ok(TEMPLATE_REVIEW_CONFIG.replace("{preamble}", PROMPT_PREAMBLE)),
        "review-security" => Ok(TEMPLATE_REVIEW_SECURITY.replace("{preamble}", PROMPT_PREAMBLE)),
        path => fs::read_to_string(path)
            .map_err(|e| format!("failed to read LLM template '{}': {e}", path)),
    }
}

/// Build a complete LLM prompt from a template, content entries, and an
/// optional sanitization report.
///
/// The prompt structure is:
/// 1. Template instructions (with preamble embedded for built-ins)
/// 2. `## Sanitization Summary` — file count and total replacements (when `report` is `Some`)
/// 3. One `<content name="…">…</content>` block per entry
/// 4. `<notable_events>…</notable_events>` — keyword-matched log lines with
///    surrounding context (only when the report contains log context with hits)
///
/// # Errors
///
/// Returns an error string if the template cannot be resolved.
pub fn format_llm_prompt(
    template_name: &str,
    entries: &[LlmEntry],
    report: Option<&SanitizeReport>,
) -> Result<String, String> {
    let mut out = resolve_llm_template(template_name)?;

    if let Some(r) = report {
        let total_replacements: u64 = r.files.iter().map(|f| f.replacements).sum();
        write!(
            out,
            "## Sanitization Summary\n\
             - Files processed: {}\n\
             - Total replacements: {total_replacements}\n\n",
            r.files.len()
        )
        .unwrap();
    }

    if !entries.is_empty() {
        out.push_str("## Files Analyzed\n");
        for (label, _) in entries {
            writeln!(out, "- {label}").unwrap();
        }
        out.push('\n');
    }

    for (label, bytes) in entries {
        let content = String::from_utf8_lossy(bytes);
        write!(
            out,
            "<content name=\"{}\">\n{}\n</content>\n\n",
            label, content
        )
        .unwrap();
    }

    if let Some(r) = report {
        append_notable_events(&mut out, r);
    }

    Ok(out)
}

/// Build a reference-mode LLM prompt: sanitized files are written to disk and
/// the prompt lists their absolute paths for an agentic LLM to read directly.
///
/// Use this instead of [`format_llm_prompt`] when `--output` is specified so
/// that large file sets are not inlined into the prompt.
///
/// # Errors
///
/// Returns an error string if the template cannot be resolved.
pub fn format_llm_prompt_reference(
    template_name: &str,
    entries: &[LlmPathEntry],
    report: Option<&SanitizeReport>,
) -> Result<String, String> {
    let mut out = resolve_llm_template(template_name)?;

    if let Some(r) = report {
        let total_replacements: u64 = r.files.iter().map(|f| f.replacements).sum();
        write!(
            out,
            "## Sanitization Summary\n\
             - Files processed: {}\n\
             - Total replacements: {total_replacements}\n\n",
            r.files.len()
        )
        .unwrap();
    }

    if !entries.is_empty() {
        out.push_str("## Sanitized Files\n");
        out.push_str("Read each path below to review the sanitized content:\n\n");
        for (label, out_path) in entries {
            writeln!(out, "- {}{}", label, out_path.display()).unwrap();
        }
        out.push('\n');
    }

    if let Some(r) = report {
        append_notable_events(&mut out, r);
    }

    Ok(out)
}

/// Append the `<notable_events>` block to `out` when the report contains
/// keyword-matched log lines.
fn append_notable_events(out: &mut String, report: &SanitizeReport) {
    let notable: Vec<_> = report
        .files
        .iter()
        .filter_map(|f| f.log_context.as_ref().map(|ctx| (&f.path, ctx)))
        .filter(|(_, ctx)| ctx.match_count > 0)
        .collect();

    if notable.is_empty() {
        return;
    }

    out.push_str("<notable_events>\n");
    let mut any_truncated = false;
    for (path, ctx) in &notable {
        writeln!(out, "# {path}").unwrap();
        for m in &ctx.matches {
            for line in &m.before {
                writeln!(out, "  {line}").unwrap();
            }
            writeln!(out, ">>> [{}] {}", m.keyword, m.line).unwrap();
            for line in &m.after {
                writeln!(out, "  {line}").unwrap();
            }
            out.push('\n');
        }
        if ctx.truncated {
            any_truncated = true;
        }
    }
    if any_truncated {
        out.push_str(
            "(notable events truncated — use --context-lines or --report for full context)\n",
        );
    }
    out.push_str("</notable_events>\n");
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::log_context::{extract_context, LogContextConfig};
    use crate::report::{FileReport, ReportBuilder, ReportMetadata};
    use crate::scanner::ScanStats;
    use std::fs;
    use tempfile::tempdir;

    fn make_test_report(replacements: u64) -> SanitizeReport {
        let builder = ReportBuilder::new(ReportMetadata {
            version: "0.0.0".into(),
            timestamp: "test".into(),
            deterministic: false,
            dry_run: false,
            strict: false,
            chunk_size: 1024,
            threads: None,
            secrets_file: None,
        });
        builder.record_file(FileReport::from_scan_stats(
            "test.log",
            &ScanStats {
                matches_found: replacements,
                replacements_applied: replacements,
                ..Default::default()
            },
            "scanner",
        ));
        builder.finish()
    }

    #[test]
    fn troubleshoot_embeds_preamble_and_instructions() {
        let t = resolve_llm_template("troubleshoot").unwrap();
        assert!(t.contains("sanitized"), "preamble should be embedded");
        assert!(
            t.contains("Root cause"),
            "should request root cause analysis"
        );
        assert!(
            t.contains("Remediation"),
            "should request remediation steps"
        );
        assert!(
            t.contains("clarifying question"),
            "should instruct LLM to ask rather than guess"
        );
    }

    #[test]
    fn review_config_embeds_preamble_and_instructions() {
        let t = resolve_llm_template("review-config").unwrap();
        assert!(t.contains("sanitized"), "preamble should be embedded");
        assert!(
            t.contains("Misconfigurations"),
            "should request misconfiguration review"
        );
        assert!(
            t.contains("Security concerns"),
            "should request security review"
        );
        assert!(
            t.contains("clarifying question"),
            "should instruct LLM to ask rather than guess"
        );
    }

    #[test]
    fn review_security_embeds_preamble_and_instructions() {
        let t = resolve_llm_template("review-security").unwrap();
        assert!(t.contains("sanitized"), "preamble should be embedded");
        assert!(t.contains("Authentication"), "should cover auth review");
        assert!(t.contains("Encryption"), "should cover TLS/crypto review");
        assert!(
            t.contains("Hardcoded"),
            "should flag hardcoded credential placement"
        );
        assert!(
            t.contains("clarifying question"),
            "should instruct LLM to ask rather than guess"
        );
    }

    #[test]
    fn nonexistent_path_returns_error() {
        let err = resolve_llm_template("/nonexistent/template.txt").unwrap_err();
        assert!(err.contains("failed to read"), "got: {err}");
    }

    #[test]
    fn custom_file_returns_raw_content() {
        let dir = tempdir().unwrap();
        let p = dir.path().join("custom.txt");
        fs::write(&p, "MY CUSTOM INSTRUCTIONS\n").unwrap();
        let t = resolve_llm_template(p.to_str().unwrap()).unwrap();
        assert_eq!(t, "MY CUSTOM INSTRUCTIONS\n");
    }

    #[test]
    fn prompt_includes_content_block() {
        let entries = vec![("app.log".to_string(), b"sanitized line\n".to_vec())];
        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
        assert!(
            prompt.contains("<content name=\"app.log\">"),
            "got:\n{prompt}"
        );
        assert!(prompt.contains("sanitized line"), "got:\n{prompt}");
        assert!(prompt.contains("</content>"), "got:\n{prompt}");
    }

    #[test]
    fn prompt_includes_sanitization_summary() {
        let report = make_test_report(7);
        let entries: Vec<LlmEntry> = vec![];
        let prompt = format_llm_prompt("troubleshoot", &entries, Some(&report)).unwrap();
        assert!(prompt.contains("## Sanitization Summary"), "got:\n{prompt}");
        assert!(prompt.contains("Files processed: 1"), "got:\n{prompt}");
        assert!(prompt.contains("Total replacements: 7"), "got:\n{prompt}");
    }

    #[test]
    fn prompt_includes_notable_events_when_present() {
        let builder = ReportBuilder::new(ReportMetadata {
            version: "0.0.0".into(),
            timestamp: "test".into(),
            deterministic: false,
            dry_run: false,
            strict: false,
            chunk_size: 1024,
            threads: None,
            secrets_file: None,
        });
        builder.record_file(FileReport::from_scan_stats(
            "app.log",
            &ScanStats::default(),
            "scanner",
        ));
        let ctx = extract_context(
            "INFO start\nERROR disk full\nINFO done",
            &LogContextConfig::new().with_context_lines(1),
        );
        builder.set_file_log_context("app.log", ctx);
        let report = builder.finish();

        let entries: Vec<LlmEntry> = vec![];
        let prompt = format_llm_prompt("troubleshoot", &entries, Some(&report)).unwrap();
        assert!(prompt.contains("<notable_events>"), "got:\n{prompt}");
        assert!(prompt.contains("# app.log"), "got:\n{prompt}");
        assert!(prompt.contains(">>> [error]"), "got:\n{prompt}");
        assert!(prompt.contains("ERROR disk full"), "got:\n{prompt}");
        assert!(prompt.contains("</notable_events>"), "got:\n{prompt}");
    }

    #[test]
    fn prompt_omits_notable_events_when_no_matches() {
        let report = make_test_report(0);
        let entries: Vec<LlmEntry> = vec![];
        let prompt = format_llm_prompt("troubleshoot", &entries, Some(&report)).unwrap();
        assert!(
            !prompt.contains("<notable_events>"),
            "should omit section when no keyword matches"
        );
    }

    #[test]
    fn prompt_multiple_content_blocks_in_order() {
        let entries = vec![
            ("first.log".to_string(), b"first content".to_vec()),
            ("second.log".to_string(), b"second content".to_vec()),
        ];
        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
        let first_pos = prompt.find("first.log").unwrap();
        let second_pos = prompt.find("second.log").unwrap();
        assert!(
            first_pos < second_pos,
            "entries should appear in insertion order"
        );
    }

    #[test]
    fn prompt_includes_files_analyzed_manifest() {
        let entries = vec![
            ("/abs/app.log".to_string(), b"sanitized line\n".to_vec()),
            (
                "/abs/config.yaml".to_string(),
                b"key: __SANITIZED__\n".to_vec(),
            ),
        ];
        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
        assert!(prompt.contains("## Files Analyzed"), "got:\n{prompt}");
        assert!(prompt.contains("- /abs/app.log"), "got:\n{prompt}");
        assert!(prompt.contains("- /abs/config.yaml"), "got:\n{prompt}");
        let manifest_pos = prompt.find("## Files Analyzed").unwrap();
        let content_pos = prompt.find("<content name=").unwrap();
        assert!(
            manifest_pos < content_pos,
            "manifest should precede content blocks"
        );
    }

    #[test]
    fn prompt_omits_files_analyzed_when_no_entries() {
        let entries: Vec<LlmEntry> = vec![];
        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
        assert!(
            !prompt.contains("## Files Analyzed"),
            "should omit manifest when no entries"
        );
    }

    #[test]
    fn reference_prompt_lists_output_paths() {
        let dir = tempdir().unwrap();
        let out1 = dir.path().join("app.log.sanitized");
        let out2 = dir.path().join("config.yaml.sanitized");
        let entries: Vec<LlmPathEntry> = vec![
            ("/abs/input/app.log".to_string(), out1.clone()),
            ("/abs/input/config.yaml".to_string(), out2.clone()),
        ];
        let prompt = format_llm_prompt_reference("troubleshoot", &entries, None).unwrap();
        assert!(prompt.contains("## Sanitized Files"), "got:\n{prompt}");
        assert!(
            prompt.contains("/abs/input/app.log"),
            "should include input label; got:\n{prompt}"
        );
        assert!(
            prompt.contains(&out1.display().to_string()),
            "should include output path; got:\n{prompt}"
        );
        assert!(
            !prompt.contains("<content"),
            "reference mode must not inline content"
        );
    }

    #[test]
    fn reference_prompt_includes_sanitization_summary() {
        let report = make_test_report(12);
        let entries: Vec<LlmPathEntry> = vec![];
        let prompt = format_llm_prompt_reference("troubleshoot", &entries, Some(&report)).unwrap();
        assert!(prompt.contains("## Sanitization Summary"), "got:\n{prompt}");
        assert!(prompt.contains("Total replacements: 12"), "got:\n{prompt}");
    }
}