sanitize_engine/
llm.rs

1//! LLM prompt formatting — template resolution and prompt assembly.
2//!
3//! Provides the built-in prompt templates and the helpers needed to build a
4//! structured LLM prompt from sanitized content and an optional sanitization
5//! report.
6//!
7//! # Built-in templates
8//!
9//! | Name | Use case |
10//! |------|----------|
11//! | `"troubleshoot"` | Incident triage — root cause, event sequence, remediation |
12//! | `"review-config"` | Config review — misconfigurations, security concerns, best practices |
13//! | `"review-security"` | Security posture — auth, exposure, TLS, CVEs, hardcoded secrets |
14//!
15//! A filesystem path can be supplied instead of a name; the file's raw content
16//! is used as-is (no substitution is applied to custom templates).
17//!
18//! # Prompt modes
19//!
20//! **Inline** ([`format_llm_prompt`]) — sanitized bytes are embedded directly in
21//! `<content>` blocks. Use when piping output to an LLM without writing files.
22//!
23//! **Reference** ([`format_llm_prompt_reference`]) — sanitized files are written
24//! to disk and the prompt lists their absolute paths. Use with `--output` so an
25//! agentic LLM can read the files via its own tools.
26//!
27//! # Example
28//!
29//! ```rust
30//! use sanitize_engine::llm::{format_llm_prompt, LlmEntry};
31//!
32//! let entries: Vec<LlmEntry> = vec![
33//!     ("app.log".to_string(), b"INFO start\nERROR disk full\n".to_vec()),
34//! ];
35//! let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
36//! assert!(prompt.contains("Root cause"));
37//! assert!(prompt.contains("<content name=\"app.log\">"));
38//! ```
39
40use crate::report::SanitizeReport;
41use std::fmt::Write as FmtWrite;
42use std::fs;
43use std::path::PathBuf;
44
45/// A single content entry for the LLM prompt: `(label, sanitized_bytes)`.
46pub type LlmEntry = (String, Vec<u8>);
47
48/// A reference entry for the LLM prompt: `(input_label, sanitized_output_path)`.
49///
50/// Used by [`format_llm_prompt_reference`] when sanitized files are written to
51/// disk and the prompt should reference them by absolute path instead of
52/// inlining their content.
53pub type LlmPathEntry = (String, PathBuf);
54
55/// Preamble injected into every built-in template, explaining the sanitization
56/// model to the LLM so it does not attempt to recover original values.
57pub const PROMPT_PREAMBLE: &str = "\
58Content sanitized:
59- Structured fields (passwords, tokens, API keys) → __SANITIZED-<hash>__
60- Typed values (emails, IPs, hostnames, UUIDs) → same-format/length substitute
61
62Same original = same replacement per run. Repeated values safe to correlate.
63
64No inferring originals. Critical value missing? Ask specific clarifying question. Use sanitized forms only.
65";
66
67/// Built-in template for incident troubleshooting.
68pub const TEMPLATE_TROUBLESHOOT: &str = "\
69SRE troubleshooting incident. Logs/output sanitized.
70
71{preamble}
72Provide:
731. Root cause — specific triggering failure
742. Event sequence — timeline to failure
753. Remediation — concrete fix + prevent recurrence
76
77Data insufficient? State what info needed and why. No speculating on sanitized values.
78
79";
80
81/// Built-in template for configuration review.
82pub const TEMPLATE_REVIEW_CONFIG: &str = "\
83Systems engineer reviewing sanitized config.
84
85{preamble}
86Identify:
871. Misconfigurations — invalid/inconsistent settings causing failures
882. Security concerns — exposed services, permissive rules, weak/default settings
893. Best practice violations — deprecated options, missing fields, non-standard patterns
904. Credential placement — flag secret locations; presence/placement = hardcoding risk
91
92Cannot assess redacted credential strength. Risk depends on actual value? Flag + ask.
93
94";
95
96/// Built-in template for security posture review.
97pub const TEMPLATE_REVIEW_SECURITY: &str = "\
98Security engineer: posture review of sanitized config/logs.
99
100{preamble}
101Assess and report:
1021. Authentication/authz — weak configs, missing enforcement, privilege issues
1032. Network exposure — ports/services/interfaces needing restriction
1043. Encryption/TLS — weak ciphers, outdated protocols, insecure defaults
1054. Hardcoded secrets — flag credential locations; presence/placement = finding
1065. Known CVEs — tie visible version strings to known weaknesses
1076. Cannot assess — list findings needing original values (e.g. password strength, token format)
108
109Cite field/file/line per finding. No guessing sanitized values. Need actual value? Ask specifically.
110
111";
112
113/// Resolve a template name or path to its instruction text.
114///
115/// Accepts `"troubleshoot"`, `"review-config"`, `"review-security"` (built-in
116/// templates with the preamble embedded), or an arbitrary filesystem path whose
117/// raw content is returned unchanged.
118///
119/// # Errors
120///
121/// Returns an error string if a custom path cannot be read from disk.
122pub fn resolve_llm_template(template_name: &str) -> Result<String, String> {
123    match template_name {
124        "troubleshoot" => Ok(TEMPLATE_TROUBLESHOOT.replace("{preamble}", PROMPT_PREAMBLE)),
125        "review-config" => Ok(TEMPLATE_REVIEW_CONFIG.replace("{preamble}", PROMPT_PREAMBLE)),
126        "review-security" => Ok(TEMPLATE_REVIEW_SECURITY.replace("{preamble}", PROMPT_PREAMBLE)),
127        path => fs::read_to_string(path)
128            .map_err(|e| format!("failed to read LLM template '{}': {e}", path)),
129    }
130}
131
132/// Build a complete LLM prompt from a template, content entries, and an
133/// optional sanitization report.
134///
135/// The prompt structure is:
136/// 1. Template instructions (with preamble embedded for built-ins)
137/// 2. `## Sanitization Summary` — file count and total replacements (when `report` is `Some`)
138/// 3. One `<content name="…">…</content>` block per entry
139/// 4. `<notable_events>…</notable_events>` — keyword-matched log lines with
140///    surrounding context (only when the report contains log context with hits)
141///
142/// # Errors
143///
144/// Returns an error string if the template cannot be resolved.
145pub fn format_llm_prompt(
146    template_name: &str,
147    entries: &[LlmEntry],
148    report: Option<&SanitizeReport>,
149) -> Result<String, String> {
150    let mut out = resolve_llm_template(template_name)?;
151
152    if let Some(r) = report {
153        let total_replacements: u64 = r.files.iter().map(|f| f.replacements).sum();
154        write!(
155            out,
156            "## Sanitization Summary\n\
157             - Files processed: {}\n\
158             - Total replacements: {total_replacements}\n\n",
159            r.files.len()
160        )
161        .unwrap();
162    }
163
164    if !entries.is_empty() {
165        out.push_str("## Files Analyzed\n");
166        for (label, _) in entries {
167            writeln!(out, "- {label}").unwrap();
168        }
169        out.push('\n');
170    }
171
172    for (label, bytes) in entries {
173        let content = String::from_utf8_lossy(bytes);
174        write!(
175            out,
176            "<content name=\"{}\">\n{}\n</content>\n\n",
177            label, content
178        )
179        .unwrap();
180    }
181
182    if let Some(r) = report {
183        append_notable_events(&mut out, r);
184    }
185
186    Ok(out)
187}
188
189/// Build a reference-mode LLM prompt: sanitized files are written to disk and
190/// the prompt lists their absolute paths for an agentic LLM to read directly.
191///
192/// Use this instead of [`format_llm_prompt`] when `--output` is specified so
193/// that large file sets are not inlined into the prompt.
194///
195/// # Errors
196///
197/// Returns an error string if the template cannot be resolved.
198pub fn format_llm_prompt_reference(
199    template_name: &str,
200    entries: &[LlmPathEntry],
201    report: Option<&SanitizeReport>,
202) -> Result<String, String> {
203    let mut out = resolve_llm_template(template_name)?;
204
205    if let Some(r) = report {
206        let total_replacements: u64 = r.files.iter().map(|f| f.replacements).sum();
207        write!(
208            out,
209            "## Sanitization Summary\n\
210             - Files processed: {}\n\
211             - Total replacements: {total_replacements}\n\n",
212            r.files.len()
213        )
214        .unwrap();
215    }
216
217    if !entries.is_empty() {
218        out.push_str("## Sanitized Files\n");
219        out.push_str("Read each path below to review the sanitized content:\n\n");
220        for (label, out_path) in entries {
221            writeln!(out, "- {} → {}", label, out_path.display()).unwrap();
222        }
223        out.push('\n');
224    }
225
226    if let Some(r) = report {
227        append_notable_events(&mut out, r);
228    }
229
230    Ok(out)
231}
232
233/// Append the `<notable_events>` block to `out` when the report contains
234/// keyword-matched log lines.
235fn append_notable_events(out: &mut String, report: &SanitizeReport) {
236    let notable: Vec<_> = report
237        .files
238        .iter()
239        .filter_map(|f| f.log_context.as_ref().map(|ctx| (&f.path, ctx)))
240        .filter(|(_, ctx)| ctx.match_count > 0)
241        .collect();
242
243    if notable.is_empty() {
244        return;
245    }
246
247    out.push_str("<notable_events>\n");
248    let mut any_truncated = false;
249    for (path, ctx) in &notable {
250        writeln!(out, "# {path}").unwrap();
251        for m in &ctx.matches {
252            for line in &m.before {
253                writeln!(out, "  {line}").unwrap();
254            }
255            writeln!(out, ">>> [{}] {}", m.keyword, m.line).unwrap();
256            for line in &m.after {
257                writeln!(out, "  {line}").unwrap();
258            }
259            out.push('\n');
260        }
261        if ctx.truncated {
262            any_truncated = true;
263        }
264    }
265    if any_truncated {
266        out.push_str(
267            "(notable events truncated — use --context-lines or --report for full context)\n",
268        );
269    }
270    out.push_str("</notable_events>\n");
271}
272
273// ---------------------------------------------------------------------------
274// Tests
275// ---------------------------------------------------------------------------
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280    use crate::log_context::{extract_context, LogContextConfig};
281    use crate::report::{FileReport, ReportBuilder, ReportMetadata};
282    use crate::scanner::ScanStats;
283    use std::fs;
284    use tempfile::tempdir;
285
286    fn make_test_report(replacements: u64) -> SanitizeReport {
287        let builder = ReportBuilder::new(ReportMetadata {
288            version: "0.0.0".into(),
289            timestamp: "test".into(),
290            deterministic: false,
291            dry_run: false,
292            strict: false,
293            chunk_size: 1024,
294            threads: None,
295            secrets_file: None,
296        });
297        builder.record_file(FileReport::from_scan_stats(
298            "test.log",
299            &ScanStats {
300                matches_found: replacements,
301                replacements_applied: replacements,
302                ..Default::default()
303            },
304            "scanner",
305        ));
306        builder.finish()
307    }
308
309    #[test]
310    fn troubleshoot_embeds_preamble_and_instructions() {
311        let t = resolve_llm_template("troubleshoot").unwrap();
312        assert!(t.contains("sanitized"), "preamble should be embedded");
313        assert!(
314            t.contains("Root cause"),
315            "should request root cause analysis"
316        );
317        assert!(
318            t.contains("Remediation"),
319            "should request remediation steps"
320        );
321        assert!(
322            t.contains("clarifying question"),
323            "should instruct LLM to ask rather than guess"
324        );
325    }
326
327    #[test]
328    fn review_config_embeds_preamble_and_instructions() {
329        let t = resolve_llm_template("review-config").unwrap();
330        assert!(t.contains("sanitized"), "preamble should be embedded");
331        assert!(
332            t.contains("Misconfigurations"),
333            "should request misconfiguration review"
334        );
335        assert!(
336            t.contains("Security concerns"),
337            "should request security review"
338        );
339        assert!(
340            t.contains("clarifying question"),
341            "should instruct LLM to ask rather than guess"
342        );
343    }
344
345    #[test]
346    fn review_security_embeds_preamble_and_instructions() {
347        let t = resolve_llm_template("review-security").unwrap();
348        assert!(t.contains("sanitized"), "preamble should be embedded");
349        assert!(t.contains("Authentication"), "should cover auth review");
350        assert!(t.contains("Encryption"), "should cover TLS/crypto review");
351        assert!(
352            t.contains("Hardcoded"),
353            "should flag hardcoded credential placement"
354        );
355        assert!(
356            t.contains("clarifying question"),
357            "should instruct LLM to ask rather than guess"
358        );
359    }
360
361    #[test]
362    fn nonexistent_path_returns_error() {
363        let err = resolve_llm_template("/nonexistent/template.txt").unwrap_err();
364        assert!(err.contains("failed to read"), "got: {err}");
365    }
366
367    #[test]
368    fn custom_file_returns_raw_content() {
369        let dir = tempdir().unwrap();
370        let p = dir.path().join("custom.txt");
371        fs::write(&p, "MY CUSTOM INSTRUCTIONS\n").unwrap();
372        let t = resolve_llm_template(p.to_str().unwrap()).unwrap();
373        assert_eq!(t, "MY CUSTOM INSTRUCTIONS\n");
374    }
375
376    #[test]
377    fn prompt_includes_content_block() {
378        let entries = vec![("app.log".to_string(), b"sanitized line\n".to_vec())];
379        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
380        assert!(
381            prompt.contains("<content name=\"app.log\">"),
382            "got:\n{prompt}"
383        );
384        assert!(prompt.contains("sanitized line"), "got:\n{prompt}");
385        assert!(prompt.contains("</content>"), "got:\n{prompt}");
386    }
387
388    #[test]
389    fn prompt_includes_sanitization_summary() {
390        let report = make_test_report(7);
391        let entries: Vec<LlmEntry> = vec![];
392        let prompt = format_llm_prompt("troubleshoot", &entries, Some(&report)).unwrap();
393        assert!(prompt.contains("## Sanitization Summary"), "got:\n{prompt}");
394        assert!(prompt.contains("Files processed: 1"), "got:\n{prompt}");
395        assert!(prompt.contains("Total replacements: 7"), "got:\n{prompt}");
396    }
397
398    #[test]
399    fn prompt_includes_notable_events_when_present() {
400        let builder = ReportBuilder::new(ReportMetadata {
401            version: "0.0.0".into(),
402            timestamp: "test".into(),
403            deterministic: false,
404            dry_run: false,
405            strict: false,
406            chunk_size: 1024,
407            threads: None,
408            secrets_file: None,
409        });
410        builder.record_file(FileReport::from_scan_stats(
411            "app.log",
412            &ScanStats::default(),
413            "scanner",
414        ));
415        let ctx = extract_context(
416            "INFO start\nERROR disk full\nINFO done",
417            &LogContextConfig::new().with_context_lines(1),
418        );
419        builder.set_file_log_context("app.log", ctx);
420        let report = builder.finish();
421
422        let entries: Vec<LlmEntry> = vec![];
423        let prompt = format_llm_prompt("troubleshoot", &entries, Some(&report)).unwrap();
424        assert!(prompt.contains("<notable_events>"), "got:\n{prompt}");
425        assert!(prompt.contains("# app.log"), "got:\n{prompt}");
426        assert!(prompt.contains(">>> [error]"), "got:\n{prompt}");
427        assert!(prompt.contains("ERROR disk full"), "got:\n{prompt}");
428        assert!(prompt.contains("</notable_events>"), "got:\n{prompt}");
429    }
430
431    #[test]
432    fn prompt_omits_notable_events_when_no_matches() {
433        let report = make_test_report(0);
434        let entries: Vec<LlmEntry> = vec![];
435        let prompt = format_llm_prompt("troubleshoot", &entries, Some(&report)).unwrap();
436        assert!(
437            !prompt.contains("<notable_events>"),
438            "should omit section when no keyword matches"
439        );
440    }
441
442    #[test]
443    fn prompt_multiple_content_blocks_in_order() {
444        let entries = vec![
445            ("first.log".to_string(), b"first content".to_vec()),
446            ("second.log".to_string(), b"second content".to_vec()),
447        ];
448        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
449        let first_pos = prompt.find("first.log").unwrap();
450        let second_pos = prompt.find("second.log").unwrap();
451        assert!(
452            first_pos < second_pos,
453            "entries should appear in insertion order"
454        );
455    }
456
457    #[test]
458    fn prompt_includes_files_analyzed_manifest() {
459        let entries = vec![
460            ("/abs/app.log".to_string(), b"sanitized line\n".to_vec()),
461            (
462                "/abs/config.yaml".to_string(),
463                b"key: __SANITIZED__\n".to_vec(),
464            ),
465        ];
466        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
467        assert!(prompt.contains("## Files Analyzed"), "got:\n{prompt}");
468        assert!(prompt.contains("- /abs/app.log"), "got:\n{prompt}");
469        assert!(prompt.contains("- /abs/config.yaml"), "got:\n{prompt}");
470        let manifest_pos = prompt.find("## Files Analyzed").unwrap();
471        let content_pos = prompt.find("<content name=").unwrap();
472        assert!(
473            manifest_pos < content_pos,
474            "manifest should precede content blocks"
475        );
476    }
477
478    #[test]
479    fn prompt_omits_files_analyzed_when_no_entries() {
480        let entries: Vec<LlmEntry> = vec![];
481        let prompt = format_llm_prompt("troubleshoot", &entries, None).unwrap();
482        assert!(
483            !prompt.contains("## Files Analyzed"),
484            "should omit manifest when no entries"
485        );
486    }
487
488    #[test]
489    fn reference_prompt_lists_output_paths() {
490        let dir = tempdir().unwrap();
491        let out1 = dir.path().join("app.log.sanitized");
492        let out2 = dir.path().join("config.yaml.sanitized");
493        let entries: Vec<LlmPathEntry> = vec![
494            ("/abs/input/app.log".to_string(), out1.clone()),
495            ("/abs/input/config.yaml".to_string(), out2.clone()),
496        ];
497        let prompt = format_llm_prompt_reference("troubleshoot", &entries, None).unwrap();
498        assert!(prompt.contains("## Sanitized Files"), "got:\n{prompt}");
499        assert!(
500            prompt.contains("/abs/input/app.log"),
501            "should include input label; got:\n{prompt}"
502        );
503        assert!(
504            prompt.contains(&out1.display().to_string()),
505            "should include output path; got:\n{prompt}"
506        );
507        assert!(
508            !prompt.contains("<content"),
509            "reference mode must not inline content"
510        );
511    }
512
513    #[test]
514    fn reference_prompt_includes_sanitization_summary() {
515        let report = make_test_report(12);
516        let entries: Vec<LlmPathEntry> = vec![];
517        let prompt = format_llm_prompt_reference("troubleshoot", &entries, Some(&report)).unwrap();
518        assert!(prompt.contains("## Sanitization Summary"), "got:\n{prompt}");
519        assert!(prompt.contains("Total replacements: 12"), "got:\n{prompt}");
520    }
521}
sanitize_engine/llm.rs

sanitize_engine/
llm.rs