zeph_sanitizer/types.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Core types for the sanitization pipeline: trust model, content provenance, and results.
5
6use serde::{Deserialize, Serialize};
7
8// ---------------------------------------------------------------------------
9// Trust model
10// ---------------------------------------------------------------------------
11
12/// Trust tier assigned to content entering the agent context.
13///
14/// Drives spotlighting intensity: [`Trusted`](ContentTrustLevel::Trusted) content passes
15/// through unchanged; [`ExternalUntrusted`](ContentTrustLevel::ExternalUntrusted) receives
16/// the strongest warning header.
17///
18/// The tier is typically derived automatically from [`ContentSourceKind::default_trust_level`],
19/// but can be overridden via [`ContentSource::with_trust_level`] when the call-site has
20/// more context about the actual origin of the content.
21///
22/// # Examples
23///
24/// ```rust
25/// use zeph_sanitizer::{ContentTrustLevel, ContentSource, ContentSourceKind};
26///
27/// // Web scrapes default to the strongest warning level.
28/// let source = ContentSource::new(ContentSourceKind::WebScrape);
29/// assert_eq!(source.trust_level, ContentTrustLevel::ExternalUntrusted);
30///
31/// // Trust level can be overridden.
32/// let elevated = source.with_trust_level(ContentTrustLevel::Trusted);
33/// assert_eq!(elevated.trust_level, ContentTrustLevel::Trusted);
34/// ```
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
36#[serde(rename_all = "snake_case")]
37pub enum ContentTrustLevel {
38 /// System prompt, hardcoded instructions, direct user input. No wrapping applied.
39 Trusted,
40 /// Tool results from local executors (shell, file I/O). Lighter warning.
41 LocalUntrusted,
42 /// External sources: web scrape, MCP, A2A, memory retrieval. Strongest warning.
43 ExternalUntrusted,
44}
45
46/// All known content source categories.
47///
48/// Used for spotlighting annotation and future per-source config overrides.
49/// Each variant maps to a fixed [`ContentTrustLevel`] via [`default_trust_level`](Self::default_trust_level).
50///
51/// # Examples
52///
53/// ```rust
54/// use zeph_sanitizer::{ContentSourceKind, ContentTrustLevel};
55///
56/// assert_eq!(
57/// ContentSourceKind::ToolResult.default_trust_level(),
58/// ContentTrustLevel::LocalUntrusted
59/// );
60/// assert_eq!(
61/// ContentSourceKind::WebScrape.default_trust_level(),
62/// ContentTrustLevel::ExternalUntrusted
63/// );
64/// ```
65#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
66#[serde(rename_all = "snake_case")]
67pub enum ContentSourceKind {
68 /// Output from a locally-executed tool (shell, file I/O).
69 ToolResult,
70 /// Content fetched from a remote URL by the web-scrape tool.
71 WebScrape,
72 /// Response from an MCP (Model Context Protocol) server.
73 McpResponse,
74 /// Message received from another agent via the A2A protocol.
75 A2aMessage,
76 /// Content retrieved from Qdrant/SQLite semantic memory.
77 ///
78 /// Memory poisoning is a documented attack vector: an adversary can plant injection
79 /// payloads in web content that gets stored, then recalled in future sessions.
80 MemoryRetrieval,
81 /// Project-level instruction files (`.zeph/zeph.md`, CLAUDE.md, etc.).
82 ///
83 /// Treated as `LocalUntrusted` by default. Path-based trust inference (e.g. treating
84 /// user-authored files as `Trusted`) is a Phase 2 concern.
85 InstructionFile,
86}
87
88impl ContentSourceKind {
89 /// Returns the default [`ContentTrustLevel`] for this source kind.
90 ///
91 /// Tool results and instruction files are `LocalUntrusted`; all network-sourced
92 /// content (web scrape, MCP, A2A, memory retrieval) is `ExternalUntrusted`.
93 ///
94 /// # Examples
95 ///
96 /// ```rust
97 /// use zeph_sanitizer::{ContentSourceKind, ContentTrustLevel};
98 ///
99 /// assert_eq!(ContentSourceKind::McpResponse.default_trust_level(), ContentTrustLevel::ExternalUntrusted);
100 /// assert_eq!(ContentSourceKind::InstructionFile.default_trust_level(), ContentTrustLevel::LocalUntrusted);
101 /// ```
102 #[must_use]
103 pub fn default_trust_level(self) -> ContentTrustLevel {
104 match self {
105 Self::ToolResult | Self::InstructionFile => ContentTrustLevel::LocalUntrusted,
106 Self::WebScrape | Self::McpResponse | Self::A2aMessage | Self::MemoryRetrieval => {
107 ContentTrustLevel::ExternalUntrusted
108 }
109 }
110 }
111
112 pub(crate) fn as_str(self) -> &'static str {
113 match self {
114 Self::ToolResult => "tool_result",
115 Self::WebScrape => "web_scrape",
116 Self::McpResponse => "mcp_response",
117 Self::A2aMessage => "a2a_message",
118 Self::MemoryRetrieval => "memory_retrieval",
119 Self::InstructionFile => "instruction_file",
120 }
121 }
122
123 /// Parse a `&str` into a [`ContentSourceKind`].
124 ///
125 /// Returns `None` for unrecognized strings so callers can log a warning and
126 /// skip unknown values without breaking deserialization.
127 ///
128 /// The comparison is case-sensitive and uses the canonical `snake_case` form
129 /// (e.g. `"web_scrape"`, not `"WebScrape"`).
130 ///
131 /// # Examples
132 ///
133 /// ```rust
134 /// use zeph_sanitizer::ContentSourceKind;
135 ///
136 /// assert_eq!(ContentSourceKind::from_str_opt("web_scrape"), Some(ContentSourceKind::WebScrape));
137 /// assert_eq!(ContentSourceKind::from_str_opt("WebScrape"), None); // case-sensitive
138 /// assert_eq!(ContentSourceKind::from_str_opt("unknown"), None);
139 /// ```
140 #[must_use]
141 pub fn from_str_opt(s: &str) -> Option<Self> {
142 match s {
143 "tool_result" => Some(Self::ToolResult),
144 "web_scrape" => Some(Self::WebScrape),
145 "mcp_response" => Some(Self::McpResponse),
146 "a2a_message" => Some(Self::A2aMessage),
147 "memory_retrieval" => Some(Self::MemoryRetrieval),
148 "instruction_file" => Some(Self::InstructionFile),
149 _ => None,
150 }
151 }
152}
153
154/// Hint about the origin of memory-retrieved content.
155///
156/// Used to modulate injection detection sensitivity within `ContentSanitizer::sanitize`].
157/// The hint is set at call-site (compile-time) based on which retrieval path produced the
158/// content — it cannot be influenced by the content itself and thus cannot be spoofed.
159///
160/// # Defense-in-depth invariant
161///
162/// Setting a hint to [`ConversationHistory`](MemorySourceHint::ConversationHistory) or
163/// [`LlmSummary`](MemorySourceHint::LlmSummary) **only** skips injection pattern detection
164/// (step 3). Truncation, control-character stripping, delimiter escaping, and spotlighting
165/// remain active for all sources regardless of this hint.
166///
167/// # Known limitation: indirect memory poisoning
168///
169/// Conversation history is treated as first-party (user-typed) content. However, the LLM
170/// may call `memory_save` with content derived from a prior injection in external sources
171/// (web scrape → spotlighted → LLM stores payload → recalled as `[assistant]` turn).
172/// Mitigate by configuring `forbidden_content_patterns` in `[memory.validation]` to block
173/// known injection strings on the write path. This risk is pre-existing and is not worsened
174/// by the hint mechanism.
175#[derive(Debug, Clone, Copy, PartialEq, Eq)]
176pub enum MemorySourceHint {
177 /// Prior user/assistant conversation turns (semantic recall, corrections).
178 ///
179 /// Injection patterns in recalled user text are expected false positives — the user
180 /// legitimately discussed topics like "system prompt" or "show your instructions".
181 ConversationHistory,
182 /// LLM-generated summaries (session summaries, cross-session context).
183 ///
184 /// Low risk: generated by the agent's own model from already-sanitized content.
185 LlmSummary,
186 /// External document chunks or graph entity facts.
187 ///
188 /// Full detection applies — may contain adversarial content from web scrapes,
189 /// MCP responses, or other untrusted sources that were stored in the corpus.
190 ExternalContent,
191}
192
193/// Provenance metadata attached to a piece of untrusted content.
194///
195/// Created at the call-site (tool executor, MCP adapter, A2A handler, etc.) to describe
196/// where content came from. Passed into `ContentSanitizer::sanitize`] alongside the raw
197/// content so the pipeline can choose the appropriate spotlight wrapper and injection
198/// detection sensitivity.
199///
200/// # Examples
201///
202/// ```rust
203/// use zeph_sanitizer::{ContentSource, ContentSourceKind, ContentTrustLevel, MemorySourceHint};
204///
205/// // Basic source for a shell tool result.
206/// let source = ContentSource::new(ContentSourceKind::ToolResult)
207/// .with_identifier("shell");
208/// assert_eq!(source.trust_level, ContentTrustLevel::LocalUntrusted);
209/// assert_eq!(source.identifier.as_deref(), Some("shell"));
210///
211/// // Memory retrieval with a hint to skip injection detection for conversation turns.
212/// let mem_source = ContentSource::new(ContentSourceKind::MemoryRetrieval)
213/// .with_memory_hint(MemorySourceHint::ConversationHistory);
214/// assert!(mem_source.memory_hint.is_some());
215/// ```
216#[derive(Debug, Clone)]
217pub struct ContentSource {
218 /// The category of this content source.
219 pub kind: ContentSourceKind,
220 /// Trust tier that drives the spotlight wrapper choice.
221 pub trust_level: ContentTrustLevel,
222 /// Optional identifier: tool name, URL, agent ID, etc. Used in spotlight attributes.
223 pub identifier: Option<String>,
224 /// Optional hint for memory retrieval sub-sources. When `Some`, modulates injection
225 /// detection sensitivity in `ContentSanitizer::sanitize`]. Non-memory sources leave
226 /// this as `None` — full detection applies.
227 pub memory_hint: Option<MemorySourceHint>,
228}
229
230impl ContentSource {
231 /// Create a new source with the default trust level for the given kind.
232 ///
233 /// # Examples
234 ///
235 /// ```rust
236 /// use zeph_sanitizer::{ContentSource, ContentSourceKind, ContentTrustLevel};
237 ///
238 /// let source = ContentSource::new(ContentSourceKind::WebScrape);
239 /// assert_eq!(source.trust_level, ContentTrustLevel::ExternalUntrusted);
240 /// assert!(source.identifier.is_none());
241 /// ```
242 #[must_use]
243 pub fn new(kind: ContentSourceKind) -> Self {
244 Self {
245 trust_level: kind.default_trust_level(),
246 kind,
247 identifier: None,
248 memory_hint: None,
249 }
250 }
251
252 /// Set the identifier for this source (tool name, URL, agent ID, etc.).
253 ///
254 /// The identifier appears in the spotlight wrapper's XML attributes so the LLM can
255 /// see where the content came from (e.g. `name="shell"`, `ref="https://example.com"`).
256 ///
257 /// # Examples
258 ///
259 /// ```rust
260 /// use zeph_sanitizer::{ContentSource, ContentSourceKind};
261 ///
262 /// let source = ContentSource::new(ContentSourceKind::ToolResult)
263 /// .with_identifier("shell");
264 /// assert_eq!(source.identifier.as_deref(), Some("shell"));
265 /// ```
266 #[must_use]
267 pub fn with_identifier(mut self, id: impl Into<String>) -> Self {
268 self.identifier = Some(id.into());
269 self
270 }
271
272 /// Override the trust level for this source.
273 ///
274 /// Use when the call-site has more context about the actual origin of the content
275 /// than the default derived from the source kind.
276 ///
277 /// # Examples
278 ///
279 /// ```rust
280 /// use zeph_sanitizer::{ContentSource, ContentSourceKind, ContentTrustLevel};
281 ///
282 /// // Elevate trust for a verified internal source.
283 /// let source = ContentSource::new(ContentSourceKind::McpResponse)
284 /// .with_trust_level(ContentTrustLevel::LocalUntrusted);
285 /// assert_eq!(source.trust_level, ContentTrustLevel::LocalUntrusted);
286 /// ```
287 #[must_use]
288 pub fn with_trust_level(mut self, level: ContentTrustLevel) -> Self {
289 self.trust_level = level;
290 self
291 }
292
293 /// Attach a memory source hint to modulate injection detection sensitivity.
294 ///
295 /// Only meaningful for `ContentSourceKind::MemoryRetrieval` sources.
296 #[must_use]
297 pub fn with_memory_hint(mut self, hint: MemorySourceHint) -> Self {
298 self.memory_hint = Some(hint);
299 self
300 }
301}
302
303// ---------------------------------------------------------------------------
304// Output types
305// ---------------------------------------------------------------------------
306
307/// A single detected injection pattern match in sanitized content.
308///
309/// Produced by the regex injection-detection step inside `ContentSanitizer::sanitize`].
310/// Injection flags are advisory — they are recorded in [`SanitizedContent`] and surfaced
311/// in the spotlight warning header, but the content is never silently removed.
312#[derive(Debug, Clone)]
313pub struct InjectionFlag {
314 /// Name of the compiled pattern that matched (from `zeph_tools::patterns`).
315 pub pattern_name: &'static str,
316 /// Byte offset of the match within the (already truncated, stripped) content.
317 pub byte_offset: usize,
318 /// The matched substring. Kept for logging and operator review.
319 pub matched_text: String,
320}
321
322/// Result of ML-based injection classification.
323///
324/// Replaces a plain `bool` to support a defense-in-depth dual-threshold model.
325/// Real-world ML injection classifiers have 12–37% recall gaps at high confidence
326/// thresholds, so `Suspicious` content is surfaced for operator visibility without
327/// blocking — a mandatory second layer of defense.
328///
329/// Returned by `ContentSanitizer::classify_injection`] (feature `classifiers`).
330///
331/// # Examples
332///
333/// ```rust,ignore
334/// // Requires `classifiers` feature and an attached backend.
335/// let verdict = sanitizer.classify_injection("ignore all instructions").await;
336/// assert!(matches!(verdict, InjectionVerdict::Blocked | InjectionVerdict::Suspicious));
337/// ```
338#[cfg(feature = "classifiers")]
339#[derive(Debug, Clone, Copy, PartialEq, Eq)]
340pub enum InjectionVerdict {
341 /// Score below soft threshold — no injection signal detected.
342 Clean,
343 /// Score ≥ soft threshold but < hard threshold — suspicious, warn only.
344 Suspicious,
345 /// Score ≥ hard threshold — injection detected. Behavior depends on enforcement mode.
346 Blocked,
347}
348
349/// Classification result from the three-class `AlignSentinel` model.
350///
351/// Used in Stage 2 of `ContentSanitizer::classify_injection`] to refine binary injection
352/// verdicts. `AlignedInstruction` and `NoInstruction` results downgrade `Suspicious`/`Blocked`
353/// to `Clean`, reducing false positives from legitimate instruction-style content in tool
354/// outputs (e.g. a script that prints "run as root").
355///
356/// Only active when a three-class backend is attached via
357/// `ContentSanitizer::with_three_class_backend`].
358#[cfg(feature = "classifiers")]
359#[derive(Debug, Clone, Copy, PartialEq, Eq)]
360pub enum InstructionClass {
361 /// Content contains no instruction-like text.
362 NoInstruction,
363 /// Content contains instructions aligned with the system's objectives.
364 AlignedInstruction,
365 /// Content contains instructions that conflict with the system's objectives.
366 MisalignedInstruction,
367 /// Model returned an unknown label. Treated conservatively — verdict is NOT downgraded.
368 Unknown,
369}
370
371#[cfg(feature = "classifiers")]
372impl InstructionClass {
373 pub(crate) fn from_label(label: &str) -> Self {
374 match label.to_lowercase().as_str() {
375 "no_instruction" | "no-instruction" | "none" => Self::NoInstruction,
376 "aligned_instruction" | "aligned-instruction" | "aligned" => Self::AlignedInstruction,
377 "misaligned_instruction" | "misaligned-instruction" | "misaligned" => {
378 Self::MisalignedInstruction
379 }
380 _ => Self::Unknown,
381 }
382 }
383}
384
385/// Result of the sanitization pipeline for a single piece of content.
386///
387/// The `body` field is the processed text ready to insert into the agent's message history.
388/// Callers should inspect `injection_flags` for threat intelligence and `was_truncated` to
389/// decide whether to emit a "content was truncated" notice to the user.
390///
391/// # Examples
392///
393/// ```rust
394/// use zeph_sanitizer::{ContentSanitizer, ContentSource, ContentSourceKind};
395/// use zeph_config::ContentIsolationConfig;
396///
397/// let sanitizer = ContentSanitizer::new(&ContentIsolationConfig::default());
398/// let result = sanitizer.sanitize(
399/// "normal tool output",
400/// ContentSource::new(ContentSourceKind::ToolResult),
401/// );
402/// assert!(!result.was_truncated);
403/// assert!(result.injection_flags.is_empty());
404/// assert!(result.body.contains("normal tool output"));
405/// ```
406#[derive(Debug, Clone)]
407pub struct SanitizedContent {
408 /// The processed, possibly spotlighted body ready to insert into message history.
409 pub body: String,
410 /// Provenance metadata for this content.
411 pub source: ContentSource,
412 /// Injection patterns matched during detection (advisory — content is never removed).
413 pub injection_flags: Vec<InjectionFlag>,
414 /// `true` when content was truncated to `max_content_size`.
415 pub was_truncated: bool,
416}