zeph_sanitizer/types.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use serde::{Deserialize, Serialize};
5
6// ---------------------------------------------------------------------------
7// Trust model
8// ---------------------------------------------------------------------------
9
10/// Trust tier assigned to content entering the agent context.
11///
12/// Drives spotlighting intensity: [`Trusted`](ContentTrustLevel::Trusted) content passes
13/// through unchanged; [`ExternalUntrusted`](ContentTrustLevel::ExternalUntrusted) receives
14/// the strongest warning header.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
16#[serde(rename_all = "snake_case")]
17pub enum ContentTrustLevel {
18 /// System prompt, hardcoded instructions, direct user input. No wrapping applied.
19 Trusted,
20 /// Tool results from local executors (shell, file I/O). Lighter warning.
21 LocalUntrusted,
22 /// External sources: web scrape, MCP, A2A, memory retrieval. Strongest warning.
23 ExternalUntrusted,
24}
25
26/// All known content source categories.
27///
28/// Used for spotlighting annotation and future per-source config overrides.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
30#[serde(rename_all = "snake_case")]
31pub enum ContentSourceKind {
32 ToolResult,
33 WebScrape,
34 McpResponse,
35 A2aMessage,
36 /// Content retrieved from Qdrant/SQLite semantic memory.
37 ///
38 /// Memory poisoning is a documented attack vector: an adversary can plant injection
39 /// payloads in web content that gets stored, then recalled in future sessions.
40 MemoryRetrieval,
41 /// Project-level instruction files (`.zeph/zeph.md`, CLAUDE.md, etc.).
42 ///
43 /// Treated as `LocalUntrusted` by default. Path-based trust inference (e.g. treating
44 /// user-authored files as `Trusted`) is a Phase 2 concern.
45 InstructionFile,
46}
47
48impl ContentSourceKind {
49 /// Returns the default trust level for this source kind.
50 #[must_use]
51 pub fn default_trust_level(self) -> ContentTrustLevel {
52 match self {
53 Self::ToolResult | Self::InstructionFile => ContentTrustLevel::LocalUntrusted,
54 Self::WebScrape | Self::McpResponse | Self::A2aMessage | Self::MemoryRetrieval => {
55 ContentTrustLevel::ExternalUntrusted
56 }
57 }
58 }
59
60 pub(crate) fn as_str(self) -> &'static str {
61 match self {
62 Self::ToolResult => "tool_result",
63 Self::WebScrape => "web_scrape",
64 Self::McpResponse => "mcp_response",
65 Self::A2aMessage => "a2a_message",
66 Self::MemoryRetrieval => "memory_retrieval",
67 Self::InstructionFile => "instruction_file",
68 }
69 }
70
71 /// Parse a string into a `ContentSourceKind`.
72 ///
73 /// Returns `None` for unrecognized strings (instead of an error) so callers
74 /// can log a warning and skip unknown values without breaking deserialization.
75 #[must_use]
76 pub fn from_str_opt(s: &str) -> Option<Self> {
77 match s {
78 "tool_result" => Some(Self::ToolResult),
79 "web_scrape" => Some(Self::WebScrape),
80 "mcp_response" => Some(Self::McpResponse),
81 "a2a_message" => Some(Self::A2aMessage),
82 "memory_retrieval" => Some(Self::MemoryRetrieval),
83 "instruction_file" => Some(Self::InstructionFile),
84 _ => None,
85 }
86 }
87}
88
89/// Hint about the origin of memory-retrieved content.
90///
91/// Used to modulate injection detection sensitivity within [`ContentSanitizer::sanitize`].
92/// The hint is set at call-site (compile-time) based on which retrieval path produced the
93/// content — it cannot be influenced by the content itself and thus cannot be spoofed.
94///
95/// # Defense-in-depth invariant
96///
97/// Setting a hint to [`ConversationHistory`](MemorySourceHint::ConversationHistory) or
98/// [`LlmSummary`](MemorySourceHint::LlmSummary) **only** skips injection pattern detection
99/// (step 3). Truncation, control-character stripping, delimiter escaping, and spotlighting
100/// remain active for all sources regardless of this hint.
101///
102/// # Known limitation: indirect memory poisoning
103///
104/// Conversation history is treated as first-party (user-typed) content. However, the LLM
105/// may call `memory_save` with content derived from a prior injection in external sources
106/// (web scrape → spotlighted → LLM stores payload → recalled as `[assistant]` turn).
107/// Mitigate by configuring `forbidden_content_patterns` in `[memory.validation]` to block
108/// known injection strings on the write path. This risk is pre-existing and is not worsened
109/// by the hint mechanism.
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub enum MemorySourceHint {
112 /// Prior user/assistant conversation turns (semantic recall, corrections).
113 ///
114 /// Injection patterns in recalled user text are expected false positives — the user
115 /// legitimately discussed topics like "system prompt" or "show your instructions".
116 ConversationHistory,
117 /// LLM-generated summaries (session summaries, cross-session context).
118 ///
119 /// Low risk: generated by the agent's own model from already-sanitized content.
120 LlmSummary,
121 /// External document chunks or graph entity facts.
122 ///
123 /// Full detection applies — may contain adversarial content from web scrapes,
124 /// MCP responses, or other untrusted sources that were stored in the corpus.
125 ExternalContent,
126}
127
128/// Provenance metadata attached to a piece of untrusted content.
129#[derive(Debug, Clone)]
130pub struct ContentSource {
131 pub kind: ContentSourceKind,
132 pub trust_level: ContentTrustLevel,
133 /// Optional identifier: tool name, URL, agent ID, etc.
134 pub identifier: Option<String>,
135 /// Optional hint for memory retrieval sub-sources. When `Some`, modulates injection
136 /// detection sensitivity in [`ContentSanitizer::sanitize`]. Non-memory sources leave
137 /// this as `None` — full detection applies.
138 pub memory_hint: Option<MemorySourceHint>,
139}
140
141impl ContentSource {
142 #[must_use]
143 pub fn new(kind: ContentSourceKind) -> Self {
144 Self {
145 trust_level: kind.default_trust_level(),
146 kind,
147 identifier: None,
148 memory_hint: None,
149 }
150 }
151
152 #[must_use]
153 pub fn with_identifier(mut self, id: impl Into<String>) -> Self {
154 self.identifier = Some(id.into());
155 self
156 }
157
158 #[must_use]
159 pub fn with_trust_level(mut self, level: ContentTrustLevel) -> Self {
160 self.trust_level = level;
161 self
162 }
163
164 /// Attach a memory source hint to modulate injection detection sensitivity.
165 ///
166 /// Only meaningful for `ContentSourceKind::MemoryRetrieval` sources.
167 #[must_use]
168 pub fn with_memory_hint(mut self, hint: MemorySourceHint) -> Self {
169 self.memory_hint = Some(hint);
170 self
171 }
172}
173
174// ---------------------------------------------------------------------------
175// Output types
176// ---------------------------------------------------------------------------
177
178/// A single detected injection pattern match.
179#[derive(Debug, Clone)]
180pub struct InjectionFlag {
181 pub pattern_name: &'static str,
182 /// Byte offset of the match within the (already truncated, stripped) content.
183 pub byte_offset: usize,
184 pub matched_text: String,
185}
186
187/// Result of ML-based injection classification.
188///
189/// Replaces the previous `bool` return type of `classify_injection` to support
190/// a defense-in-depth dual-threshold model. Real-world ML injection classifiers
191/// have 12–37% recall gaps at high confidence thresholds, so `Suspicious` content
192/// is surfaced for operator visibility without blocking — a mandatory second layer.
193#[cfg(feature = "classifiers")]
194#[derive(Debug, Clone, Copy, PartialEq, Eq)]
195pub enum InjectionVerdict {
196 /// Score below soft threshold — no injection signal detected.
197 Clean,
198 /// Score ≥ soft threshold but < hard threshold — suspicious, warn only.
199 Suspicious,
200 /// Score ≥ hard threshold — injection detected, block.
201 Blocked,
202}
203
204/// Classification result from the three-class `AlignSentinel` model.
205///
206/// Used to refine binary injection verdicts: `AlignedInstruction` and `NoInstruction`
207/// results downgrade `Suspicious`/`Blocked` to `Clean`, reducing false positives from
208/// legitimate instruction-style content in tool outputs.
209#[cfg(feature = "classifiers")]
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211pub enum InstructionClass {
212 NoInstruction,
213 AlignedInstruction,
214 MisalignedInstruction,
215 /// Model returned an unknown label. Treated conservatively — verdict is NOT downgraded.
216 Unknown,
217}
218
219#[cfg(feature = "classifiers")]
220impl InstructionClass {
221 pub(crate) fn from_label(label: &str) -> Self {
222 match label.to_lowercase().as_str() {
223 "no_instruction" | "no-instruction" | "none" => Self::NoInstruction,
224 "aligned_instruction" | "aligned-instruction" | "aligned" => Self::AlignedInstruction,
225 "misaligned_instruction" | "misaligned-instruction" | "misaligned" => {
226 Self::MisalignedInstruction
227 }
228 _ => Self::Unknown,
229 }
230 }
231}
232
233/// Result of the sanitization pipeline for a single piece of content.
234#[derive(Debug, Clone)]
235pub struct SanitizedContent {
236 /// The processed, possibly spotlighted body to insert into message history.
237 pub body: String,
238 pub source: ContentSource,
239 pub injection_flags: Vec<InjectionFlag>,
240 /// `true` when content was truncated to `max_content_size`.
241 pub was_truncated: bool,
242}