Skip to main content

zeph_sanitizer/
types.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use serde::{Deserialize, Serialize};
5
6// ---------------------------------------------------------------------------
7// Trust model
8// ---------------------------------------------------------------------------
9
10/// Trust tier assigned to content entering the agent context.
11///
12/// Drives spotlighting intensity: [`Trusted`](ContentTrustLevel::Trusted) content passes
13/// through unchanged; [`ExternalUntrusted`](ContentTrustLevel::ExternalUntrusted) receives
14/// the strongest warning header.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
16#[serde(rename_all = "snake_case")]
17pub enum ContentTrustLevel {
18    /// System prompt, hardcoded instructions, direct user input. No wrapping applied.
19    Trusted,
20    /// Tool results from local executors (shell, file I/O). Lighter warning.
21    LocalUntrusted,
22    /// External sources: web scrape, MCP, A2A, memory retrieval. Strongest warning.
23    ExternalUntrusted,
24}
25
26/// All known content source categories.
27///
28/// Used for spotlighting annotation and future per-source config overrides.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
30#[serde(rename_all = "snake_case")]
31pub enum ContentSourceKind {
32    ToolResult,
33    WebScrape,
34    McpResponse,
35    A2aMessage,
36    /// Content retrieved from Qdrant/SQLite semantic memory.
37    ///
38    /// Memory poisoning is a documented attack vector: an adversary can plant injection
39    /// payloads in web content that gets stored, then recalled in future sessions.
40    MemoryRetrieval,
41    /// Project-level instruction files (`.zeph/zeph.md`, CLAUDE.md, etc.).
42    ///
43    /// Treated as `LocalUntrusted` by default. Path-based trust inference (e.g. treating
44    /// user-authored files as `Trusted`) is a Phase 2 concern.
45    InstructionFile,
46}
47
48impl ContentSourceKind {
49    /// Returns the default trust level for this source kind.
50    #[must_use]
51    pub fn default_trust_level(self) -> ContentTrustLevel {
52        match self {
53            Self::ToolResult | Self::InstructionFile => ContentTrustLevel::LocalUntrusted,
54            Self::WebScrape | Self::McpResponse | Self::A2aMessage | Self::MemoryRetrieval => {
55                ContentTrustLevel::ExternalUntrusted
56            }
57        }
58    }
59
60    pub(crate) fn as_str(self) -> &'static str {
61        match self {
62            Self::ToolResult => "tool_result",
63            Self::WebScrape => "web_scrape",
64            Self::McpResponse => "mcp_response",
65            Self::A2aMessage => "a2a_message",
66            Self::MemoryRetrieval => "memory_retrieval",
67            Self::InstructionFile => "instruction_file",
68        }
69    }
70
71    /// Parse a string into a `ContentSourceKind`.
72    ///
73    /// Returns `None` for unrecognized strings (instead of an error) so callers
74    /// can log a warning and skip unknown values without breaking deserialization.
75    #[must_use]
76    pub fn from_str_opt(s: &str) -> Option<Self> {
77        match s {
78            "tool_result" => Some(Self::ToolResult),
79            "web_scrape" => Some(Self::WebScrape),
80            "mcp_response" => Some(Self::McpResponse),
81            "a2a_message" => Some(Self::A2aMessage),
82            "memory_retrieval" => Some(Self::MemoryRetrieval),
83            "instruction_file" => Some(Self::InstructionFile),
84            _ => None,
85        }
86    }
87}
88
89/// Hint about the origin of memory-retrieved content.
90///
91/// Used to modulate injection detection sensitivity within [`ContentSanitizer::sanitize`].
92/// The hint is set at call-site (compile-time) based on which retrieval path produced the
93/// content — it cannot be influenced by the content itself and thus cannot be spoofed.
94///
95/// # Defense-in-depth invariant
96///
97/// Setting a hint to [`ConversationHistory`](MemorySourceHint::ConversationHistory) or
98/// [`LlmSummary`](MemorySourceHint::LlmSummary) **only** skips injection pattern detection
99/// (step 3). Truncation, control-character stripping, delimiter escaping, and spotlighting
100/// remain active for all sources regardless of this hint.
101///
102/// # Known limitation: indirect memory poisoning
103///
104/// Conversation history is treated as first-party (user-typed) content. However, the LLM
105/// may call `memory_save` with content derived from a prior injection in external sources
106/// (web scrape → spotlighted → LLM stores payload → recalled as `[assistant]` turn).
107/// Mitigate by configuring `forbidden_content_patterns` in `[memory.validation]` to block
108/// known injection strings on the write path. This risk is pre-existing and is not worsened
109/// by the hint mechanism.
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub enum MemorySourceHint {
112    /// Prior user/assistant conversation turns (semantic recall, corrections).
113    ///
114    /// Injection patterns in recalled user text are expected false positives — the user
115    /// legitimately discussed topics like "system prompt" or "show your instructions".
116    ConversationHistory,
117    /// LLM-generated summaries (session summaries, cross-session context).
118    ///
119    /// Low risk: generated by the agent's own model from already-sanitized content.
120    LlmSummary,
121    /// External document chunks or graph entity facts.
122    ///
123    /// Full detection applies — may contain adversarial content from web scrapes,
124    /// MCP responses, or other untrusted sources that were stored in the corpus.
125    ExternalContent,
126}
127
128/// Provenance metadata attached to a piece of untrusted content.
129#[derive(Debug, Clone)]
130pub struct ContentSource {
131    pub kind: ContentSourceKind,
132    pub trust_level: ContentTrustLevel,
133    /// Optional identifier: tool name, URL, agent ID, etc.
134    pub identifier: Option<String>,
135    /// Optional hint for memory retrieval sub-sources. When `Some`, modulates injection
136    /// detection sensitivity in [`ContentSanitizer::sanitize`]. Non-memory sources leave
137    /// this as `None` — full detection applies.
138    pub memory_hint: Option<MemorySourceHint>,
139}
140
141impl ContentSource {
142    #[must_use]
143    pub fn new(kind: ContentSourceKind) -> Self {
144        Self {
145            trust_level: kind.default_trust_level(),
146            kind,
147            identifier: None,
148            memory_hint: None,
149        }
150    }
151
152    #[must_use]
153    pub fn with_identifier(mut self, id: impl Into<String>) -> Self {
154        self.identifier = Some(id.into());
155        self
156    }
157
158    #[must_use]
159    pub fn with_trust_level(mut self, level: ContentTrustLevel) -> Self {
160        self.trust_level = level;
161        self
162    }
163
164    /// Attach a memory source hint to modulate injection detection sensitivity.
165    ///
166    /// Only meaningful for `ContentSourceKind::MemoryRetrieval` sources.
167    #[must_use]
168    pub fn with_memory_hint(mut self, hint: MemorySourceHint) -> Self {
169        self.memory_hint = Some(hint);
170        self
171    }
172}
173
174// ---------------------------------------------------------------------------
175// Output types
176// ---------------------------------------------------------------------------
177
178/// A single detected injection pattern match.
179#[derive(Debug, Clone)]
180pub struct InjectionFlag {
181    pub pattern_name: &'static str,
182    /// Byte offset of the match within the (already truncated, stripped) content.
183    pub byte_offset: usize,
184    pub matched_text: String,
185}
186
187/// Result of ML-based injection classification.
188///
189/// Replaces the previous `bool` return type of `classify_injection` to support
190/// a defense-in-depth dual-threshold model. Real-world ML injection classifiers
191/// have 12–37% recall gaps at high confidence thresholds, so `Suspicious` content
192/// is surfaced for operator visibility without blocking — a mandatory second layer.
193#[cfg(feature = "classifiers")]
194#[derive(Debug, Clone, Copy, PartialEq, Eq)]
195pub enum InjectionVerdict {
196    /// Score below soft threshold — no injection signal detected.
197    Clean,
198    /// Score ≥ soft threshold but < hard threshold — suspicious, warn only.
199    Suspicious,
200    /// Score ≥ hard threshold — injection detected, block.
201    Blocked,
202}
203
204/// Classification result from the three-class `AlignSentinel` model.
205///
206/// Used to refine binary injection verdicts: `AlignedInstruction` and `NoInstruction`
207/// results downgrade `Suspicious`/`Blocked` to `Clean`, reducing false positives from
208/// legitimate instruction-style content in tool outputs.
209#[cfg(feature = "classifiers")]
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211pub enum InstructionClass {
212    NoInstruction,
213    AlignedInstruction,
214    MisalignedInstruction,
215    /// Model returned an unknown label. Treated conservatively — verdict is NOT downgraded.
216    Unknown,
217}
218
219#[cfg(feature = "classifiers")]
220impl InstructionClass {
221    pub(crate) fn from_label(label: &str) -> Self {
222        match label.to_lowercase().as_str() {
223            "no_instruction" | "no-instruction" | "none" => Self::NoInstruction,
224            "aligned_instruction" | "aligned-instruction" | "aligned" => Self::AlignedInstruction,
225            "misaligned_instruction" | "misaligned-instruction" | "misaligned" => {
226                Self::MisalignedInstruction
227            }
228            _ => Self::Unknown,
229        }
230    }
231}
232
233/// Result of the sanitization pipeline for a single piece of content.
234#[derive(Debug, Clone)]
235pub struct SanitizedContent {
236    /// The processed, possibly spotlighted body to insert into message history.
237    pub body: String,
238    pub source: ContentSource,
239    pub injection_flags: Vec<InjectionFlag>,
240    /// `true` when content was truncated to `max_content_size`.
241    pub was_truncated: bool,
242}