chio_guards/text_utils.rs
1//! Text canonicalization utilities shared across content-safety guards.
2//!
3//! These functions normalize free-form text before running regex-based signal
4//! detection. Canonicalization is deliberately conservative: the output is a
5//! lowercase ASCII-biased form that preserves the general shape of the input
6//! but strips common obfuscation techniques (zero-width splicing, homoglyph
7//! substitution, punctuation runs, case flipping).
8//!
9//! This module is shared infrastructure for the
10//! [`crate::prompt_injection::PromptInjectionGuard`] and the forthcoming
11//! jailbreak guard. It has no external dependencies beyond the standard
12//! library and is safe to use in fail-closed guard paths.
13
14/// The canonical-form representation of an input string.
15///
16/// The returned `String` has:
17///
18/// - all ASCII letters lowercased;
19/// - common Unicode homoglyphs of Latin letters folded to their ASCII
20/// counterparts (e.g. Cyrillic `а` -> `a`, full-width digits -> ASCII);
21/// - zero-width and Unicode formatting characters removed;
22/// - runs of two or more separator-class punctuation characters collapsed
23/// to a single space.
24///
25/// This is NOT a security-grade Unicode normaliser. It is a best-effort
26/// heuristic that defeats the most common copy-paste prompt injection
27/// tricks seen in the wild. Callers still need to bound the input length
28/// (`max_scan_bytes`) and fail-closed on internal errors.
29pub fn canonicalize(input: &str) -> String {
30 // First pass: strip zero-width / format characters, fold homoglyphs,
31 // lowercase ASCII letters in one sweep. We also collect a secondary
32 // pass indicator: whether the previous emitted character was a
33 // punctuation run that should be collapsed.
34 let mut out = String::with_capacity(input.len());
35 for ch in input.chars() {
36 if is_zero_width(ch) {
37 continue;
38 }
39 let mapped = fold_homoglyph(ch);
40 // Lowercase only for ASCII letters; leave folded ASCII as-is.
41 if mapped.is_ascii_uppercase() {
42 out.push(mapped.to_ascii_lowercase());
43 } else {
44 out.push(mapped);
45 }
46 }
47
48 // Second pass: collapse whitespace runs and separator-punctuation runs
49 // to a single space, and trim the result.
50 collapse_runs(&out)
51}
52
53/// Return true if `ch` is a zero-width or Unicode formatting character
54/// commonly used to obfuscate prompt content.
55///
56/// The set is a subset of the Unicode "formatting / joining" category plus a
57/// handful of BOM/LRM/RLM codepoints; it is not exhaustive but covers the
58/// characters that appear in observed injection payloads.
59pub fn is_zero_width(ch: char) -> bool {
60 matches!(
61 ch,
62 '\u{200B}' // ZERO WIDTH SPACE
63 | '\u{200C}' // ZWNJ
64 | '\u{200D}' // ZWJ
65 | '\u{200E}' // LRM
66 | '\u{200F}' // RLM
67 | '\u{202A}'..='\u{202E}' // LRE/RLE/PDF/LRO/RLO
68 | '\u{2060}' // WORD JOINER
69 | '\u{2061}'..='\u{2064}' // invisible function/plus/separator
70 | '\u{FEFF}' // BOM / zero-width no-break space
71 | '\u{180E}' // Mongolian vowel separator
72 | '\u{034F}' // combining grapheme joiner
73 | '\u{061C}' // arabic letter mark
74 )
75}
76
77/// Fold a single character to its ASCII analogue when it is a commonly-used
78/// homoglyph. Returns the original character when no fold is known.
79///
80/// The table is intentionally small: we prioritise characters that actually
81/// appear in observed prompt-injection payloads (Cyrillic letters that look
82/// like Latin, full-width digits and letters, Greek alpha/omicron, etc.).
83/// Expanding the table later is purely additive.
84fn fold_homoglyph(ch: char) -> char {
85 match ch {
86 // Cyrillic -> Latin look-alikes.
87 'А' => 'A',
88 'а' => 'a',
89 'В' => 'B',
90 'С' => 'C',
91 'с' => 'c',
92 'Е' => 'E',
93 'е' => 'e',
94 'Н' => 'H',
95 'К' => 'K',
96 'М' => 'M',
97 'О' => 'O',
98 'о' => 'o',
99 'Р' => 'P',
100 'р' => 'p',
101 'Т' => 'T',
102 'Х' => 'X',
103 'х' => 'x',
104 'У' => 'Y',
105 'у' => 'y',
106 'і' => 'i',
107 'І' => 'I',
108 // Greek -> Latin look-alikes.
109 'Α' => 'A',
110 'α' => 'a',
111 'Β' => 'B',
112 'Ε' => 'E',
113 'ε' => 'e',
114 'Η' => 'H',
115 'Ι' => 'I',
116 'ι' => 'i',
117 'Κ' => 'K',
118 'Μ' => 'M',
119 'Ν' => 'N',
120 'Ο' => 'O',
121 'ο' => 'o',
122 'Ρ' => 'P',
123 'Τ' => 'T',
124 'Υ' => 'Y',
125 'Χ' => 'X',
126 // Full-width ASCII -> ASCII.
127 '\u{FF01}'..='\u{FF5E}' => {
128 // Full-width punctuation and Latin block maps directly via offset.
129 // SAFETY: the subtraction stays inside the BMP; every codepoint
130 // in the range has a valid ASCII analogue at offset 0xFEE0.
131 let raw = ch as u32 - 0xFEE0;
132 char::from_u32(raw).unwrap_or(ch)
133 }
134 // Full-width digits 0-9 handled by the FF01-FF5E range above.
135 _ => ch,
136 }
137}
138
139/// Collapse runs of whitespace and separator punctuation into a single space,
140/// then trim leading/trailing whitespace. This prevents attackers from
141/// evading regex matchers by splicing extra punctuation into key phrases.
142fn collapse_runs(input: &str) -> String {
143 let mut out = String::with_capacity(input.len());
144 let mut prev_was_break = false;
145 for ch in input.chars() {
146 let is_break = ch.is_whitespace() || is_separator_punct(ch);
147 if is_break {
148 if !prev_was_break && !out.is_empty() {
149 out.push(' ');
150 }
151 prev_was_break = true;
152 } else {
153 out.push(ch);
154 prev_was_break = false;
155 }
156 }
157 let trimmed = out.trim_end().to_string();
158 trimmed
159}
160
161/// ASCII-centric separator punctuation run detector. We collapse runs of
162/// these so "ignore---all---previous" normalises cleanly. We do NOT collapse
163/// single punctuation characters: only runs of two or more are affected by
164/// `collapse_runs`.
165///
166/// Note: `:` and `/` are intentionally excluded so URL-shaped substrings
167/// (`https://`) survive canonicalization and remain matchable by the
168/// exfiltration-framing signal.
169fn is_separator_punct(ch: char) -> bool {
170 matches!(
171 ch,
172 '-' | '_' | '~' | '=' | '*' | '+' | '.' | ',' | ';' | '|' | '\\'
173 )
174}
175
176/// Truncate `input` to at most `max_bytes` bytes while preserving UTF-8
177/// boundaries. Returns the truncated slice and a `bool` indicating whether
178/// truncation happened. Guards use this to bound scan cost without splitting
179/// multi-byte characters.
180pub fn truncate_at_char_boundary(input: &str, max_bytes: usize) -> (&str, bool) {
181 if input.len() <= max_bytes {
182 return (input, false);
183 }
184 // Walk backwards from max_bytes to the nearest char boundary.
185 let mut end = max_bytes.min(input.len());
186 while end > 0 && !input.is_char_boundary(end) {
187 end -= 1;
188 }
189 (&input[..end], true)
190}
191
192/// Ratio of non-alphanumeric (punctuation / symbol) characters to
193/// non-whitespace characters. Used by the statistical jailbreak layer to
194/// flag inputs whose visible content is dominated by symbols (a common
195/// adversarial-suffix shape). Returns `0.0` for empty or all-whitespace
196/// input.
197pub fn punctuation_ratio(s: &str) -> f32 {
198 let mut punct = 0usize;
199 let mut total = 0usize;
200 for c in s.chars() {
201 if c.is_whitespace() {
202 continue;
203 }
204 total += 1;
205 if !c.is_alphanumeric() {
206 punct += 1;
207 }
208 }
209 if total == 0 {
210 0.0
211 } else {
212 punct as f32 / total as f32
213 }
214}
215
216/// Return true if `s` contains a run of `min_run` or more consecutive
217/// non-alphanumeric, non-whitespace characters. Adversarial suffixes in the
218/// wild typically appear as long unbroken punctuation / symbol sequences.
219pub fn long_run_of_symbols(s: &str, min_run: usize) -> bool {
220 if min_run == 0 {
221 return true;
222 }
223 let mut run = 0usize;
224 for c in s.chars() {
225 if c.is_alphanumeric() || c.is_whitespace() {
226 run = 0;
227 continue;
228 }
229 run += 1;
230 if run >= min_run {
231 return true;
232 }
233 }
234 false
235}
236
237/// Shannon entropy (bits/char) over non-whitespace ASCII bytes of `s`.
238/// Returns `0.0` when the ASCII-non-whitespace subset is empty. This is a
239/// cheap proxy for character diversity: payloads dominated by a handful of
240/// symbols have low entropy; uniform-random adversarial suffixes have high
241/// entropy. Non-ASCII characters are ignored (they are already accounted
242/// for by canonicalization folding).
243pub fn shannon_entropy_ascii_nonws(s: &str) -> f32 {
244 let mut counts = [0u32; 128];
245 let mut total = 0u32;
246 for b in s.bytes() {
247 if b >= 128 || b.is_ascii_whitespace() {
248 continue;
249 }
250 counts[b as usize] = counts[b as usize].saturating_add(1);
251 total = total.saturating_add(1);
252 }
253 if total == 0 {
254 return 0.0;
255 }
256 let total_f = total as f64;
257 let mut entropy = 0.0f64;
258 for c in counts {
259 if c == 0 {
260 continue;
261 }
262 let p = (c as f64) / total_f;
263 entropy -= p * p.log2();
264 }
265 entropy as f32
266}
267
268/// Number of zero-width / Unicode formatting codepoints in `s` (using the
269/// [`is_zero_width`] predicate). Useful for a statistical "obfuscation"
270/// signal that fires even when canonicalization has already stripped the
271/// characters: callers count on the original pre-canonicalization string.
272pub fn zero_width_count(s: &str) -> usize {
273 s.chars().filter(|c| is_zero_width(*c)).count()
274}
275
276/// Ratio of distinct character shingles (sliding n-grams) to total shingles
277/// for `s` after canonicalization. Lower values indicate heavy repetition
278/// (a hallmark of token-spam / adversarial-suffix attacks). Returns `1.0`
279/// when `s` has fewer than `n` chars or is empty (nothing to compare).
280///
281/// `n` is clamped to `[1, 16]`; callers typically pick `n = 3` for
282/// character trigrams, which balance sensitivity against random noise.
283pub fn shingle_uniqueness(s: &str, n: usize) -> f32 {
284 let n = n.clamp(1, 16);
285 let chars: Vec<char> = s.chars().collect();
286 if chars.len() < n {
287 return 1.0;
288 }
289 let total = chars.len() - n + 1;
290 if total == 0 {
291 return 1.0;
292 }
293 let mut seen: std::collections::HashSet<String> =
294 std::collections::HashSet::with_capacity(total);
295 for window in chars.windows(n) {
296 let key: String = window.iter().collect();
297 seen.insert(key);
298 }
299 (seen.len() as f32) / (total as f32)
300}
301
302#[cfg(test)]
303mod tests {
304 use super::*;
305
306 #[test]
307 fn canonicalize_lowercases_ascii() {
308 assert_eq!(canonicalize("IGNORE ALL"), "ignore all");
309 }
310
311 #[test]
312 fn canonicalize_strips_zero_width() {
313 let sneaky = "ig\u{200B}no\u{200C}re all";
314 assert_eq!(canonicalize(sneaky), "ignore all");
315 }
316
317 #[test]
318 fn canonicalize_folds_homoglyphs() {
319 // Cyrillic U+0440 (er) -> ASCII "p"; lowercase and fold together.
320 let disguised = "igno\u{0440}e";
321 assert_eq!(canonicalize(disguised), "ignope");
322 // Full-width ASCII folds via the 0xFEE0 offset.
323 assert_eq!(canonicalize("IGNORE"), "ignore");
324 }
325
326 #[test]
327 fn canonicalize_collapses_separators() {
328 assert_eq!(
329 canonicalize("ignore---all___previous"),
330 "ignore all previous"
331 );
332 }
333
334 #[test]
335 fn truncate_respects_utf8_boundary() {
336 let input = "héllo"; // é is two bytes
337 let (out, truncated) = truncate_at_char_boundary(input, 2);
338 assert!(truncated);
339 assert_eq!(out, "h");
340 }
341
342 #[test]
343 fn truncate_short_input_unchanged() {
344 let (out, truncated) = truncate_at_char_boundary("hi", 100);
345 assert!(!truncated);
346 assert_eq!(out, "hi");
347 }
348
349 #[test]
350 fn punctuation_ratio_basic() {
351 assert_eq!(punctuation_ratio(""), 0.0);
352 assert_eq!(punctuation_ratio(" \n\t"), 0.0);
353 // All alphanum -> 0.0.
354 assert_eq!(punctuation_ratio("abc123"), 0.0);
355 // All punctuation -> 1.0.
356 assert_eq!(punctuation_ratio("!!!@@@"), 1.0);
357 // Half and half (non-whitespace): 3/6 = 0.5.
358 assert!((punctuation_ratio("ab;c;!") - 0.5).abs() < 1e-6);
359 }
360
361 #[test]
362 fn long_run_of_symbols_detects_runs() {
363 assert!(!long_run_of_symbols("hello world", 12));
364 assert!(long_run_of_symbols("hello !!!!!!!!!!!! world", 12));
365 assert!(!long_run_of_symbols("hello !!! world", 12));
366 // min_run 0 is trivially true even for empty input.
367 assert!(long_run_of_symbols("", 0));
368 }
369
370 #[test]
371 fn shannon_entropy_ascii_nonws_bounds() {
372 // All-one-character -> 0 entropy.
373 assert!(shannon_entropy_ascii_nonws("aaaaaa") < 1e-6);
374 // Two equiprobable characters -> 1 bit.
375 let e = shannon_entropy_ascii_nonws("abababab");
376 assert!((e - 1.0).abs() < 0.1);
377 // Empty input -> 0.
378 assert_eq!(shannon_entropy_ascii_nonws(""), 0.0);
379 }
380
381 #[test]
382 fn zero_width_count_matches_inserts() {
383 let s = "a\u{200B}b\u{200C}c\u{FEFF}d";
384 assert_eq!(zero_width_count(s), 3);
385 assert_eq!(zero_width_count("plain"), 0);
386 }
387
388 #[test]
389 fn shingle_uniqueness_detects_repetition() {
390 // Unique input: every trigram distinct.
391 let u = shingle_uniqueness("abcdefg", 3);
392 assert!((u - 1.0).abs() < 1e-6);
393 // Repeated trigrams: "aaa" repeats.
394 let r = shingle_uniqueness("aaaaaaaaa", 3);
395 assert!(r < 0.2, "expected low uniqueness, got {r}");
396 // Too-short input returns 1.0.
397 assert_eq!(shingle_uniqueness("ab", 3), 1.0);
398 }
399}