Skip to main content

chio_guards/
text_utils.rs

1//! Text canonicalization utilities shared across content-safety guards.
2//!
3//! These functions normalize free-form text before running regex-based signal
4//! detection.  Canonicalization is deliberately conservative: the output is a
5//! lowercase ASCII-biased form that preserves the general shape of the input
6//! but strips common obfuscation techniques (zero-width splicing, homoglyph
7//! substitution, punctuation runs, case flipping).
8//!
9//! This module is shared infrastructure for the
10//! [`crate::prompt_injection::PromptInjectionGuard`] and the forthcoming
11//! jailbreak guard.  It has no external dependencies beyond the standard
12//! library and is safe to use in fail-closed guard paths.
13
14/// The canonical-form representation of an input string.
15///
16/// The returned `String` has:
17///
18/// - all ASCII letters lowercased;
19/// - common Unicode homoglyphs of Latin letters folded to their ASCII
20///   counterparts (e.g. Cyrillic `а` -> `a`, full-width digits -> ASCII);
21/// - zero-width and Unicode formatting characters removed;
22/// - runs of two or more separator-class punctuation characters collapsed
23///   to a single space.
24///
25/// This is NOT a security-grade Unicode normaliser.  It is a best-effort
26/// heuristic that defeats the most common copy-paste prompt injection
27/// tricks seen in the wild.  Callers still need to bound the input length
28/// (`max_scan_bytes`) and fail-closed on internal errors.
29pub fn canonicalize(input: &str) -> String {
30    // First pass: strip zero-width / format characters, fold homoglyphs,
31    // lowercase ASCII letters in one sweep.  We also collect a secondary
32    // pass indicator: whether the previous emitted character was a
33    // punctuation run that should be collapsed.
34    let mut out = String::with_capacity(input.len());
35    for ch in input.chars() {
36        if is_zero_width(ch) {
37            continue;
38        }
39        let mapped = fold_homoglyph(ch);
40        // Lowercase only for ASCII letters; leave folded ASCII as-is.
41        if mapped.is_ascii_uppercase() {
42            out.push(mapped.to_ascii_lowercase());
43        } else {
44            out.push(mapped);
45        }
46    }
47
48    // Second pass: collapse whitespace runs and separator-punctuation runs
49    // to a single space, and trim the result.
50    collapse_runs(&out)
51}
52
53/// Return true if `ch` is a zero-width or Unicode formatting character
54/// commonly used to obfuscate prompt content.
55///
56/// The set is a subset of the Unicode "formatting / joining" category plus a
57/// handful of BOM/LRM/RLM codepoints; it is not exhaustive but covers the
58/// characters that appear in observed injection payloads.
59pub fn is_zero_width(ch: char) -> bool {
60    matches!(
61        ch,
62        '\u{200B}' // ZERO WIDTH SPACE
63            | '\u{200C}' // ZWNJ
64            | '\u{200D}' // ZWJ
65            | '\u{200E}' // LRM
66            | '\u{200F}' // RLM
67            | '\u{202A}'..='\u{202E}' // LRE/RLE/PDF/LRO/RLO
68            | '\u{2060}' // WORD JOINER
69            | '\u{2061}'..='\u{2064}' // invisible function/plus/separator
70            | '\u{FEFF}' // BOM / zero-width no-break space
71            | '\u{180E}' // Mongolian vowel separator
72            | '\u{034F}' // combining grapheme joiner
73            | '\u{061C}' // arabic letter mark
74    )
75}
76
77/// Fold a single character to its ASCII analogue when it is a commonly-used
78/// homoglyph.  Returns the original character when no fold is known.
79///
80/// The table is intentionally small: we prioritise characters that actually
81/// appear in observed prompt-injection payloads (Cyrillic letters that look
82/// like Latin, full-width digits and letters, Greek alpha/omicron, etc.).
83/// Expanding the table later is purely additive.
84fn fold_homoglyph(ch: char) -> char {
85    match ch {
86        // Cyrillic -> Latin look-alikes.
87        'А' => 'A',
88        'а' => 'a',
89        'В' => 'B',
90        'С' => 'C',
91        'с' => 'c',
92        'Е' => 'E',
93        'е' => 'e',
94        'Н' => 'H',
95        'К' => 'K',
96        'М' => 'M',
97        'О' => 'O',
98        'о' => 'o',
99        'Р' => 'P',
100        'р' => 'p',
101        'Т' => 'T',
102        'Х' => 'X',
103        'х' => 'x',
104        'У' => 'Y',
105        'у' => 'y',
106        'і' => 'i',
107        'І' => 'I',
108        // Greek -> Latin look-alikes.
109        'Α' => 'A',
110        'α' => 'a',
111        'Β' => 'B',
112        'Ε' => 'E',
113        'ε' => 'e',
114        'Η' => 'H',
115        'Ι' => 'I',
116        'ι' => 'i',
117        'Κ' => 'K',
118        'Μ' => 'M',
119        'Ν' => 'N',
120        'Ο' => 'O',
121        'ο' => 'o',
122        'Ρ' => 'P',
123        'Τ' => 'T',
124        'Υ' => 'Y',
125        'Χ' => 'X',
126        // Full-width ASCII -> ASCII.
127        '\u{FF01}'..='\u{FF5E}' => {
128            // Full-width punctuation and Latin block maps directly via offset.
129            // SAFETY: the subtraction stays inside the BMP; every codepoint
130            // in the range has a valid ASCII analogue at offset 0xFEE0.
131            let raw = ch as u32 - 0xFEE0;
132            char::from_u32(raw).unwrap_or(ch)
133        }
134        // Full-width digits 0-9 handled by the FF01-FF5E range above.
135        _ => ch,
136    }
137}
138
139/// Collapse runs of whitespace and separator punctuation into a single space,
140/// then trim leading/trailing whitespace.  This prevents attackers from
141/// evading regex matchers by splicing extra punctuation into key phrases.
142fn collapse_runs(input: &str) -> String {
143    let mut out = String::with_capacity(input.len());
144    let mut prev_was_break = false;
145    for ch in input.chars() {
146        let is_break = ch.is_whitespace() || is_separator_punct(ch);
147        if is_break {
148            if !prev_was_break && !out.is_empty() {
149                out.push(' ');
150            }
151            prev_was_break = true;
152        } else {
153            out.push(ch);
154            prev_was_break = false;
155        }
156    }
157    let trimmed = out.trim_end().to_string();
158    trimmed
159}
160
161/// ASCII-centric separator punctuation run detector.  We collapse runs of
162/// these so "ignore---all---previous" normalises cleanly.  We do NOT collapse
163/// single punctuation characters: only runs of two or more are affected by
164/// `collapse_runs`.
165///
166/// Note: `:` and `/` are intentionally excluded so URL-shaped substrings
167/// (`https://`) survive canonicalization and remain matchable by the
168/// exfiltration-framing signal.
169fn is_separator_punct(ch: char) -> bool {
170    matches!(
171        ch,
172        '-' | '_' | '~' | '=' | '*' | '+' | '.' | ',' | ';' | '|' | '\\'
173    )
174}
175
176/// Truncate `input` to at most `max_bytes` bytes while preserving UTF-8
177/// boundaries.  Returns the truncated slice and a `bool` indicating whether
178/// truncation happened.  Guards use this to bound scan cost without splitting
179/// multi-byte characters.
180pub fn truncate_at_char_boundary(input: &str, max_bytes: usize) -> (&str, bool) {
181    if input.len() <= max_bytes {
182        return (input, false);
183    }
184    // Walk backwards from max_bytes to the nearest char boundary.
185    let mut end = max_bytes.min(input.len());
186    while end > 0 && !input.is_char_boundary(end) {
187        end -= 1;
188    }
189    (&input[..end], true)
190}
191
192/// Ratio of non-alphanumeric (punctuation / symbol) characters to
193/// non-whitespace characters.  Used by the statistical jailbreak layer to
194/// flag inputs whose visible content is dominated by symbols (a common
195/// adversarial-suffix shape).  Returns `0.0` for empty or all-whitespace
196/// input.
197pub fn punctuation_ratio(s: &str) -> f32 {
198    let mut punct = 0usize;
199    let mut total = 0usize;
200    for c in s.chars() {
201        if c.is_whitespace() {
202            continue;
203        }
204        total += 1;
205        if !c.is_alphanumeric() {
206            punct += 1;
207        }
208    }
209    if total == 0 {
210        0.0
211    } else {
212        punct as f32 / total as f32
213    }
214}
215
216/// Return true if `s` contains a run of `min_run` or more consecutive
217/// non-alphanumeric, non-whitespace characters.  Adversarial suffixes in the
218/// wild typically appear as long unbroken punctuation / symbol sequences.
219pub fn long_run_of_symbols(s: &str, min_run: usize) -> bool {
220    if min_run == 0 {
221        return true;
222    }
223    let mut run = 0usize;
224    for c in s.chars() {
225        if c.is_alphanumeric() || c.is_whitespace() {
226            run = 0;
227            continue;
228        }
229        run += 1;
230        if run >= min_run {
231            return true;
232        }
233    }
234    false
235}
236
237/// Shannon entropy (bits/char) over non-whitespace ASCII bytes of `s`.
238/// Returns `0.0` when the ASCII-non-whitespace subset is empty.  This is a
239/// cheap proxy for character diversity: payloads dominated by a handful of
240/// symbols have low entropy; uniform-random adversarial suffixes have high
241/// entropy.  Non-ASCII characters are ignored (they are already accounted
242/// for by canonicalization folding).
243pub fn shannon_entropy_ascii_nonws(s: &str) -> f32 {
244    let mut counts = [0u32; 128];
245    let mut total = 0u32;
246    for b in s.bytes() {
247        if b >= 128 || b.is_ascii_whitespace() {
248            continue;
249        }
250        counts[b as usize] = counts[b as usize].saturating_add(1);
251        total = total.saturating_add(1);
252    }
253    if total == 0 {
254        return 0.0;
255    }
256    let total_f = total as f64;
257    let mut entropy = 0.0f64;
258    for c in counts {
259        if c == 0 {
260            continue;
261        }
262        let p = (c as f64) / total_f;
263        entropy -= p * p.log2();
264    }
265    entropy as f32
266}
267
268/// Number of zero-width / Unicode formatting codepoints in `s` (using the
269/// [`is_zero_width`] predicate).  Useful for a statistical "obfuscation"
270/// signal that fires even when canonicalization has already stripped the
271/// characters: callers count on the original pre-canonicalization string.
272pub fn zero_width_count(s: &str) -> usize {
273    s.chars().filter(|c| is_zero_width(*c)).count()
274}
275
276/// Ratio of distinct character shingles (sliding n-grams) to total shingles
277/// for `s` after canonicalization.  Lower values indicate heavy repetition
278/// (a hallmark of token-spam / adversarial-suffix attacks).  Returns `1.0`
279/// when `s` has fewer than `n` chars or is empty (nothing to compare).
280///
281/// `n` is clamped to `[1, 16]`; callers typically pick `n = 3` for
282/// character trigrams, which balance sensitivity against random noise.
283pub fn shingle_uniqueness(s: &str, n: usize) -> f32 {
284    let n = n.clamp(1, 16);
285    let chars: Vec<char> = s.chars().collect();
286    if chars.len() < n {
287        return 1.0;
288    }
289    let total = chars.len() - n + 1;
290    if total == 0 {
291        return 1.0;
292    }
293    let mut seen: std::collections::HashSet<String> =
294        std::collections::HashSet::with_capacity(total);
295    for window in chars.windows(n) {
296        let key: String = window.iter().collect();
297        seen.insert(key);
298    }
299    (seen.len() as f32) / (total as f32)
300}
301
302#[cfg(test)]
303mod tests {
304    use super::*;
305
306    #[test]
307    fn canonicalize_lowercases_ascii() {
308        assert_eq!(canonicalize("IGNORE ALL"), "ignore all");
309    }
310
311    #[test]
312    fn canonicalize_strips_zero_width() {
313        let sneaky = "ig\u{200B}no\u{200C}re all";
314        assert_eq!(canonicalize(sneaky), "ignore all");
315    }
316
317    #[test]
318    fn canonicalize_folds_homoglyphs() {
319        // Cyrillic U+0440 (er) -> ASCII "p"; lowercase and fold together.
320        let disguised = "igno\u{0440}e";
321        assert_eq!(canonicalize(disguised), "ignope");
322        // Full-width ASCII folds via the 0xFEE0 offset.
323        assert_eq!(canonicalize("IGNORE"), "ignore");
324    }
325
326    #[test]
327    fn canonicalize_collapses_separators() {
328        assert_eq!(
329            canonicalize("ignore---all___previous"),
330            "ignore all previous"
331        );
332    }
333
334    #[test]
335    fn truncate_respects_utf8_boundary() {
336        let input = "héllo"; // é is two bytes
337        let (out, truncated) = truncate_at_char_boundary(input, 2);
338        assert!(truncated);
339        assert_eq!(out, "h");
340    }
341
342    #[test]
343    fn truncate_short_input_unchanged() {
344        let (out, truncated) = truncate_at_char_boundary("hi", 100);
345        assert!(!truncated);
346        assert_eq!(out, "hi");
347    }
348
349    #[test]
350    fn punctuation_ratio_basic() {
351        assert_eq!(punctuation_ratio(""), 0.0);
352        assert_eq!(punctuation_ratio("   \n\t"), 0.0);
353        // All alphanum -> 0.0.
354        assert_eq!(punctuation_ratio("abc123"), 0.0);
355        // All punctuation -> 1.0.
356        assert_eq!(punctuation_ratio("!!!@@@"), 1.0);
357        // Half and half (non-whitespace): 3/6 = 0.5.
358        assert!((punctuation_ratio("ab;c;!") - 0.5).abs() < 1e-6);
359    }
360
361    #[test]
362    fn long_run_of_symbols_detects_runs() {
363        assert!(!long_run_of_symbols("hello world", 12));
364        assert!(long_run_of_symbols("hello !!!!!!!!!!!! world", 12));
365        assert!(!long_run_of_symbols("hello !!! world", 12));
366        // min_run 0 is trivially true even for empty input.
367        assert!(long_run_of_symbols("", 0));
368    }
369
370    #[test]
371    fn shannon_entropy_ascii_nonws_bounds() {
372        // All-one-character -> 0 entropy.
373        assert!(shannon_entropy_ascii_nonws("aaaaaa") < 1e-6);
374        // Two equiprobable characters -> 1 bit.
375        let e = shannon_entropy_ascii_nonws("abababab");
376        assert!((e - 1.0).abs() < 0.1);
377        // Empty input -> 0.
378        assert_eq!(shannon_entropy_ascii_nonws(""), 0.0);
379    }
380
381    #[test]
382    fn zero_width_count_matches_inserts() {
383        let s = "a\u{200B}b\u{200C}c\u{FEFF}d";
384        assert_eq!(zero_width_count(s), 3);
385        assert_eq!(zero_width_count("plain"), 0);
386    }
387
388    #[test]
389    fn shingle_uniqueness_detects_repetition() {
390        // Unique input: every trigram distinct.
391        let u = shingle_uniqueness("abcdefg", 3);
392        assert!((u - 1.0).abs() < 1e-6);
393        // Repeated trigrams: "aaa" repeats.
394        let r = shingle_uniqueness("aaaaaaaaa", 3);
395        assert!(r < 0.2, "expected low uniqueness, got {r}");
396        // Too-short input returns 1.0.
397        assert_eq!(shingle_uniqueness("ab", 3), 1.0);
398    }
399}