Skip to main content

atomcode_tuix/
sanitize.rs

1// crates/atomcode-tuix/src/sanitize.rs
2
3/// Strip ANSI escape sequences and C0/C1 control codes except tab/newline/CR.
4///
5/// Defends against:
6/// - CSI sequences (\x1b[...) — can clear screen, move cursor, query position
7/// - OSC sequences (\x1b]...\x07 or \x1b]...\x1b\\) — can set terminal title,
8///   manipulate clipboard, or write to hyperlink targets
9/// - C0 controls (0x00..0x1F) except \t \n \r — can ring bell, backspace, etc.
10/// - C1 controls (0x80..0x9F in UTF-8 as U+0080..U+009F) — alternate CSI forms
11/// - Bare ESC (\x1b not followed by a recognised intro)
12pub fn scrub_controls(input: &str) -> String {
13    scrub_inner(input, false)
14}
15
16/// Same as [`scrub_controls`] but preserves CSI sequences whose final
17/// byte is `m` — i.e. **SGR (Select Graphic Rendition)**: colour,
18/// bold, italic, underline, strikethrough, faint, reverse-video, etc.
19///
20/// SGR is purely cosmetic — it changes how subsequent text is drawn
21/// but never moves the cursor, queries terminal state, touches the
22/// clipboard, sets the window title, or otherwise reaches outside
23/// the display rectangle. Allowing it through is what `less`, `git`,
24/// `bat`, and every other "safe ANSI" tool does, and it lets trusted
25/// internal output (e.g. the `/codingplan` SetupReport's locked-model
26/// rows that render in the terminal's theme red) survive sanitisation
27/// without each caller having to roll its own emission path.
28///
29/// Use this on **trusted** output — strings the app itself builds
30/// (slash-command return text, status lines, setup reports). Do NOT
31/// use it on text that came from a remote LLM or any other untrusted
32/// channel: SGR can still be used to hide content (faint, black-on-
33/// black) or impersonate UI chrome (✓ in green next to a lie), so
34/// LLM streams continue to go through the strict [`scrub_controls`].
35pub fn scrub_controls_keep_sgr(input: &str) -> String {
36    scrub_inner(input, true)
37}
38
39fn scrub_inner(input: &str, keep_sgr: bool) -> String {
40    let mut out = String::with_capacity(input.len());
41    let mut chars = input.chars().peekable();
42
43    while let Some(c) = chars.next() {
44        match c {
45            '\t' | '\n' | '\r' => out.push(c),
46            '\x00'..='\x1F' => {
47                if c == '\x1B' {
48                    // ESC — consume one of: CSI, OSC, SS2, SS3, or lone byte
49                    match chars.peek() {
50                        Some(&'[') => {
51                            chars.next(); // consume [
52                                          // CSI: params, intermediates, final byte 0x40..=0x7E
53                            let mut buf = String::new();
54                            buf.push('\x1b');
55                            buf.push('[');
56                            let mut final_byte: Option<char> = None;
57                            while let Some(&p) = chars.peek() {
58                                chars.next();
59                                buf.push(p);
60                                if ('\x40'..='\x7E').contains(&p) {
61                                    final_byte = Some(p);
62                                    break;
63                                }
64                            }
65                            // SGR (CSI ... m) is pure presentation —
66                            // when the caller asked to keep SGR, emit
67                            // the buffered sequence verbatim. Other
68                            // CSI finals (cursor moves, DSR queries,
69                            // erase-in-display, etc.) stay dropped on
70                            // both paths.
71                            if keep_sgr && final_byte == Some('m') {
72                                out.push_str(&buf);
73                            }
74                        }
75                        Some(&']') => {
76                            chars.next(); // consume ]
77                                          // OSC: end on BEL (\x07) or ST (ESC \)
78                            while let Some(&p) = chars.peek() {
79                                chars.next();
80                                if p == '\x07' {
81                                    break;
82                                }
83                                if p == '\x1B' {
84                                    if chars.peek() == Some(&'\\') {
85                                        chars.next();
86                                    }
87                                    break;
88                                }
89                            }
90                        }
91                        Some(_) => {
92                            // bare ESC: drop only the ESC; let next char pass through
93                        }
94                        None => {} // lone ESC at EOF, drop
95                    }
96                }
97                // other C0: drop
98            }
99            '\u{0080}'..='\u{009F}' => {
100                // C1 controls — some terminals interpret these as CSI alternatives.
101                // For U+009B (alt CSI introducer), consume the full sequence up to
102                // its final byte so the payload cannot reach the terminal as literal
103                // text. Other C1 controls are dropped as-is.
104                if c == '\u{009B}' {
105                    while let Some(&p) = chars.peek() {
106                        chars.next();
107                        if ('\x40'..='\x7E').contains(&p) {
108                            break;
109                        }
110                    }
111                }
112            }
113            _ => out.push(c),
114        }
115    }
116    out
117}
118
119#[cfg(test)]
120mod tests {
121    use super::*;
122
123    #[test]
124    fn plain_ascii_passes_through() {
125        assert_eq!(scrub_controls("hello world"), "hello world");
126    }
127
128    #[test]
129    fn newline_tab_cr_preserved() {
130        assert_eq!(scrub_controls("a\nb\tc\rd"), "a\nb\tc\rd");
131    }
132
133    #[test]
134    fn csi_escape_stripped() {
135        // \x1b[2J = clear screen, \x1b[H = cursor home
136        assert_eq!(scrub_controls("\x1b[2J\x1b[Hhello"), "hello");
137    }
138
139    #[test]
140    fn osc_escape_stripped() {
141        // \x1b]0;title\x07 = set terminal title
142        assert_eq!(scrub_controls("\x1b]0;pwned\x07safe"), "safe");
143    }
144
145    #[test]
146    fn cursor_position_query_stripped() {
147        // \x1b[6n = query cursor position (leaks via stdin!)
148        assert_eq!(scrub_controls("a\x1b[6nb"), "ab");
149    }
150
151    #[test]
152    fn c0_controls_except_tnlcr_removed() {
153        assert_eq!(scrub_controls("a\x00b\x01c\x07d\x08e"), "abcde");
154    }
155
156    #[test]
157    fn c1_controls_removed() {
158        // \x9b = CSI alternate form — introducer AND payload must be stripped
159        assert_eq!(scrub_controls("a\u{009b}2Jb"), "ab");
160    }
161
162    #[test]
163    fn utf8_text_preserved() {
164        assert_eq!(scrub_controls("你好\nworld"), "你好\nworld");
165    }
166
167    #[test]
168    fn bare_esc_removed() {
169        assert_eq!(scrub_controls("a\x1bb"), "ab");
170    }
171
172    #[test]
173    fn keep_sgr_lets_color_through() {
174        // SGR 31 (red fg) + SGR 39 (default fg) survives.
175        assert_eq!(
176            scrub_controls_keep_sgr("\x1b[31mred\x1b[39m tail"),
177            "\x1b[31mred\x1b[39m tail"
178        );
179    }
180
181    #[test]
182    fn keep_sgr_still_strips_cursor_csi() {
183        // Cursor moves (\x1b[2J, \x1b[H, \x1b[6n) and any non-SGR CSI
184        // are still rejected even on the SGR-allowing path.
185        assert_eq!(
186            scrub_controls_keep_sgr("\x1b[2J\x1b[Hhi\x1b[6n"),
187            "hi"
188        );
189    }
190
191    #[test]
192    fn keep_sgr_still_strips_osc() {
193        // OSC payloads (clipboard injection, set title) stay rejected.
194        assert_eq!(
195            scrub_controls_keep_sgr("\x1b]0;pwned\x07safe"),
196            "safe"
197        );
198    }
199
200    #[test]
201    fn keep_sgr_preserves_multi_param_sgr() {
202        // SGR can carry multiple parameters in one sequence —
203        // e.g. `\x1b[1;31m` = bold + red. Verify both the
204        // separator and the parameter chain pass through intact.
205        assert_eq!(
206            scrub_controls_keep_sgr("\x1b[1;31mbold-red\x1b[0m"),
207            "\x1b[1;31mbold-red\x1b[0m"
208        );
209    }
210}