atomcode_tuix/sanitize.rs
1// crates/atomcode-tuix/src/sanitize.rs
2
3/// Strip ANSI escape sequences and C0/C1 control codes except tab/newline/CR.
4///
5/// Defends against:
6/// - CSI sequences (\x1b[...) — can clear screen, move cursor, query position
7/// - OSC sequences (\x1b]...\x07 or \x1b]...\x1b\\) — can set terminal title,
8/// manipulate clipboard, or write to hyperlink targets
9/// - C0 controls (0x00..0x1F) except \t \n \r — can ring bell, backspace, etc.
10/// - C1 controls (0x80..0x9F in UTF-8 as U+0080..U+009F) — alternate CSI forms
11/// - Bare ESC (\x1b not followed by a recognised intro)
12pub fn scrub_controls(input: &str) -> String {
13 scrub_inner(input, false)
14}
15
16/// Same as [`scrub_controls`] but preserves CSI sequences whose final
17/// byte is `m` — i.e. **SGR (Select Graphic Rendition)**: colour,
18/// bold, italic, underline, strikethrough, faint, reverse-video, etc.
19///
20/// SGR is purely cosmetic — it changes how subsequent text is drawn
21/// but never moves the cursor, queries terminal state, touches the
22/// clipboard, sets the window title, or otherwise reaches outside
23/// the display rectangle. Allowing it through is what `less`, `git`,
24/// `bat`, and every other "safe ANSI" tool does, and it lets trusted
25/// internal output (e.g. the `/codingplan` SetupReport's locked-model
26/// rows that render in the terminal's theme red) survive sanitisation
27/// without each caller having to roll its own emission path.
28///
29/// Use this on **trusted** output — strings the app itself builds
30/// (slash-command return text, status lines, setup reports). Do NOT
31/// use it on text that came from a remote LLM or any other untrusted
32/// channel: SGR can still be used to hide content (faint, black-on-
33/// black) or impersonate UI chrome (✓ in green next to a lie), so
34/// LLM streams continue to go through the strict [`scrub_controls`].
35pub fn scrub_controls_keep_sgr(input: &str) -> String {
36 scrub_inner(input, true)
37}
38
39fn scrub_inner(input: &str, keep_sgr: bool) -> String {
40 let mut out = String::with_capacity(input.len());
41 let mut chars = input.chars().peekable();
42
43 while let Some(c) = chars.next() {
44 match c {
45 '\t' | '\n' | '\r' => out.push(c),
46 '\x00'..='\x1F' => {
47 if c == '\x1B' {
48 // ESC — consume one of: CSI, OSC, SS2, SS3, or lone byte
49 match chars.peek() {
50 Some(&'[') => {
51 chars.next(); // consume [
52 // CSI: params, intermediates, final byte 0x40..=0x7E
53 let mut buf = String::new();
54 buf.push('\x1b');
55 buf.push('[');
56 let mut final_byte: Option<char> = None;
57 while let Some(&p) = chars.peek() {
58 chars.next();
59 buf.push(p);
60 if ('\x40'..='\x7E').contains(&p) {
61 final_byte = Some(p);
62 break;
63 }
64 }
65 // SGR (CSI ... m) is pure presentation —
66 // when the caller asked to keep SGR, emit
67 // the buffered sequence verbatim. Other
68 // CSI finals (cursor moves, DSR queries,
69 // erase-in-display, etc.) stay dropped on
70 // both paths.
71 if keep_sgr && final_byte == Some('m') {
72 out.push_str(&buf);
73 }
74 }
75 Some(&']') => {
76 chars.next(); // consume ]
77 // OSC: end on BEL (\x07) or ST (ESC \)
78 while let Some(&p) = chars.peek() {
79 chars.next();
80 if p == '\x07' {
81 break;
82 }
83 if p == '\x1B' {
84 if chars.peek() == Some(&'\\') {
85 chars.next();
86 }
87 break;
88 }
89 }
90 }
91 Some(_) => {
92 // bare ESC: drop only the ESC; let next char pass through
93 }
94 None => {} // lone ESC at EOF, drop
95 }
96 }
97 // other C0: drop
98 }
99 '\u{0080}'..='\u{009F}' => {
100 // C1 controls — some terminals interpret these as CSI alternatives.
101 // For U+009B (alt CSI introducer), consume the full sequence up to
102 // its final byte so the payload cannot reach the terminal as literal
103 // text. Other C1 controls are dropped as-is.
104 if c == '\u{009B}' {
105 while let Some(&p) = chars.peek() {
106 chars.next();
107 if ('\x40'..='\x7E').contains(&p) {
108 break;
109 }
110 }
111 }
112 }
113 _ => out.push(c),
114 }
115 }
116 out
117}
118
119#[cfg(test)]
120mod tests {
121 use super::*;
122
123 #[test]
124 fn plain_ascii_passes_through() {
125 assert_eq!(scrub_controls("hello world"), "hello world");
126 }
127
128 #[test]
129 fn newline_tab_cr_preserved() {
130 assert_eq!(scrub_controls("a\nb\tc\rd"), "a\nb\tc\rd");
131 }
132
133 #[test]
134 fn csi_escape_stripped() {
135 // \x1b[2J = clear screen, \x1b[H = cursor home
136 assert_eq!(scrub_controls("\x1b[2J\x1b[Hhello"), "hello");
137 }
138
139 #[test]
140 fn osc_escape_stripped() {
141 // \x1b]0;title\x07 = set terminal title
142 assert_eq!(scrub_controls("\x1b]0;pwned\x07safe"), "safe");
143 }
144
145 #[test]
146 fn cursor_position_query_stripped() {
147 // \x1b[6n = query cursor position (leaks via stdin!)
148 assert_eq!(scrub_controls("a\x1b[6nb"), "ab");
149 }
150
151 #[test]
152 fn c0_controls_except_tnlcr_removed() {
153 assert_eq!(scrub_controls("a\x00b\x01c\x07d\x08e"), "abcde");
154 }
155
156 #[test]
157 fn c1_controls_removed() {
158 // \x9b = CSI alternate form — introducer AND payload must be stripped
159 assert_eq!(scrub_controls("a\u{009b}2Jb"), "ab");
160 }
161
162 #[test]
163 fn utf8_text_preserved() {
164 assert_eq!(scrub_controls("你好\nworld"), "你好\nworld");
165 }
166
167 #[test]
168 fn bare_esc_removed() {
169 assert_eq!(scrub_controls("a\x1bb"), "ab");
170 }
171
172 #[test]
173 fn keep_sgr_lets_color_through() {
174 // SGR 31 (red fg) + SGR 39 (default fg) survives.
175 assert_eq!(
176 scrub_controls_keep_sgr("\x1b[31mred\x1b[39m tail"),
177 "\x1b[31mred\x1b[39m tail"
178 );
179 }
180
181 #[test]
182 fn keep_sgr_still_strips_cursor_csi() {
183 // Cursor moves (\x1b[2J, \x1b[H, \x1b[6n) and any non-SGR CSI
184 // are still rejected even on the SGR-allowing path.
185 assert_eq!(
186 scrub_controls_keep_sgr("\x1b[2J\x1b[Hhi\x1b[6n"),
187 "hi"
188 );
189 }
190
191 #[test]
192 fn keep_sgr_still_strips_osc() {
193 // OSC payloads (clipboard injection, set title) stay rejected.
194 assert_eq!(
195 scrub_controls_keep_sgr("\x1b]0;pwned\x07safe"),
196 "safe"
197 );
198 }
199
200 #[test]
201 fn keep_sgr_preserves_multi_param_sgr() {
202 // SGR can carry multiple parameters in one sequence —
203 // e.g. `\x1b[1;31m` = bold + red. Verify both the
204 // separator and the parameter chain pass through intact.
205 assert_eq!(
206 scrub_controls_keep_sgr("\x1b[1;31mbold-red\x1b[0m"),
207 "\x1b[1;31mbold-red\x1b[0m"
208 );
209 }
210}