Skip to main content

contextual_encoder/
rust.rs

1//! rust literal encoders.
2//!
3//! encodes untrusted strings for safe embedding in rust source literals.
4//!
5//! - [`for_rust_string`] — safe for rust string literals (`"..."`)
6//! - [`for_rust_char`] — safe for rust char literals (`'...'`)
7//! - [`for_rust_byte_string`] — safe for rust byte string literals (`b"..."`)
8//!
9//! # encoding rules
10//!
11//! all three encoders use rust's native escape syntax:
12//!
13//! - named escapes: `\0`, `\t`, `\n`, `\r`, `\\`
14//! - C0 controls and DEL without named escapes → `\xHH`
15//! - unicode non-characters → space (string/char) or `\xHH` per byte (byte string)
16//!
17//! the encoders differ in which quote is escaped and how non-ASCII is handled:
18//!
19//! | encoder | quote escape | non-ASCII |
20//! |---------|-------------|-----------|
21//! | `for_rust_string` | `"` → `\"` | passes through |
22//! | `for_rust_char` | `'` → `\'` | passes through |
23//! | `for_rust_byte_string` | `"` → `\"` | each UTF-8 byte → `\xHH` |
24
25use std::fmt;
26
27use crate::engine::{
28    encode_loop, is_unicode_noncharacter, needs_byte_string_encoding, write_byte_string_encoded,
29    write_rust_named_escape,
30};
31
32// ---------------------------------------------------------------------------
33// for_rust_string — safe for Rust string literals ("...")
34// ---------------------------------------------------------------------------
35
36/// encodes `input` for safe embedding in a rust string literal (`"..."`).
37///
38/// escapes backslashes, double quotes, and control characters using rust's
39/// escape syntax. non-ASCII unicode passes through unchanged (valid in rust
40/// string literals). unicode non-characters are replaced with space.
41///
42/// # examples
43///
44/// ```
45/// use contextual_encoder::for_rust_string;
46///
47/// assert_eq!(for_rust_string(r#"say "hi""#), r#"say \"hi\""#);
48/// assert_eq!(for_rust_string("line\nbreak"), r"line\nbreak");
49/// assert_eq!(for_rust_string("café"), "café");
50/// ```
51pub fn for_rust_string(input: &str) -> String {
52    let mut out = String::with_capacity(input.len());
53    write_rust_string(&mut out, input).expect("writing to string cannot fail");
54    out
55}
56
57/// writes the rust-string-encoded form of `input` to `out`.
58///
59/// see [`for_rust_string`] for encoding rules.
60pub fn write_rust_string<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
61    encode_loop(out, input, needs_rust_string_encoding, |out, c, _next| {
62        write_rust_text_encoded(out, c, '"')
63    })
64}
65
66fn needs_rust_string_encoding(c: char) -> bool {
67    matches!(c, '\x00'..='\x1F' | '\x7F' | '"' | '\\') || is_unicode_noncharacter(c as u32)
68}
69
70// ---------------------------------------------------------------------------
71// for_rust_char — safe for Rust char literals ('...')
72// ---------------------------------------------------------------------------
73
74/// encodes `input` for safe embedding in a rust char literal (`'...'`).
75///
76/// escapes backslashes, single quotes, and control characters using rust's
77/// escape syntax. non-ASCII unicode passes through unchanged. unicode
78/// non-characters are replaced with space.
79///
80/// # examples
81///
82/// ```
83/// use contextual_encoder::for_rust_char;
84///
85/// assert_eq!(for_rust_char("it's"), r"it\'s");
86/// assert_eq!(for_rust_char(r#"a"b"#), r#"a"b"#);
87/// assert_eq!(for_rust_char("tab\there"), r"tab\there");
88/// ```
89pub fn for_rust_char(input: &str) -> String {
90    let mut out = String::with_capacity(input.len());
91    write_rust_char(&mut out, input).expect("writing to string cannot fail");
92    out
93}
94
95/// writes the rust-char-encoded form of `input` to `out`.
96///
97/// see [`for_rust_char`] for encoding rules.
98pub fn write_rust_char<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
99    encode_loop(out, input, needs_rust_char_encoding, |out, c, _next| {
100        write_rust_text_encoded(out, c, '\'')
101    })
102}
103
104fn needs_rust_char_encoding(c: char) -> bool {
105    matches!(c, '\x00'..='\x1F' | '\x7F' | '\'' | '\\') || is_unicode_noncharacter(c as u32)
106}
107
108// ---------------------------------------------------------------------------
109// shared helper for string and char encoders
110// ---------------------------------------------------------------------------
111
112/// writes the encoded form of a character for rust string or char context.
113/// `quote` is the delimiter being escaped (`"` or `'`).
114fn write_rust_text_encoded<W: fmt::Write>(out: &mut W, c: char, quote: char) -> fmt::Result {
115    match c {
116        '\0' => out.write_str("\\0"),
117        '\t' => out.write_str("\\t"),
118        '\n' => out.write_str("\\n"),
119        '\r' => out.write_str("\\r"),
120        '\\' => out.write_str("\\\\"),
121        '"' if quote == '"' => out.write_str("\\\""),
122        '\'' if quote == '\'' => out.write_str("\\'"),
123        c if is_unicode_noncharacter(c as u32) => out.write_char(' '),
124        // other C0 controls and DEL
125        c => write!(out, "\\x{:02x}", c as u32),
126    }
127}
128
129// ---------------------------------------------------------------------------
130// for_rust_byte_string — safe for Rust byte string literals (b"...")
131// ---------------------------------------------------------------------------
132
133/// encodes `input` for safe embedding in a rust byte string literal (`b"..."`).
134///
135/// escapes backslashes, double quotes, and control characters. non-ASCII
136/// characters are encoded as their individual UTF-8 bytes using `\xHH`
137/// notation, since byte string literals only accept ASCII directly.
138///
139/// # examples
140///
141/// ```
142/// use contextual_encoder::for_rust_byte_string;
143///
144/// assert_eq!(for_rust_byte_string("hello"), "hello");
145/// assert_eq!(for_rust_byte_string(r#"say "hi""#), r#"say \"hi\""#);
146/// assert_eq!(for_rust_byte_string("café"), r"caf\xc3\xa9");
147/// assert_eq!(for_rust_byte_string("null\x00byte"), r"null\0byte");
148/// ```
149pub fn for_rust_byte_string(input: &str) -> String {
150    let mut out = String::with_capacity(input.len());
151    write_rust_byte_string(&mut out, input).expect("writing to string cannot fail");
152    out
153}
154
155/// writes the rust-byte-string-encoded form of `input` to `out`.
156///
157/// see [`for_rust_byte_string`] for encoding rules.
158pub fn write_rust_byte_string<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
159    encode_loop(out, input, needs_byte_string_encoding, |out, c, _next| {
160        write_byte_string_encoded(out, c, write_rust_named_escape)
161    })
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    // -- for_rust_string --
169
170    #[test]
171    fn string_passthrough() {
172        assert_eq!(for_rust_string("hello world"), "hello world");
173        assert_eq!(for_rust_string(""), "");
174        assert_eq!(for_rust_string("café 日本語"), "café 日本語");
175        assert_eq!(for_rust_string("😀"), "😀");
176    }
177
178    #[test]
179    fn string_escapes_double_quote() {
180        assert_eq!(for_rust_string(r#"a"b"#), r#"a\"b"#);
181    }
182
183    #[test]
184    fn string_passes_single_quote() {
185        assert_eq!(for_rust_string("a'b"), "a'b");
186    }
187
188    #[test]
189    fn string_escapes_backslash() {
190        assert_eq!(for_rust_string(r"a\b"), r"a\\b");
191    }
192
193    #[test]
194    fn string_named_escapes() {
195        assert_eq!(for_rust_string("\0"), "\\0");
196        assert_eq!(for_rust_string("\t"), "\\t");
197        assert_eq!(for_rust_string("\n"), "\\n");
198        assert_eq!(for_rust_string("\r"), "\\r");
199    }
200
201    #[test]
202    fn string_hex_escapes_for_controls() {
203        assert_eq!(for_rust_string("\x01"), "\\x01");
204        assert_eq!(for_rust_string("\x08"), "\\x08");
205        assert_eq!(for_rust_string("\x0B"), "\\x0b");
206        assert_eq!(for_rust_string("\x0C"), "\\x0c");
207        assert_eq!(for_rust_string("\x1F"), "\\x1f");
208        assert_eq!(for_rust_string("\x7F"), "\\x7f");
209    }
210
211    #[test]
212    fn string_nonchars_replaced() {
213        assert_eq!(for_rust_string("\u{FDD0}"), " ");
214        assert_eq!(for_rust_string("\u{FFFE}"), " ");
215    }
216
217    #[test]
218    fn string_writer_matches() {
219        let input = "test\0\"\\\n café";
220        let mut w = String::new();
221        write_rust_string(&mut w, input).unwrap();
222        assert_eq!(for_rust_string(input), w);
223    }
224
225    // -- for_rust_char --
226
227    #[test]
228    fn char_passthrough() {
229        assert_eq!(for_rust_char("hello world"), "hello world");
230        assert_eq!(for_rust_char(""), "");
231        assert_eq!(for_rust_char("café"), "café");
232    }
233
234    #[test]
235    fn char_escapes_single_quote() {
236        assert_eq!(for_rust_char("a'b"), r"a\'b");
237    }
238
239    #[test]
240    fn char_passes_double_quote() {
241        assert_eq!(for_rust_char(r#"a"b"#), r#"a"b"#);
242    }
243
244    #[test]
245    fn char_escapes_backslash() {
246        assert_eq!(for_rust_char(r"a\b"), r"a\\b");
247    }
248
249    #[test]
250    fn char_named_escapes() {
251        assert_eq!(for_rust_char("\0"), "\\0");
252        assert_eq!(for_rust_char("\t"), "\\t");
253        assert_eq!(for_rust_char("\n"), "\\n");
254        assert_eq!(for_rust_char("\r"), "\\r");
255    }
256
257    #[test]
258    fn char_hex_escapes_for_controls() {
259        assert_eq!(for_rust_char("\x01"), "\\x01");
260        assert_eq!(for_rust_char("\x7F"), "\\x7f");
261    }
262
263    #[test]
264    fn char_nonchars_replaced() {
265        assert_eq!(for_rust_char("\u{FDD0}"), " ");
266    }
267
268    #[test]
269    fn char_writer_matches() {
270        let input = "test\0'\\\n café";
271        let mut w = String::new();
272        write_rust_char(&mut w, input).unwrap();
273        assert_eq!(for_rust_char(input), w);
274    }
275
276    // -- for_rust_byte_string --
277
278    #[test]
279    fn byte_string_passthrough() {
280        assert_eq!(for_rust_byte_string("hello world"), "hello world");
281        assert_eq!(for_rust_byte_string(""), "");
282    }
283
284    #[test]
285    fn byte_string_escapes_double_quote() {
286        assert_eq!(for_rust_byte_string(r#"a"b"#), r#"a\"b"#);
287    }
288
289    #[test]
290    fn byte_string_escapes_backslash() {
291        assert_eq!(for_rust_byte_string(r"a\b"), r"a\\b");
292    }
293
294    #[test]
295    fn byte_string_named_escapes() {
296        assert_eq!(for_rust_byte_string("\0"), "\\0");
297        assert_eq!(for_rust_byte_string("\t"), "\\t");
298        assert_eq!(for_rust_byte_string("\n"), "\\n");
299        assert_eq!(for_rust_byte_string("\r"), "\\r");
300    }
301
302    #[test]
303    fn byte_string_hex_for_controls() {
304        assert_eq!(for_rust_byte_string("\x01"), "\\x01");
305        assert_eq!(for_rust_byte_string("\x7F"), "\\x7f");
306    }
307
308    #[test]
309    fn byte_string_non_ascii_as_utf8_bytes() {
310        // é = U+00E9 → UTF-8: C3 A9
311        assert_eq!(for_rust_byte_string("é"), r"\xc3\xa9");
312        // café → only the é is encoded
313        assert_eq!(for_rust_byte_string("café"), r"caf\xc3\xa9");
314        // 日 = U+65E5 → UTF-8: E6 97 A5
315        assert_eq!(for_rust_byte_string("日"), r"\xe6\x97\xa5");
316        // 😀 = U+1F600 → UTF-8: F0 9F 98 80
317        assert_eq!(for_rust_byte_string("😀"), r"\xf0\x9f\x98\x80");
318    }
319
320    #[test]
321    fn byte_string_nonchars_as_bytes() {
322        // U+FDD0 → UTF-8: EF B7 90
323        assert_eq!(for_rust_byte_string("\u{FDD0}"), r"\xef\xb7\x90");
324    }
325
326    #[test]
327    fn byte_string_single_quote_passes() {
328        assert_eq!(for_rust_byte_string("a'b"), "a'b");
329    }
330
331    #[test]
332    fn byte_string_writer_matches() {
333        let input = "test\0\"\\café";
334        let mut w = String::new();
335        write_rust_byte_string(&mut w, input).unwrap();
336        assert_eq!(for_rust_byte_string(input), w);
337    }
338}