Skip to main content

contextual_encoder/
rust.rs

1//! rust literal encoders.
2//!
3//! encodes untrusted strings for safe embedding in rust source literals.
4//!
5//! - [`for_rust_string`] — safe for rust string literals (`"..."`)
6//! - [`for_rust_char`] — safe for rust char literals (`'...'`)
7//! - [`for_rust_byte_string`] — safe for rust byte string literals (`b"..."`)
8//!
9//! # encoding rules
10//!
11//! all three encoders use rust's native escape syntax:
12//!
13//! - named escapes: `\0`, `\t`, `\n`, `\r`, `\\`
14//! - C0 controls and DEL without named escapes → `\xHH`
15//! - unicode non-characters → space (string/char) or `\xHH` per byte (byte string)
16//!
17//! the encoders differ in which quote is escaped and how non-ASCII is handled:
18//!
19//! | encoder | quote escape | non-ASCII |
20//! |---------|-------------|-----------|
21//! | `for_rust_string` | `"` → `\"` | passes through |
22//! | `for_rust_char` | `'` → `\'` | passes through |
23//! | `for_rust_byte_string` | `"` → `\"` | each UTF-8 byte → `\xHH` |
24
25use std::fmt;
26
27use crate::engine::{encode_loop, is_unicode_noncharacter};
28
29// ---------------------------------------------------------------------------
30// for_rust_string — safe for Rust string literals ("...")
31// ---------------------------------------------------------------------------
32
33/// encodes `input` for safe embedding in a rust string literal (`"..."`).
34///
35/// escapes backslashes, double quotes, and control characters using rust's
36/// escape syntax. non-ASCII unicode passes through unchanged (valid in rust
37/// string literals). unicode non-characters are replaced with space.
38///
39/// # examples
40///
41/// ```
42/// use contextual_encoder::for_rust_string;
43///
44/// assert_eq!(for_rust_string(r#"say "hi""#), r#"say \"hi\""#);
45/// assert_eq!(for_rust_string("line\nbreak"), r"line\nbreak");
46/// assert_eq!(for_rust_string("café"), "café");
47/// ```
48pub fn for_rust_string(input: &str) -> String {
49    let mut out = String::with_capacity(input.len());
50    write_rust_string(&mut out, input).expect("writing to string cannot fail");
51    out
52}
53
54/// writes the rust-string-encoded form of `input` to `out`.
55///
56/// see [`for_rust_string`] for encoding rules.
57pub fn write_rust_string<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
58    encode_loop(out, input, needs_rust_string_encoding, |out, c, _next| {
59        write_rust_text_encoded(out, c, '"')
60    })
61}
62
63fn needs_rust_string_encoding(c: char) -> bool {
64    matches!(c, '\x00'..='\x1F' | '\x7F' | '"' | '\\') || is_unicode_noncharacter(c as u32)
65}
66
67// ---------------------------------------------------------------------------
68// for_rust_char — safe for Rust char literals ('...')
69// ---------------------------------------------------------------------------
70
71/// encodes `input` for safe embedding in a rust char literal (`'...'`).
72///
73/// escapes backslashes, single quotes, and control characters using rust's
74/// escape syntax. non-ASCII unicode passes through unchanged. unicode
75/// non-characters are replaced with space.
76///
77/// # examples
78///
79/// ```
80/// use contextual_encoder::for_rust_char;
81///
82/// assert_eq!(for_rust_char("it's"), r"it\'s");
83/// assert_eq!(for_rust_char(r#"a"b"#), r#"a"b"#);
84/// assert_eq!(for_rust_char("tab\there"), r"tab\there");
85/// ```
86pub fn for_rust_char(input: &str) -> String {
87    let mut out = String::with_capacity(input.len());
88    write_rust_char(&mut out, input).expect("writing to string cannot fail");
89    out
90}
91
92/// writes the rust-char-encoded form of `input` to `out`.
93///
94/// see [`for_rust_char`] for encoding rules.
95pub fn write_rust_char<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
96    encode_loop(out, input, needs_rust_char_encoding, |out, c, _next| {
97        write_rust_text_encoded(out, c, '\'')
98    })
99}
100
101fn needs_rust_char_encoding(c: char) -> bool {
102    matches!(c, '\x00'..='\x1F' | '\x7F' | '\'' | '\\') || is_unicode_noncharacter(c as u32)
103}
104
105// ---------------------------------------------------------------------------
106// shared helper for string and char encoders
107// ---------------------------------------------------------------------------
108
109/// writes the encoded form of a character for rust string or char context.
110/// `quote` is the delimiter being escaped (`"` or `'`).
111fn write_rust_text_encoded<W: fmt::Write>(out: &mut W, c: char, quote: char) -> fmt::Result {
112    match c {
113        '\0' => out.write_str("\\0"),
114        '\t' => out.write_str("\\t"),
115        '\n' => out.write_str("\\n"),
116        '\r' => out.write_str("\\r"),
117        '\\' => out.write_str("\\\\"),
118        '"' if quote == '"' => out.write_str("\\\""),
119        '\'' if quote == '\'' => out.write_str("\\'"),
120        c if is_unicode_noncharacter(c as u32) => out.write_char(' '),
121        // other C0 controls and DEL
122        c => write!(out, "\\x{:02x}", c as u32),
123    }
124}
125
126// ---------------------------------------------------------------------------
127// for_rust_byte_string — safe for Rust byte string literals (b"...")
128// ---------------------------------------------------------------------------
129
130/// encodes `input` for safe embedding in a rust byte string literal (`b"..."`).
131///
132/// escapes backslashes, double quotes, and control characters. non-ASCII
133/// characters are encoded as their individual UTF-8 bytes using `\xHH`
134/// notation, since byte string literals only accept ASCII directly.
135///
136/// # examples
137///
138/// ```
139/// use contextual_encoder::for_rust_byte_string;
140///
141/// assert_eq!(for_rust_byte_string("hello"), "hello");
142/// assert_eq!(for_rust_byte_string(r#"say "hi""#), r#"say \"hi\""#);
143/// assert_eq!(for_rust_byte_string("café"), r"caf\xc3\xa9");
144/// assert_eq!(for_rust_byte_string("null\x00byte"), r"null\0byte");
145/// ```
146pub fn for_rust_byte_string(input: &str) -> String {
147    let mut out = String::with_capacity(input.len());
148    write_rust_byte_string(&mut out, input).expect("writing to string cannot fail");
149    out
150}
151
152/// writes the rust-byte-string-encoded form of `input` to `out`.
153///
154/// see [`for_rust_byte_string`] for encoding rules.
155pub fn write_rust_byte_string<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
156    encode_loop(
157        out,
158        input,
159        needs_rust_byte_string_encoding,
160        write_rust_byte_string_encoded,
161    )
162}
163
164fn needs_rust_byte_string_encoding(c: char) -> bool {
165    matches!(c, '\x00'..='\x1F' | '\x7F' | '"' | '\\') || !c.is_ascii()
166}
167
168fn write_rust_byte_string_encoded<W: fmt::Write>(
169    out: &mut W,
170    c: char,
171    _next: Option<char>,
172) -> fmt::Result {
173    match c {
174        '\0' => out.write_str("\\0"),
175        '\t' => out.write_str("\\t"),
176        '\n' => out.write_str("\\n"),
177        '\r' => out.write_str("\\r"),
178        '"' => out.write_str("\\\""),
179        '\\' => out.write_str("\\\\"),
180        // non-ASCII → encode each UTF-8 byte
181        c if !c.is_ascii() => {
182            let mut buf = [0u8; 4];
183            let encoded = c.encode_utf8(&mut buf);
184            for b in encoded.as_bytes() {
185                write!(out, "\\x{b:02x}")?;
186            }
187            Ok(())
188        }
189        // other C0 controls and DEL
190        c => write!(out, "\\x{:02x}", c as u32),
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    // -- for_rust_string --
199
200    #[test]
201    fn string_passthrough() {
202        assert_eq!(for_rust_string("hello world"), "hello world");
203        assert_eq!(for_rust_string(""), "");
204        assert_eq!(for_rust_string("café 日本語"), "café 日本語");
205        assert_eq!(for_rust_string("😀"), "😀");
206    }
207
208    #[test]
209    fn string_escapes_double_quote() {
210        assert_eq!(for_rust_string(r#"a"b"#), r#"a\"b"#);
211    }
212
213    #[test]
214    fn string_passes_single_quote() {
215        assert_eq!(for_rust_string("a'b"), "a'b");
216    }
217
218    #[test]
219    fn string_escapes_backslash() {
220        assert_eq!(for_rust_string(r"a\b"), r"a\\b");
221    }
222
223    #[test]
224    fn string_named_escapes() {
225        assert_eq!(for_rust_string("\0"), "\\0");
226        assert_eq!(for_rust_string("\t"), "\\t");
227        assert_eq!(for_rust_string("\n"), "\\n");
228        assert_eq!(for_rust_string("\r"), "\\r");
229    }
230
231    #[test]
232    fn string_hex_escapes_for_controls() {
233        assert_eq!(for_rust_string("\x01"), "\\x01");
234        assert_eq!(for_rust_string("\x08"), "\\x08");
235        assert_eq!(for_rust_string("\x0B"), "\\x0b");
236        assert_eq!(for_rust_string("\x0C"), "\\x0c");
237        assert_eq!(for_rust_string("\x1F"), "\\x1f");
238        assert_eq!(for_rust_string("\x7F"), "\\x7f");
239    }
240
241    #[test]
242    fn string_nonchars_replaced() {
243        assert_eq!(for_rust_string("\u{FDD0}"), " ");
244        assert_eq!(for_rust_string("\u{FFFE}"), " ");
245    }
246
247    #[test]
248    fn string_writer_matches() {
249        let input = "test\0\"\\\n café";
250        let mut w = String::new();
251        write_rust_string(&mut w, input).unwrap();
252        assert_eq!(for_rust_string(input), w);
253    }
254
255    // -- for_rust_char --
256
257    #[test]
258    fn char_passthrough() {
259        assert_eq!(for_rust_char("hello world"), "hello world");
260        assert_eq!(for_rust_char(""), "");
261        assert_eq!(for_rust_char("café"), "café");
262    }
263
264    #[test]
265    fn char_escapes_single_quote() {
266        assert_eq!(for_rust_char("a'b"), r"a\'b");
267    }
268
269    #[test]
270    fn char_passes_double_quote() {
271        assert_eq!(for_rust_char(r#"a"b"#), r#"a"b"#);
272    }
273
274    #[test]
275    fn char_escapes_backslash() {
276        assert_eq!(for_rust_char(r"a\b"), r"a\\b");
277    }
278
279    #[test]
280    fn char_named_escapes() {
281        assert_eq!(for_rust_char("\0"), "\\0");
282        assert_eq!(for_rust_char("\t"), "\\t");
283        assert_eq!(for_rust_char("\n"), "\\n");
284        assert_eq!(for_rust_char("\r"), "\\r");
285    }
286
287    #[test]
288    fn char_hex_escapes_for_controls() {
289        assert_eq!(for_rust_char("\x01"), "\\x01");
290        assert_eq!(for_rust_char("\x7F"), "\\x7f");
291    }
292
293    #[test]
294    fn char_nonchars_replaced() {
295        assert_eq!(for_rust_char("\u{FDD0}"), " ");
296    }
297
298    #[test]
299    fn char_writer_matches() {
300        let input = "test\0'\\\n café";
301        let mut w = String::new();
302        write_rust_char(&mut w, input).unwrap();
303        assert_eq!(for_rust_char(input), w);
304    }
305
306    // -- for_rust_byte_string --
307
308    #[test]
309    fn byte_string_passthrough() {
310        assert_eq!(for_rust_byte_string("hello world"), "hello world");
311        assert_eq!(for_rust_byte_string(""), "");
312    }
313
314    #[test]
315    fn byte_string_escapes_double_quote() {
316        assert_eq!(for_rust_byte_string(r#"a"b"#), r#"a\"b"#);
317    }
318
319    #[test]
320    fn byte_string_escapes_backslash() {
321        assert_eq!(for_rust_byte_string(r"a\b"), r"a\\b");
322    }
323
324    #[test]
325    fn byte_string_named_escapes() {
326        assert_eq!(for_rust_byte_string("\0"), "\\0");
327        assert_eq!(for_rust_byte_string("\t"), "\\t");
328        assert_eq!(for_rust_byte_string("\n"), "\\n");
329        assert_eq!(for_rust_byte_string("\r"), "\\r");
330    }
331
332    #[test]
333    fn byte_string_hex_for_controls() {
334        assert_eq!(for_rust_byte_string("\x01"), "\\x01");
335        assert_eq!(for_rust_byte_string("\x7F"), "\\x7f");
336    }
337
338    #[test]
339    fn byte_string_non_ascii_as_utf8_bytes() {
340        // é = U+00E9 → UTF-8: C3 A9
341        assert_eq!(for_rust_byte_string("é"), r"\xc3\xa9");
342        // café → only the é is encoded
343        assert_eq!(for_rust_byte_string("café"), r"caf\xc3\xa9");
344        // 日 = U+65E5 → UTF-8: E6 97 A5
345        assert_eq!(for_rust_byte_string("日"), r"\xe6\x97\xa5");
346        // 😀 = U+1F600 → UTF-8: F0 9F 98 80
347        assert_eq!(for_rust_byte_string("😀"), r"\xf0\x9f\x98\x80");
348    }
349
350    #[test]
351    fn byte_string_nonchars_as_bytes() {
352        // U+FDD0 → UTF-8: EF B7 90
353        assert_eq!(for_rust_byte_string("\u{FDD0}"), r"\xef\xb7\x90");
354    }
355
356    #[test]
357    fn byte_string_single_quote_passes() {
358        assert_eq!(for_rust_byte_string("a'b"), "a'b");
359    }
360
361    #[test]
362    fn byte_string_writer_matches() {
363        let input = "test\0\"\\café";
364        let mut w = String::new();
365        write_rust_byte_string(&mut w, input).unwrap();
366        assert_eq!(for_rust_byte_string(input), w);
367    }
368}