Skip to main content

contextual_encoder/
python.rs

1//! python literal encoders.
2//!
3//! encodes untrusted strings for safe embedding in python source literals.
4//!
5//! - [`for_python_string`] — safe for python string literals (`"..."` or `'...'`)
6//! - [`for_python_bytes`] — safe for python bytes literals (`b"..."` or `b'...'`)
7//! - [`for_python_raw_string`] — safe for python raw string literals
8//!   (`r"..."` or `r'...'`)
9//!
10//! # encoding rules
11//!
12//! ## string and bytes
13//!
14//! both encoders use python's native escape syntax:
15//!
16//! - named escapes: `\a`, `\b`, `\t`, `\n`, `\v`, `\f`, `\r`, `\\`, `\"`, `\'`
17//! - other C0 controls and DEL → `\xHH`
18//! - unicode non-characters → space (string) or `\xHH` per byte (bytes)
19//!
20//! both quote characters are escaped, making the output safe regardless of
21//! which delimiter (`"` or `'`) is used.
22//!
23//! the encoders differ in how non-ASCII is handled:
24//!
25//! | encoder | non-ASCII |
26//! |---------|-----------|
27//! | `for_python_string` | passes through |
28//! | `for_python_bytes` | each UTF-8 byte → `\xHH` |
29//!
30//! ## raw string
31//!
32//! raw strings do not process escape sequences, so the encoder replaces
33//! dangerous characters with space:
34//!
35//! - quotes (`"` and `'`) → space
36//! - C0 controls and DEL → space
37//! - unicode non-characters → space
38//! - trailing odd backslash → replaced with space (raw strings cannot
39//!   end with an odd number of backslashes)
40
41use std::fmt;
42
43use crate::engine::{
44    encode_loop, is_unicode_noncharacter, write_c0_named_escape, write_utf8_hex_bytes,
45};
46
47// ---------------------------------------------------------------------------
48// for_python_string — safe for Python string literals ("..." or '...')
49// ---------------------------------------------------------------------------
50
51/// encodes `input` for safe embedding in a python string literal
52/// (`"..."` or `'...'`).
53///
54/// escapes backslashes, both quote characters, and control characters using
55/// python's escape syntax. non-ASCII unicode passes through unchanged
56/// (python 3 source files are UTF-8 by default). unicode non-characters
57/// are replaced with space.
58///
59/// # examples
60///
61/// ```
62/// use contextual_encoder::for_python_string;
63///
64/// assert_eq!(for_python_string(r#"say "hi""#), r#"say \"hi\""#);
65/// assert_eq!(for_python_string("it's"), r"it\'s");
66/// assert_eq!(for_python_string("line\nbreak"), r"line\nbreak");
67/// assert_eq!(for_python_string("café"), "café");
68/// ```
69pub fn for_python_string(input: &str) -> String {
70    let mut out = String::with_capacity(input.len());
71    write_python_string(&mut out, input).expect("writing to string cannot fail");
72    out
73}
74
75/// writes the python-string-encoded form of `input` to `out`.
76///
77/// see [`for_python_string`] for encoding rules.
78pub fn write_python_string<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
79    encode_loop(out, input, needs_python_string_encoding, |out, c, _next| {
80        write_python_text_encoded(out, c)
81    })
82}
83
84fn needs_python_string_encoding(c: char) -> bool {
85    matches!(c, '\x00'..='\x1F' | '\x7F' | '"' | '\'' | '\\') || is_unicode_noncharacter(c as u32)
86}
87
88// ---------------------------------------------------------------------------
89// for_python_bytes — safe for Python bytes literals (b"..." or b'...')
90// ---------------------------------------------------------------------------
91
92/// encodes `input` for safe embedding in a python bytes literal
93/// (`b"..."` or `b'...'`).
94///
95/// escapes backslashes, both quote characters, and control characters.
96/// non-ASCII characters are encoded as their individual UTF-8 bytes
97/// using `\xHH` notation, since bytes literals only accept ASCII directly.
98///
99/// # examples
100///
101/// ```
102/// use contextual_encoder::for_python_bytes;
103///
104/// assert_eq!(for_python_bytes("hello"), "hello");
105/// assert_eq!(for_python_bytes(r#"say "hi""#), r#"say \"hi\""#);
106/// assert_eq!(for_python_bytes("café"), r"caf\xc3\xa9");
107/// assert_eq!(for_python_bytes("null\x00byte"), r"null\x00byte");
108/// ```
109pub fn for_python_bytes(input: &str) -> String {
110    let mut out = String::with_capacity(input.len());
111    write_python_bytes(&mut out, input).expect("writing to string cannot fail");
112    out
113}
114
115/// writes the python-bytes-encoded form of `input` to `out`.
116///
117/// see [`for_python_bytes`] for encoding rules.
118pub fn write_python_bytes<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
119    encode_loop(
120        out,
121        input,
122        needs_python_bytes_encoding,
123        write_python_bytes_encoded,
124    )
125}
126
127fn needs_python_bytes_encoding(c: char) -> bool {
128    matches!(c, '\x00'..='\x1F' | '\x7F' | '"' | '\'' | '\\') || !c.is_ascii()
129}
130
131fn write_python_bytes_encoded<W: fmt::Write>(
132    out: &mut W,
133    c: char,
134    _next: Option<char>,
135) -> fmt::Result {
136    if let Some(r) = write_c0_named_escape(out, c) {
137        return r;
138    }
139    match c {
140        '"' => out.write_str("\\\""),
141        '\'' => out.write_str("\\'"),
142        // non-ASCII → encode each UTF-8 byte
143        c if !c.is_ascii() => write_utf8_hex_bytes(out, c),
144        // other C0 controls and DEL
145        c => write!(out, "\\x{:02x}", c as u32),
146    }
147}
148
149// ---------------------------------------------------------------------------
150// shared helper for string encoder
151// ---------------------------------------------------------------------------
152
153/// writes the encoded form of a character for python string context.
154fn write_python_text_encoded<W: fmt::Write>(out: &mut W, c: char) -> fmt::Result {
155    if let Some(r) = write_c0_named_escape(out, c) {
156        return r;
157    }
158    match c {
159        '"' => out.write_str("\\\""),
160        '\'' => out.write_str("\\'"),
161        c if is_unicode_noncharacter(c as u32) => out.write_char(' '),
162        // other C0 controls and DEL
163        c => write!(out, "\\x{:02x}", c as u32),
164    }
165}
166
167// ---------------------------------------------------------------------------
168// for_python_raw_string — safe for Python raw string literals (r"..." or r'...')
169// ---------------------------------------------------------------------------
170
171/// encodes `input` for safe embedding in a python raw string literal
172/// (`r"..."` or `r'...'`).
173///
174/// raw strings do not process escape sequences, so dangerous characters
175/// are replaced with space. both quote characters are replaced (making
176/// the output safe regardless of which delimiter is used). if the input
177/// would end with an odd number of backslashes, the last is replaced
178/// with space (raw strings cannot end with an odd backslash count).
179///
180/// # examples
181///
182/// ```
183/// use contextual_encoder::for_python_raw_string;
184///
185/// assert_eq!(for_python_raw_string("hello"), "hello");
186/// assert_eq!(for_python_raw_string(r#"a"b"#), "a b");
187/// assert_eq!(for_python_raw_string(r"path\to\file"), r"path\to\file");
188/// assert_eq!(for_python_raw_string(r"trailing\"), "trailing ");
189/// ```
190pub fn for_python_raw_string(input: &str) -> String {
191    let mut out = String::with_capacity(input.len());
192    write_python_raw_string(&mut out, input).expect("writing to string cannot fail");
193    out
194}
195
196/// writes the python-raw-string-encoded form of `input` to `out`.
197///
198/// see [`for_python_raw_string`] for encoding rules.
199pub fn write_python_raw_string<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
200    let trailing_bs = input.bytes().rev().take_while(|&b| b == b'\\').count();
201    let cutoff = if trailing_bs % 2 == 1 {
202        input.len() - 1
203    } else {
204        input.len()
205    };
206
207    for (i, c) in input.char_indices() {
208        if i >= cutoff {
209            // trailing odd backslash — replace with space
210            out.write_char(' ')?;
211        } else if needs_python_raw_string_encoding(c) {
212            out.write_char(' ')?;
213        } else {
214            out.write_char(c)?;
215        }
216    }
217    Ok(())
218}
219
220fn needs_python_raw_string_encoding(c: char) -> bool {
221    matches!(c, '\x00'..='\x1F' | '\x7F' | '"' | '\'') || is_unicode_noncharacter(c as u32)
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    // -- for_python_string --
229
230    #[test]
231    fn string_passthrough() {
232        assert_eq!(for_python_string("hello world"), "hello world");
233        assert_eq!(for_python_string(""), "");
234        assert_eq!(
235            for_python_string("cafe\u{0301} \u{65E5}\u{672C}\u{8A9E}"),
236            "cafe\u{0301} \u{65E5}\u{672C}\u{8A9E}"
237        );
238        assert_eq!(for_python_string("\u{1F600}"), "\u{1F600}");
239    }
240
241    #[test]
242    fn string_escapes_double_quote() {
243        assert_eq!(for_python_string(r#"a"b"#), r#"a\"b"#);
244    }
245
246    #[test]
247    fn string_escapes_single_quote() {
248        assert_eq!(for_python_string("a'b"), r"a\'b");
249    }
250
251    #[test]
252    fn string_escapes_backslash() {
253        assert_eq!(for_python_string(r"a\b"), r"a\\b");
254    }
255
256    #[test]
257    fn string_named_escapes() {
258        assert_eq!(for_python_string("\x07"), "\\a");
259        assert_eq!(for_python_string("\x08"), "\\b");
260        assert_eq!(for_python_string("\t"), "\\t");
261        assert_eq!(for_python_string("\n"), "\\n");
262        assert_eq!(for_python_string("\x0B"), "\\v");
263        assert_eq!(for_python_string("\x0C"), "\\f");
264        assert_eq!(for_python_string("\r"), "\\r");
265    }
266
267    #[test]
268    fn string_hex_escapes_for_controls() {
269        assert_eq!(for_python_string("\x00"), "\\x00");
270        assert_eq!(for_python_string("\x01"), "\\x01");
271        assert_eq!(for_python_string("\x06"), "\\x06");
272        assert_eq!(for_python_string("\x0E"), "\\x0e");
273        assert_eq!(for_python_string("\x1F"), "\\x1f");
274        assert_eq!(for_python_string("\x7F"), "\\x7f");
275    }
276
277    #[test]
278    fn string_nonchars_replaced() {
279        assert_eq!(for_python_string("\u{FDD0}"), " ");
280        assert_eq!(for_python_string("\u{FFFE}"), " ");
281    }
282
283    #[test]
284    fn string_writer_matches() {
285        let input = "test\x00\"'\\\n cafe\u{0301}";
286        let mut w = String::new();
287        write_python_string(&mut w, input).unwrap();
288        assert_eq!(for_python_string(input), w);
289    }
290
291    // -- for_python_bytes --
292
293    #[test]
294    fn bytes_passthrough() {
295        assert_eq!(for_python_bytes("hello world"), "hello world");
296        assert_eq!(for_python_bytes(""), "");
297    }
298
299    #[test]
300    fn bytes_escapes_double_quote() {
301        assert_eq!(for_python_bytes(r#"a"b"#), r#"a\"b"#);
302    }
303
304    #[test]
305    fn bytes_escapes_single_quote() {
306        assert_eq!(for_python_bytes("a'b"), r"a\'b");
307    }
308
309    #[test]
310    fn bytes_escapes_backslash() {
311        assert_eq!(for_python_bytes(r"a\b"), r"a\\b");
312    }
313
314    #[test]
315    fn bytes_named_escapes() {
316        assert_eq!(for_python_bytes("\x07"), "\\a");
317        assert_eq!(for_python_bytes("\x08"), "\\b");
318        assert_eq!(for_python_bytes("\t"), "\\t");
319        assert_eq!(for_python_bytes("\n"), "\\n");
320        assert_eq!(for_python_bytes("\x0B"), "\\v");
321        assert_eq!(for_python_bytes("\x0C"), "\\f");
322        assert_eq!(for_python_bytes("\r"), "\\r");
323    }
324
325    #[test]
326    fn bytes_hex_for_controls() {
327        assert_eq!(for_python_bytes("\x00"), "\\x00");
328        assert_eq!(for_python_bytes("\x01"), "\\x01");
329        assert_eq!(for_python_bytes("\x7F"), "\\x7f");
330    }
331
332    #[test]
333    fn bytes_non_ascii_as_utf8_bytes() {
334        // combining accent U+0301 → UTF-8: CC 81
335        assert_eq!(for_python_bytes("\u{0301}"), r"\xcc\x81");
336        // cafe + combining accent
337        assert_eq!(for_python_bytes("cafe\u{0301}"), r"cafe\xcc\x81");
338        // 日 = U+65E5 → UTF-8: E6 97 A5
339        assert_eq!(for_python_bytes("\u{65E5}"), r"\xe6\x97\xa5");
340        // 😀 = U+1F600 → UTF-8: F0 9F 98 80
341        assert_eq!(for_python_bytes("\u{1F600}"), r"\xf0\x9f\x98\x80");
342    }
343
344    #[test]
345    fn bytes_nonchars_as_bytes() {
346        // U+FDD0 → UTF-8: EF B7 90
347        assert_eq!(for_python_bytes("\u{FDD0}"), r"\xef\xb7\x90");
348    }
349
350    #[test]
351    fn bytes_writer_matches() {
352        let input = "test\x00\"'\\cafe\u{0301}";
353        let mut w = String::new();
354        write_python_bytes(&mut w, input).unwrap();
355        assert_eq!(for_python_bytes(input), w);
356    }
357
358    // -- for_python_raw_string --
359
360    #[test]
361    fn raw_passthrough() {
362        assert_eq!(for_python_raw_string("hello world"), "hello world");
363        assert_eq!(for_python_raw_string(""), "");
364    }
365
366    #[test]
367    fn raw_quotes_replaced() {
368        assert_eq!(for_python_raw_string(r#"a"b"#), "a b");
369        assert_eq!(for_python_raw_string("a'b"), "a b");
370        assert_eq!(for_python_raw_string(r#"a"b'c"#), "a b c");
371    }
372
373    #[test]
374    fn raw_controls_replaced() {
375        assert_eq!(for_python_raw_string("\x00"), " ");
376        assert_eq!(for_python_raw_string("\x01"), " ");
377        assert_eq!(for_python_raw_string("\t"), " ");
378        assert_eq!(for_python_raw_string("\n"), " ");
379        assert_eq!(for_python_raw_string("\x7F"), " ");
380    }
381
382    #[test]
383    fn raw_backslash_in_middle() {
384        assert_eq!(for_python_raw_string(r"a\b"), r"a\b");
385        assert_eq!(for_python_raw_string(r"path\to\file"), r"path\to\file");
386    }
387
388    #[test]
389    fn raw_trailing_even_backslashes() {
390        assert_eq!(for_python_raw_string(r"ab\\"), r"ab\\");
391        assert_eq!(for_python_raw_string(r"ab\\\\"), r"ab\\\\");
392    }
393
394    #[test]
395    fn raw_trailing_odd_backslash_replaced() {
396        assert_eq!(for_python_raw_string(r"trailing\"), "trailing ");
397        assert_eq!(for_python_raw_string(r"ab\\\"), "ab\\\\ ");
398        assert_eq!(for_python_raw_string(r"\"), " ");
399    }
400
401    #[test]
402    fn raw_nonchars_replaced() {
403        assert_eq!(for_python_raw_string("\u{FDD0}"), " ");
404        assert_eq!(for_python_raw_string("\u{FFFE}"), " ");
405    }
406
407    #[test]
408    fn raw_non_ascii_passes_through() {
409        assert_eq!(for_python_raw_string("café"), "café");
410        assert_eq!(for_python_raw_string("日本語"), "日本語");
411        assert_eq!(for_python_raw_string("😀"), "😀");
412    }
413
414    #[test]
415    fn raw_writer_matches() {
416        let input = "test\x00\"'\\path\\to";
417        let mut w = String::new();
418        write_python_raw_string(&mut w, input).unwrap();
419        assert_eq!(for_python_raw_string(input), w);
420    }
421}