rustpython_literal/
escape.rs

1#[derive(Debug, PartialEq, Eq, Copy, Clone, Hash, is_macro::Is)]
2pub enum Quote {
3    Single,
4    Double,
5}
6
7impl Quote {
8    #[inline]
9    pub const fn swap(self) -> Quote {
10        match self {
11            Quote::Single => Quote::Double,
12            Quote::Double => Quote::Single,
13        }
14    }
15
16    #[inline]
17    pub const fn to_byte(&self) -> u8 {
18        match self {
19            Quote::Single => b'\'',
20            Quote::Double => b'"',
21        }
22    }
23
24    #[inline]
25    pub const fn to_char(&self) -> char {
26        match self {
27            Quote::Single => '\'',
28            Quote::Double => '"',
29        }
30    }
31}
32
33pub struct EscapeLayout {
34    pub quote: Quote,
35    pub len: Option<usize>,
36}
37
38pub trait Escape {
39    fn source_len(&self) -> usize;
40    fn layout(&self) -> &EscapeLayout;
41    fn changed(&self) -> bool {
42        self.layout().len != Some(self.source_len())
43    }
44
45    fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result;
46    fn write_body_slow(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result;
47    fn write_body(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
48        if self.changed() {
49            self.write_body_slow(formatter)
50        } else {
51            self.write_source(formatter)
52        }
53    }
54}
55
56/// Returns the outer quotes to use and the number of quotes that need to be
57/// escaped.
58pub(crate) const fn choose_quote(
59    single_count: usize,
60    double_count: usize,
61    preferred_quote: Quote,
62) -> (Quote, usize) {
63    let (primary_count, secondary_count) = match preferred_quote {
64        Quote::Single => (single_count, double_count),
65        Quote::Double => (double_count, single_count),
66    };
67
68    // always use primary unless we have primary but no secondary
69    let use_secondary = primary_count > 0 && secondary_count == 0;
70    if use_secondary {
71        (preferred_quote.swap(), secondary_count)
72    } else {
73        (preferred_quote, primary_count)
74    }
75}
76
77pub struct UnicodeEscape<'a> {
78    source: &'a str,
79    layout: EscapeLayout,
80}
81
82impl<'a> UnicodeEscape<'a> {
83    #[inline]
84    pub fn with_forced_quote(source: &'a str, quote: Quote) -> Self {
85        let layout = EscapeLayout { quote, len: None };
86        Self { source, layout }
87    }
88    #[inline]
89    pub fn with_preferred_quote(source: &'a str, quote: Quote) -> Self {
90        let layout = Self::repr_layout(source, quote);
91        Self { source, layout }
92    }
93    #[inline]
94    pub fn new_repr(source: &'a str) -> Self {
95        Self::with_preferred_quote(source, Quote::Single)
96    }
97    #[inline]
98    pub fn str_repr<'r>(&'a self) -> StrRepr<'r, 'a> {
99        StrRepr(self)
100    }
101}
102
103pub struct StrRepr<'r, 'a>(&'r UnicodeEscape<'a>);
104
105impl StrRepr<'_, '_> {
106    pub fn write(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
107        let quote = self.0.layout().quote.to_char();
108        formatter.write_char(quote)?;
109        self.0.write_body(formatter)?;
110        formatter.write_char(quote)
111    }
112
113    pub fn to_string(&self) -> Option<String> {
114        let mut s = String::with_capacity(self.0.layout().len?);
115        self.write(&mut s).unwrap();
116        Some(s)
117    }
118}
119
120impl std::fmt::Display for StrRepr<'_, '_> {
121    fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122        self.write(formatter)
123    }
124}
125
126impl UnicodeEscape<'_> {
127    const REPR_RESERVED_LEN: usize = 2; // for quotes
128
129    pub fn repr_layout(source: &str, preferred_quote: Quote) -> EscapeLayout {
130        Self::output_layout_with_checker(source, preferred_quote, |a, b| {
131            Some((a as isize).checked_add(b as isize)? as usize)
132        })
133    }
134
135    fn output_layout_with_checker(
136        source: &str,
137        preferred_quote: Quote,
138        length_add: impl Fn(usize, usize) -> Option<usize>,
139    ) -> EscapeLayout {
140        let mut out_len = Self::REPR_RESERVED_LEN;
141        let mut single_count = 0;
142        let mut double_count = 0;
143
144        for ch in source.chars() {
145            let incr = match ch {
146                '\'' => {
147                    single_count += 1;
148                    1
149                }
150                '"' => {
151                    double_count += 1;
152                    1
153                }
154                c => Self::escaped_char_len(c),
155            };
156            let Some(new_len) = length_add(out_len, incr) else {
157                #[cold]
158                fn stop(
159                    single_count: usize,
160                    double_count: usize,
161                    preferred_quote: Quote,
162                ) -> EscapeLayout {
163                    EscapeLayout {
164                        quote: choose_quote(single_count, double_count, preferred_quote).0,
165                        len: None,
166                    }
167                }
168                return stop(single_count, double_count, preferred_quote);
169            };
170            out_len = new_len;
171        }
172
173        let (quote, num_escaped_quotes) = choose_quote(single_count, double_count, preferred_quote);
174        // we'll be adding backslashes in front of the existing inner quotes
175        let Some(out_len) = length_add(out_len, num_escaped_quotes) else {
176            return EscapeLayout { quote, len: None };
177        };
178
179        EscapeLayout {
180            quote,
181            len: Some(out_len - Self::REPR_RESERVED_LEN),
182        }
183    }
184
185    fn escaped_char_len(ch: char) -> usize {
186        match ch {
187            '\\' | '\t' | '\r' | '\n' => 2,
188            ch if ch < ' ' || ch as u32 == 0x7f => 4, // \xHH
189            ch if ch.is_ascii() => 1,
190            ch if crate::char::is_printable(ch) => {
191                // max = std::cmp::max(ch, max);
192                ch.len_utf8()
193            }
194            ch if (ch as u32) < 0x100 => 4,   // \xHH
195            ch if (ch as u32) < 0x10000 => 6, // \uHHHH
196            _ => 10,                          // \uHHHHHHHH
197        }
198    }
199
200    fn write_char(
201        ch: char,
202        quote: Quote,
203        formatter: &mut impl std::fmt::Write,
204    ) -> std::fmt::Result {
205        match ch {
206            '\n' => formatter.write_str("\\n"),
207            '\t' => formatter.write_str("\\t"),
208            '\r' => formatter.write_str("\\r"),
209            // these 2 branches *would* be handled below, but we shouldn't have to do a
210            // unicodedata lookup just for ascii characters
211            '\x20'..='\x7e' => {
212                // printable ascii range
213                if ch == quote.to_char() || ch == '\\' {
214                    formatter.write_char('\\')?;
215                }
216                formatter.write_char(ch)
217            }
218            ch if ch.is_ascii() => {
219                write!(formatter, "\\x{:02x}", ch as u8)
220            }
221            ch if crate::char::is_printable(ch) => formatter.write_char(ch),
222            '\0'..='\u{ff}' => {
223                write!(formatter, "\\x{:02x}", ch as u32)
224            }
225            '\0'..='\u{ffff}' => {
226                write!(formatter, "\\u{:04x}", ch as u32)
227            }
228            _ => {
229                write!(formatter, "\\U{:08x}", ch as u32)
230            }
231        }
232    }
233}
234
235impl<'a> Escape for UnicodeEscape<'a> {
236    fn source_len(&self) -> usize {
237        self.source.len()
238    }
239
240    fn layout(&self) -> &EscapeLayout {
241        &self.layout
242    }
243
244    fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
245        formatter.write_str(self.source)
246    }
247
248    #[cold]
249    fn write_body_slow(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
250        for ch in self.source.chars() {
251            Self::write_char(ch, self.layout().quote, formatter)?;
252        }
253        Ok(())
254    }
255}
256
257#[cfg(test)]
258mod unicode_escape_tests {
259    use super::*;
260
261    #[test]
262    fn changed() {
263        fn test(s: &str) -> bool {
264            UnicodeEscape::new_repr(s).changed()
265        }
266        assert!(!test("hello"));
267        assert!(!test("'hello'"));
268        assert!(!test("\"hello\""));
269
270        assert!(test("'\"hello"));
271        assert!(test("hello\n"));
272    }
273}
274
275pub struct AsciiEscape<'a> {
276    source: &'a [u8],
277    layout: EscapeLayout,
278}
279
280impl<'a> AsciiEscape<'a> {
281    #[inline]
282    pub fn new(source: &'a [u8], layout: EscapeLayout) -> Self {
283        Self { source, layout }
284    }
285    #[inline]
286    pub fn with_forced_quote(source: &'a [u8], quote: Quote) -> Self {
287        let layout = EscapeLayout { quote, len: None };
288        Self { source, layout }
289    }
290    #[inline]
291    pub fn with_preferred_quote(source: &'a [u8], quote: Quote) -> Self {
292        let layout = Self::repr_layout(source, quote);
293        Self { source, layout }
294    }
295    #[inline]
296    pub fn new_repr(source: &'a [u8]) -> Self {
297        Self::with_preferred_quote(source, Quote::Single)
298    }
299    #[inline]
300    pub fn bytes_repr<'r>(&'a self) -> BytesRepr<'r, 'a> {
301        BytesRepr(self)
302    }
303}
304
305impl AsciiEscape<'_> {
306    pub fn repr_layout(source: &[u8], preferred_quote: Quote) -> EscapeLayout {
307        Self::output_layout_with_checker(source, preferred_quote, 3, |a, b| {
308            Some((a as isize).checked_add(b as isize)? as usize)
309        })
310    }
311
312    pub fn named_repr_layout(source: &[u8], name: &str) -> EscapeLayout {
313        Self::output_layout_with_checker(source, Quote::Single, name.len() + 2 + 3, |a, b| {
314            Some((a as isize).checked_add(b as isize)? as usize)
315        })
316    }
317
318    fn output_layout_with_checker(
319        source: &[u8],
320        preferred_quote: Quote,
321        reserved_len: usize,
322        length_add: impl Fn(usize, usize) -> Option<usize>,
323    ) -> EscapeLayout {
324        let mut out_len = reserved_len;
325        let mut single_count = 0;
326        let mut double_count = 0;
327
328        for ch in source.iter() {
329            let incr = match ch {
330                b'\'' => {
331                    single_count += 1;
332                    1
333                }
334                b'"' => {
335                    double_count += 1;
336                    1
337                }
338                c => Self::escaped_char_len(*c),
339            };
340            let Some(new_len) = length_add(out_len, incr) else {
341                #[cold]
342                fn stop(
343                    single_count: usize,
344                    double_count: usize,
345                    preferred_quote: Quote,
346                ) -> EscapeLayout {
347                    EscapeLayout {
348                        quote: choose_quote(single_count, double_count, preferred_quote).0,
349                        len: None,
350                    }
351                }
352                return stop(single_count, double_count, preferred_quote);
353            };
354            out_len = new_len;
355        }
356
357        let (quote, num_escaped_quotes) = choose_quote(single_count, double_count, preferred_quote);
358        // we'll be adding backslashes in front of the existing inner quotes
359        let Some(out_len) = length_add(out_len, num_escaped_quotes) else {
360            return EscapeLayout { quote, len: None };
361        };
362
363        EscapeLayout {
364            quote,
365            len: Some(out_len - reserved_len),
366        }
367    }
368
369    fn escaped_char_len(ch: u8) -> usize {
370        match ch {
371            b'\\' | b'\t' | b'\r' | b'\n' => 2,
372            0x20..=0x7e => 1,
373            _ => 4, // \xHH
374        }
375    }
376
377    fn write_char(ch: u8, quote: Quote, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
378        match ch {
379            b'\t' => formatter.write_str("\\t"),
380            b'\n' => formatter.write_str("\\n"),
381            b'\r' => formatter.write_str("\\r"),
382            0x20..=0x7e => {
383                // printable ascii range
384                if ch == quote.to_byte() || ch == b'\\' {
385                    formatter.write_char('\\')?;
386                }
387                formatter.write_char(ch as char)
388            }
389            ch => write!(formatter, "\\x{ch:02x}"),
390        }
391    }
392}
393
394impl<'a> Escape for AsciiEscape<'a> {
395    fn source_len(&self) -> usize {
396        self.source.len()
397    }
398
399    fn layout(&self) -> &EscapeLayout {
400        &self.layout
401    }
402
403    fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
404        formatter.write_str(unsafe {
405            // SAFETY: this function must be called only when source is printable ascii characters
406            std::str::from_utf8_unchecked(self.source)
407        })
408    }
409
410    #[cold]
411    fn write_body_slow(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
412        for ch in self.source.iter() {
413            Self::write_char(*ch, self.layout().quote, formatter)?;
414        }
415        Ok(())
416    }
417}
418
419pub struct BytesRepr<'r, 'a>(&'r AsciiEscape<'a>);
420
421impl BytesRepr<'_, '_> {
422    pub fn write(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result {
423        let quote = self.0.layout().quote.to_char();
424        formatter.write_char('b')?;
425        formatter.write_char(quote)?;
426        self.0.write_body(formatter)?;
427        formatter.write_char(quote)
428    }
429
430    pub fn to_string(&self) -> Option<String> {
431        let mut s = String::with_capacity(self.0.layout().len?);
432        self.write(&mut s).unwrap();
433        Some(s)
434    }
435}
436
437impl std::fmt::Display for BytesRepr<'_, '_> {
438    fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
439        self.write(formatter)
440    }
441}