Skip to main content

rustpython_vm/vm/
compile.rs

1//! Python code compilation functions.
2//!
3//! For code execution functions, see python_run.rs
4
5use crate::{
6    PyRef, VirtualMachine,
7    builtins::PyCode,
8    compiler::{self, CompileError, CompileOpts},
9};
10
11impl VirtualMachine {
12    pub fn compile(
13        &self,
14        source: &str,
15        mode: compiler::Mode,
16        source_path: String,
17    ) -> Result<PyRef<PyCode>, CompileError> {
18        self.compile_with_opts(source, mode, source_path, self.compile_opts())
19    }
20
21    pub fn compile_with_opts(
22        &self,
23        source: &str,
24        mode: compiler::Mode,
25        source_path: String,
26        opts: CompileOpts,
27    ) -> Result<PyRef<PyCode>, CompileError> {
28        let code =
29            compiler::compile(source, mode, &source_path, opts).map(|code| self.ctx.new_code(code));
30        #[cfg(feature = "parser")]
31        if code.is_ok() {
32            self.emit_string_escape_warnings(source, &source_path);
33        }
34        code
35    }
36}
37
38/// Scan source for invalid escape sequences in all string literals and emit
39/// SyntaxWarning.
40///
41/// Corresponds to:
42/// - `warn_invalid_escape_sequence()` in `Parser/string_parser.c`
43/// - `_PyTokenizer_warn_invalid_escape_sequence()` in `Parser/tokenizer/helpers.c`
44#[cfg(feature = "parser")]
45mod escape_warnings {
46    use super::*;
47    use crate::warn;
48    use ruff_python_ast::{self as ast, visitor::Visitor};
49    use ruff_text_size::TextRange;
50
51    /// Calculate 1-indexed line number at byte offset in source.
52    fn line_number_at(source: &str, offset: usize) -> usize {
53        source[..offset.min(source.len())]
54            .bytes()
55            .filter(|&b| b == b'\n')
56            .count()
57            + 1
58    }
59
60    /// Get content bounds (start, end byte offsets) of a quoted string literal,
61    /// excluding prefix characters and quote delimiters.
62    fn content_bounds(source: &str, range: TextRange) -> Option<(usize, usize)> {
63        let s = range.start().to_usize();
64        let e = range.end().to_usize();
65        if s >= e || e > source.len() {
66            return None;
67        }
68        let bytes = &source.as_bytes()[s..e];
69        // Skip prefix (u, b, r, etc.) to find the first quote character.
70        let qi = bytes.iter().position(|&c| c == b'\'' || c == b'"')?;
71        let qc = bytes[qi];
72        let ql = if bytes.get(qi + 1) == Some(&qc) && bytes.get(qi + 2) == Some(&qc) {
73            3
74        } else {
75            1
76        };
77        let cs = s + qi + ql;
78        let ce = e.checked_sub(ql)?;
79        if cs <= ce { Some((cs, ce)) } else { None }
80    }
81
82    /// Scan `source[start..end]` for the first invalid escape sequence.
83    /// Returns `Some((invalid_char, byte_offset_in_source))` for the first
84    /// invalid escape found, or `None` if all escapes are valid.
85    ///
86    /// When `is_bytes` is true, `\u`, `\U`, and `\N` are treated as invalid
87    /// (bytes literals only support byte-oriented escapes).
88    ///
89    /// Only reports the **first** invalid escape per string literal, matching
90    /// `_PyUnicode_DecodeUnicodeEscapeInternal2` which stores only the first
91    /// `first_invalid_escape_char`.
92    fn first_invalid_escape(
93        source: &str,
94        start: usize,
95        end: usize,
96        is_bytes: bool,
97    ) -> Option<(char, usize)> {
98        let raw = &source[start..end];
99        let mut chars = raw.char_indices().peekable();
100        while let Some((i, ch)) = chars.next() {
101            if ch != '\\' {
102                continue;
103            }
104            let Some((_, next)) = chars.next() else {
105                break;
106            };
107            let valid = match next {
108                '\\' | '\'' | '"' | 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' => true,
109                '\n' => true,
110                '\r' => {
111                    if matches!(chars.peek(), Some(&(_, '\n'))) {
112                        chars.next();
113                    }
114                    true
115                }
116                '0'..='7' => {
117                    for _ in 0..2 {
118                        if matches!(chars.peek(), Some(&(_, '0'..='7'))) {
119                            chars.next();
120                        } else {
121                            break;
122                        }
123                    }
124                    true
125                }
126                'x' | 'u' | 'U' => {
127                    // \u and \U are only valid in string literals, not bytes
128                    if is_bytes && next != 'x' {
129                        false
130                    } else {
131                        let count = match next {
132                            'x' => 2,
133                            'u' => 4,
134                            'U' => 8,
135                            _ => unreachable!(),
136                        };
137                        for _ in 0..count {
138                            if chars.peek().is_some_and(|&(_, c)| c.is_ascii_hexdigit()) {
139                                chars.next();
140                            } else {
141                                break;
142                            }
143                        }
144                        true
145                    }
146                }
147                'N' => {
148                    // \N{name} is only valid in string literals, not bytes
149                    if is_bytes {
150                        false
151                    } else {
152                        if matches!(chars.peek(), Some(&(_, '{'))) {
153                            chars.next();
154                            for (_, c) in chars.by_ref() {
155                                if c == '}' {
156                                    break;
157                                }
158                            }
159                        }
160                        true
161                    }
162                }
163                _ => false,
164            };
165            if !valid {
166                return Some((next, start + i));
167            }
168        }
169        None
170    }
171
172    /// Emit `SyntaxWarning` for an invalid escape sequence.
173    ///
174    /// `warn_invalid_escape_sequence()` in `Parser/string_parser.c`
175    fn warn_invalid_escape_sequence(
176        source: &str,
177        ch: char,
178        offset: usize,
179        filename: &str,
180        vm: &VirtualMachine,
181    ) {
182        let lineno = line_number_at(source, offset);
183        let message = vm.ctx.new_str(format!(
184            "\"\\{ch}\" is an invalid escape sequence. \
185             Such sequences will not work in the future. \
186             Did you mean \"\\\\{ch}\"? A raw string is also an option."
187        ));
188        let fname = vm.ctx.new_str(filename);
189        let _ = warn::warn_explicit(
190            Some(vm.ctx.exceptions.syntax_warning.to_owned()),
191            message.into(),
192            fname,
193            lineno,
194            None,
195            vm.ctx.none(),
196            None,
197            None,
198            vm,
199        );
200    }
201
202    struct EscapeWarningVisitor<'a> {
203        source: &'a str,
204        filename: &'a str,
205        vm: &'a VirtualMachine,
206    }
207
208    impl<'a> EscapeWarningVisitor<'a> {
209        /// Check a quoted string/bytes literal for invalid escapes.
210        /// The range must include the prefix and quote delimiters.
211        fn check_quoted_literal(&self, range: TextRange, is_bytes: bool) {
212            if let Some((start, end)) = content_bounds(self.source, range)
213                && let Some((ch, offset)) = first_invalid_escape(self.source, start, end, is_bytes)
214            {
215                warn_invalid_escape_sequence(self.source, ch, offset, self.filename, self.vm);
216            }
217        }
218
219        /// Check an f-string literal element for invalid escapes.
220        /// The range covers content only (no prefix/quotes).
221        ///
222        /// Also handles `\{` / `\}` at the literal–interpolation boundary,
223        /// equivalent to `_PyTokenizer_warn_invalid_escape_sequence` handling
224        /// `FSTRING_MIDDLE` / `FSTRING_END` tokens.
225        fn check_fstring_literal(&self, range: TextRange) {
226            let start = range.start().to_usize();
227            let end = range.end().to_usize();
228            if start >= end || end > self.source.len() {
229                return;
230            }
231            if let Some((ch, offset)) = first_invalid_escape(self.source, start, end, false) {
232                warn_invalid_escape_sequence(self.source, ch, offset, self.filename, self.vm);
233                return;
234            }
235            // In CPython, _PyTokenizer_warn_invalid_escape_sequence handles
236            // `\{` and `\}` for FSTRING_MIDDLE/FSTRING_END tokens.  Ruff
237            // splits the literal element before the interpolation delimiter,
238            // so the `\` sits at the end of the literal range and the `{`/`}`
239            // sits just after it.  Only warn when the number of trailing
240            // backslashes is odd (an even count means they are all escaped).
241            let trailing_bs = self.source.as_bytes()[start..end]
242                .iter()
243                .rev()
244                .take_while(|&&b| b == b'\\')
245                .count();
246            if trailing_bs % 2 == 1
247                && let Some(&after) = self.source.as_bytes().get(end)
248                && (after == b'{' || after == b'}')
249            {
250                warn_invalid_escape_sequence(
251                    self.source,
252                    after as char,
253                    end - 1,
254                    self.filename,
255                    self.vm,
256                );
257            }
258        }
259
260        /// Visit f-string elements, checking literals and recursing into
261        /// interpolation expressions and format specs.
262        fn visit_fstring_elements(&mut self, elements: &'a ast::InterpolatedStringElements) {
263            for element in elements {
264                match element {
265                    ast::InterpolatedStringElement::Literal(lit) => {
266                        self.check_fstring_literal(lit.range);
267                    }
268                    ast::InterpolatedStringElement::Interpolation(interp) => {
269                        self.visit_expr(&interp.expression);
270                        if let Some(spec) = &interp.format_spec {
271                            self.visit_fstring_elements(&spec.elements);
272                        }
273                    }
274                }
275            }
276        }
277    }
278
279    impl<'a> Visitor<'a> for EscapeWarningVisitor<'a> {
280        fn visit_expr(&mut self, expr: &'a ast::Expr) {
281            match expr {
282                // Regular string literals — decode_unicode_with_escapes path
283                ast::Expr::StringLiteral(string) => {
284                    for part in string.value.as_slice() {
285                        if !matches!(
286                            part.flags.prefix(),
287                            ast::str_prefix::StringLiteralPrefix::Raw { .. }
288                        ) {
289                            self.check_quoted_literal(part.range, false);
290                        }
291                    }
292                }
293                // Byte string literals — decode_bytes_with_escapes path
294                ast::Expr::BytesLiteral(bytes) => {
295                    for part in bytes.value.as_slice() {
296                        if !matches!(
297                            part.flags.prefix(),
298                            ast::str_prefix::ByteStringPrefix::Raw { .. }
299                        ) {
300                            self.check_quoted_literal(part.range, true);
301                        }
302                    }
303                }
304                // F-string literals — tokenizer + string_parser paths
305                ast::Expr::FString(fstring_expr) => {
306                    for part in fstring_expr.value.as_slice() {
307                        match part {
308                            ast::FStringPart::Literal(string_lit) => {
309                                // Plain string part in f-string concatenation
310                                if !matches!(
311                                    string_lit.flags.prefix(),
312                                    ast::str_prefix::StringLiteralPrefix::Raw { .. }
313                                ) {
314                                    self.check_quoted_literal(string_lit.range, false);
315                                }
316                            }
317                            ast::FStringPart::FString(fstring) => {
318                                if matches!(
319                                    fstring.flags.prefix(),
320                                    ast::str_prefix::FStringPrefix::Raw { .. }
321                                ) {
322                                    continue;
323                                }
324                                self.visit_fstring_elements(&fstring.elements);
325                            }
326                        }
327                    }
328                }
329                _ => ast::visitor::walk_expr(self, expr),
330            }
331        }
332    }
333
334    impl VirtualMachine {
335        /// Walk all string literals in `source` and emit `SyntaxWarning` for
336        /// each that contains an invalid escape sequence.
337        pub(super) fn emit_string_escape_warnings(&self, source: &str, filename: &str) {
338            let Ok(parsed) =
339                ruff_python_parser::parse(source, ruff_python_parser::Mode::Module.into())
340            else {
341                return;
342            };
343            let ast = parsed.into_syntax();
344            let mut visitor = EscapeWarningVisitor {
345                source,
346                filename,
347                vm: self,
348            };
349            match ast {
350                ast::Mod::Module(module) => {
351                    for stmt in &module.body {
352                        visitor.visit_stmt(stmt);
353                    }
354                }
355                ast::Mod::Expression(expr) => {
356                    visitor.visit_expr(&expr.body);
357                }
358            }
359        }
360    }
361}