rustpython_vm/vm/compile.rs
1//! Python code compilation functions.
2//!
3//! For code execution functions, see python_run.rs
4
5use crate::{
6 PyRef, VirtualMachine,
7 builtins::PyCode,
8 compiler::{self, CompileError, CompileOpts},
9};
10
11impl VirtualMachine {
12 pub fn compile(
13 &self,
14 source: &str,
15 mode: compiler::Mode,
16 source_path: String,
17 ) -> Result<PyRef<PyCode>, CompileError> {
18 self.compile_with_opts(source, mode, source_path, self.compile_opts())
19 }
20
21 pub fn compile_with_opts(
22 &self,
23 source: &str,
24 mode: compiler::Mode,
25 source_path: String,
26 opts: CompileOpts,
27 ) -> Result<PyRef<PyCode>, CompileError> {
28 let code =
29 compiler::compile(source, mode, &source_path, opts).map(|code| self.ctx.new_code(code));
30 #[cfg(feature = "parser")]
31 if code.is_ok() {
32 self.emit_string_escape_warnings(source, &source_path);
33 }
34 code
35 }
36}
37
38/// Scan source for invalid escape sequences in all string literals and emit
39/// SyntaxWarning.
40///
41/// Corresponds to:
42/// - `warn_invalid_escape_sequence()` in `Parser/string_parser.c`
43/// - `_PyTokenizer_warn_invalid_escape_sequence()` in `Parser/tokenizer/helpers.c`
44#[cfg(feature = "parser")]
45mod escape_warnings {
46 use super::*;
47 use crate::warn;
48 use ruff_python_ast::{self as ast, visitor::Visitor};
49 use ruff_text_size::TextRange;
50
51 /// Calculate 1-indexed line number at byte offset in source.
52 fn line_number_at(source: &str, offset: usize) -> usize {
53 source[..offset.min(source.len())]
54 .bytes()
55 .filter(|&b| b == b'\n')
56 .count()
57 + 1
58 }
59
60 /// Get content bounds (start, end byte offsets) of a quoted string literal,
61 /// excluding prefix characters and quote delimiters.
62 fn content_bounds(source: &str, range: TextRange) -> Option<(usize, usize)> {
63 let s = range.start().to_usize();
64 let e = range.end().to_usize();
65 if s >= e || e > source.len() {
66 return None;
67 }
68 let bytes = &source.as_bytes()[s..e];
69 // Skip prefix (u, b, r, etc.) to find the first quote character.
70 let qi = bytes.iter().position(|&c| c == b'\'' || c == b'"')?;
71 let qc = bytes[qi];
72 let ql = if bytes.get(qi + 1) == Some(&qc) && bytes.get(qi + 2) == Some(&qc) {
73 3
74 } else {
75 1
76 };
77 let cs = s + qi + ql;
78 let ce = e.checked_sub(ql)?;
79 if cs <= ce { Some((cs, ce)) } else { None }
80 }
81
82 /// Scan `source[start..end]` for the first invalid escape sequence.
83 /// Returns `Some((invalid_char, byte_offset_in_source))` for the first
84 /// invalid escape found, or `None` if all escapes are valid.
85 ///
86 /// When `is_bytes` is true, `\u`, `\U`, and `\N` are treated as invalid
87 /// (bytes literals only support byte-oriented escapes).
88 ///
89 /// Only reports the **first** invalid escape per string literal, matching
90 /// `_PyUnicode_DecodeUnicodeEscapeInternal2` which stores only the first
91 /// `first_invalid_escape_char`.
92 fn first_invalid_escape(
93 source: &str,
94 start: usize,
95 end: usize,
96 is_bytes: bool,
97 ) -> Option<(char, usize)> {
98 let raw = &source[start..end];
99 let mut chars = raw.char_indices().peekable();
100 while let Some((i, ch)) = chars.next() {
101 if ch != '\\' {
102 continue;
103 }
104 let Some((_, next)) = chars.next() else {
105 break;
106 };
107 let valid = match next {
108 '\\' | '\'' | '"' | 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' => true,
109 '\n' => true,
110 '\r' => {
111 if matches!(chars.peek(), Some(&(_, '\n'))) {
112 chars.next();
113 }
114 true
115 }
116 '0'..='7' => {
117 for _ in 0..2 {
118 if matches!(chars.peek(), Some(&(_, '0'..='7'))) {
119 chars.next();
120 } else {
121 break;
122 }
123 }
124 true
125 }
126 'x' | 'u' | 'U' => {
127 // \u and \U are only valid in string literals, not bytes
128 if is_bytes && next != 'x' {
129 false
130 } else {
131 let count = match next {
132 'x' => 2,
133 'u' => 4,
134 'U' => 8,
135 _ => unreachable!(),
136 };
137 for _ in 0..count {
138 if chars.peek().is_some_and(|&(_, c)| c.is_ascii_hexdigit()) {
139 chars.next();
140 } else {
141 break;
142 }
143 }
144 true
145 }
146 }
147 'N' => {
148 // \N{name} is only valid in string literals, not bytes
149 if is_bytes {
150 false
151 } else {
152 if matches!(chars.peek(), Some(&(_, '{'))) {
153 chars.next();
154 for (_, c) in chars.by_ref() {
155 if c == '}' {
156 break;
157 }
158 }
159 }
160 true
161 }
162 }
163 _ => false,
164 };
165 if !valid {
166 return Some((next, start + i));
167 }
168 }
169 None
170 }
171
172 /// Emit `SyntaxWarning` for an invalid escape sequence.
173 ///
174 /// `warn_invalid_escape_sequence()` in `Parser/string_parser.c`
175 fn warn_invalid_escape_sequence(
176 source: &str,
177 ch: char,
178 offset: usize,
179 filename: &str,
180 vm: &VirtualMachine,
181 ) {
182 let lineno = line_number_at(source, offset);
183 let message = vm.ctx.new_str(format!(
184 "\"\\{ch}\" is an invalid escape sequence. \
185 Such sequences will not work in the future. \
186 Did you mean \"\\\\{ch}\"? A raw string is also an option."
187 ));
188 let fname = vm.ctx.new_str(filename);
189 let _ = warn::warn_explicit(
190 Some(vm.ctx.exceptions.syntax_warning.to_owned()),
191 message.into(),
192 fname,
193 lineno,
194 None,
195 vm.ctx.none(),
196 None,
197 None,
198 vm,
199 );
200 }
201
202 struct EscapeWarningVisitor<'a> {
203 source: &'a str,
204 filename: &'a str,
205 vm: &'a VirtualMachine,
206 }
207
208 impl<'a> EscapeWarningVisitor<'a> {
209 /// Check a quoted string/bytes literal for invalid escapes.
210 /// The range must include the prefix and quote delimiters.
211 fn check_quoted_literal(&self, range: TextRange, is_bytes: bool) {
212 if let Some((start, end)) = content_bounds(self.source, range)
213 && let Some((ch, offset)) = first_invalid_escape(self.source, start, end, is_bytes)
214 {
215 warn_invalid_escape_sequence(self.source, ch, offset, self.filename, self.vm);
216 }
217 }
218
219 /// Check an f-string literal element for invalid escapes.
220 /// The range covers content only (no prefix/quotes).
221 ///
222 /// Also handles `\{` / `\}` at the literal–interpolation boundary,
223 /// equivalent to `_PyTokenizer_warn_invalid_escape_sequence` handling
224 /// `FSTRING_MIDDLE` / `FSTRING_END` tokens.
225 fn check_fstring_literal(&self, range: TextRange) {
226 let start = range.start().to_usize();
227 let end = range.end().to_usize();
228 if start >= end || end > self.source.len() {
229 return;
230 }
231 if let Some((ch, offset)) = first_invalid_escape(self.source, start, end, false) {
232 warn_invalid_escape_sequence(self.source, ch, offset, self.filename, self.vm);
233 return;
234 }
235 // In CPython, _PyTokenizer_warn_invalid_escape_sequence handles
236 // `\{` and `\}` for FSTRING_MIDDLE/FSTRING_END tokens. Ruff
237 // splits the literal element before the interpolation delimiter,
238 // so the `\` sits at the end of the literal range and the `{`/`}`
239 // sits just after it. Only warn when the number of trailing
240 // backslashes is odd (an even count means they are all escaped).
241 let trailing_bs = self.source.as_bytes()[start..end]
242 .iter()
243 .rev()
244 .take_while(|&&b| b == b'\\')
245 .count();
246 if trailing_bs % 2 == 1
247 && let Some(&after) = self.source.as_bytes().get(end)
248 && (after == b'{' || after == b'}')
249 {
250 warn_invalid_escape_sequence(
251 self.source,
252 after as char,
253 end - 1,
254 self.filename,
255 self.vm,
256 );
257 }
258 }
259
260 /// Visit f-string elements, checking literals and recursing into
261 /// interpolation expressions and format specs.
262 fn visit_fstring_elements(&mut self, elements: &'a ast::InterpolatedStringElements) {
263 for element in elements {
264 match element {
265 ast::InterpolatedStringElement::Literal(lit) => {
266 self.check_fstring_literal(lit.range);
267 }
268 ast::InterpolatedStringElement::Interpolation(interp) => {
269 self.visit_expr(&interp.expression);
270 if let Some(spec) = &interp.format_spec {
271 self.visit_fstring_elements(&spec.elements);
272 }
273 }
274 }
275 }
276 }
277 }
278
279 impl<'a> Visitor<'a> for EscapeWarningVisitor<'a> {
280 fn visit_expr(&mut self, expr: &'a ast::Expr) {
281 match expr {
282 // Regular string literals — decode_unicode_with_escapes path
283 ast::Expr::StringLiteral(string) => {
284 for part in string.value.as_slice() {
285 if !matches!(
286 part.flags.prefix(),
287 ast::str_prefix::StringLiteralPrefix::Raw { .. }
288 ) {
289 self.check_quoted_literal(part.range, false);
290 }
291 }
292 }
293 // Byte string literals — decode_bytes_with_escapes path
294 ast::Expr::BytesLiteral(bytes) => {
295 for part in bytes.value.as_slice() {
296 if !matches!(
297 part.flags.prefix(),
298 ast::str_prefix::ByteStringPrefix::Raw { .. }
299 ) {
300 self.check_quoted_literal(part.range, true);
301 }
302 }
303 }
304 // F-string literals — tokenizer + string_parser paths
305 ast::Expr::FString(fstring_expr) => {
306 for part in fstring_expr.value.as_slice() {
307 match part {
308 ast::FStringPart::Literal(string_lit) => {
309 // Plain string part in f-string concatenation
310 if !matches!(
311 string_lit.flags.prefix(),
312 ast::str_prefix::StringLiteralPrefix::Raw { .. }
313 ) {
314 self.check_quoted_literal(string_lit.range, false);
315 }
316 }
317 ast::FStringPart::FString(fstring) => {
318 if matches!(
319 fstring.flags.prefix(),
320 ast::str_prefix::FStringPrefix::Raw { .. }
321 ) {
322 continue;
323 }
324 self.visit_fstring_elements(&fstring.elements);
325 }
326 }
327 }
328 }
329 _ => ast::visitor::walk_expr(self, expr),
330 }
331 }
332 }
333
334 impl VirtualMachine {
335 /// Walk all string literals in `source` and emit `SyntaxWarning` for
336 /// each that contains an invalid escape sequence.
337 pub(super) fn emit_string_escape_warnings(&self, source: &str, filename: &str) {
338 let Ok(parsed) =
339 ruff_python_parser::parse(source, ruff_python_parser::Mode::Module.into())
340 else {
341 return;
342 };
343 let ast = parsed.into_syntax();
344 let mut visitor = EscapeWarningVisitor {
345 source,
346 filename,
347 vm: self,
348 };
349 match ast {
350 ast::Mod::Module(module) => {
351 for stmt in &module.body {
352 visitor.visit_stmt(stmt);
353 }
354 }
355 ast::Mod::Expression(expr) => {
356 visitor.visit_expr(&expr.body);
357 }
358 }
359 }
360 }
361}