python_packaging/python_source.rs
1// Copyright 2022 Gregory Szorc.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9/*! Utility functions related to Python source code. */
10
11use {anyhow::Result, once_cell::sync::Lazy};
12
13static RE_CODING: Lazy<regex::bytes::Regex> = Lazy::new(|| {
14 regex::bytes::Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)").unwrap()
15});
16
17/// Derive the source encoding from Python source code.
18pub fn python_source_encoding(source: &[u8]) -> Vec<u8> {
19 // Default source encoding is UTF-8. But per PEP 263, the first or second
20 // line of source can match a regular expression to define a custom
21 // encoding.
22 let lines = source.split(|v| v == &b'\n');
23
24 for (i, line) in lines.enumerate() {
25 if i > 1 {
26 break;
27 }
28
29 if let Some(m) = RE_CODING.find(line) {
30 return m.as_bytes().to_vec();
31 }
32 }
33
34 b"utf-8".to_vec()
35}
36
37/// Whether __file__ occurs in Python source code.
38pub fn has_dunder_file(source: &[u8]) -> Result<bool> {
39 // We can't just look for b"__file__ because the source file may be in
40 // encodings like UTF-16. So we need to decode to Unicode first then look for
41 // the code points.
42 let encoding = python_source_encoding(source);
43
44 let encoder = match encoding_rs::Encoding::for_label(&encoding) {
45 Some(encoder) => encoder,
46 None => encoding_rs::UTF_8,
47 };
48
49 let (source, ..) = encoder.decode(source);
50
51 Ok(source.contains("__file__"))
52}