python_packaging/
python_source.rs

1// Copyright 2022 Gregory Szorc.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9/*! Utility functions related to Python source code. */
10
11use {anyhow::Result, once_cell::sync::Lazy};
12
13static RE_CODING: Lazy<regex::bytes::Regex> = Lazy::new(|| {
14    regex::bytes::Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)").unwrap()
15});
16
17/// Derive the source encoding from Python source code.
18pub fn python_source_encoding(source: &[u8]) -> Vec<u8> {
19    // Default source encoding is UTF-8. But per PEP 263, the first or second
20    // line of source can match a regular expression to define a custom
21    // encoding.
22    let lines = source.split(|v| v == &b'\n');
23
24    for (i, line) in lines.enumerate() {
25        if i > 1 {
26            break;
27        }
28
29        if let Some(m) = RE_CODING.find(line) {
30            return m.as_bytes().to_vec();
31        }
32    }
33
34    b"utf-8".to_vec()
35}
36
37/// Whether __file__ occurs in Python source code.
38pub fn has_dunder_file(source: &[u8]) -> Result<bool> {
39    // We can't just look for b"__file__ because the source file may be in
40    // encodings like UTF-16. So we need to decode to Unicode first then look for
41    // the code points.
42    let encoding = python_source_encoding(source);
43
44    let encoder = match encoding_rs::Encoding::for_label(&encoding) {
45        Some(encoder) => encoder,
46        None => encoding_rs::UTF_8,
47    };
48
49    let (source, ..) = encoder.decode(source);
50
51    Ok(source.contains("__file__"))
52}