pyoxidizerlib/py_packaging/
distribution.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5/*!
6Defining and manipulating Python distributions.
7*/
8
9use {
10    super::{
11        binary::{LibpythonLinkMode, PythonBinaryBuilder},
12        config::PyembedPythonInterpreterConfig,
13        standalone_distribution::StandaloneDistribution,
14    },
15    crate::{environment::Environment, python_distributions::PYTHON_DISTRIBUTIONS},
16    anyhow::{anyhow, Context, Result},
17    fs2::FileExt,
18    log::info,
19    python_packaging::{
20        bytecode::PythonBytecodeCompiler, module_util::PythonModuleSuffixes,
21        policy::PythonPackagingPolicy, resource::PythonResource,
22    },
23    sha2::{Digest, Sha256},
24    simple_file_manifest::FileEntry,
25    std::{
26        collections::HashMap,
27        fmt::{Display, Formatter},
28        fs,
29        fs::{create_dir_all, File},
30        io::Read,
31        ops::DerefMut,
32        path::{Path, PathBuf},
33        sync::{Arc, Mutex},
34    },
35    tugger_common::http::get_http_client,
36    url::Url,
37    uuid::Uuid,
38};
39
40/// Denotes how a binary should link libpython.
41#[derive(Clone, Debug, PartialEq, Eq)]
42pub enum BinaryLibpythonLinkMode {
43    /// Use default link mode semantics.
44    Default,
45    /// Statically link libpython into the binary.
46    Static,
47    /// Binary should dynamically link libpython.
48    Dynamic,
49}
50
51#[derive(Clone, Debug, Eq, Hash, PartialEq)]
52pub enum PythonDistributionLocation {
53    Local { local_path: String, sha256: String },
54    Url { url: String, sha256: String },
55}
56
57impl std::fmt::Display for PythonDistributionLocation {
58    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
59        match self {
60            Self::Local { local_path, sha256 } => {
61                write!(f, "{} (sha256={})", local_path, sha256)
62            }
63            Self::Url { url, sha256 } => {
64                write!(f, "{} (sha256={})", url, sha256)
65            }
66        }
67    }
68}
69
70/// Describes an obtainable Python distribution.
71#[derive(Clone, Debug, PartialEq, Eq)]
72pub struct PythonDistributionRecord {
73    /// X.Y major.minor version of Python.
74    pub python_major_minor_version: String,
75
76    /// Where the distribution can be obtained from.
77    pub location: PythonDistributionLocation,
78
79    /// Rust target triple this distribution runs on.
80    pub target_triple: String,
81
82    /// Whether the distribution can load prebuilt extension modules.
83    pub supports_prebuilt_extension_modules: bool,
84}
85
86/// Describes Apple SDK build/targeting.
87#[derive(Clone, Debug, PartialEq, Eq)]
88pub struct AppleSdkInfo {
89    /// Canonical name of Apple SDK used.
90    pub canonical_name: String,
91    /// Name of SDK platform being targeted.
92    pub platform: String,
93    /// Version of Apple SDK used.
94    pub version: String,
95    /// Deployment target version used.
96    pub deployment_target: String,
97}
98
99/// Describes a generic Python distribution.
100pub trait PythonDistribution {
101    /// Clone self into a Box'ed trait object.
102    fn clone_trait(&self) -> Arc<dyn PythonDistribution>;
103
104    /// The Rust machine triple this distribution runs on.
105    fn target_triple(&self) -> &str;
106
107    /// Rust target triples on which this distribution's binaries can run.
108    ///
109    /// For example, an x86 distribution might advertise that it can run on
110    /// 64-bit host triples.
111    ///
112    /// `target_triple()` is always in the result.
113    fn compatible_host_triples(&self) -> Vec<String>;
114
115    /// Obtain the filesystem path to a `python` executable for this distribution.
116    fn python_exe_path(&self) -> &Path;
117
118    /// Obtain the full Python version string.
119    fn python_version(&self) -> &str;
120
121    /// Obtain the X.Y Python version component. e.g. `3.7`.
122    fn python_major_minor_version(&self) -> String;
123
124    /// Obtain the full Python implementation name. e.g. `cpython`.
125    fn python_implementation(&self) -> &str;
126
127    /// Obtain the short Python implementation name. e.g. `cp`
128    fn python_implementation_short(&self) -> &str;
129
130    /// Obtain the PEP 425 Python tag. e.g. `cp38`.
131    fn python_tag(&self) -> &str;
132
133    /// Obtain the PEP 425 Python ABI tag. e.g. `cp38d`.
134    fn python_abi_tag(&self) -> Option<&str>;
135
136    /// Obtain the Python platform tag.
137    fn python_platform_tag(&self) -> &str;
138
139    /// Obtain the Python platform tag used to indicate compatibility.
140    ///
141    /// This is similar to the platform tag. But where `python_platform_tag()`
142    /// exposes the raw value like `linux-x86_64`, this is the normalized
143    /// value that can be used by tools like `pip`. e.g. `manylinux2014_x86_64`.
144    fn python_platform_compatibility_tag(&self) -> &str;
145
146    /// Obtain the cache tag to apply to Python bytecode modules.
147    fn cache_tag(&self) -> &str;
148
149    /// Obtain file suffixes for various Python module flavors.
150    fn python_module_suffixes(&self) -> Result<PythonModuleSuffixes>;
151
152    /// Python configuration variables.
153    fn python_config_vars(&self) -> &HashMap<String, String>;
154
155    /// Obtain Python packages in the standard library that provide tests.
156    fn stdlib_test_packages(&self) -> Vec<String>;
157
158    /// Obtain Apple SDK settings for this distribution.
159    fn apple_sdk_info(&self) -> Option<&AppleSdkInfo>;
160
161    /// Create a `PythonBytecodeCompiler` from this instance.
162    fn create_bytecode_compiler(
163        &self,
164        env: &Environment,
165    ) -> Result<Box<dyn PythonBytecodeCompiler>>;
166
167    /// Construct a `PythonPackagingPolicy` derived from this instance.
168    fn create_packaging_policy(&self) -> Result<PythonPackagingPolicy>;
169
170    /// Construct an `EmbeddedPythonConfig` derived from this instance.
171    fn create_python_interpreter_config(&self) -> Result<PyembedPythonInterpreterConfig>;
172
173    /// Obtain a `PythonBinaryBuilder` for constructing an executable embedding Python.
174    ///
175    /// This method is how you start the process of creating a new executable file
176    /// from a Python distribution. Using the returned `PythonBinaryBuilder` instance,
177    /// you can manipulate resources, etc and then eventually build a new executable
178    /// with it.
179    #[allow(clippy::too_many_arguments)]
180    fn as_python_executable_builder(
181        &self,
182        host_triple: &str,
183        target_triple: &str,
184        name: &str,
185        libpython_link_mode: BinaryLibpythonLinkMode,
186        policy: &PythonPackagingPolicy,
187        config: &PyembedPythonInterpreterConfig,
188        host_distribution: Option<Arc<dyn PythonDistribution>>,
189    ) -> Result<Box<dyn PythonBinaryBuilder>>;
190
191    /// Obtain `PythonResource` instances for every resource in this distribution.
192    fn python_resources<'a>(&self) -> Vec<PythonResource<'a>>;
193
194    /// Ensure pip is available to run in the distribution.
195    ///
196    /// Returns the path to a `pip` executable.
197    fn ensure_pip(&self) -> Result<PathBuf>;
198
199    /// Resolve a `distutils` installation used for building Python packages.
200    ///
201    /// Some distributions may need to use a modified `distutils` to coerce builds to work
202    /// as PyOxidizer desires. This method is used to realize such a `distutils` installation.
203    ///
204    /// Note that we pass in an explicit libpython link mode because the link mode
205    /// we care about may differ from the link mode of the distribution itself (as some
206    /// distributions support multiple link modes).
207    ///
208    /// The return is a map of environment variables to set in the build environment.
209    fn resolve_distutils(
210        &self,
211        libpython_link_mode: LibpythonLinkMode,
212        dest_dir: &Path,
213        extra_python_paths: &[&Path],
214    ) -> Result<HashMap<String, String>>;
215
216    /// Whether this distribution supports loading shared libraries from memory.
217    ///
218    /// This effectively answers whether we can embed a shared library into an
219    /// executable and load it without having to materialize it on a filesystem.
220    fn supports_in_memory_shared_library_loading(&self) -> bool;
221
222    /// Determine whether a named module is in a known standard library test package.
223    fn is_stdlib_test_package(&self, name: &str) -> bool {
224        for package in self.stdlib_test_packages() {
225            let prefix = format!("{}.", package);
226
227            if name == package || name.starts_with(&prefix) {
228                return true;
229            }
230        }
231
232        false
233    }
234
235    /// Obtain support files for tcl/tk.
236    ///
237    /// The returned list of files contains relative file names and the locations
238    /// of file content. If the files are installed in a new directory, it should
239    /// be possible to use that directory joined with `tcl_library_path_directory`
240    /// as the value of `TCL_LIBRARY`.
241    fn tcl_files(&self) -> Result<Vec<(PathBuf, FileEntry)>>;
242
243    /// The name of the directory to use for `TCL_LIBRARY`
244    fn tcl_library_path_directory(&self) -> Option<String>;
245}
246
247/// Multiple threads or processes could race to extract the archive.
248/// So we use a lock file to ensure exclusive access.
249/// TODO use more granular lock based on the output directory (possibly
250/// by putting lock in output directory itself).
251pub struct DistributionExtractLock {
252    file: std::fs::File,
253}
254
255impl DistributionExtractLock {
256    pub fn new(extract_dir: &Path) -> Result<Self> {
257        let lock_path = extract_dir
258            .parent()
259            .unwrap()
260            .join("distribution-extract-lock");
261
262        let file = File::create(&lock_path)
263            .context(format!("could not create {}", lock_path.display()))?;
264
265        file.lock_exclusive()
266            .context(format!("failed to obtain lock for {}", lock_path.display()))?;
267
268        Ok(DistributionExtractLock { file })
269    }
270}
271
272impl Drop for DistributionExtractLock {
273    fn drop(&mut self) {
274        self.file.unlock().unwrap();
275    }
276}
277
278fn sha256_path(path: &Path) -> Vec<u8> {
279    let mut hasher = Sha256::new();
280    let fh = File::open(path).unwrap();
281    let mut reader = std::io::BufReader::new(fh);
282
283    let mut buffer = [0; 32768];
284
285    loop {
286        let count = reader.read(&mut buffer).expect("error reading");
287        if count == 0 {
288            break;
289        }
290        hasher.update(&buffer[..count]);
291    }
292
293    hasher.finalize().to_vec()
294}
295
296/// Ensure a Python distribution at a URL is available in a local directory.
297///
298/// The path to the downloaded and validated file is returned.
299pub fn download_distribution(url: &str, sha256: &str, cache_dir: &Path) -> Result<PathBuf> {
300    let expected_hash = hex::decode(sha256)?;
301    let u = Url::parse(url)?;
302
303    let basename = u
304        .path_segments()
305        .expect("cannot be base path")
306        .last()
307        .unwrap()
308        .to_string();
309
310    let cache_path = cache_dir.join(basename);
311
312    if cache_path.exists() {
313        let file_hash = sha256_path(&cache_path);
314
315        // We don't care about timing side-channels from the string compare.
316        if file_hash == expected_hash {
317            return Ok(cache_path);
318        }
319    }
320
321    let mut data: Vec<u8> = Vec::new();
322
323    println!("downloading {}", u);
324    let client = get_http_client()?;
325    let mut response = client.get(u.as_str()).send()?;
326    response.read_to_end(&mut data)?;
327
328    let mut hasher = Sha256::new();
329    hasher.update(&data);
330
331    let url_hash = hasher.finalize().to_vec();
332    if url_hash != expected_hash {
333        return Err(anyhow!("sha256 of Python distribution does not validate"));
334    }
335
336    let mut temp_cache_path = cache_path.clone();
337    temp_cache_path.set_file_name(format!("{}.tmp", Uuid::new_v4()));
338
339    fs::write(&temp_cache_path, data).context("unable to write distribution file")?;
340
341    fs::rename(&temp_cache_path, &cache_path)
342        .or_else(|e| -> Result<()> {
343            fs::remove_file(&temp_cache_path)
344                .context("unable to remove temporary distribution file")?;
345
346            if cache_path.exists() {
347                download_distribution(url, sha256, cache_dir)?;
348                return Ok(());
349            }
350
351            Err(e.into())
352        })
353        .context("unable to rename downloaded distribution file")?;
354
355    Ok(cache_path)
356}
357
358pub fn copy_local_distribution(path: &Path, sha256: &str, cache_dir: &Path) -> Result<PathBuf> {
359    let expected_hash = hex::decode(sha256)?;
360    let basename = path.file_name().unwrap().to_str().unwrap().to_string();
361    let cache_path = cache_dir.join(basename);
362
363    if cache_path.exists() {
364        let file_hash = sha256_path(&cache_path);
365
366        if file_hash == expected_hash {
367            println!(
368                "existing {} passes SHA-256 integrity check",
369                cache_path.display()
370            );
371            return Ok(cache_path);
372        }
373    }
374
375    let source_hash = sha256_path(path);
376
377    if source_hash != expected_hash {
378        return Err(anyhow!("sha256 of Python distribution does not validate"));
379    }
380
381    println!("copying {}", path.display());
382    std::fs::copy(path, &cache_path)?;
383
384    Ok(cache_path)
385}
386
387/// Obtain a local Path for a Python distribution tar archive.
388///
389/// Takes a parsed config and a cache directory as input. Usually the cache
390/// directory is the OUT_DIR for the invocation of a Cargo build script.
391/// A Python distribution will be fetched according to the configuration and a
392/// copy of the archive placed in ``cache_dir``. If the archive already exists
393/// in ``cache_dir``, it will be verified and returned.
394///
395/// Local filesystem paths are preferred over remote URLs if both are defined.
396pub fn resolve_python_distribution_archive(
397    dist: &PythonDistributionLocation,
398    cache_dir: &Path,
399) -> Result<PathBuf> {
400    if !cache_dir.exists() {
401        create_dir_all(cache_dir).unwrap();
402    }
403
404    match dist {
405        PythonDistributionLocation::Local { local_path, sha256 } => {
406            let p = PathBuf::from(local_path);
407            copy_local_distribution(&p, sha256, cache_dir)
408        }
409        PythonDistributionLocation::Url { url, sha256 } => {
410            download_distribution(url, sha256, cache_dir)
411        }
412    }
413}
414
415/// Resolve a Python distribution archive.
416///
417/// Returns a tuple of (archive path, extract directory).
418pub fn resolve_python_distribution_from_location(
419    location: &PythonDistributionLocation,
420    distributions_dir: &Path,
421) -> Result<(PathBuf, PathBuf)> {
422    info!("resolving Python distribution {}", location);
423    let path = resolve_python_distribution_archive(location, distributions_dir)?;
424    info!("Python distribution available at {}", path.display());
425
426    let distribution_hash = match location {
427        PythonDistributionLocation::Local { sha256, .. } => sha256,
428        PythonDistributionLocation::Url { sha256, .. } => sha256,
429    };
430
431    let distribution_path = distributions_dir.join(format!("python.{}", &distribution_hash[0..12]));
432
433    Ok((path, distribution_path))
434}
435
436/// Describes the flavor of a distribution.
437#[allow(clippy::enum_variant_names)]
438#[derive(Debug, PartialEq, Eq)]
439pub enum DistributionFlavor {
440    /// Distributions coming from the `python-build-standalone` project.
441    Standalone,
442
443    /// Statically linked distributions coming from the `python-build-standalone` project.
444    StandaloneStatic,
445
446    /// Dynamically linked distributions coming from the `python-build-standalone` project.
447    StandaloneDynamic,
448}
449
450impl Default for DistributionFlavor {
451    fn default() -> Self {
452        DistributionFlavor::Standalone
453    }
454}
455
456impl Display for DistributionFlavor {
457    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
458        f.write_str(match self {
459            Self::Standalone => "standalone",
460            Self::StandaloneStatic => "standalone-static",
461            Self::StandaloneDynamic => "standalone-dynamic",
462        })
463    }
464}
465
466impl TryFrom<&str> for DistributionFlavor {
467    type Error = String;
468
469    fn try_from(value: &str) -> Result<Self, Self::Error> {
470        match value {
471            "standalone" => Ok(Self::Standalone),
472            "standalone_static" | "standalone-static" => Ok(Self::StandaloneStatic),
473            "standalone_dynamic" | "standalone-dynamic" => Ok(Self::StandaloneDynamic),
474            _ => Err(format!("distribution flavor {} not recognized", value)),
475        }
476    }
477}
478
479type DistributionCacheKey = (PathBuf, PythonDistributionLocation);
480type DistributionCacheValue = Arc<Mutex<Option<Arc<StandaloneDistribution>>>>;
481
482/// Holds references to resolved PythonDistribution instances.
483#[derive(Debug)]
484pub struct DistributionCache {
485    cache: Mutex<HashMap<DistributionCacheKey, DistributionCacheValue>>,
486    default_dest_dir: Option<PathBuf>,
487}
488
489impl DistributionCache {
490    pub fn new(default_dest_dir: Option<&Path>) -> Self {
491        Self {
492            cache: Mutex::new(HashMap::new()),
493            default_dest_dir: default_dest_dir.map(|x| x.to_path_buf()),
494        }
495    }
496
497    /// Resolve a `PythonDistribution` given its source and storage locations.
498    pub fn resolve_distribution(
499        &self,
500        location: &PythonDistributionLocation,
501        dest_dir: Option<&Path>,
502    ) -> Result<Arc<StandaloneDistribution>> {
503        let dest_dir = if let Some(p) = dest_dir {
504            p
505        } else if let Some(p) = &self.default_dest_dir {
506            p
507        } else {
508            return Err(anyhow!("no destination directory available"));
509        };
510
511        let key = (dest_dir.to_path_buf(), location.clone());
512
513        // This logic is whack. Surely there's a cleaner way to do this...
514        //
515        // The general problem is instances of this type are Send + Sync. And
516        // we do rely on multiple threads simultaneously accessing it. This
517        // occurs in tests for example, which use a global/static instance to
518        // cache resolved distributions to drastically reduce CPU overhead.
519        //
520        // We need a Mutex of some kind around the HashMap to allow
521        // multi-threaded access. But if that was the only Mutex that existed, we'd
522        // need to hold the Mutex while any thread was resolving a distribution
523        // and this would prevent multi-threaded distribution resolving.
524        //
525        // Or we could release that Mutex after a missing lookup and then each
526        // thread would race to resolve the distribution and insert. That's fine,
527        // but it results in redundancy and wasted CPU (several minutes worth for
528        // debug builds in the test harness).
529        //
530        // What we do instead is have HashMap values be Arc<Mutex<Option<T>>>.
531        // We then perform a 2 phase lookup.
532        //
533        // In the 1st lock, we lock the entire HashMap and do the key lookup.
534        // If it exists, we clone the Arc<T>. Else if it is missing, we insert
535        // a new key with `None` and return a clone of its Arc<T>. Either way,
536        // we have a handle on the Arc<Mutex<Option<T>>> in a populated. We then
537        // release the outer HashMap lock.
538        //
539        // We then lock the inner entry. With that lock hold, we return a clone
540        // of its `Some(T)` entry immediately or proceed to populate it. Only 1
541        // thread can hold this lock, ensuring only 1 thread performs the
542        // value resolution. Multiple threads can resolve different keys in
543        // parallel. By other threads will be blocked resolving a single key.
544
545        let entry = {
546            let mut lock = self
547                .cache
548                .lock()
549                .map_err(|e| anyhow!("cannot obtain distribution cache lock: {}", e))?;
550
551            if let Some(value) = lock.get(&key) {
552                value.clone()
553            } else {
554                let value = Arc::new(Mutex::new(None));
555                lock.insert(key.clone(), value.clone());
556
557                value
558            }
559        };
560
561        let mut lock = entry
562            .lock()
563            .map_err(|e| anyhow!("cannot obtain distribution lock: {}", e))?;
564
565        let value = lock.deref_mut();
566
567        if let Some(dist) = value {
568            Ok(dist.clone())
569        } else {
570            let dist = Arc::new(StandaloneDistribution::from_location(location, dest_dir)?);
571
572            lock.replace(dist.clone());
573
574            Ok(dist)
575        }
576    }
577
578    /// Resolve a Python distribution that runs on the current machine.
579    pub fn host_distribution(
580        &self,
581        python_major_minor_version: Option<&str>,
582        dest_dir: Option<&Path>,
583    ) -> Result<Arc<StandaloneDistribution>> {
584        let location = default_distribution_location(
585            &DistributionFlavor::Standalone,
586            crate::environment::default_target_triple(),
587            python_major_minor_version,
588        )
589        .context("resolving host distribution location")?;
590
591        self.resolve_distribution(&location, dest_dir)
592            .context("resolving host distribution from location")
593    }
594}
595
596/// Obtain a `PythonDistribution` implementation of a flavor and from a location.
597///
598/// The distribution will be written to `dest_dir`.
599#[allow(unused)]
600pub fn resolve_distribution(
601    location: &PythonDistributionLocation,
602    dest_dir: &Path,
603) -> Result<Box<dyn PythonDistribution>> {
604    Ok(
605        Box::new(StandaloneDistribution::from_location(location, dest_dir)?)
606            as Box<dyn PythonDistribution>,
607    )
608}
609
610/// Resolve the location of the default Python distribution of a given flavor and build target.
611pub fn default_distribution_location(
612    flavor: &DistributionFlavor,
613    target: &str,
614    python_major_minor_version: Option<&str>,
615) -> Result<PythonDistributionLocation> {
616    let dist = PYTHON_DISTRIBUTIONS
617        .find_distribution(target, flavor, python_major_minor_version)
618        .ok_or_else(|| anyhow!("could not find default Python distribution for {}", target))?;
619
620    Ok(dist.location)
621}
622
623#[cfg(test)]
624mod tests {
625    use {super::*, crate::testutil::*};
626
627    #[test]
628    fn test_all_standalone_distributions() -> Result<()> {
629        assert!(!get_all_standalone_distributions()?.is_empty());
630
631        Ok(())
632    }
633}